前端之家收集整理的这篇文章主要介绍了
centos上 java使用Tesseract进行ocr识别,
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
1、安装过程:
安装ocr
yum install tesseract-ocr
查找中文包
yum search tesseract-ocr | grep sim
安装中文包
yum install tesseract-langpack-chi_sim
安装版本信息:
? test-ugc-api01 tesseract tesseract -v
tesseract 3.04.00
leptonica-1.72
libgif 4.1.6(?) : libjpeg 6b (libjpeg-turbo 1.2.90) : libpng 1.5.13 : libtiff 4.0.3 : zlib 1.2.7 : libwebp 0.3.0
2、java开发
注意版本匹配:3.04.00,采用tess4j
<
dependency
>
<
groupId
>
net.sourceforge.tess4j
</
groupId
>
<
artifactId
>
tess4j
</
artifactId
>
<
version
>
3.0
.
0
</
version
>
</
dependency
>
简单测试代码
@H_
404_100@
public
String ocr(String url) { String datapath
=
"
/usr/share/tesseract/
"
; String language
=
"
chi_sim
"
;
//
进行相关的检测
try
{ url
=
url.trim(); System.
out
.println(
"
url is:
"
+
url); URL targetUrl
=
new
URL(url); BufferedImage image
=
ImageIO.read(targetUrl); ByteBuffer buf
=
ImageIOHelper.convertImageData(image);
int
bpp
=
image.getColorModel().getPixelSize();
int
bytespp
=
bpp
/
8
;
int
bytespl
=
(
int
) Math.ceil(image.getWidth()
*
bpp
/
8.0
); System.
out
.println(
"
bpp is:
"
+
bpp
+
"
;bytespp is:
"
+
bytespp
+
"
;bytespl is:
"
+
bytespl);
//
初始化
ITessAPI.TessBaseAPI handle
=
TessAPI1.TessBaseAPICreate(); TessAPI1.TessBaseAPIInit3(handle,datapath,language); TessAPI1.TessBaseAPISetPageSegMode(handle,ITessAPI.TessPageSegMode.PSM_AUTO); Pointer utf8Text
=
TessAPI1.TessBaseAPIRect(handle,buf,bytespp,bytespl,
0
,image.getWidth(),image.getHeight()); String result
=
utf8Text.getString(
0
); TessAPI1.TessDeleteText(utf8Text); TessAPI1.TessBaseAPIDelete(handle); System.
out
.println(
"
==============================================
"
); System.
out
.println(
"
result is:
"
+
result); System.
out
.println(
"
==============================================
"
);
if
(result.equalsIgnoreCase(
""
)){ System.
out
.println(
"
no detected words!!
"
); }
return
result; }
catch
(Exception ex){ ex.printStackTrace(); }
return
"
no detected words!!
"
; }
#查询相关包 test
-
ugc
-
api01 tesseract rpm
-
qa
|
grep tesseract tesseract
-
langpack
-
chi_sim
-
3.04
.
00
-
3
.el7.noarch tesseract
-
3.04
.
00
-
3
.el7.x86_64 #查询包具体安装位置 test
-
ugc
-
api01 tesseract rpm
-
ql tesseract
-
3.04
.
00
-
3
.el7.x86_64
/
usr
/
bin
/
ambiguous_words
/
usr
/
bin
/
classifier_tester
/
usr
/
bin
/
cntraining
/
usr
/
bin
/
combine_tessdata
/
usr
/
bin
/
dawg2wordlist
/
usr
/
bin
/
mftraining
/
usr
/
bin
/
set_unicharset_properties
/
usr
/
bin
/
shapeclustering
/
usr
/
bin
/
tesseract
/
usr
/
bin
/
text2image
/
usr
/
bin
/
unicharset_extractor
/
usr
/
bin
/
wordlist2dawg
/
usr
/
lib64
/
libtesseract.so.
3
/
usr
/
lib64
/
libtesseract.so.
3.0
.
4
/
usr
/
share
/
doc
/
tesseract
-
3.04
.
00
/
usr
/
share
/
doc
/
tesseract
-
3.04
.
00
/
AUTHORS
/
usr
/
share
/
doc
/
tesseract
-
3.04
.
00
/
ChangeLog
/
usr
/
share
/
doc
/
tesseract
-
3.04
.
00
/
NEWS
/
usr
/
share
/
doc
/
tesseract
-
3.04
.
00
/
README
/
usr
/
share
/
doc
/
tesseract
-
3.04
.
00
/
eurotext.tif
/
usr
/
share
/
doc
/
tesseract
-
3.04
.
00
/
phototest.tif
/
usr
/
share
/
licenses
/
tesseract
-
3.04
.
00
/
usr
/
share
/
licenses
/
tesseract
-
3.04
.
00
/
COPYING
/
usr
/
share
/
man
/
man1
/
ambiguous_words.
1
.gz
/
usr
/
share
/
man
/
man1
/
cntraining.
1
.gz
/
usr
/
share
/
man
/
man1
/
combine_tessdata.
1
.gz
/
usr
/
share
/
man
/
man1
/
dawg2wordlist.
1
.gz
/
usr
/
share
/
man
/
man1
/
mftraining.
1
.gz
/
usr
/
share
/
man
/
man1
/
shapeclustering.
1
.gz
/
usr
/
share
/
man
/
man1
/
tesseract.
1
.gz
/
usr
/
share
/
man
/
man1
/
unicharset_extractor.
1
.gz
/
usr
/
share
/
man
/
man1
/
wordlist2dawg.
1
.gz
/
usr
/
share
/
man
/
man5
/
unicharambigs.
5
.gz
/
usr
/
share
/
man
/
man5
/
unicharset.
5
.gz
/
usr
/
share
/
tesseract
/
usr
/
share
/
tesseract
/
tessdata
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
ambigs.train
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
api_config
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
bigram
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
Box.train
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
Box.train.stderr
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
digits
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
hocr
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
inter
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
kannada
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
lineBox
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
logfile
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
makeBox
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
pdf
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
quiet
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
reBox
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
strokewidth
/
usr
/
share
/
tesseract
/
tessdata
/
configs
/
unlv
/
usr
/
share
/
tesseract
/
tessdata
/
eng.cube.bigrams
/
usr
/
share
/
tesseract
/
tessdata
/
eng.cube.fold
/
usr
/
share
/
tesseract
/
tessdata
/
eng.cube.lm
/
usr
/
share
/
tesseract
/
tessdata
/
eng.cube.nn
/
usr
/
share
/
tesseract
/
tessdata
/
eng.cube.
params
/
usr
/
share
/
tesseract
/
tessdata
/
eng.cube.size
/
usr
/
share
/
tesseract
/
tessdata
/
eng.cube.word
-
freq
/
usr
/
share
/
tesseract
/
tessdata
/
eng.tesseract_cube.nn
/
usr
/
share
/
tesseract
/
tessdata
/
eng.traineddata
/
usr
/
share
/
tesseract
/
tessdata
/
pdf.ttf
/
usr
/
share
/
tesseract
/
tessdata
/
tessconfigs
/
usr
/
share
/
tesseract
/
tessdata
/
tessconfigs
/
batch
/
usr
/
share
/
tesseract
/
tessdata
/
tessconfigs
/
batch.nochop
/
usr
/
share
/
tesseract
/
tessdata
/
tessconfigs
/
matdemo
/
usr
/
share
/
tesseract
/
tessdata
/
tessconfigs
/
msdemo
/
usr
/
share
/
tesseract
/
tessdata
/
tessconfigs
/
nobatch
/
usr
/
share
/
tesseract
/
tessdata
/
tessconfigs
/
segdemo