今天看同事的ppt,提到了图片识别,又tesseract-ocr,觉得不错,试一下,如果效果好可以用来做验证码的识别
http://code.google.com/p/tesseract-ocr/
tesseract是一款开源工具,我安装了Windows版试水先
1、首先登录首页在‘下载’页面下载
- tesseract-ocr-setup-xx.xx.exe
- chi_sim.traineddata.gz 中文语言包
2、双击即可安装tesserract-ocr,
3、安装中文语言包,将语言包 chi_sim.traineddata.gz 解压到 Tesseract-OCR下
4、写测试代码:
- package com.taobao.voc.tesseract;
-
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.InputStreamReader;
- import java.util.ArrayList;
- import java.util.List;
-
- import org.jdesktop.swingx.util.OS;
-
- public class OCR {
- private final String LANG_OPTION = "-l";
- private final String EOL = System.getProperty("line.separator");
- private String tessPath = "D://java_tools//Tesseract-OCR";
-
-
- public String recognizeText(File imageFile,String imageFormat)throws Exception{
- File tempImage = ImageIOHelper.createImage(imageFile,imageFormat);
- File outputFile = new File(imageFile.getParentFile(),"output");
- StringBuffer strB = new StringBuffer();
- List<String> cmd = new ArrayList<String>();
- if(OS.isWindowsXP()){
- cmd.add(tessPath+"//tesseract");
- }else if(OS.isLinux()){
- cmd.add("tesseract");
- }else{
- cmd.add(tessPath+"//tesseract");
- }
- cmd.add("");
- cmd.add(outputFile.getName());
- cmd.add(LANG_OPTION);
-
- cmd.add("eng");
-
- ProcessBuilder pb = new ProcessBuilder();
- pb.directory(imageFile.getParentFile());
-
- cmd.set(1, tempImage.getName());
- pb.command(cmd);
- pb.redirectErrorStream(true);
-
- Process process = pb.start();
-
- int w = process.waitFor();
-
-
- tempImage.delete();
-
- if(w==0){
- BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8"));
-
- String str;
- while((str = in.readLine())!=null){
- strB.append(str).append(EOL);
- }
- in.close();
- }else{
- String msg;
- switch(w){
- case 1:
- msg = "Errors accessing files.There may be spaces in your image‘s filename.";
- break;
- case 29:
- msg = "Cannot recongnize the image or its selected region.";
- break;
- case 31:
- msg = "Unsupported image format.";
- break;
- default:
- msg = "Errors occurred.";
- }
- tempImage.delete();
- throw new RuntimeException(msg);
- }
- new File(outputFile.getAbsolutePath()+".txt").delete();
- return strB.toString();
- }
- }
- package com.taobao.voc.tesseract;
-
- import java.awt.image.BufferedImage;
- import java.io.File;
- import java.io.IOException;
- import java.util.Iterator;
- import java.util.Locale;
-
- import javax.imageio.IIOImage;
- import javax.imageio.ImageIO;
- import javax.imageio.ImageReader;
- import javax.imageio.ImageWriteParam;
- import javax.imageio.ImageWriter;
- import javax.imageio.metadata.IIOMetadata;
- import javax.imageio.stream.ImageInputStream;
- import javax.imageio.stream.ImageOutputStream;
-
- import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam;
-
- public class ImageIOHelper {
-
- public static File createImage(File imageFile, String imageFormat) {
- File tempFile = null;
- try {
- Iterator<ImageReader> readers = ImageIO.getImageReadersByFormatName(imageFormat);
- ImageReader reader = readers.next();
-
- ImageInputStream iis = ImageIO.createImageInputStream(imageFile);
- reader.setInput(iis);
-
- IIOMetadata streamMetadata = reader.getStreamMetadata();
-
-
- TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE);
- tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);
-
-
- Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("tiff");
- ImageWriter writer = writers.next();
-
- BufferedImage bi = reader.read(0);
- IIOImage image = new IIOImage(bi,null,reader.getImageMetadata(0));
- tempFile = tempImageFile(imageFile);
- ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile);
- writer.setOutput(ios);
- writer.write(streamMetadata, image, tiffWriteParam);
- ios.close();
-
- writer.dispose();
- reader.dispose();
-
- } catch (IOException e) {
- e.printStackTrace();
- }
- return tempFile;
- }
-
- private static File tempImageFile(File imageFile) {
- String path = imageFile.getPath();
- StringBuffer strB = new StringBuffer(path);
- strB.insert(path.lastIndexOf(‘.‘),0);
- return new File(strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif"));
- }
-
- }
测试代码
- package com.taobao.voc.tesseract;
- import java.io.File;
- import java.io.IOException;
-
- public class TestOCR {
-
-
- public static void main(String[] args) {
- String path = "d://test4.jpg";
- try {
- String valCode = new OCR().recognizeText(new File(path), "jpg");
- System.out.println(valCode);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- }
String path = "d://test4.jpg"; 修改为需要测试的图片,最好全中文,如果因为请更换语言包,OCR类中cmd.add("chi_sim");