tess4j OCR文字识别配合HanLP实现分词、关键字提取
程序员文章站
2022-07-06 19:56:09
...
HanLP: Han Language Processing
地址:https://github.com/hankcs/HanLP/tree/1.x
tess4j
依赖
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.4.0</version>
</dependency>
语言库下载
https://github.com/tesseract-ocr/tessdata
Demo
package com.example.demo;
import com.alibaba.fastjson.JSON;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.tokenizer.IndexTokenizer;
import com.hankcs.hanlp.tokenizer.NLPTokenizer;
import net.sourceforge.tess4j.Tesseract;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import net.sourceforge.tess4j.ITesseract;
import java.io.File;
import java.util.*;
/**
* @author 贾镇泽 [email protected]
* @packagename com.example.demo
* @date 2020/11/4
* @time 8:57
*/
@RestController
public class FileCheck {
@GetMapping("/test")
public String test() throws Exception
{
/*源文件地址*/
File file = new File("D:\\d7e2d8a5-8322-4237-b804-33ce4a33fdda.png");
/*实例化Tess4j*/
ITesseract instance = new Tesseract();
/*设置tessdata 学习库地址*/
instance.setDatapath("E:\\Code_tmp\\tessdata");//设置你的Tess4J下的tessdata目录
/*识别语种*/
instance.setLanguage("chi_sim");//指定需要识别的语种
long l = System.currentTimeMillis();
/*返回识别内容*/
String result = instance.doOCR(file);
result = result.replace(" ","");
System.out.println(HanLP.segment(result));
System.out.println(result);
System.out.println("用时:"+(System.currentTimeMillis()-l) + "ms");
Map<String, Object> fh = new HashMap<String, Object>();
/*HanLp基本用法*/
fh.put("标准分词",HanLP.segment(result));
// fh.put("NLP分词", NLPTokenizer.segment(result));
// fh.put("NLP分词1", NLPTokenizer.analyze(result).toSimpleWordList());
// fh.put("NLP分词2", NLPTokenizer.analyze(result));
fh.put("索引分词", IndexTokenizer.segment(result));
fh.put("关键词提取",HanLP.extractKeyword(result,10));
fh.put("自动摘要",HanLP.extractSummary(result,10));
fh.put("短语提取",HanLP.extractPhrase(result,10));
return JSON.toJSONString(fh);
}
}
Tess4j 识别文字识别率还算可以识别表格惨不忍睹。。