Lucene分词器测试
程序员文章站
2022-07-01 15:39:09
...
Maven依赖
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-memory</artifactId>
<version>6.0.0</version>
</dependency>
<!--IK分析器依赖-->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
测试代码
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class AnalyzerPrint {
private static String strCh = "*简称中国,是一个有13亿人口的国家";
public static void main(String[] args) throws IOException {
Analyzer analyzer = new StandardAnalyzer(); // 标准分词
System.out.println("标准分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new WhitespaceAnalyzer(); // 空格分词
System.out.println("空格分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new SimpleAnalyzer(); // 简单分词
System.out.println("简单分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new CJKAnalyzer(); // 二分法分词
System.out.println("二分法分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new KeywordAnalyzer(); // 关键字分词
System.out.println("关键字分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new StopAnalyzer(); // 停用词分词
System.out.println("停用词分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new SmartChineseAnalyzer(); // 中文智能分词
System.out.println("中文智能分词:" + analyzer.getClass());
printAnalyzer(analyzer);
}
public static void printAnalyzer(Analyzer analyzer) throws IOException {
StringReader reader = new StringReader(strCh);
TokenStream toStream = analyzer.tokenStream(strCh, reader);
toStream.reset();
CharTermAttribute teAttribute = toStream.getAttribute(CharTermAttribute.class);
System.out.println("分词结果:");
while (toStream.incrementToken()) {
System.out.print(teAttribute.toString() + "|");
}
System.out.println("\n");
analyzer.close();
}
}
结果
标准分词:class org.apache.lucene.analysis.standard.StandardAnalyzer
分词结果:
中|华|人|民|共|和|国|简|称|中|国|是|一|个|有|13|亿|人|口|的|国|家|
空格分词:class org.apache.lucene.analysis.core.WhitespaceAnalyzer
分词结果:
*简称中国,是一个有13亿人口的国家|
简单分词:class org.apache.lucene.analysis.core.SimpleAnalyzer
分词结果:
*简称中国|是一个有|亿人口的国家|
二分法分词:class org.apache.lucene.analysis.cjk.CJKAnalyzer
分词结果:
中华|华人|人民|民共|共和|和国|国简|简称|称中|中国|是一|一个|个有|13|亿人|人口|口的|的国|国家|
关键字分词:class org.apache.lucene.analysis.core.KeywordAnalyzer
分词结果:
*简称中国,是一个有13亿人口的国家|
停用词分词:class org.apache.lucene.analysis.core.StopAnalyzer
分词结果:
*简称中国|是一个有|亿人口的国家|
中文智能分词:class org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer
分词结果:
*|简称|中国|是|一个|有|13|亿|人口|的|国家|
下一篇: webrtc 计算解码时间的方法