IK分词器
程序员文章站
2024-02-22 16:20:52
...
1、引入依赖
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
2、IKUtil工具类
import com.asiainfo.biapp.aiop.web.product.config.CustomConfiguration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
public class IKUtil {
private static final Logger logger = LoggerFactory.getLogger(IKUtil.class);
@Deprecated
public static String[] separate(String... textArr){
Set<String> wordSet = new HashSet<>();
for (String text : textArr) {
//创建分词对象,true:智能切分,false:最细粒度切分
Analyzer anal=new IKAnalyzer(true);
//分词
TokenStream ts= null;
try(StringReader reader=new StringReader(text)) {
ts = anal.tokenStream("", reader);
//要先reset一下,否则会报错
ts.reset();
CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);
//遍历分词数据
while(ts.incrementToken()){
wordSet.add(term.toString());
}
} catch (IOException e) {
logger.error("分词异常",e);
}
}
String[] arr = new String[wordSet.size()];
return wordSet.toArray(arr);
}
public static String[] parse(String text){
Set<String> wordSet = new HashSet<>();
StringReader sr = new StringReader(text);
IKSegmenter ikSegmenter = new IKSegmenter(sr, CustomConfiguration.getInstance());
Lexeme word;
String wordStr;
try {
while((word = ikSegmenter.next()) != null){
wordStr = word.getLexemeText();
wordSet.add(wordStr);
}
} catch (IOException e) {
logger.error("获取分词结果异常",e);
}
String[] arr = new String[wordSet.size()];
return wordSet.toArray(arr);
}
public static void main(String[] args) {
String text = "我是网管智慧中台的老大";
String[] arr = IKUtil.parse(text);
for (String s : arr) {
System.out.println(s);
}
}
}
3、自定义配置类
import org.wltea.analyzer.cfg.Configuration;
import java.util.ArrayList;
import java.util.List;
/**
* @author xulong3
* @Title: file_name
* @Package package_name
* @Description: todo
* @date 2020/7/1 17:14
*/
public class CustomConfiguration implements Configuration{
private static final String PATH_DIC_MAIN = "words/main.dic";
private static final String PATH_DIC_STOPWORD = "words/stopword.dic";
private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
private static CustomConfiguration customConfiguration = new CustomConfiguration();
private CustomConfiguration(){}
public static CustomConfiguration getInstance(){
return customConfiguration;
}
private boolean useSmart;
@Override
public boolean useSmart() {
return useSmart;
}
@Override
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
@Override
public String getMainDictionary() {
return PATH_DIC_MAIN;
}
@Override
public String getQuantifierDicionary() {
return PATH_DIC_QUANTIFIER;
}
@Override
public List<String> getExtDictionarys() {
return null;
}
@Override
public List<String> getExtStopWordDictionarys() {
List<String> pathList = new ArrayList<>();
pathList.add(PATH_DIC_STOPWORD);
return pathList;
}
}
上一篇: .NET中方法的注意事项总结
下一篇: Flash页面如何通过校验