欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

IK分词器

程序员文章站 2024-02-22 16:20:52
...

1、引入依赖

<dependency>
			<groupId>com.janeluo</groupId>
			<artifactId>ikanalyzer</artifactId>
			<version>2012_u6</version>
		</dependency>

 

2、IKUtil工具类

import com.asiainfo.biapp.aiop.web.product.config.CustomConfiguration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;


public class IKUtil {

    private static final Logger logger = LoggerFactory.getLogger(IKUtil.class);

    @Deprecated
    public static String[] separate(String... textArr){
        Set<String> wordSet = new HashSet<>();


        for (String text : textArr) {
            //创建分词对象,true:智能切分,false:最细粒度切分
            Analyzer anal=new IKAnalyzer(true);
            //分词
            TokenStream ts= null;
            try(StringReader reader=new StringReader(text)) {
                ts = anal.tokenStream("", reader);
                //要先reset一下,否则会报错
                ts.reset();
                CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);
                //遍历分词数据
                while(ts.incrementToken()){
                    wordSet.add(term.toString());
                }
            } catch (IOException e) {
                logger.error("分词异常",e);
            }

        }
        String[] arr = new String[wordSet.size()];
        return wordSet.toArray(arr);
    }

    public static String[] parse(String text){
        Set<String> wordSet = new HashSet<>();

        StringReader sr = new StringReader(text);
        IKSegmenter ikSegmenter = new IKSegmenter(sr, CustomConfiguration.getInstance());
        Lexeme word;
        String wordStr;
        try {
            while((word = ikSegmenter.next()) != null){
                wordStr = word.getLexemeText();
                wordSet.add(wordStr);
            }
        } catch (IOException e) {
            logger.error("获取分词结果异常",e);
        }
        String[] arr = new String[wordSet.size()];
        return wordSet.toArray(arr);

    }

    public static void main(String[] args) {
        String text = "我是网管智慧中台的老大";
        String[] arr = IKUtil.parse(text);
        for (String s : arr) {
            System.out.println(s);
        }
    }

}

 

3、自定义配置类

import org.wltea.analyzer.cfg.Configuration;

import java.util.ArrayList;
import java.util.List;

/**
 * @author xulong3
 * @Title: file_name
 * @Package package_name
 * @Description: todo
 * @date 2020/7/1 17:14
 */
public class CustomConfiguration implements Configuration{

    private static final String PATH_DIC_MAIN = "words/main.dic";
    private static final String PATH_DIC_STOPWORD = "words/stopword.dic";
    private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";


    private static CustomConfiguration customConfiguration = new CustomConfiguration();
    private CustomConfiguration(){}

    public static CustomConfiguration getInstance(){
        return customConfiguration;
    }

    private boolean useSmart;

    @Override
    public boolean useSmart() {
        return useSmart;
    }

    @Override
    public void setUseSmart(boolean useSmart) {
        this.useSmart = useSmart;
    }

    @Override
    public String getMainDictionary() {
        return PATH_DIC_MAIN;
    }

    @Override
    public String getQuantifierDicionary() {
        return PATH_DIC_QUANTIFIER;
    }

    @Override
    public List<String> getExtDictionarys() {
        return null;
    }

    @Override
    public List<String> getExtStopWordDictionarys() {
        List<String> pathList = new ArrayList<>();
        pathList.add(PATH_DIC_STOPWORD);
        return pathList;
    }
}