lucene使用hanlp分词

程序员文章站 2022-07-09 09:35:24

...

maven依赖

4.0.0

ff
dd
0.0.1-SNAPSHOT
jar

org.apache.lucene lucene-core ${lucene.version} org.apache.lucene lucene-queryparser ${lucene.version} org.apache.lucene lucene-analyzers-smartcn ${lucene.version} org.apache.lucene lucene-analyzers-common ${lucene.version} com.hankcs.nlp hanlp-lucene-plugin 1.1.2

<dependency>
  <groupId>junit</groupId>
  <artifactId>junit</artifactId>
  <version>3.8.1</version>
  <scope>test</scope>
</dependency>

public static void main( String[] args ) throws ParseException, IOException
{
long time=System.currentTimeMillis();

String text = "以前发布过HanLP的Lucene插件，后来很多人跟我说山东人比武汉人听说过吃一线，长一智更好其实Solr更流行（反正我是觉得既然Solr是Lucene的子项目，那么稍武汉轻工大学微改改配置就能红安以及黄石路支持Solr），于是就抽空做了个Solr插件出来，开源在Github上，欢迎改进来自王宝强";


    ////////////////////////////////标准分词器(长词不做切分的分词器)//////////////////////////////
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream ts = analyzer.tokenStream("field",text);
    ts.reset();
    while(ts.incrementToken()){
        CharTermAttribute attribute = ts.getAttribute(CharTermAttribute.class);  //The term text of a Token.
        OffsetAttribute offsetAttribute =ts.getAttribute(OffsetAttribute.class);  //偏移量
        PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class); //距离
        System.out.println(attribute+"  "
                +offsetAttribute.startOffset()+"  "+offsetAttribute.endOffset()+"  "
                +positionIncrementAttribute.getPositionIncrement());
    }
    ts.close();
    System.out.println(11111111);


    /////////////////////////////////索引分词器(长词全切分的分词器)/////////////////////////////
    Analyzer indexAnalyzer = new HanLPIndexAnalyzer();
    TokenStream indexTs = indexAnalyzer.tokenStream("field",text);
    indexTs.reset();
    while(indexTs.incrementToken()){
        CharTermAttribute attribute = indexTs.getAttribute(CharTermAttribute.class);  //The term text of a Token.
        OffsetAttribute offsetAttribute =indexTs.getAttribute(OffsetAttribute.class);  //偏移量
        PositionIncrementAttribute positionIncrementAttribute = indexTs.getAttribute(PositionIncrementAttribute.class); //距离
        System.out.println(attribute+"  "
                +offsetAttribute.startOffset()+"  "+offsetAttribute.endOffset()+"  "
                +positionIncrementAttribute.getPositionIncrement());
    }
    
    indexTs.close();
   // System.out.println("2222222");


    /////////////////////////////通过query查看分词结果//////////////////////////////
    QueryParser queryParser = new QueryParser( "txt",analyzer);
    Query query = queryParser.parse(text);
  //  System.out.println(query.toString("txt"));
    queryParser = new QueryParser("txt",indexAnalyzer);
    query = queryParser.parse(text);
  
    
    System.out.println(query.toString("txt"));
  //  System.out.println(HanLP.parseDependency("把市场经济奉行的等价交换原则引入党的生活和国家机关政务活动中"));;
    
    System.out.println(System.currentTimeMillis()-time);

// System.out.println(HanLP.DemoTextClassification);
}

以前 0 2 1
发布 2 4 1
过 4 5 1
HanLP 5 10 1
的 10 11 1
Lucene 11 17 1
插件 17 19 1
， 19 20 1
后来 20 22 1
很多 22 24 1
人 24 25 1
跟 25 26 1
我 26 27 1
说 27 28 1
山东 28 30 1
人 30 31 1
比 31 32 1
武汉 32 34 1
人 34 35 1
听说 35 37 1
过 37 38 1
吃 38 39 1
一线 39 41 1
， 41 42 1
长一智 42 45 1
更好 45 47 1
其实 47 49 1
Solr 49 53 1
更 53 54 1
流行 54 56 1
（ 56 57 1

文章来源于公众号火炎一笑倾城的博客

上一篇： Lucene查询结果高亮

下一篇： lucene 的分词StandardAnalyzer

lucene使用hanlp分词

Net Core使用Lucene.Net和盘古分词器实现全文检索

Lucene的基本使用

使用Discuz关键词服务器实现PHP中文分词

Lucene-分词器简介及IK分词器的使用

python中文分词,使用结巴分词对python进行分词(实例讲解)

使用libmmseg实现Ruby的中文分词功能

ek插件------ik中文分词器的使用

白话Elasticsearch28-IK中文分词器的安装和使用

Python英文文本分词(无空格)模块wordninja的使用实例

Lucene 3.0学习笔记－使用索引查询

lucene使用hanlp分词

Net Core使用Lucene.Net和盘古分词器 实现全文检索

Lucene的基本使用

使用Discuz关键词服务器实现PHP中文分词

Lucene-分词器简介及IK分词器的使用

python中文分词,使用结巴分词对python进行分词(实例讲解)

使用libmmseg实现Ruby的中文分词功能

ek插件------ik中文分词器的使用

白话Elasticsearch28-IK中文分词器的安装和使用

Python英文文本分词(无空格)模块wordninja的使用实例

Lucene 3.0学习笔记－使用索引查询

Net Core使用Lucene.Net和盘古分词器实现全文检索