欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

lucene Analyzer 分词 一

程序员文章站 2022-07-09 09:34:54
...

 

 

 

package com.hb;

import java.io.Reader;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;

public class MyStopAnalyzer extends Analyzer {
	private Set stops;

	public MyStopAnalyzer(String[] strs){
		//会自动将字符串数据转为set
		stops = StopFilter.makeStopSet(Version.LUCENE_35, strs, true);
		//将原有的停用词加入到现在的停用词中
		stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
	}
	
	public MyStopAnalyzer(){
		//获取原有的停用词
		stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
	}
	
	@Override
	public TokenStream tokenStream(String fieldname, Reader reader) {
		//为这个分词器设定过滤链和Tokenizers
		return new StopFilter(Version.LUCENE_35, 
				new LowerCaseFilter(Version.LUCENE_35, new LetterTokenizer(Version.LUCENE_35, reader)), 
				stops);
	}

}