lucene 的分词StandardAnalyzer
程序员文章站
2022-07-09 09:35:18
...
package analyzer;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class StandardAnalyzerTest {
/**
* 构造函数
*
*/
public StandardAnalyzerTest()
{
}
public static void main(String[] args)
{//初始化一个standardAnalyzer对象
Analyzer aAnanyzer=new StandardAnalyzer();
StringReader sr=new StringReader("People are always talking about 'the problem of youth'.");
//生成TokenStream对象
TokenStream ts=aAnanyzer.tokenStream(sr);
int i=0;
Token t;
try {
//调用next()方法不断的取得下一个切出的词
t = ts.next();
while(t!=null)
{
i++;
System.out.println("Line "+i+":"+t.termText());
t=ts.next();
}
} catch (IOException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
}
}
}
1:对空格进行了分词
2:大写都转换成小写
3:对停止词的过滤如a,an ,the 等小词
4:删除了所有的标点符号
下面是汉语的效果
package analyzer;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class StandardAnalyzerTestForCH {
public StandardAnalyzerTestForCH()
{
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO 自动生成方法存根
Analyzer a=new StandardAnalyzer();
StringReader sr=new StringReader("龙门石窟位于山西省大同市西 郊,是 '我国古代艺术的宝贝!");
TokenStream ts=a.tokenStream(sr);
int i=0;
try {
Token t=ts.next();
while(t!=null)
{
i++;
System.out.println("Line "+i+":"+t.termText());
t=ts.next();
}
} catch (IOException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
}
}
}