lucene入门-使用JE中文分词
基于词库的算法分词,是较好的中文分词器
package busetoken;
import java.io.IOException;
import jeasy.analysis.MMAnalyzer;
public class UseJe {
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String s="编码规范从根本上解决了程序维护员的难题;规范的编码阅读和理解起来更容易,也可以快速的不费力气的借鉴别人的编码。对将来维护你编码的人来说,你的编码越优化,他们就越喜欢你的编码,理解起来也就越快。";
MMAnalyzer mm=new MMAnalyzer();
System.out.print(mm.segment(s, "|"));
}
}
效果如下
编码|规范|从根本上|解决|程序|维护|员|难题|规范|编码|阅读|理解|起来|更|容易|也可以|快速|不费力气|借鉴|别人|编码|将来|维护|你|编码|的人|来说|你的|编码|越|优化|他们|就越|喜欢|你的|编码|理解|起来|也就|越快|
建立索引
package bindex;
import java.io.File;
import tool.FileText;
import tool.FileList;
import java.io.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.store.LockObtainFailedException;
public class FileIndexer {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String indexPath ="indexes";
try {
IndexWriter indexWriter = new IndexWriter(indexPath,new MMAnalyzer());
String[] files=FileList.getFiles("htmls");
int num=files.length;
for(int i=0;i<num;i++){
Document doc=new Document();
File f=new File(files[i]);
String name=f.getName();
String content=FileText.getText(f);
String path=f.getPath();
Field field=new Field("name",name,Field.Store.YES,Field.Index.TOKENIZED);
doc.add(field);
field=new Field("content",content,Field.Store.YES,Field.Index.TOKENIZED);
doc.add(field);
field=new Field("path",path,Field.Store.YES,Field.Index.NO);
doc.add(field);
indexWriter.addDocument(doc);
System.out.println("File:"+path+name+" indexed!");
}
System.out.println("OK!");
indexWriter.close();
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (LockObtainFailedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
搜索
package bindex;
import java.io.IOException;
import java.lang.StringBuffer;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.document.*;
public class BindexSearcher {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String indexPath="indexes";
String searchField="content";
String searchPhrase="安全";
StringBuffer sb=new StringBuffer("");
try {
IndexSearcher searcher=new IndexSearcher(indexPath);
Term t=new Term(searchField,searchPhrase);
Query q=new TermQuery(t);
Hits hs=searcher.search(q);
int num=hs.length();
for (int i=0;i<num;i++){
Document doc=hs.doc(i);
Field fname=doc.getField("name");
Field fcontent=doc.getField("content");
sb.append("name:\n");
sb.append(fname.stringValue()+"\n");
sb.append("content:\n");
sb.append(fcontent.stringValue().substring(0, 100)+"\n");
sb.append("------------"+"\n");
}
searcher.close();
System.out.println(sb);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}