lucene create index and analyzer query
程序员文章站
2022-07-09 09:34:12
...
package com.lucene;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.sql.DataSource;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.mapping.Environment;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
import com.dao.core.SqlSessionSingle;
import com.dao.reptile.WriteAlreadyUrlDao;
import com.reptile.util.GlobalContains;
public class Lucene {
public static void main(String[] args) throws SQLException, InvalidTokenOffsetsException {
Lucene lucene = new Lucene();
loadMybatis();
lucene.createIndex(GlobalContains.index_path);
// lucene.indexSearch(GlobalContains.index_path,"title","111");//"content"
}
private void createIndex(String indexFile) {
Analyzer analyzer = new IKAnalyzer();
Directory d;
try {
// File dir = new File(GlobalContains.reptile_root);
WriteAlreadyUrlDao alreadyDao = new WriteAlreadyUrlDao();
Map paramMap = new HashMap();
paramMap.put("is_index","0");
List list = alreadyDao.queryList(paramMap);
if(list!=null && list.size()>0){
d = FSDirectory.open(new File(indexFile));
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33,
analyzer);
IndexWriter indexWriter = new IndexWriter(d, conf);
for (int i = 0; i < list.size(); i++) {
Map map = (Map)list.get(i);
String path = map.get("path").toString();
File ff = new File(path);
if(ff==null || !ff.exists()){
System.out.println("文件:"+path+"不存在。");
continue;
}
Document doc = new Document();
doc.add(new Field("title", map.get("title").toString(), Store.YES,
Index.ANALYZED));
doc.add(new Field("url", map.get("url").toString(), Store.YES,
Index.ANALYZED));
doc.add(new Field("content", new FileReader(ff)));
indexWriter.addDocument(doc);
System.out.println(map.get("url").toString()+"\tcount:"+"\t当前:" + (i + 1)+",总共:"+list.size());
}
indexWriter.close();
d.close();
}else{
System.out.println("没有任何数据需要被索引。");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public List indexSearch(String indexFile,String key,String keywork) throws InvalidTokenOffsetsException {
Analyzer analyzer = new IKAnalyzer();
Directory d;
List resultList = null;
IndexSearcher isearcher = null;
try {
// d = SimpleFSDirectory.open(new File(indexFile));
// d= MMapDirectory.open(new File(indexFile));
d = FSDirectory.open(new File(indexFile));
isearcher = new IndexSearcher(d);
// 在索引中使用IKSimilarity相似度评估器
isearcher.setSimilarity(new IKSimilarity());
Query query = IKQueryParser.parse(key, keywork);
// 搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query, 1000);
System.out.println("命中:" + topDocs.totalHits);
ScoreDoc[] result = topDocs.scoreDocs;
if(result.length>0){
resultList = new ArrayList();
for (int i = 0; i < result.length; i++) {
Document document = isearcher.doc(result[i].doc);
System.out.println("找到:" + document.get("url")+"\t"+
document.get("title"));
//org.apache.lucene.search.highlight
String text = document.get("title");
System.out.println("key:"+text);
if(text!=null){
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(text.length()));
TokenStream tokenStream = analyzer.tokenStream(key, new StringReader(text));
String highlighterText = highlighter.getBestFragment(tokenStream, text);
System.out.println("【高亮显示第】"+(i+1)+"条,检索结果如下:"+highlighterText);
//set result
LuceneResultBean luceneResultBean = new LuceneResultBean();
luceneResultBean.setUrl(document.get("url"));
luceneResultBean.setTitle(highlighterText);
resultList.add(luceneResultBean);
}
}
}
return resultList;
} catch (IOException e) {
e.printStackTrace();
}finally{
if(isearcher!=null)
try {
isearcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
private static void loadMybatis() throws SQLException{
// 加载myBatis的数据库配置文件,不用spring则需要手动加载
Reader reader = null;
try {
reader = Resources.getResourceAsReader("myBatis3.xml");
} catch (IOException e) {
e.printStackTrace();
}
SqlSessionFactory sqlSession = new SqlSessionFactoryBuilder()
.build(reader);
Environment en = sqlSession.getConfiguration().getEnvironment();
DataSource ds = en.getDataSource();
System.out.println("连接:" + ds.getConnection().toString());
SqlSessionSingle.sqlSession = sqlSession;
System.out.println(sqlSession);
// System.out.println("测试连接数据库是否成功。。。");
// SqlSession session = sqlSession.openSession();
// Map map = new HashMap();
// List list = session.selectList("t_url.queryList", map);
// System.out.println(list);
// session.close();
}
}
上一篇: Lucene 分词