Lucene全文检索应用
程序员文章站
2022-03-04 23:29:22
...
一后台接口方法:
1.创建索引
/**
* 创建知识索引
*
* @param flag
* @param path
* @param indexPath
* @param title
* @param knowlegeid
*/
public void createKnowledgeIndex(boolean flag, String path, String indexPath, String title, Date createtime, String infoType, String knowlegeid) {
// 根据类目代码找到类目,获得文件及文件索引目录
File indexDir = new File(indexPath + "/knowledgeIndex");
if (!indexDir.exists()) {
if (!indexDir.mkdirs()) throw new RuntimeException("索引文件夹创建出错");
}
// 建立分词
Analyzer luceneAnalyzer = new IK_CAnalyzer();
// 取得目录下所有Files
// 建立indexWrite indexWrite主要作用是添加索引,并判断索引目录是否有索引文件
File[] file = indexDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.equals("segments.gen");
}
});
if (file == null || file.length == 0) {
flag = true;
}
IndexWriter indexWriter = null;
long startTime = new Date().getTime();
try {
indexWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
File dataFiles = new File(path);
String txtReader = "";
if (!"".equals(path) && path != null) {
txtReader = postFix(dataFiles, fileParseDomain);
Document document = new Document();
document.add(new Field("path", dataFiles.getCanonicalPath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
document.add(new Field("title", title, Field.Store.YES,
Field.Index.ANALYZED,
Field.TermVector.WITH_OFFSETS));
document.add(new Field("createtime", StringUtil.getDateStringYMD(createtime),
Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("infoType", infoType,
Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("knowlegeid", knowlegeid,
Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("contents", txtReader,
Field.Store.COMPRESS, Field.Index.ANALYZED,
Field.TermVector.WITH_OFFSETS));
document.add(new Field("all", title + txtReader,
Field.Store.COMPRESS, Field.Index.ANALYZED,
Field.TermVector.WITH_OFFSETS));
indexWriter.addDocument(document);
}
indexWriter.optimize();
} catch (IOException e) {
log.error("索引创建出错" + e.getMessage(), e);
throw new RuntimeException("索引创建出错" + e.getMessage());
} finally {
try {
indexWriter.close();
} catch (CorruptIndexException e) {
log.error(e.getMessage());
throw new RuntimeException("关闭写索引流出错" + e.getMessage());
} catch (IOException e) {
log.error(e.getMessage());
throw new RuntimeException("关闭写索引流出错" + e.getMessage());
}
}
long endTime = new Date().getTime();
log.info("创建索引总时间:" + (endTime - startTime));
}
2.根据id和路径删除索引:
public void delKnowledgeIndexByinfoid(String path,String knowlegeid, String indexPath) {
Directory directory = null;
try {
directory = FSDirectory.getDirectory(indexPath + "/knowledgeIndex");
Term[] termArr = new Term[2];
termArr[0] = new Term("path", path);
termArr[1] = new Term("knowlegeid", knowlegeid);
Analyzer luceneAnalyzer = new IK_CAnalyzer();
IndexWriter indexWriter = new IndexWriter(directory,
luceneAnalyzer, false);
indexWriter.deleteDocuments(termArr);
indexWriter.optimize();
indexWriter.close();
} catch (IOException e) {
log.debug("索引删除出错" + e.getMessage(), e);
throw new RuntimeException("索引删除出错" + e.getMessage());
}
}
3.查询索引
/**
* 根据关键字检索知识信息
*
* @param type
* @param keyWord
* @param indexPath
* @param sp
* @return
*/
public List searchKnowlegeByKey(String type, String keyWord, String indexPath, SplitPage sp) {
keyWord = specialStrConvert(keyWord);
// 索引地址
File indexDir = new File(indexPath + "/knowledgeIndex");
// 取得索引字典
FSDirectory directory = null;
IndexSearcher searcher = null;
Hits hits = null;
List list = null;
try {
directory = FSDirectory.getDirectory(indexDir, false);
IndexReader reader = IndexReader.open(directory);
searcher = new IndexSearcher(directory);
// 查询的索引地址是否存在
if (!indexDir.exists()) {
log.debug("索引文件不存在");
throw new RuntimeException("索引文件不存在");
}
// 建立term 查询docuemnt中contents中的内容(内容要转为大字)
Analyzer luceneAnalyzer = new IK_CAnalyzer();
QueryParser parser = new QueryParser(type, luceneAnalyzer);
parser.setAllowLeadingWildcard(true);
Query query = null;
query = parser.parse("+(" + type + ":*" + keyWord + "*)");
// 生成结果
Sort sort = new Sort(new SortField[]{new SortField("createtime", false)});//对索引结果排序
hits = searcher.search(query, sort);
// 分词结果
list = new ArrayList();
SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(
"<b><span style='background-color:yellow;'>", "</span></b>");
Highlighter highlighter = new Highlighter(sHtmlF, new QueryScorer(
query));
if (hits != null && hits.length() > 0) {
int len = hits.length();
sp.setRecordCount(len);
sp.init();
int endRecord = sp.getStartRecord() + sp.getPageSize();
int con = endRecord > sp.getRecordCount() ? sp.getRecordCount()
: endRecord;
for (int i = sp.getStartRecord(); i < con; i++) {
Document docTemp = hits.doc(i);
String value = docTemp.get(type);
// 对要高亮显示的字段格式化,这里只是加红色显示和加粗
Map m = new HashMap();
m.put("path", docTemp.get("path"));
m.put("title", docTemp.get("title"));
m.put("createtime", docTemp.get("createtime"));
m.put("infoType", docTemp.get("infoType"));
m.put("knowlegeid", docTemp.get("knowlegeid"));
if (value != null && !type.equals("title")) {
// Lucene使用项向量提高高亮显示性能
TermPositionVector termFreqVector = (TermPositionVector) reader
.getTermFreqVector(hits.id(i), type);
TokenStream tokenStream = TokenSources
.getTokenStream(termFreqVector);
String str = highlighter.getBestFragment(tokenStream,
value);
m.put(type, str);
}
list.add(m);
}
}
// long ll = System.currentTimeMillis();
// System.out.println("高亮显示" + (ll - l));
searcher.close();
reader.close();
} catch (IOException e) {
log.debug(e.getMessage(), e);
throw new RuntimeException(e.getMessage());
} catch (ParseException e) {
log.debug("lucene分词转换出错" + e.getMessage(), e);
throw new RuntimeException("lucene分词转换出错" + e.getMessage());
}
return list;
}
4.判定是否存在索引
public String isExistsKnowlegeIndex(String path) {
String mes;
//获得文件及文件索引目录
File indexDir = new File(path + "/knowledgeIndex");
if (!indexDir.exists()) {
if (!indexDir.mkdirs()) throw new RuntimeException("索引文件夹创建出错");
}
// 建立indexWrite indexWrite主要作用是添加索引,并判断索引目录是否有索引文件
File[] file = indexDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.equals("segments.gen");
}
});
if (file == null || file.length == 0) {
mes = "";
} else {
mes = "ok";
}
return mes;
}
5.替换特殊字符
/**
* 替换特殊字符
*
* @param str
* @return
*/
private static String specialStrConvert(String str) {
// + - && || ! ( ) { } [ ] ^ " ~ * ? : \
if ("".equals(str) || str == null)
return "";
else
return str.replaceAll("\\\\", "\\\\\\\\")
.replaceAll("\\+", "\\\\+").replaceAll("\\-", "\\\\-")
.replaceAll("\\&&", "\\\\&&").replaceAll("\\!", "\\\\!")
.replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)")
.replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}")
.replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]")
.replaceAll("\\^", "\\\\^").replaceAll("\"", "\\\\\"")
.replaceAll("\\~", "\\\\~").replaceAll("\\*", "\\\\*")
.replaceAll("\\?", "\\\\?").replaceAll("\\|\\|", "\\\\||")
.replaceAll("\\:", "\\\\:");
}
6.读取不同文件(方法在Java基础中工具类中实现)
private static String postFix(File file, FileParseDomain fileParseDomain) {
String txtReader = "";
try {
if (file.getPath().endsWith(".doc")) {
txtReader = fileParseDomain.readWord(file.getCanonicalPath());
} else if (file.getPath().endsWith(".pdf")) {
txtReader = fileParseDomain.readPDF(file.getCanonicalPath());
} else if (file.getPath().endsWith(".xls")) {
txtReader = fileParseDomain.readExcel(file.getCanonicalPath());
} else if (file.getPath().endsWith(".txt")) {
txtReader = fileParseDomain.readTxt(file.getCanonicalPath());
} else if (file.getPath().endsWith(".html")
|| file.getPath().endsWith(".htm")) {
txtReader = fileParseDomain.readHtmlText(file.getCanonicalPath());
}
} catch (IOException e) {
log.debug("文件读取出错" + e.getMessage(), e);
throw new RuntimeException("文件读取出错" + e.getMessage());
}
return txtReader;
}
上一篇: P2P之Noise代码分析
下一篇: WebRTC初步研究