C#编写了一个基于Lucene.Net的搜索引擎查询通用工具类:SearchEngineUtil
最近由于工作原因,一直忙于公司的各种项目(大部份都是基于spring cloud的微服务项目),故有一段时间没有与大家分享总结最近的技术研究成果的,其实最近我一直在不断的深入研究学习spring、spring boot、spring cloud的各种框架原理,同时也随时关注着.net core的发展情况及最新技术点,也在极客时间上订阅相关的专栏,只要下班有空我都会去认真阅读观看,纸质书箱也买了一些,总之近一年都是在通过:微信技术公众号(.net、java、算法、前端等技术方向)、极客时间、技术书箱 不断的吸取、借鉴他人之精华,从而不断的充实提高自己的技术水平,所谓:学如逆水行舟,不进则退,工作中学习,学习后工作中运用,当然写文章分享是一种总结,同时也是“温故而知新”的最佳应用。
前面废话说得有点多了,就直奔本文的主题内容,编写一个基于lucene.net的搜索引擎查询通用工具类:searchengineutil,lucene是什么,见百度百科 ,重点是:lucene是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,lucene.net是c#及.net运行时下的另一种语言的实现,官网地址: ,具体用法就不多说了,官网以及网上都有很多,但由于lucene.net的原生sdk中的api比较复杂,用起来不太方便,故我进行了适当的封装,把常用的增、删、改、查(分页查)在保证灵活度的情况下进行了封装,使得操作lucene.net变得相对简单一些,代码本身也不复杂,贴出完整的searchengineutil代码如下:
using lucene.net.analysis.pangu;
using lucene.net.documents;
using lucene.net.index;
using lucene.net.queryparsers;
using lucene.net.search;
using lucene.net.store;
using nlog;
using pangu;
using pangu.highlight;
using system;
using system.collections.generic;
using system.io;
using system.linq;
using system.reflection;
using system.text;
namespace cn.zuowenjun.blog.common
{
/// <summary>
/// lucene 搜索引擎实用工具类
/// author:zuowenjun
/// </summary>
public class searchengineutil
{
/// <summary>
/// 创建并添加索引记录
/// </summary>
/// <typeparam name="tindex"></typeparam>
/// <param name="indexdir"></param>
/// <param name="indexdata"></param>
/// <param name="setdocfiledsaction"></param>
public static void addindex<tindex>(string indexdir, tindex indexdata, action<document, tindex> setdocfiledsaction)
{
//创建索引目录
if (!system.io.directory.exists(indexdir))
{
system.io.directory.createdirectory(indexdir);
}
fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
bool isupdate = indexreader.indexexists(directory);
if (isupdate)
{
//如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
if (indexwriter.islocked(directory))
{
indexwriter.unlock(directory);
}
}
using (indexwriter writer = new indexwriter(directory, new panguanalyzer(), !isupdate, indexwriter.maxfieldlength.unlimited))
{
document document = new document();
setdocfiledsaction(document, indexdata);
writer.adddocument(document);
writer.optimize();//优化索引
}
}
/// <summary>
/// 删除索引记录
/// </summary>
/// <param name="indexdir"></param>
/// <param name="keyfiledname"></param>
/// <param name="keyfilevalue"></param>
public static void deleteindex(string indexdir, string keyfiledname, object keyfilevalue)
{
fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
if (!indexreader.indexexists(directory))
{
return;
}
using (indexwriter iw = new indexwriter(directory, new panguanalyzer(), indexwriter.maxfieldlength.unlimited))
{
iw.deletedocuments(new term(keyfiledname, keyfilevalue.tostring()));
iw.optimize();//删除文件后并非从磁盘中移除,而是生成一个.del的文件,需要调用optimize方法来清除。在清除文件前可以使用undeleteall方法恢复
}
}
/// <summary>
/// 更新索引记录
/// </summary>
/// <param name="indexdir"></param>
/// <param name="keyfiledname"></param>
/// <param name="keyfilevalue"></param>
/// <param name="doc"></param>
public static void updateindex(string indexdir, string keyfiledname, object keyfilevalue, document doc)
{
fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
if (!indexreader.indexexists(directory))
{
return;
}
using (indexwriter iw = new indexwriter(directory, new panguanalyzer(), indexwriter.maxfieldlength.unlimited))
{
iw.updatedocument(new term(keyfiledname, keyfilevalue.tostring()), doc);
iw.optimize();
}
}
/// <summary>
/// 是否存在指定的索引文档
/// </summary>
/// <param name="indexdir"></param>
/// <param name="keyfiledname"></param>
/// <param name="keyfilevalue"></param>
/// <returns></returns>
public static bool existsdocument(string indexdir, string keyfiledname, object keyfilevalue)
{
fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
if (!indexreader.indexexists(directory))
{
return false;
}
var reader = indexreader.open(directory, true);
return reader.docfreq(new term(keyfiledname, keyfilevalue.tostring())) > 0;
}
/// <summary>
/// 查询索引匹配到的记录
/// </summary>
/// <typeparam name="tresult"></typeparam>
/// <param name="indexdir"></param>
/// <param name="buildqueryaction"></param>
/// <param name="getsortfieldsfunc"></param>
/// <param name="buildresultfunc"></param>
/// <param name="topcount"></param>
/// <param name="needhighlight"></param>
/// <returns></returns>
public static list<tresult> searchindex<tresult>(string indexdir, func<booleanquery, idictionary<string, string>> buildqueryaction,
func<ienumerable<sortfield>> getsortfieldsfunc, func<document, tresult> buildresultfunc, bool needhighlight = true, int topcount = 0)
{
fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nolockfactory());
if (!indexreader.indexexists(directory))
{
return new list<tresult>();
}
indexreader reader = indexreader.open(directory, true);
indexsearcher searcher = new indexsearcher(reader);
booleanquery bquery = new booleanquery();
var keywords = buildqueryaction(bquery);
sort sort = null;
var sortfields = getsortfieldsfunc();
if (sortfields != null)
{
sort = new sort();
sort.setsort(sortfields.toarray());
}
topcount = topcount > 0 ? topcount : int.maxvalue;//当未指定top值,则设置最大值以表示获取全部
topdocs resultdocs = null;
if (sort != null)
{
resultdocs = searcher.search(bquery, null, topcount, sort);
}
else
{
resultdocs = searcher.search(bquery, null, topcount);
}
if (topcount > resultdocs.totalhits)
{
topcount = resultdocs.totalhits;
}
dictionary<string, propertyinfo> highlightprops = null;
list<tresult> results = new list<tresult>();
if (resultdocs != null)
{
for (int i = 0; i < topcount; i++)
{
document doc = searcher.doc(resultdocs.scoredocs[i].doc);
var model = buildresultfunc(doc);
if (needhighlight)
{
model = sethighlighter(keywords, model, ref highlightprops);
}
results.add(model);
}
}
return results;
}
/// <summary>
/// 分页查询索引匹配到的记录
/// </summary>
/// <typeparam name="tresult"></typeparam>
/// <param name="indexdir"></param>
/// <param name="buildqueryaction"></param>
/// <param name="getsortfieldsfunc"></param>
/// <param name="buildresultfunc"></param>
/// <param name="pagesize"></param>
/// <param name="page"></param>
/// <param name="totalcount"></param>
/// <param name="needhighlight"></param>
/// <returns></returns>
public static list<tresult> searchindexbypage<tresult>(string indexdir, func<booleanquery, idictionary<string, string>> buildqueryaction,
func<ienumerable<sortfield>> getsortfieldsfunc, func<document, tresult> buildresultfunc, int pagesize, int page, out int totalcount, bool needhighlight = true)
{
fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nolockfactory());
if (!indexreader.indexexists(directory))
{
totalcount = 0;
return new list<tresult>();
}
indexreader reader = indexreader.open(directory, true);
indexsearcher searcher = new indexsearcher(reader);
booleanquery bquery = new booleanquery();
var keywords = buildqueryaction(bquery);
sort sort = null;
var sortfields = getsortfieldsfunc();
if (sortfields != null)
{
sort = new sort();
sort.setsort(sortfields.toarray());
}
topscoredoccollector doccollector = topscoredoccollector.create(1, true);
searcher.search(bquery, doccollector);
totalcount = doccollector.totalhits;
if (totalcount <= 0) return null;
topdocs resultdocs = searcher.search(bquery, null, pagesize * page, sort);
dictionary<string, propertyinfo> highlightprops = null;
list<tresult> results = new list<tresult>();
int indexstart = (page - 1) * pagesize;
int indexend = indexstart + pagesize;
if (totalcount < indexend) indexend = totalcount;
if (resultdocs != null)
{
for (int i = indexstart; i < indexend; i++)
{
document doc = searcher.doc(resultdocs.scoredocs[i].doc);
var model = buildresultfunc(doc);
if (needhighlight)
{
model = sethighlighter(keywords, model, ref highlightprops);
}
results.add(model);
}
}
return results;
}
/// <summary>
/// 设置结果高亮
/// </summary>
/// <typeparam name="t"></typeparam>
/// <param name="dickeywords"></param>
/// <param name="model"></param>
/// <param name="props"></param>
/// <returns></returns>
private static t sethighlighter<t>(idictionary<string, string> dickeywords, t model, ref dictionary<string, propertyinfo> props)
{
simplehtmlformatter simplehtmlformatter = new simplehtmlformatter("<font color=\"red\">", "</font>");
highlighter highlighter = new highlighter(simplehtmlformatter, new segment());
highlighter.fragmentsize = 250;
type modeltype = typeof(t);
foreach (var item in dickeywords)
{
if (!string.isnullorwhitespace(item.value))
{
if (props == null)
{
props = new dictionary<string, propertyinfo>();
}
if (!props.containskey(item.key))
{
props[item.key] = modeltype.getproperty(item.key, bindingflags.ignorecase | bindingflags.public | bindingflags.instance);
}
var modelprop = props[item.key];
if (modelprop.propertytype == typeof(string))
{
string newvalue = highlighter.getbestfragment(item.value, modelprop.getvalue(model).tostring());
if (!string.isnullorempty(newvalue))
{
modelprop.setvalue(model, newvalue);
}
}
}
}
return model;
}
/// <summary>
/// 拆分关键词
/// </summary>
/// <param name="keywords"></param>
/// <returns></returns>
public static string getkeywordssplitbyspace(string keyword)
{
pangutokenizer kttokenizer = new pangutokenizer();
stringbuilder result = new stringbuilder();
icollection<wordinfo> words = kttokenizer.segmenttowordinfos(keyword);
foreach (wordinfo word in words)
{
if (word == null)
{
continue;
}
result.appendformat("{0}^{1}.0 ", word.word, (int)math.pow(3, word.rank));
}
return result.tostring().trim();
}
/// <summary>
/// 【辅助方法】创建盘古查询对象
/// </summary>
/// <param name="field"></param>
/// <param name="keyword"></param>
/// <returns></returns>
public static query createpanguquery(string field, string keyword, bool needsplit = true)
{
if (needsplit)
{
keyword = getkeywordssplitbyspace(keyword);
}
queryparser parse = new queryparser(lucene.net.util.version.lucene_30, field, new panguanalyzer());
parse.defaultoperator = queryparser.operator.or;
query query = parse.parse(keyword);
return query;
}
/// <summary>
/// 【辅助方法】创建盘古多字段查询对象
/// </summary>
/// <param name="keyword"></param>
/// <param name="fields"></param>
/// <returns></returns>
public static query createpangumultifieldquery(string keyword, bool needsplit, params string[] fields)
{
if (needsplit)
{
keyword = getkeywordssplitbyspace(keyword);
}
queryparser parse = new multifieldqueryparser(lucene.net.util.version.lucene_30, fields, new panguanalyzer());
parse.defaultoperator = queryparser.operator.or;
query query = parse.parse(keyword);
return query;
}
}
}
里面除了使用了lucene.net nuget包,还单独引用了pangu分词器及其相关组件,因为大多数情况下我们的内容会包含中文。如上代码就不再细讲了,注释得比较清楚了。下面贴出一些实际的用法:
创建索引:
searchengineutil.addindex(getsearchindexdir(), post, (doc, data) => buildpostsearchdocument(data, doc));
private document buildpostsearchdocument(post post, document doc = null)
{
if (doc == null)
{
doc = new document();//创建document
}
doc.add(new field("id", post.id.tostring(), field.store.yes, field.index.not_analyzed));
doc.add(new field("title", post.title, field.store.yes, field.index.analyzed));
doc.add(new field("summary", post.summary, field.store.yes, field.index.analyzed));
doc.add(new field("createtime", post.createtime.tostring("yyyy/mm/dd hh:mm"), field.store.yes, field.index.no));
doc.add(new field("author", post.isoriginal ? (post.creator ?? userqueryservice.findbyname(post.createby)).nickname : post.sourceby, field.store.yes, field.index.no));
return doc;
}
删除索引:
searchengineutil.deleteindex(getsearchindexdir(), "id", post.id);
更新索引:
searchengineutil.updateindex(getsearchindexdir(), "id", post.id, buildpostsearchdocument(post));
分页查询:
var keyword = searchengineutil.getkeywordssplitbyspace("梦在旅途 中国梦");
var searchresult = searchengineutil.searchindexbypage(indexdir, (bquery) =>
{
var query = searchengineutil.createpangumultifieldquery(keyword, false, "title", "summary");
bquery.add(query, occur.should);
return new dictionary<string, string> {
{ "title",keyword},{"summary",keyword}
};
}, () =>
{
return new[] { new sortfield("id", sortfield.int, true) };
}, doc =>
{
return new postsearchinfodto
{
id = doc.get("id"),
title = doc.get("title"),
summary = doc.get("summary"),
author = doc.get("author"),
createtime = doc.get("createtime")
};
}, pagesize, pageno, out totalcount);
其它的还有:判断索引中的指定文档记录存不存在、查询符合条件的索引文档等在此没有列出,大家有兴趣的可以copy到自己的项目中测试一下。
这里可以看一下我在自己的项目中(个人全新改版的自己博客,还在开发中)应用搜索场景的效果:
最后说明的是:lucene并不是一个完整的全文检索引擎,但了解它对于学习elasticsearch、solr还是有一定的帮助,目前一般应用于实际的生产项目中,多半是使用更高层的elasticsearch、solr。
(本文中的代码我是今年很早前就写好了,只是今天才分享出来)
我喜欢对一些常用的组件进行封装,比如过往封装有:
基于mongodb官方c#驱动封装mongodbcsharphelper类(crud类)
基于rabbitmq.client组件实现rabbitmq可复用的 connectionpool(连接池)
上一篇: 网站数据分析:如何追踪访客初始来源
下一篇: 二次元社区的成功 弹幕视频为什么火了?