Lucene+springboot 实现一个简单的搜索
程序员文章站
2022-03-04 23:28:28
...
1、背景:网站需要实现一个检索,但是mysql的like已经不能满足需求,需要类似全文检索,在之前简单的接触过elasticsearch,感觉类似elasticsearch的搜索可以满足,最后决定集成lucene实现搜索。(可以直接使用es,为什么没有使用就不多说了)
2、环境:java8、springboot2.2,maven,lucene7.6
3、在pom文件中添加依赖
<!-- Lucene -->
<!--核心包-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.6.0</version>
</dependency>
<!--对分词索引查询解析-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.6.0</version>
</dependency>
<!--一般分词器,适用于英文分词-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.6.0</version>
</dependency>
<!--检索关键字高亮显示 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.6.0</version>
</dependency>
<!-- smartcn中文分词器 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>7.6.0</version>
</dependency>
4、创建查询索引
package 包名
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import com.cuiyanet.entity.inter.EncyRetrievalIN;
import com.cuiyanet.service.EncyRetrievalService;
/**
* @Description lucene搜索引擎的实现方法
*/
@Controller
@RequestMapping("/test/lucene")
public class test {
private static final Logger logger = LoggerFactory.getLogger(test.class);
@Autowired
private EncyRetrievalService encyRetrievalService;
/**
* @Description 创建查询索引
* @throws IOException
*/
@RequestMapping("/creat")
public void indexCreate() throws IOException {
////指定索引的生成位置,后面查询会使用
Directory directory = FSDirectory.open(Paths.get(new File("D:\\lucene\\lucene_index").getPath()));
logger.info("===================>索引位置:"+Paths.get(new File("D:\\lucene\\lucene_index").getPath()));
//创建一个分词器,表示你存入的内容使用的是该分词器
SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
//创建indexwriterConfig(参数分词器)
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(smartChineseAnalyzer);
//创建indexwrite 对象(文件对象,索引配置对象)
IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
//此处是从数据库中查询你需要建索引的东西
List<EncyRetrievalIN> retrievalList = encyRetrievalService.searchEncyRetrievalIN(new EncyRetrievalIN());
//如果做测试可以直接写几个字符串就可以了
//循环生成索引
for(int i = 0; i < retrievalList.size(); i ++) {
/**这个很关键,忘记了从哪个文章看到的
* 1、LongPoint(String name, int... point) : 在Lucene 6.0中,LongField替换为LongPoint,IntField替换为IntPoint,FloatField替换为FloatPoint,DoubleField替换为DoublePoint。对int型字段索引,索引不存储,提供了一些静态工厂方法用于创建一般的查询,提供了不同于文本的数值类型存储方式,使用KD-trees索引
* 2、StringField(FieldName, FieldValue,Store.YES)) : 只索引但不分词,会将整个串存储在索引中,比如(订单号,身份证号等)是否存储在文档中用Store.YES或Store.NO决定
* 3、StoredField(FieldName, FieldValue) : 存储Field的值,不分析,不索引,可以用IndexSearcher.doc和IndexReader.document来获取存储的Field和存储的值
* 4、TextField(FieldName,FieldValue, Store.NO) :索引并分词,不包括term vectors(词向量,下面会讲),例如通常用于一个body Field
*/
Field titleField = new TextField("tilte", retrievalList.get(i).getTitle() , Field.Store.YES);
Field contentField = new TextField("content", retrievalList.get(i).getProfiles() , Field.Store.YES);
Field keyWordField = new TextField("keyWord", retrievalList.get(i).getKeyWord() , Field.Store.YES);
Field clicksField = new StringField("clicks", String.valueOf(retrievalList.get(i).getClicks()) , Field.Store.YES);
Field idField = new StringField("id", String.valueOf(retrievalList.get(i).getId()) , Field.Store.YES);
Document doc = new Document();
doc.add(titleField);
doc.add(contentField);
doc.add(keyWordField);
doc.add(clicksField);
doc.add(idField);
doc.add(new NumericDocValuesField("sortid",retrievalList.get(i).getId())); //这个东西是后面查询的时候可以用这个排序用的
doc.add(new NumericDocValuesField("click",retrievalList.get(i).getClicks()));
indexWriter.addDocument(doc); //写入文档
}
// 查看IndexWriter里面有多少个索引
logger.info("================IndexWriter创建索引个数===============》:"+indexWriter.numDocs());
// 关闭索引
indexWriter.close();
}
}
5、查询
package 包名;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;
import com.alibaba.fastjson.JSONObject;
import com.cuiyanet.common.util.PageBean;
import com.cuiyanet.common.util.PageInfo;
import com.cuiyanet.entity.WebResult;
import com.cuiyanet.entity.inter.EncyRetrievalIN;
import com.cuiyanet.entity.vo.EncyRetrievalSearchVO;
import com.cuiyanet.service.EncyRetrievalSearchService;
import com.cuiyanet.service.EncyRetrievalService;
import com.github.pagehelper.PageHelper;
import io.swagger.annotations.ApiImplicitParam;
import io.swagger.annotations.ApiImplicitParams;
import io.swagger.annotations.ApiOperation;
/**
* @Description lucene搜索引擎的实现方法
*/
@Controller
@RequestMapping("/test/lucene")
public class test {
private static final Logger logger = LoggerFactory.getLogger(test.class);
@Autowired
private EncyRetrievalService encyRetrievalService;
@Autowired
private EncyRetrievalSearchService encyRetrievalSearchService;
/**
* @Description 搜索
*/
@GetMapping("/searchList")
public void searchList(@RequestParam(value = "page", defaultValue = "1") int page, //分页
@RequestParam(value = "limit", defaultValue = "7") int limit, //分页
@RequestParam(value = "title", defaultValue = "") String title //搜索内容
) {
try {
if (title != "") {
if (page == 0)
page = 1;
if (limit == 0)
limit = 10;
int startIndex = (page - 1) * limit;
//这里加载的分词,最好创建索引和查询使用一个,不然可能存在查询不到的问题
SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
Query query = null; //查询
//这里用的是BooleanQuery方法,可以多个字段进行检索
Builder builder = new BooleanQuery.Builder();
String[] fieldStra = new String[] { "title", "keyword" };
BooleanClause.Occur[] flagsa = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
String[] queryArya = new String[] { title, title };
query = MultiFieldQueryParser.parse(queryArya, fieldStra, flagsa, smartChineseAnalyzer);
builder.add(new QueryParser("typeid", smartChineseAnalyzer).parse(String.valueOf(11)), Occur.MUST); //业务需求固定查询11
builder.add(query, Occur.MUST);
//这个是上面的说明,一个build.add()代表一个,上面是俩must,下面是查询出的数据集合说明。
// 1.MUST和MUST:取得连个查询子句的交集。
// 2.MUST和MUST_NOT:表示查询结果中不能包含MUST_NOT所对应得查询子句的检索结果。
// 3.SHOULD与MUST_NOT:连用时,功能同MUST和MUST_NOT。
// 4.SHOULD与MUST连用时,结果为MUST子句的检索结果,但是SHOULD可影响排序。
// 5.SHOULD与SHOULD:表示“或”关系,最终检索结果为所有检索子句的并集。
// 6.MUST_NOT和MUST_NOT:无意义,检索无结果。
Directory directory = FSDirectory.open(Paths.get("D:\\lucene\\lucene_index")); //加载生成索引位置
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader); //搜索
//排序用的sort,可以多字段排序
SortField field = new SortField("", SortField.Type.SCORE, false); // 相关度高的在前
SortField fieldB = new SortField("clicks", SortField.Type.LONG, true); // 点击量大值在前面
Sort sort = new Sort(field, fieldB);
//分页查询
TopFieldCollector c = TopFieldCollector.create(sort, 100000, false, false, false, false); //后面这些参数可以自己查询看一下
indexSearcher.search(builder.build(), c); //加载查询
TopDocs topDocs = indexSearcher.search(builder.build(), 100000); // 查询总数
//分页--开始数据第几个,查询多少个例:(5,10),从第五条后查询十个
ScoreDoc[] hits = c.topDocs(startIndex, limit).scoreDocs;
for (ScoreDoc sdoc : hits) {
Document hitDoc = indexSearcher.doc(sdoc.doc); // 根据文档id取存储的文档
System.out.println(hitDoc.get("tilte")); //取出查询文档中的title值
}
}
} catch (Exception e) {
logger.error("Exception", e);
}
}
}
7、除了BooleanQuery查询,其余的查询还有好多,如短语、模糊、范围等,但是目前网上的低版本比较多,查询的方法有很多不好用,而且一般简单的搜索满足不了业务,这里只介绍了创建索引,和查询的基础方法,更新和删除不做介绍。
上一篇: p2p学习记录