欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Lucene+springboot 实现一个简单的搜索

程序员文章站 2022-03-04 23:28:28
...

1、背景:网站需要实现一个检索,但是mysql的like已经不能满足需求,需要类似全文检索,在之前简单的接触过elasticsearch,感觉类似elasticsearch的搜索可以满足,最后决定集成lucene实现搜索。(可以直接使用es,为什么没有使用就不多说了)

2、环境:java8、springboot2.2,maven,lucene7.6

3、在pom文件中添加依赖

        <!-- Lucene -->
		<!--核心包-->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-core</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!--对分词索引查询解析-->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-queryparser</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!--一般分词器,适用于英文分词-->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-analyzers-common</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!--检索关键字高亮显示 -->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-highlighter</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!-- smartcn中文分词器 -->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-analyzers-smartcn</artifactId>
		  <version>7.6.0</version>
		</dependency>

4、创建查询索引

package 包名
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;

import com.cuiyanet.entity.inter.EncyRetrievalIN;
import com.cuiyanet.service.EncyRetrievalService;

/**
 * @Description lucene搜索引擎的实现方法
 */
@Controller
@RequestMapping("/test/lucene")
public class test {
	private static final Logger logger = LoggerFactory.getLogger(test.class);
	
	@Autowired
	private EncyRetrievalService encyRetrievalService;

	/**
	 * @Description 创建查询索引		
	 * @throws IOException
	 */
	@RequestMapping("/creat")
	public void indexCreate() throws IOException {
		
		////指定索引的生成位置,后面查询会使用
		Directory directory = FSDirectory.open(Paths.get(new File("D:\\lucene\\lucene_index").getPath()));		
        logger.info("===================>索引位置:"+Paths.get(new File("D:\\lucene\\lucene_index").getPath()));
        
        //创建一个分词器,表示你存入的内容使用的是该分词器
        SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
        //创建indexwriterConfig(参数分词器)
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(smartChineseAnalyzer);
        //创建indexwrite 对象(文件对象,索引配置对象)
        IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
		
        //此处是从数据库中查询你需要建索引的东西
		List<EncyRetrievalIN> retrievalList = encyRetrievalService.searchEncyRetrievalIN(new EncyRetrievalIN());
		//如果做测试可以直接写几个字符串就可以了
		
		//循环生成索引
		for(int i = 0; i < retrievalList.size(); i ++) {
			
			/**这个很关键,忘记了从哪个文章看到的
	         * 1、LongPoint(String name, int... point) : 在Lucene 6.0中,LongField替换为LongPoint,IntField替换为IntPoint,FloatField替换为FloatPoint,DoubleField替换为DoublePoint。对int型字段索引,索引不存储,提供了一些静态工厂方法用于创建一般的查询,提供了不同于文本的数值类型存储方式,使用KD-trees索引
	         * 2、StringField(FieldName, FieldValue,Store.YES)) : 只索引但不分词,会将整个串存储在索引中,比如(订单号,身份证号等)是否存储在文档中用Store.YES或Store.NO决定
	         * 3、StoredField(FieldName, FieldValue) : 存储Field的值,不分析,不索引,可以用IndexSearcher.doc和IndexReader.document来获取存储的Field和存储的值
	         * 4、TextField(FieldName,FieldValue, Store.NO) :索引并分词,不包括term vectors(词向量,下面会讲),例如通常用于一个body Field
	         */
			Field titleField = new TextField("tilte", retrievalList.get(i).getTitle() , Field.Store.YES);
			Field contentField = new TextField("content", retrievalList.get(i).getProfiles() , Field.Store.YES);
			Field keyWordField = new TextField("keyWord", retrievalList.get(i).getKeyWord() , Field.Store.YES);
			Field clicksField = new StringField("clicks", String.valueOf(retrievalList.get(i).getClicks()) , Field.Store.YES);
			Field idField = new StringField("id", String.valueOf(retrievalList.get(i).getId()) , Field.Store.YES);
			Document doc = new Document();
			doc.add(titleField);
			doc.add(contentField);
			doc.add(keyWordField);
			doc.add(clicksField);
			doc.add(idField);
			doc.add(new NumericDocValuesField("sortid",retrievalList.get(i).getId())); 		//这个东西是后面查询的时候可以用这个排序用的
			doc.add(new NumericDocValuesField("click",retrievalList.get(i).getClicks()));  
			indexWriter.addDocument(doc);							//写入文档	
		}
		
	    // 查看IndexWriter里面有多少个索引
	    logger.info("================IndexWriter创建索引个数===============》:"+indexWriter.numDocs());
	    // 关闭索引
	    indexWriter.close();
	}
	
}

5、查询

package 包名;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;

import com.alibaba.fastjson.JSONObject;
import com.cuiyanet.common.util.PageBean;
import com.cuiyanet.common.util.PageInfo;
import com.cuiyanet.entity.WebResult;
import com.cuiyanet.entity.inter.EncyRetrievalIN;
import com.cuiyanet.entity.vo.EncyRetrievalSearchVO;
import com.cuiyanet.service.EncyRetrievalSearchService;
import com.cuiyanet.service.EncyRetrievalService;
import com.github.pagehelper.PageHelper;

import io.swagger.annotations.ApiImplicitParam;
import io.swagger.annotations.ApiImplicitParams;
import io.swagger.annotations.ApiOperation;

/**
 * @Description lucene搜索引擎的实现方法
 */
@Controller
@RequestMapping("/test/lucene")
public class test {
	private static final Logger logger = LoggerFactory.getLogger(test.class);
	
	@Autowired
	private EncyRetrievalService encyRetrievalService;
	@Autowired
	private EncyRetrievalSearchService encyRetrievalSearchService;
	
	/**
	 * @Description 搜索 	
	 */
	@GetMapping("/searchList")
	public void searchList(@RequestParam(value = "page", defaultValue = "1") int page,	//分页
			@RequestParam(value = "limit", defaultValue = "7") int limit,		//分页
			@RequestParam(value = "title", defaultValue = "") String title		//搜索内容
	) {
		try {
			
			if (title != "") {
				if (page == 0)
					page = 1;
				if (limit == 0)
					limit = 10;
				int startIndex = (page - 1) * limit;
				//这里加载的分词,最好创建索引和查询使用一个,不然可能存在查询不到的问题
				SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
				Query query = null;	//查询

				//这里用的是BooleanQuery方法,可以多个字段进行检索
				Builder builder = new BooleanQuery.Builder();
				String[] fieldStra = new String[] { "title", "keyword" };
				BooleanClause.Occur[] flagsa = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
				String[] queryArya = new String[] { title, title };
				query = MultiFieldQueryParser.parse(queryArya, fieldStra, flagsa, smartChineseAnalyzer);
				
				builder.add(new QueryParser("typeid", smartChineseAnalyzer).parse(String.valueOf(11)), Occur.MUST);	//业务需求固定查询11
				builder.add(query, Occur.MUST);

				//这个是上面的说明,一个build.add()代表一个,上面是俩must,下面是查询出的数据集合说明。
//				1.MUST和MUST:取得连个查询子句的交集。 
//				2.MUST和MUST_NOT:表示查询结果中不能包含MUST_NOT所对应得查询子句的检索结果。 
//				3.SHOULD与MUST_NOT:连用时,功能同MUST和MUST_NOT。
//				4.SHOULD与MUST连用时,结果为MUST子句的检索结果,但是SHOULD可影响排序。
//				5.SHOULD与SHOULD:表示“或”关系,最终检索结果为所有检索子句的并集。
//				6.MUST_NOT和MUST_NOT:无意义,检索无结果。
				
			    Directory directory = FSDirectory.open(Paths.get("D:\\lucene\\lucene_index"));	//加载生成索引位置
			    IndexReader indexReader = DirectoryReader.open(directory);
			    IndexSearcher indexSearcher = new IndexSearcher(indexReader);		//搜索
				
			    //排序用的sort,可以多字段排序
				SortField field = new SortField("", SortField.Type.SCORE, false); 			// 相关度高的在前
				SortField fieldB = new SortField("clicks", SortField.Type.LONG, true); 		// 点击量大值在前面
				Sort sort = new Sort(field, fieldB);
				//分页查询
				TopFieldCollector c = TopFieldCollector.create(sort, 100000, false, false, false, false);	//后面这些参数可以自己查询看一下
				indexSearcher.search(builder.build(), c);	//加载查询

				TopDocs topDocs = indexSearcher.search(builder.build(), 100000); // 查询总数

				//分页--开始数据第几个,查询多少个例:(5,10),从第五条后查询十个
				ScoreDoc[] hits = c.topDocs(startIndex, limit).scoreDocs;
				
				for (ScoreDoc sdoc : hits) {
					Document hitDoc = indexSearcher.doc(sdoc.doc); // 根据文档id取存储的文档
					System.out.println(hitDoc.get("tilte"));   	//取出查询文档中的title值
				}

			} 
		} catch (Exception e) {
			logger.error("Exception", e);
		}
		
	}
	
	
	
	
	
}

7、除了BooleanQuery查询,其余的查询还有好多,如短语、模糊、范围等,但是目前网上的低版本比较多,查询的方法有很多不好用,而且一般简单的搜索满足不了业务,这里只介绍了创建索引,和查询的基础方法,更新和删除不做介绍。

相关标签: 全文检索