Lucene构建索引
程序员文章站
2022-03-23 14:52:08
...
1. 添加、删除、修改文档
2. 文档域加权
New maven project -> Create a simple project -> Group Id: com.andrew.lucene Artifact Id: Lucene02 Version: 0.0.1-SNAPSHOT Packaging: jar
pom.xml <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.andrew.lucene</groupId> <artifactId>Lucene02</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>5.3.1</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>5.3.1</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>5.3.1</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> </dependencies> </project>
IndexingTest.java代码 package com.andrew.lucene; import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.Before; import org.junit.Test; public class IndexingTest { private String ids[] = { "1", "2", "3" }; private String citys[] = { "qingdao", "nanjing", "shanghai" }; private String descs[] = { "Qingdao is a beautiful city.", "Nanjing is a city of culture.", "Shanghai is a bustling city." }; private Directory dir; @Before public void setUp() throws Exception { dir = FSDirectory.open(Paths.get("E:\\lucene2")); IndexWriter writer = getWriter(); for (int i = 0; i < ids.length; i++) { Document doc = new Document(); doc.add(new StringField("id", ids[i], Field.Store.YES)); doc.add(new StringField("city", citys[i], Field.Store.YES)); doc.add(new TextField("desc", descs[i], Field.Store.NO)); writer.addDocument(doc); // 添加文档 } writer.close(); } // 获取IndexWriter实例 private IndexWriter getWriter() throws Exception { Analyzer analyzer = new StandardAnalyzer(); // 标准分词器 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(dir, iwc); return writer; } // 测试写了几个文档 @Test public void testIndexWriter() throws Exception { IndexWriter writer = getWriter(); System.out.println("写入了" + writer.numDocs() + "个文档"); writer.close(); } // 测试读取文档 @Test public void testIndexReader() throws Exception { IndexReader reader = DirectoryReader.open(dir); System.out.println("最大文档数:" + reader.maxDoc()); System.out.println("实际文档数:" + reader.numDocs()); reader.close(); } // 测试删除 在合并前 @Test public void testDeleteBeforeMerge() throws Exception { IndexWriter writer = getWriter(); System.out.println("删除前:" + writer.numDocs()); writer.deleteDocuments(new Term("id", "1")); writer.commit(); System.out.println("writer.maxDoc():" + writer.maxDoc()); System.out.println("writer.numDocs():" + writer.numDocs()); writer.close(); } // 测试删除 在合并后 @Test public void testDeleteAfterMerge() throws Exception { IndexWriter writer = getWriter(); System.out.println("删除前:" + writer.numDocs()); writer.deleteDocuments(new Term("id", "1")); writer.forceMergeDeletes(); // 强制删除 writer.commit(); System.out.println("writer.maxDoc():" + writer.maxDoc()); System.out.println("writer.numDocs():" + writer.numDocs()); writer.close(); } // 测试更新 @Test public void testUpdate() throws Exception { IndexWriter writer = getWriter(); Document doc = new Document(); doc.add(new StringField("id", "1", Field.Store.YES)); doc.add(new StringField("city", "qingdao", Field.Store.YES)); doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO)); writer.updateDocument(new Term("id", "1"), doc); writer.close(); } } 执行结果 最大文档数:3 实际文档数:3 删除前:3 writer.maxDoc():3 writer.numDocs():2 删除前:3 writer.maxDoc():2 writer.numDocs():2
2. 文档域加权
IndexingTest2.java代码 package com.andrew.lucene; import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.Test; public class IndexingTest2 { private String ids[] = { "1", "2", "3", "4" }; private String authors[] = { "Jack", "Marry", "John", "Json" }; private String positions[] = { "accounting", "technician", "salesperson", "boss" }; private String titles[] = { "Java is a good language.", "Java is a cross platform language", "Java powerful", "You should learn java" }; private String contents[] = { "If possible, use the same JRE major version at both index and search time.", "When upgrading to a different JRE major version, consider re-indexing. ", "Different JRE major versions may implement different versions of Unicode,", "For example: with Java 1.4, `LetterTokenizer` will split around the character U+02C6," }; private Directory dir; // 获取IndexWriter实例 private IndexWriter getWriter() throws Exception { Analyzer analyzer = new StandardAnalyzer(); // 标准分词器 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(dir, iwc); return writer; } // 生成索引 @Test public void index() throws Exception { dir = FSDirectory.open(Paths.get("E:\\lucene3")); IndexWriter writer = getWriter(); for (int i = 0; i < ids.length; i++) { Document doc = new Document(); doc.add(new StringField("id", ids[i], Field.Store.YES)); doc.add(new StringField("author", authors[i], Field.Store.YES)); doc.add(new StringField("position", positions[i], Field.Store.YES)); // 加权操作 TextField field = new TextField("title", titles[i], Field.Store.YES); // if ("boss".equals(positions[i])) { // field.setBoost(1.5f); // } doc.add(field); doc.add(new TextField("content", contents[i], Field.Store.NO)); writer.addDocument(doc); // 添加文档 } writer.close(); } // 查询 @Test public void search() throws Exception { dir = FSDirectory.open(Paths.get("E:\\lucene3")); IndexReader reader = DirectoryReader.open(dir); IndexSearcher is = new IndexSearcher(reader); String searchField = "title"; String q = "java"; Term t = new Term(searchField, q); Query query = new TermQuery(t); TopDocs hits = is.search(query, 10); System.out.println("匹配 '" + q + "',总共查询到" + hits.totalHits + "个文档"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); System.out.println(doc.get("author")); } reader.close(); } } 执行结果 未使用加权: 匹配 'java',总共查询到4个文档 John Jack Marry Json 使用加权: 匹配 'java',总共查询到4个文档 Json John Jack Marry
下一篇: 教你搞定php逻辑运算符的短路运算