lucene在本地文件建索引和搜索
程序员文章站
2022-05-13 16:26:32
...
添加如下的maven依赖:
<dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>6.1.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>6.1.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>6.1.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-demo</artifactId> <version>6.1.0</version> </dependency>
例子:
package com.tch.test.lucene; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; public class MyLuceneTest { /** * '文件内容'的索引名词 */ public static final String CONTENT_INDEX_NAME = "contents"; /** * '文件修改时间'的索引名词 */ public static final String MODIFIED_INDEX_NAME = "modified"; /** * '文件路径'的索引名词 */ public static final String PATH_INDEX_NAME = "path"; public static void main(String[] args) throws Exception { String indexPath = "/media/tch/disk1/study/temp/lucene-index"; index("/media/tch/disk1/study/mytigase", indexPath); search(indexPath, "handleRichMJPacket"); } /** * 建立索引 * @param filePath 被索引的目录 * @param indexPath 索引存放目录 * @param create * @throws IOException */ public static void index(String filePath, String indexPath) throws IOException { System.out.println("开始在目录 '" + indexPath + "' 下面建立索引文件..."); final Path path = Paths.get(filePath); Date start = new Date(); //获取IndexWriter IndexWriter writer = getIndexWriter(indexPath); index4Folder(writer, path); // writer.forceMerge(1); writer.close(); System.out.println("创建索引一共用了" + (new Date().getTime() - start.getTime())/1000 + " 秒"); } /** * 获取IndexWriter * @param docDir * @param indexPath * @return * @throws IOException */ public static IndexWriter getIndexWriter(String indexPath) throws IOException{ return new IndexWriter(FSDirectory.open(Paths.get(indexPath)), getIndexWriterConfig()); } /** * 获取IndexWriterConfig * @return */ public static IndexWriterConfig getIndexWriterConfig(){ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(getAnalyzer()); indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); return indexWriterConfig; } /** * 为指定目录的所有文件创建索引 * @param writer * @param path * @throws IOException */ public static void index4Folder(final IndexWriter writer, Path path) throws IOException { if (Files.isDirectory(path)) { Files.walkFileTree(path, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { try { index4File(writer, file.toUri().getPath(), attrs.lastModifiedTime().toMillis()); } catch (IOException ignore) { // don't index files that can't be read. } return FileVisitResult.CONTINUE; } }); } else { index4File(writer, path.toUri().getPath(), Files.getLastModifiedTime(path).toMillis()); } } /** * 为单个文件创建索引 * @param writer * @param file * @param lastModified * @throws IOException */ public static void index4File(IndexWriter writer, String filePath, long lastModified) throws IOException { System.out.println("开始为文件 " + filePath + " 创建索引"); Document document = new Document(); //文件路径 document.add(new StringField(PATH_INDEX_NAME, filePath, Field.Store.YES)); //修改时间 document.add(new LongPoint(MODIFIED_INDEX_NAME, lastModified)); //文件内容 document.add(new TextField(CONTENT_INDEX_NAME, getFileContent(filePath), Store.YES)); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { //添加doc writer.addDocument(document); } else { //更新doc writer.updateDocument(new Term("path", filePath), document); } } public static String getFileContent(String filePath) throws IOException{ StringBuilder builder = new StringBuilder(""); BufferedReader bufferedReader = null; try { bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)); String line = null; while((line = bufferedReader.readLine()) != null){ builder.append(line); builder.append("\r\n"); } } finally { if(bufferedReader != null){ bufferedReader.close(); } } return builder.toString(); } /** * 通过索引搜索字符串 * @param indexPath * @param targetString * @throws Exception */ public static void search(String indexPath, String targetString) throws Exception { //在索引的contents字段上面进行搜索 String indexField = "contents"; search(indexPath, targetString, indexField); } /** * 通过索引搜索字符串 * @param indexPath 索引存放路径 * @param searchStr 要搜索的字符串 * @throws Exception */ public static void search(String indexPath, String searchStr, String indexField) throws Exception { //解析器 QueryParser parser = new QueryParser(indexField, getAnalyzer()); //Query Query query = parser.parse(searchStr); //索引reader IndexReader reader = getIndexReader(indexPath); //索引搜索器 IndexSearcher searcher = new IndexSearcher(reader); doSearch(searcher, query); reader.close(); } /** * 根据搜索字符串和索引列名称获取Query * @param searchStr * @param indexField * @return * @throws ParseException */ public static Query getQuery(String searchStr, String indexField) throws ParseException{ //解析器 QueryParser parser = new QueryParser(indexField, getAnalyzer()); return parser.parse(searchStr); } /** * 获取IndexReader * @param indexPath 索引存放路径 * @return * @throws IOException */ public static IndexReader getIndexReader(String indexPath) throws IOException{ return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); } /** * 获取Analyzer * @return */ public static Analyzer getAnalyzer(){ return new StandardAnalyzer(); } /** * 执行搜索 * @param searcher * @param query * @throws IOException */ public static void doSearch(IndexSearcher searcher, Query query) throws IOException { TopDocs results = searcher.search(query, 50); ScoreDoc[] hits = results.scoreDocs; System.out.println("一共搜索到 " + results.totalHits + " 条结果, 下面展示 " + hits.length + "条结果"); for (int i = 0; i < hits.length; i++) { //System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score); Document document = searcher.doc(hits[i].doc); System.out.println((i + 1) + ". 文件路径: " + document.get(PATH_INDEX_NAME)); System.out.println((i + 1) + ". 文件内容: " + document.get(CONTENT_INDEX_NAME)); } } }
上一篇: elastic search index和type
下一篇: php获取汉字首字母
推荐阅读