欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

lucene在本地文件建索引和搜索

程序员文章站 2022-05-13 16:26:32
...

 

 

添加如下的maven依赖:

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>6.1.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>6.1.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
			<version>6.1.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-demo</artifactId>
			<version>6.1.0</version>
		</dependency>

 

 

例子:

 

package com.tch.test.lucene;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;

public class MyLuceneTest {
	
	/**
	 * '文件内容'的索引名词
	 */
	public static final String CONTENT_INDEX_NAME = "contents";
	/**
	 * '文件修改时间'的索引名词
	 */
	public static final String MODIFIED_INDEX_NAME = "modified";
	/**
	 * '文件路径'的索引名词
	 */
	public static final String PATH_INDEX_NAME = "path";

	public static void main(String[] args) throws Exception {
		String indexPath = "/media/tch/disk1/study/temp/lucene-index";
		index("/media/tch/disk1/study/mytigase", indexPath);
		search(indexPath, "handleRichMJPacket");
	}

	/**
	 * 建立索引
	 * @param filePath 被索引的目录
	 * @param indexPath 索引存放目录
	 * @param create
	 * @throws IOException
	 */
	public static void index(String filePath, String indexPath) throws IOException {
		System.out.println("开始在目录 '" + indexPath + "' 下面建立索引文件...");
		final Path path = Paths.get(filePath);
		Date start = new Date();
		//获取IndexWriter
		IndexWriter writer = getIndexWriter(indexPath);
		
		index4Folder(writer, path);
		
		// writer.forceMerge(1);
		writer.close();
		System.out.println("创建索引一共用了" + (new Date().getTime() - start.getTime())/1000 + " 秒");
	}
	
	/**
	 * 获取IndexWriter
	 * @param docDir
	 * @param indexPath
	 * @return
	 * @throws IOException
	 */
	public static IndexWriter getIndexWriter(String indexPath) throws IOException{
		return new IndexWriter(FSDirectory.open(Paths.get(indexPath)), getIndexWriterConfig());
	}
	
	/**
	 * 获取IndexWriterConfig
	 * @return
	 */
	public static IndexWriterConfig getIndexWriterConfig(){
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(getAnalyzer());
		indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
		return indexWriterConfig;
	}

	/**
	 * 为指定目录的所有文件创建索引
	 * @param writer
	 * @param path
	 * @throws IOException
	 */
	public static void index4Folder(final IndexWriter writer, Path path) throws IOException {
		if (Files.isDirectory(path)) {
			Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
				@Override
				public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
					try {
						index4File(writer, file.toUri().getPath(), attrs.lastModifiedTime().toMillis());
					} catch (IOException ignore) {
						// don't index files that can't be read.
					}
					return FileVisitResult.CONTINUE;
				}
			});
		} else {
			index4File(writer, path.toUri().getPath(), Files.getLastModifiedTime(path).toMillis());
		}
	}

	/**
	 * 为单个文件创建索引
	 * @param writer
	 * @param file
	 * @param lastModified
	 * @throws IOException
	 */
	public static void index4File(IndexWriter writer, String filePath, long lastModified) throws IOException {
		System.out.println("开始为文件 " + filePath + " 创建索引");
		Document document = new Document();
		//文件路径
		document.add(new StringField(PATH_INDEX_NAME, filePath, Field.Store.YES));
		//修改时间
		document.add(new LongPoint(MODIFIED_INDEX_NAME, lastModified));
		//文件内容
		document.add(new TextField(CONTENT_INDEX_NAME, getFileContent(filePath), Store.YES));
		if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
			//添加doc
			writer.addDocument(document);
		} else {
			//更新doc
			writer.updateDocument(new Term("path", filePath), document);
		}
	}
	
	public static String getFileContent(String filePath) throws IOException{
		StringBuilder builder = new StringBuilder("");
		BufferedReader bufferedReader = null;
		try {
			bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8));
			String line = null;
			while((line = bufferedReader.readLine()) != null){
				builder.append(line);
				builder.append("\r\n");
			}
		} finally {
			if(bufferedReader != null){
				bufferedReader.close();
			}
		}
		return builder.toString();
	} 

	/**
	 * 通过索引搜索字符串
	 * @param indexPath
	 * @param targetString
	 * @throws Exception
	 */
	public static void search(String indexPath, String targetString) throws Exception {
		//在索引的contents字段上面进行搜索
		String indexField = "contents";
		search(indexPath, targetString, indexField);
	}
	
	/**
	 * 通过索引搜索字符串
	 * @param indexPath 索引存放路径
	 * @param searchStr 要搜索的字符串
	 * @throws Exception
	 */
	public static void search(String indexPath, String searchStr, String indexField) throws Exception {
		//解析器
		QueryParser parser = new QueryParser(indexField, getAnalyzer());
		//Query
		Query query = parser.parse(searchStr);
		//索引reader
		IndexReader reader = getIndexReader(indexPath);
		//索引搜索器
		IndexSearcher searcher = new IndexSearcher(reader);
		doSearch(searcher, query);
		reader.close();
	}
	
	/**
	 * 根据搜索字符串和索引列名称获取Query
	 * @param searchStr
	 * @param indexField
	 * @return
	 * @throws ParseException
	 */
	public static Query getQuery(String searchStr, String indexField) throws ParseException{
		//解析器
		QueryParser parser = new QueryParser(indexField, getAnalyzer());
		return parser.parse(searchStr);
	}
	
	/**
	 * 获取IndexReader
	 * @param indexPath 索引存放路径
	 * @return
	 * @throws IOException
	 */
	public static IndexReader getIndexReader(String indexPath) throws IOException{
		return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
	}
	
	/**
	 * 获取Analyzer
	 * @return
	 */
	public static Analyzer getAnalyzer(){
		return new StandardAnalyzer();
	}
	
	/**
	 * 执行搜索
	 * @param searcher
	 * @param query
	 * @throws IOException
	 */
	public static void doSearch(IndexSearcher searcher, Query query) throws IOException {
		TopDocs results = searcher.search(query, 50);
		ScoreDoc[] hits = results.scoreDocs;
		System.out.println("一共搜索到 " + results.totalHits + " 条结果, 下面展示 " + hits.length + "条结果");
		for (int i = 0; i < hits.length; i++) {
			//System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score);
			Document document = searcher.doc(hits[i].doc);
			System.out.println((i + 1) + ". 文件路径: " + document.get(PATH_INDEX_NAME));
			System.out.println((i + 1) + ". 文件内容: " + document.get(CONTENT_INDEX_NAME));
		}
	}
}