lucene全文检索
程序员文章站
2022-07-01 21:48:38
...
lucene全文检索
可识别txt,doc,docx,xls,xlsx,pdf
package com.abc;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* @author ymy
* lucene全文检索
*/
public class IndexManager{
/**
* 创建当前文件目录的索引
* @param path 当前文件目录
* @return 是否成功
*/
public static boolean createIndex(String DATA_DIR,String INDEX_DIR){
Date date1 = new Date();
List<File> fileList = getFileList(DATA_DIR);
System.out.println("文件数量:"+fileList.size());
String content="";
if(fileList.size()>0){
for (File file : fileList) {
content = "";
//获取文件后缀
String type = file.getName().substring(file.getName().lastIndexOf(".")+1);
System.out.println("type :"+type);
if("txt".equalsIgnoreCase(type)){
content += txt2String(file);
}else if("doc".equalsIgnoreCase(type) || "docx".equalsIgnoreCase(type)){
content += doc2String(file,type);
}else if("xls".equalsIgnoreCase(type) || "xlsx".equalsIgnoreCase(type)){
content += xls2String(file);
}else if("pdf".equalsIgnoreCase(type)){
try {
content += pdf2String(file);
} catch (Exception e) {
e.printStackTrace();
}
}
//System.out.println("name :"+file.getName());
//System.out.println("path :"+file.getPath()+"\n");
//System.out.println("content :"+content);
try{
Analyzer analyzer = new IKAnalyzer();
//Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
Directory directory = FSDirectory.open(new File(INDEX_DIR));
File indexFile = new File(INDEX_DIR);
if (!indexFile.exists()) {
indexFile.mkdirs();
}
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
Document document = new Document();
document.add(new TextField("filename", file.getName(), Store.YES));
document.add(new TextField("content", content, Store.YES));
document.add(new TextField("path", file.getPath(), Store.YES));
indexWriter.addDocument(document);
indexWriter.commit();
if (indexWriter != null) {
indexWriter.close();
}
}catch(Exception e){
e.printStackTrace();
}
content = "";
}
}
Date date2 = new Date();
System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
return true;
}
/**
* 读取txt文件的内容
* @param file 想要读取的文件对象
* @return 返回文件内容
*/
public static String txt2String(File file){
String result = "";
try{
BufferedReader br = new BufferedReader(new FileReader(file));//构造一个BufferedReader类来读取文件
String s = null;
while((s = br.readLine())!=null){//使用readLine方法,一次读一行
result = result + "\n" +s;
}
br.close();
}catch(Exception e){
e.printStackTrace();
}
return result;
}
/**
* 读取doc文件内容
* @param file 想要读取的文件对象
* @return 返回文件内容
*/
public static String doc2String(File file,String type){
String result = "";
try{
InputStream fis = new FileInputStream(file);
if("doc".equalsIgnoreCase(type)){
HWPFDocument doc = new HWPFDocument(fis);
Range rang = doc.getRange();
result += rang.text();
}else if("docx".equalsIgnoreCase(type)){
result = readByDocx(file);
}
fis.close();
}catch(Exception e){
e.printStackTrace();
}
return result;
}
/**
* 通过XWPFDocument对内容进行访问。对于XWPF文档而言,用这种方式进行读操作更佳。
* @throws Exception
*/
public static String readByDocx(File file) throws Exception {
String result = "";
InputStream is = new FileInputStream(file);
XWPFDocument doc = new XWPFDocument(is);
List<XWPFParagraph> paras = doc.getParagraphs();
for (XWPFParagraph para : paras) {
//当前段落的属性
//CTPPr pr = para.getCTP().getPPr();
result += para.getText();
}
//获取文档中所有的表格
List<XWPFTable> tables = doc.getTables();
List<XWPFTableRow> rows;
List<XWPFTableCell> cells;
for (XWPFTable table : tables) {
//表格属性
//CTTblPr pr = table.getCTTbl().getTblPr();
//获取表格对应的行
rows = table.getRows();
if(rows!=null){
for (XWPFTableRow row : rows) {
//获取行对应的单元格
cells = row.getTableCells();
if(cells!=null){
for (XWPFTableCell cell : cells) {
result += cell.getText();
}
}
}
}
}
is.close();
return result;
}
//PDF文档
public static String pdf2String(File file) throws Exception {
PDDocument document = null;
try
{
// 方式一:
/**
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
**/
// 方式二:
document=PDDocument.load(file);
// 获取页码
int pages = document.getNumberOfPages();
// 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
String content = stripper.getText(document);
document.close();
return content;
}
catch(Exception e)
{
System.out.println(e);
}
return null;
}
/**
* 读取xls文件内容
* @param file 想要读取的文件对象
* @return 返回文件内容
*/
public static String xls2String(File file){
String result = "";
try{
Row row = null;
FileInputStream fis = new FileInputStream(file);
StringBuilder sb = new StringBuilder();
//jxl.Workbook rwb = Workbook.getWorkbook(fis);
boolean isExcel2003 = file.getPath().toLowerCase().endsWith("xls")?true:false;
Workbook workbook = null;
if(isExcel2003){
workbook = new HSSFWorkbook(new FileInputStream(file));
}else{
workbook = new XSSFWorkbook(new FileInputStream(file));
}
int sheetnum = workbook.getNumberOfSheets();
for(int k=0;k<sheetnum;k++){
Sheet sheet = workbook.getSheetAt(k);
for (int i = 0; sheet.getRow(i)!=null; i++) {
row = sheet.getRow(i);
for(int j=0;j<row.getLastCellNum(); j++){
sb.append(getCellValue(row.getCell(j)));
}
}
}
fis.close();
result += sb.toString();
}catch(Exception e){
e.printStackTrace();
}
return result;
}
public static String getCellValue(Cell cell){
String cellValue = "";
if(cell == null){
return cellValue;
}
//把数字当成String来读,避免出现1读成1.0的情况
if(cell.getCellType() == Cell.CELL_TYPE_NUMERIC){
cell.setCellType(Cell.CELL_TYPE_STRING);
}
//判断数据的类型
switch (cell.getCellType()){
case Cell.CELL_TYPE_NUMERIC: //数字
cellValue = String.valueOf(cell.getNumericCellValue());
break;
case Cell.CELL_TYPE_STRING: //字符串
cellValue = String.valueOf(cell.getStringCellValue());
break;
case Cell.CELL_TYPE_BOOLEAN: //Boolean
cellValue = String.valueOf(cell.getBooleanCellValue());
break;
case Cell.CELL_TYPE_FORMULA: //公式
cellValue = String.valueOf(cell.getCellFormula());
break;
case Cell.CELL_TYPE_BLANK: //空值
cellValue = "";
break;
case Cell.CELL_TYPE_ERROR: //故障
cellValue = "非法字符";
break;
default:
cellValue = "未知类型";
break;
}
return cellValue;
}
/**
* 查找索引,返回符合条件的文件
* @param text 查找的字符串
* @return 符合条件的文件List
*/
public static String searchIndex(String INDEX_DIR,String text,int topN){
System.out.println("开始检索..................");
StringBuffer buffer = new StringBuffer();
try{
//Class<?> clazz = Directory.class;
//System.out.println(clazz.getResource(clazz.getSimpleName() + ".class"));
// 1、创建Directory
Directory directory = FSDirectory.open(new File(INDEX_DIR));
// 2、创建搜索的Query
//Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
Analyzer analyzer = new IKAnalyzer();
// 3、创建IndexReader
DirectoryReader ireader = DirectoryReader.open(directory);
// 4、根据IndexReader创建IndexSearch
IndexSearcher isearcher = new IndexSearcher(ireader);
// 创建parser来确定要搜索文件的内容,第二个参数为搜索的域
QueryParser parser = new QueryParser(Version.LUCENE_40, "content", analyzer);
// 创建Query表示搜索域为content包含text的文档
Query query = parser.parse(text);
//ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
// 5、根据searcher搜索并且返回TopDocs
TopDocs topDocs = isearcher.search(query, topN);
// 6、根据TopDocs获取ScoreDoc对象
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
System.out.println("查找到的文档总共有:"+topDocs.totalHits);
//高亮显示
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
QueryScorer fragmentScorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
Fragmenter fragmenter = new SimpleFragmenter(100);
highlighter.setTextFragmenter(fragmenter);
for (ScoreDoc scoreDoc : scoreDocs) {
// 7、根据searcher和ScoreDoc对象获取具体的Document对象
Document document = isearcher.doc(scoreDoc.doc);
// 8、根据Document对象获取需要的值
//System.out.println("____________________________");
//System.out.println(filename + "\n" + document.get("path"));
//System.out.println("____________________________");
float score = scoreDoc.score; //相似度
String highlighterstr = highlighter.getBestFragment(analyzer, "content", text);
}
ireader.close();
directory.close();
}catch(Exception e){
e.printStackTrace();
}
return buffer.toString();
}
/**
* 过滤目录下的文件
* @param dirPath 想要获取文件的目录
* @return 返回文件list
*/
public static List<File> getFileList(String dirPath) {
List<File> fileList = new ArrayList<File>();
getFileList2(dirPath,fileList);
return fileList;
}
public static List<File> getFileList2(String path,List<File> fileList) {
File file = new File(path);
if (file.exists()) {
File[] files = file.listFiles();
if (files.length > 0) {
for (File file2 : files) {
if (file2.isDirectory()) {
System.out.println("文件夹:" + file2.getAbsolutePath());
getFileList2(file2.getAbsolutePath(),fileList);
} else {
System.out.println("文件:" + file2.getAbsolutePath());
if (isTxtFile(file2.getName())) {
fileList.add(file2);
}
}
}
}
}
return fileList;
}
/**
* 判断是否为目标文件,目前支持txt xls doc格式
* @param fileName 文件名称
* @return 如果是文件类型满足过滤条件,返回true;否则返回false
*/
public static boolean isTxtFile(String fileName) {
if (fileName.toLowerCase().lastIndexOf(".txt") > 0) {
return true;
}else if (fileName.toLowerCase().lastIndexOf(".xls") > 0 || fileName.toLowerCase().lastIndexOf(".xlsx") > 0) {
return true;
}else if (fileName.toLowerCase().lastIndexOf(".doc") > 0 || fileName.toLowerCase().lastIndexOf(".docx") > 0) {
return true;
}else if (fileName.toLowerCase().lastIndexOf(".pdf") > 0) {
return true;
}
return false;
}
/**
* 删除文件目录下的所有文件
* @param file 要删除的文件目录
* @return 如果成功,返回true.
*/
public static boolean deleteDir(File file){
if(file.isDirectory()){
File[] files = file.listFiles();
for(int i=0; i<files.length; i++){
deleteDir(files[i]);
}
}
file.delete();
return true;
}
public static void main(String[] args){
String DATA_DIR = "D:/data";
String INDEX_DIR = "D:/index";
File fileIndex = new File(INDEX_DIR);
if(deleteDir(fileIndex)){
fileIndex.mkdir();
}else{
fileIndex.mkdir();
}
int topN = 100;//结果要显示几条
createIndex(DATA_DIR,INDEX_DIR);
searchIndex(INDEX_DIR,"检索关键词",topN);
}
}