lucene全文检索

程序员文章站 2022-07-01 21:48:38
...
lucene全文检索

可识别txt,doc,docx,xls,xlsx,pdf

lucene全文检索
package com.abc;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author ymy
 * lucene全文检索
 */
public class IndexManager{
    /**
     * 创建当前文件目录的索引
     * @param path 当前文件目录
     * @return 是否成功
     */
    public static boolean createIndex(String DATA_DIR,String INDEX_DIR){
        Date date1 = new Date();
        List<File> fileList = getFileList(DATA_DIR);
        System.out.println("文件数量："+fileList.size());
        String content="";
        if(fileList.size()>0){
            for (File file : fileList) {
                content = "";
                //获取文件后缀
                String type = file.getName().substring(file.getName().lastIndexOf(".")+1);
                System.out.println("type :"+type);
                if("txt".equalsIgnoreCase(type)){
                    content += txt2String(file);
                }else if("doc".equalsIgnoreCase(type) || "docx".equalsIgnoreCase(type)){
                    content += doc2String(file,type);
                }else if("xls".equalsIgnoreCase(type) || "xlsx".equalsIgnoreCase(type)){
                    content += xls2String(file);
                }else if("pdf".equalsIgnoreCase(type)){
                    try {
                        content += pdf2String(file);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
                //System.out.println("name :"+file.getName());
                //System.out.println("path :"+file.getPath()+"\n");
                //System.out.println("content :"+content);
                try{
                    Analyzer analyzer = new IKAnalyzer();
                    //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
                    Directory directory = FSDirectory.open(new File(INDEX_DIR));

                    File indexFile = new File(INDEX_DIR);
                    if (!indexFile.exists()) {
                        indexFile.mkdirs();
                    }
                    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
                    IndexWriter indexWriter = new IndexWriter(directory, config);

                    Document document = new Document();
                    document.add(new TextField("filename", file.getName(), Store.YES));
                    document.add(new TextField("content", content, Store.YES));
                    document.add(new TextField("path", file.getPath(), Store.YES));
                    indexWriter.addDocument(document);
                    indexWriter.commit();
                    if (indexWriter != null) {
                        indexWriter.close();
                    }


                }catch(Exception e){
                    e.printStackTrace();
                }
                content = "";
            }
        }
        Date date2 = new Date();
        System.out.println("创建索引-----耗时：" + (date2.getTime() - date1.getTime()) + "ms\n");
        return true;
    }

    /**
     * 读取txt文件的内容
     * @param file 想要读取的文件对象
     * @return 返回文件内容
     */
    public static String txt2String(File file){
        String result = "";
        try{
            BufferedReader br = new BufferedReader(new FileReader(file));//构造一个BufferedReader类来读取文件
            String s = null;
            while((s = br.readLine())!=null){//使用readLine方法，一次读一行
                result = result + "\n" +s;
            }
            br.close();    
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }

    /**
     * 读取doc文件内容
     * @param file 想要读取的文件对象
     * @return 返回文件内容
     */
    public static String doc2String(File file,String type){
        String result = "";
        try{
            InputStream fis = new FileInputStream(file);
            if("doc".equalsIgnoreCase(type)){
                HWPFDocument doc = new HWPFDocument(fis);
                Range rang = doc.getRange();
                result += rang.text();
            }else if("docx".equalsIgnoreCase(type)){
                result = readByDocx(file); 
            }
            fis.close();
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }
    /** 
     * 通过XWPFDocument对内容进行访问。对于XWPF文档而言，用这种方式进行读操作更佳。 
     * @throws Exception 
     */  
    public static String readByDocx(File file) throws Exception {  
       String result = "";
       InputStream is = new FileInputStream(file);  
       XWPFDocument doc = new XWPFDocument(is);  
       List<XWPFParagraph> paras = doc.getParagraphs();  
       for (XWPFParagraph para : paras) {  
           //当前段落的属性  
           //CTPPr pr = para.getCTP().getPPr();  
           result += para.getText(); 
       }  
       //获取文档中所有的表格  
       List<XWPFTable> tables = doc.getTables();  
       List<XWPFTableRow> rows;  
       List<XWPFTableCell> cells;  
       for (XWPFTable table : tables) {  
          //表格属性  
          //CTTblPr pr = table.getCTTbl().getTblPr();  
          //获取表格对应的行  
          rows = table.getRows();
          if(rows!=null){
              for (XWPFTableRow row : rows) {  
                 //获取行对应的单元格  
                 cells = row.getTableCells();  
                 if(cells!=null){
                     for (XWPFTableCell cell : cells) {  
                         result += cell.getText();
                     }
                 }
              }  
          }
       }  
       is.close();
       return result;
    }  
    //PDF文档
    public static String pdf2String(File file) throws Exception {
        PDDocument document = null;
        try
        {
            // 方式一：
            /**
            InputStream input = null;
            input = new FileInputStream( pdfFile );
            //加载 pdf 文档
            PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
            parser.parse();
            document = parser.getPDDocument();
            **/
            // 方式二：
            document=PDDocument.load(file);
            // 获取页码
            int pages = document.getNumberOfPages();
            // 读文本内容
            PDFTextStripper stripper=new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            stripper.setStartPage(1);
            stripper.setEndPage(pages);
            String content = stripper.getText(document);
            document.close();
            return content;
        }
        catch(Exception e)
        {
            System.out.println(e);
        }
        return null;
    }
    /**
     * 读取xls文件内容
     * @param file 想要读取的文件对象
     * @return 返回文件内容
     */
    public static String xls2String(File file){
        String result = "";
        try{
            Row row = null;
            FileInputStream fis = new FileInputStream(file);   
            StringBuilder sb = new StringBuilder();   
            //jxl.Workbook rwb = Workbook.getWorkbook(fis);  
            boolean isExcel2003 = file.getPath().toLowerCase().endsWith("xls")?true:false;
            Workbook workbook = null;
            if(isExcel2003){
                workbook = new HSSFWorkbook(new FileInputStream(file));
            }else{
                workbook = new XSSFWorkbook(new FileInputStream(file));
            }
            int sheetnum = workbook.getNumberOfSheets();
            for(int k=0;k<sheetnum;k++){
                Sheet sheet = workbook.getSheetAt(k);
                for (int i = 0; sheet.getRow(i)!=null; i++) {   
                   row = sheet.getRow(i);   
                   for(int j=0;j<row.getLastCellNum(); j++){   
                       sb.append(getCellValue(row.getCell(j))); 
                   }
                }   
            }
            fis.close();   
            result += sb.toString();
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }
    public static String getCellValue(Cell cell){  
        String cellValue = "";  
        if(cell == null){  
            return cellValue;  
        }  
        //把数字当成String来读，避免出现1读成1.0的情况  
        if(cell.getCellType() == Cell.CELL_TYPE_NUMERIC){  
            cell.setCellType(Cell.CELL_TYPE_STRING);  
        }  
        //判断数据的类型  
        switch (cell.getCellType()){
            case Cell.CELL_TYPE_NUMERIC: //数字
                cellValue = String.valueOf(cell.getNumericCellValue());  
                break;  
            case Cell.CELL_TYPE_STRING: //字符串  
                cellValue = String.valueOf(cell.getStringCellValue());  
                break;  
            case Cell.CELL_TYPE_BOOLEAN: //Boolean  
                cellValue = String.valueOf(cell.getBooleanCellValue());  
                break;  
            case Cell.CELL_TYPE_FORMULA: //公式  
                cellValue = String.valueOf(cell.getCellFormula());  
                break;  
            case Cell.CELL_TYPE_BLANK: //空值   
                cellValue = "";  
                break;  
            case Cell.CELL_TYPE_ERROR: //故障  
                cellValue = "非法字符";  
                break;  
            default:  
                cellValue = "未知类型";  
                break;  
        }  
        return cellValue;  
    }
    /**
     * 查找索引，返回符合条件的文件
     * @param text 查找的字符串
     * @return 符合条件的文件List
     */
    public static String searchIndex(String INDEX_DIR,String text,int topN){
        System.out.println("开始检索..................");
        StringBuffer buffer = new StringBuffer();
        try{
            //Class<?> clazz = Directory.class;
            //System.out.println(clazz.getResource(clazz.getSimpleName() + ".class"));
            // 1、创建Directory 
            Directory directory = FSDirectory.open(new File(INDEX_DIR));
            // 2、创建搜索的Query
            //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
            Analyzer analyzer = new IKAnalyzer();
            // 3、创建IndexReader
            DirectoryReader ireader = DirectoryReader.open(directory);
            // 4、根据IndexReader创建IndexSearch
            IndexSearcher isearcher = new IndexSearcher(ireader);
            // 创建parser来确定要搜索文件的内容，第二个参数为搜索的域
            QueryParser parser = new QueryParser(Version.LUCENE_40, "content", analyzer);
            // 创建Query表示搜索域为content包含text的文档
            Query query = parser.parse(text);

            //ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
            // 5、根据searcher搜索并且返回TopDocs
            TopDocs topDocs = isearcher.search(query, topN);
            // 6、根据TopDocs获取ScoreDoc对象  
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
            System.out.println("查找到的文档总共有："+topDocs.totalHits);
            //高亮显示
            Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
            QueryScorer fragmentScorer = new QueryScorer(query);
            Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
            Fragmenter fragmenter = new SimpleFragmenter(100);
            highlighter.setTextFragmenter(fragmenter);

            for (ScoreDoc scoreDoc : scoreDocs) {  
                // 7、根据searcher和ScoreDoc对象获取具体的Document对象  
                Document document = isearcher.doc(scoreDoc.doc);  
                // 8、根据Document对象获取需要的值
                //System.out.println("____________________________");
                //System.out.println(filename + "\n" + document.get("path"));  
                //System.out.println("____________________________");
                float score = scoreDoc.score; //相似度
                String highlighterstr = highlighter.getBestFragment(analyzer, "content", text);
            }
            ireader.close();
            directory.close();
        }catch(Exception e){
            e.printStackTrace();
        }
        return buffer.toString();
    }
    /**
     * 过滤目录下的文件
     * @param dirPath 想要获取文件的目录
     * @return 返回文件list
     */
    public static List<File> getFileList(String dirPath) {
        List<File> fileList = new ArrayList<File>();
        getFileList2(dirPath,fileList);
        return fileList;
    }
    public static List<File> getFileList2(String path,List<File> fileList) {
        File file = new File(path);
        if (file.exists()) {
            File[] files = file.listFiles();
            if (files.length > 0) {
                for (File file2 : files) {
                    if (file2.isDirectory()) {
                        System.out.println("文件夹:" + file2.getAbsolutePath());
                        getFileList2(file2.getAbsolutePath(),fileList);
                    } else {
                        System.out.println("文件:" + file2.getAbsolutePath());
                        if (isTxtFile(file2.getName())) {
                            fileList.add(file2);
                        }
                    }
                }
            }
        }
        return fileList;
    }
    /**
     * 判断是否为目标文件，目前支持txt xls doc格式
     * @param fileName 文件名称
     * @return 如果是文件类型满足过滤条件，返回true；否则返回false
     */
    public static boolean isTxtFile(String fileName) {
        if (fileName.toLowerCase().lastIndexOf(".txt") > 0) {
            return true;
        }else if (fileName.toLowerCase().lastIndexOf(".xls") > 0 || fileName.toLowerCase().lastIndexOf(".xlsx") > 0) {
            return true;
        }else if (fileName.toLowerCase().lastIndexOf(".doc") > 0 || fileName.toLowerCase().lastIndexOf(".docx") > 0) {
            return true;
        }else if (fileName.toLowerCase().lastIndexOf(".pdf") > 0) {
            return true;
        }
        return false;
    }

    /**
     * 删除文件目录下的所有文件
     * @param file 要删除的文件目录
     * @return 如果成功，返回true.
     */
    public static boolean deleteDir(File file){
        if(file.isDirectory()){
            File[] files = file.listFiles();
            for(int i=0; i<files.length; i++){
                deleteDir(files[i]);
            }
        }
        file.delete();
        return true;
    }

    public static void main(String[] args){
        String DATA_DIR = "D:/data";
        String INDEX_DIR = "D:/index";
        File fileIndex = new File(INDEX_DIR);
        if(deleteDir(fileIndex)){
            fileIndex.mkdir();
        }else{
            fileIndex.mkdir();
        }
        int topN = 100;//结果要显示几条
        createIndex(DATA_DIR,INDEX_DIR);
        searchIndex(INDEX_DIR,"检索关键词",topN);
    }
}
lucene全文检索

lucene全文检索

可识别txt,doc,docx,xls,xlsx,pdf

解决MySQL数据库中文模糊检索问题的方法

对JavaScript的全文搜索实现相关度评分的功能的方法

关于Sphinx创建全文检索的索引介绍

oracle10g全文索引自动同步语句使用方法

SQL必知必会笔记检索和排序数据

Lucene.Net实现搜索结果分类统计功能(中小型网站)

巧用站内检索深挖需求提升销量

Spring Boot与Kotlin 整合全文搜索引擎Elasticsearch的示例代码

在Python的Flask框架中实现全文搜索功能

Linux中利用grep命令如何检索文件内容详解