欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

lucene全文检索

程序员文章站 2022-07-01 21:48:38
...

lucene全文检索

可识别txt,doc,docx,xls,xlsx,pdf

lucene全文检索

Lucene 原理与代码分析完整版.pdf

package com.abc;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author ymy
 * lucene全文检索
 */
public class IndexManager{
    /**
     * 创建当前文件目录的索引
     * @param path 当前文件目录
     * @return 是否成功
     */
    public static boolean createIndex(String DATA_DIR,String INDEX_DIR){
        Date date1 = new Date();
        List<File> fileList = getFileList(DATA_DIR);
        System.out.println("文件数量:"+fileList.size());
        String content="";
        if(fileList.size()>0){
            for (File file : fileList) {
                content = "";
                //获取文件后缀
                String type = file.getName().substring(file.getName().lastIndexOf(".")+1);
                System.out.println("type :"+type);
                if("txt".equalsIgnoreCase(type)){
                    content += txt2String(file);
                }else if("doc".equalsIgnoreCase(type) || "docx".equalsIgnoreCase(type)){
                    content += doc2String(file,type);
                }else if("xls".equalsIgnoreCase(type) || "xlsx".equalsIgnoreCase(type)){
                    content += xls2String(file);
                }else if("pdf".equalsIgnoreCase(type)){
                    try {
                        content += pdf2String(file);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
                //System.out.println("name :"+file.getName());
                //System.out.println("path :"+file.getPath()+"\n");
                //System.out.println("content :"+content);
                try{
                    Analyzer analyzer = new IKAnalyzer();
                    //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
                    Directory directory = FSDirectory.open(new File(INDEX_DIR));

                    File indexFile = new File(INDEX_DIR);
                    if (!indexFile.exists()) {
                        indexFile.mkdirs();
                    }
                    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
                    IndexWriter indexWriter = new IndexWriter(directory, config);

                    Document document = new Document();
                    document.add(new TextField("filename", file.getName(), Store.YES));
                    document.add(new TextField("content", content, Store.YES));
                    document.add(new TextField("path", file.getPath(), Store.YES));
                    indexWriter.addDocument(document);
                    indexWriter.commit();
                    if (indexWriter != null) {
                        indexWriter.close();
                    }


                }catch(Exception e){
                    e.printStackTrace();
                }
                content = "";
            }
        }
        Date date2 = new Date();
        System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
        return true;
    }

    /**
     * 读取txt文件的内容
     * @param file 想要读取的文件对象
     * @return 返回文件内容
     */
    public static String txt2String(File file){
        String result = "";
        try{
            BufferedReader br = new BufferedReader(new FileReader(file));//构造一个BufferedReader类来读取文件
            String s = null;
            while((s = br.readLine())!=null){//使用readLine方法,一次读一行
                result = result + "\n" +s;
            }
            br.close();    
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }

    /**
     * 读取doc文件内容
     * @param file 想要读取的文件对象
     * @return 返回文件内容
     */
    public static String doc2String(File file,String type){
        String result = "";
        try{
            InputStream fis = new FileInputStream(file);
            if("doc".equalsIgnoreCase(type)){
                HWPFDocument doc = new HWPFDocument(fis);
                Range rang = doc.getRange();
                result += rang.text();
            }else if("docx".equalsIgnoreCase(type)){
                result = readByDocx(file); 
            }
            fis.close();
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }
    /** 
     * 通过XWPFDocument对内容进行访问。对于XWPF文档而言,用这种方式进行读操作更佳。 
     * @throws Exception 
     */  
    public static String readByDocx(File file) throws Exception {  
       String result = "";
       InputStream is = new FileInputStream(file);  
       XWPFDocument doc = new XWPFDocument(is);  
       List<XWPFParagraph> paras = doc.getParagraphs();  
       for (XWPFParagraph para : paras) {  
           //当前段落的属性  
           //CTPPr pr = para.getCTP().getPPr();  
           result += para.getText(); 
       }  
       //获取文档中所有的表格  
       List<XWPFTable> tables = doc.getTables();  
       List<XWPFTableRow> rows;  
       List<XWPFTableCell> cells;  
       for (XWPFTable table : tables) {  
          //表格属性  
          //CTTblPr pr = table.getCTTbl().getTblPr();  
          //获取表格对应的行  
          rows = table.getRows();
          if(rows!=null){
              for (XWPFTableRow row : rows) {  
                 //获取行对应的单元格  
                 cells = row.getTableCells();  
                 if(cells!=null){
                     for (XWPFTableCell cell : cells) {  
                         result += cell.getText();
                     }
                 }
              }  
          }
       }  
       is.close();
       return result;
    }  
    //PDF文档
    public static String pdf2String(File file) throws Exception {
        PDDocument document = null;
        try
        {
            // 方式一:
            /**
            InputStream input = null;
            input = new FileInputStream( pdfFile );
            //加载 pdf 文档
            PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
            parser.parse();
            document = parser.getPDDocument();
            **/
            // 方式二:
            document=PDDocument.load(file);
            // 获取页码
            int pages = document.getNumberOfPages();
            // 读文本内容
            PDFTextStripper stripper=new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            stripper.setStartPage(1);
            stripper.setEndPage(pages);
            String content = stripper.getText(document);
            document.close();
            return content;
        }
        catch(Exception e)
        {
            System.out.println(e);
        }
        return null;
    }
    /**
     * 读取xls文件内容
     * @param file 想要读取的文件对象
     * @return 返回文件内容
     */
    public static String xls2String(File file){
        String result = "";
        try{
            Row row = null;
            FileInputStream fis = new FileInputStream(file);   
            StringBuilder sb = new StringBuilder();   
            //jxl.Workbook rwb = Workbook.getWorkbook(fis);  
            boolean isExcel2003 = file.getPath().toLowerCase().endsWith("xls")?true:false;
            Workbook workbook = null;
            if(isExcel2003){
                workbook = new HSSFWorkbook(new FileInputStream(file));
            }else{
                workbook = new XSSFWorkbook(new FileInputStream(file));
            }
            int sheetnum = workbook.getNumberOfSheets();
            for(int k=0;k<sheetnum;k++){
                Sheet sheet = workbook.getSheetAt(k);
                for (int i = 0; sheet.getRow(i)!=null; i++) {   
                   row = sheet.getRow(i);   
                   for(int j=0;j<row.getLastCellNum(); j++){   
                       sb.append(getCellValue(row.getCell(j))); 
                   }
                }   
            }
            fis.close();   
            result += sb.toString();
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }
    public static String getCellValue(Cell cell){  
        String cellValue = "";  
        if(cell == null){  
            return cellValue;  
        }  
        //把数字当成String来读,避免出现1读成1.0的情况  
        if(cell.getCellType() == Cell.CELL_TYPE_NUMERIC){  
            cell.setCellType(Cell.CELL_TYPE_STRING);  
        }  
        //判断数据的类型  
        switch (cell.getCellType()){
            case Cell.CELL_TYPE_NUMERIC: //数字
                cellValue = String.valueOf(cell.getNumericCellValue());  
                break;  
            case Cell.CELL_TYPE_STRING: //字符串  
                cellValue = String.valueOf(cell.getStringCellValue());  
                break;  
            case Cell.CELL_TYPE_BOOLEAN: //Boolean  
                cellValue = String.valueOf(cell.getBooleanCellValue());  
                break;  
            case Cell.CELL_TYPE_FORMULA: //公式  
                cellValue = String.valueOf(cell.getCellFormula());  
                break;  
            case Cell.CELL_TYPE_BLANK: //空值   
                cellValue = "";  
                break;  
            case Cell.CELL_TYPE_ERROR: //故障  
                cellValue = "非法字符";  
                break;  
            default:  
                cellValue = "未知类型";  
                break;  
        }  
        return cellValue;  
    }
    /**
     * 查找索引,返回符合条件的文件
     * @param text 查找的字符串
     * @return 符合条件的文件List
     */
    public static String searchIndex(String INDEX_DIR,String text,int topN){
        System.out.println("开始检索..................");
        StringBuffer buffer = new StringBuffer();
        try{
            //Class<?> clazz = Directory.class;
            //System.out.println(clazz.getResource(clazz.getSimpleName() + ".class"));
            // 1、创建Directory 
            Directory directory = FSDirectory.open(new File(INDEX_DIR));
            // 2、创建搜索的Query
            //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
            Analyzer analyzer = new IKAnalyzer();
            // 3、创建IndexReader
            DirectoryReader ireader = DirectoryReader.open(directory);
            // 4、根据IndexReader创建IndexSearch
            IndexSearcher isearcher = new IndexSearcher(ireader);
            // 创建parser来确定要搜索文件的内容,第二个参数为搜索的域
            QueryParser parser = new QueryParser(Version.LUCENE_40, "content", analyzer);
            // 创建Query表示搜索域为content包含text的文档
            Query query = parser.parse(text);

            //ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
            // 5、根据searcher搜索并且返回TopDocs
            TopDocs topDocs = isearcher.search(query, topN);
            // 6、根据TopDocs获取ScoreDoc对象  
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
            System.out.println("查找到的文档总共有:"+topDocs.totalHits);
            //高亮显示
            Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
            QueryScorer fragmentScorer = new QueryScorer(query);
            Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
            Fragmenter fragmenter = new SimpleFragmenter(100);
            highlighter.setTextFragmenter(fragmenter);

            for (ScoreDoc scoreDoc : scoreDocs) {  
                // 7、根据searcher和ScoreDoc对象获取具体的Document对象  
                Document document = isearcher.doc(scoreDoc.doc);  
                // 8、根据Document对象获取需要的值
                //System.out.println("____________________________");
                //System.out.println(filename + "\n" + document.get("path"));  
                //System.out.println("____________________________");
                float score = scoreDoc.score; //相似度
                String highlighterstr = highlighter.getBestFragment(analyzer, "content", text);
            }
            ireader.close();
            directory.close();
        }catch(Exception e){
            e.printStackTrace();
        }
        return buffer.toString();
    }
    /**
     * 过滤目录下的文件
     * @param dirPath 想要获取文件的目录
     * @return 返回文件list
     */
    public static List<File> getFileList(String dirPath) {
        List<File> fileList = new ArrayList<File>();
        getFileList2(dirPath,fileList);
        return fileList;
    }
    public static List<File> getFileList2(String path,List<File> fileList) {
        File file = new File(path);
        if (file.exists()) {
            File[] files = file.listFiles();
            if (files.length > 0) {
                for (File file2 : files) {
                    if (file2.isDirectory()) {
                        System.out.println("文件夹:" + file2.getAbsolutePath());
                        getFileList2(file2.getAbsolutePath(),fileList);
                    } else {
                        System.out.println("文件:" + file2.getAbsolutePath());
                        if (isTxtFile(file2.getName())) {
                            fileList.add(file2);
                        }
                    }
                }
            }
        }
        return fileList;
    }
    /**
     * 判断是否为目标文件,目前支持txt xls doc格式
     * @param fileName 文件名称
     * @return 如果是文件类型满足过滤条件,返回true;否则返回false
     */
    public static boolean isTxtFile(String fileName) {
        if (fileName.toLowerCase().lastIndexOf(".txt") > 0) {
            return true;
        }else if (fileName.toLowerCase().lastIndexOf(".xls") > 0 || fileName.toLowerCase().lastIndexOf(".xlsx") > 0) {
            return true;
        }else if (fileName.toLowerCase().lastIndexOf(".doc") > 0 || fileName.toLowerCase().lastIndexOf(".docx") > 0) {
            return true;
        }else if (fileName.toLowerCase().lastIndexOf(".pdf") > 0) {
            return true;
        }
        return false;
    }

    /**
     * 删除文件目录下的所有文件
     * @param file 要删除的文件目录
     * @return 如果成功,返回true.
     */
    public static boolean deleteDir(File file){
        if(file.isDirectory()){
            File[] files = file.listFiles();
            for(int i=0; i<files.length; i++){
                deleteDir(files[i]);
            }
        }
        file.delete();
        return true;
    }

    public static void main(String[] args){
        String DATA_DIR = "D:/data";
        String INDEX_DIR = "D:/index";
        File fileIndex = new File(INDEX_DIR);
        if(deleteDir(fileIndex)){
            fileIndex.mkdir();
        }else{
            fileIndex.mkdir();
        }
        int topN = 100;//结果要显示几条
        createIndex(DATA_DIR,INDEX_DIR);
        searchIndex(INDEX_DIR,"检索关键词",topN);
    }
}
相关标签: lucene 全文检索