欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Lucene全文检索应用

程序员文章站 2022-03-04 23:29:22
...

一后台接口方法:

   1.创建索引

     

 /**
     * 创建知识索引
     *
     * @param flag
     * @param path
     * @param indexPath
     * @param title
     * @param knowlegeid
     */
    public void createKnowledgeIndex(boolean flag, String path, String indexPath, String title, Date createtime, String infoType, String knowlegeid) {
        // 根据类目代码找到类目,获得文件及文件索引目录
        File indexDir = new File(indexPath + "/knowledgeIndex");
        if (!indexDir.exists()) {
            if (!indexDir.mkdirs()) throw new RuntimeException("索引文件夹创建出错");
        }
        // 建立分词
        Analyzer luceneAnalyzer = new IK_CAnalyzer();
        // 取得目录下所有Files
        // 建立indexWrite indexWrite主要作用是添加索引,并判断索引目录是否有索引文件
        File[] file = indexDir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.equals("segments.gen");
            }
        });
        if (file == null || file.length == 0) {
            flag = true;
        }
        IndexWriter indexWriter = null;
        long startTime = new Date().getTime();
        try {
            indexWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
            File dataFiles = new File(path);
            String txtReader = "";
            if (!"".equals(path) && path != null) {
                txtReader = postFix(dataFiles, fileParseDomain);
                Document document = new Document();
                document.add(new Field("path", dataFiles.getCanonicalPath(),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.add(new Field("title", title, Field.Store.YES,
                        Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                document.add(new Field("createtime", StringUtil.getDateStringYMD(createtime),
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("infoType", infoType,
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("knowlegeid", knowlegeid,
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("contents", txtReader,
                        Field.Store.COMPRESS, Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                document.add(new Field("all", title + txtReader,
                        Field.Store.COMPRESS, Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                indexWriter.addDocument(document);
            }
            indexWriter.optimize();
        } catch (IOException e) {
            log.error("索引创建出错" + e.getMessage(), e);
            throw new RuntimeException("索引创建出错" + e.getMessage());
        } finally {
            try {
                indexWriter.close();
            } catch (CorruptIndexException e) {
                log.error(e.getMessage());
                throw new RuntimeException("关闭写索引流出错" + e.getMessage());
            } catch (IOException e) {
                log.error(e.getMessage());
                throw new RuntimeException("关闭写索引流出错" + e.getMessage());
            }
        }
        long endTime = new Date().getTime();
        log.info("创建索引总时间:" + (endTime - startTime));
    }

 

2.根据id和路径删除索引:

  

 public void delKnowledgeIndexByinfoid(String path,String knowlegeid, String indexPath) {
       Directory directory = null;
        try {
            directory = FSDirectory.getDirectory(indexPath + "/knowledgeIndex");
            Term[] termArr = new Term[2];
            termArr[0] = new Term("path", path);
            termArr[1] = new Term("knowlegeid", knowlegeid);
            Analyzer luceneAnalyzer = new IK_CAnalyzer();
            IndexWriter indexWriter = new IndexWriter(directory,
                    luceneAnalyzer, false);
            indexWriter.deleteDocuments(termArr);
            indexWriter.optimize();
            indexWriter.close();
        } catch (IOException e) {
            log.debug("索引删除出错" + e.getMessage(), e);
            throw new RuntimeException("索引删除出错" + e.getMessage());
        }
    }

 

3.查询索引

  

/**
     * 根据关键字检索知识信息
     *
     * @param type
     * @param keyWord
     * @param indexPath
     * @param sp
     * @return
     */
    public List searchKnowlegeByKey(String type, String keyWord, String indexPath, SplitPage sp) {
        keyWord = specialStrConvert(keyWord);
        // 索引地址
        File indexDir = new File(indexPath + "/knowledgeIndex");
        // 取得索引字典
        FSDirectory directory = null;
        IndexSearcher searcher = null;
        Hits hits = null;
        List list = null;
        try {
            directory = FSDirectory.getDirectory(indexDir, false);
            IndexReader reader = IndexReader.open(directory);
            searcher = new IndexSearcher(directory);
            // 查询的索引地址是否存在
            if (!indexDir.exists()) {
                log.debug("索引文件不存在");
                throw new RuntimeException("索引文件不存在");
            }
            // 建立term 查询docuemnt中contents中的内容(内容要转为大字)
            Analyzer luceneAnalyzer = new IK_CAnalyzer();
            QueryParser parser = new QueryParser(type, luceneAnalyzer);
            parser.setAllowLeadingWildcard(true);
            Query query = null;
            query = parser.parse("+(" + type + ":*" + keyWord + "*)");
            // 生成结果
            Sort sort = new Sort(new SortField[]{new SortField("createtime", false)});//对索引结果排序
            hits = searcher.search(query, sort);
            // 分词结果
            list = new ArrayList();
            SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(
                    "<b><span style='background-color:yellow;'>", "</span></b>");
            Highlighter highlighter = new Highlighter(sHtmlF, new QueryScorer(
                    query));
            if (hits != null && hits.length() > 0) {
                int len = hits.length();
                sp.setRecordCount(len);
                sp.init();
                int endRecord = sp.getStartRecord() + sp.getPageSize();
                int con = endRecord > sp.getRecordCount() ? sp.getRecordCount()
                        : endRecord;
                for (int i = sp.getStartRecord(); i < con; i++) {
                    Document docTemp = hits.doc(i);
                    String value = docTemp.get(type);
                    // 对要高亮显示的字段格式化,这里只是加红色显示和加粗
                    Map m = new HashMap();
                    m.put("path", docTemp.get("path"));
                    m.put("title", docTemp.get("title"));
                    m.put("createtime", docTemp.get("createtime"));
                    m.put("infoType", docTemp.get("infoType"));
                    m.put("knowlegeid", docTemp.get("knowlegeid"));
                    if (value != null && !type.equals("title")) {
                        // Lucene使用项向量提高高亮显示性能
                        TermPositionVector termFreqVector = (TermPositionVector) reader
                                .getTermFreqVector(hits.id(i), type);
                        TokenStream tokenStream = TokenSources
                                .getTokenStream(termFreqVector);
                        String str = highlighter.getBestFragment(tokenStream,
                                value);
                        m.put(type, str);
                    }
                    list.add(m);

                }
            }
            // long ll = System.currentTimeMillis();
            // System.out.println("高亮显示" + (ll - l));
            searcher.close();
            reader.close();
        } catch (IOException e) {
            log.debug(e.getMessage(), e);
            throw new RuntimeException(e.getMessage());
        } catch (ParseException e) {
            log.debug("lucene分词转换出错" + e.getMessage(), e);
            throw new RuntimeException("lucene分词转换出错" + e.getMessage());
        }
        return list;
    }

 

4.判定是否存在索引

  

 public String isExistsKnowlegeIndex(String path) {
        String mes;
        //获得文件及文件索引目录
        File indexDir = new File(path + "/knowledgeIndex");
        if (!indexDir.exists()) {
            if (!indexDir.mkdirs()) throw new RuntimeException("索引文件夹创建出错");
        }
        // 建立indexWrite indexWrite主要作用是添加索引,并判断索引目录是否有索引文件
        File[] file = indexDir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.equals("segments.gen");
            }
        });
        if (file == null || file.length == 0) {
            mes = "";
        } else {
            mes = "ok";
        }
        return mes;
    }

 

5.替换特殊字符

  

 /**
     * 替换特殊字符
     *
     * @param str
     * @return
     */
    private static String specialStrConvert(String str) {
        // + - && || ! ( ) { } [ ] ^ " ~ * ? : \
        if ("".equals(str) || str == null)
            return "";
        else
            return str.replaceAll("\\\\", "\\\\\\\\")
                    .replaceAll("\\+", "\\\\+").replaceAll("\\-", "\\\\-")
                    .replaceAll("\\&&", "\\\\&&").replaceAll("\\!", "\\\\!")
                    .replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)")
                    .replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}")
                    .replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]")
                    .replaceAll("\\^", "\\\\^").replaceAll("\"", "\\\\\"")
                    .replaceAll("\\~", "\\\\~").replaceAll("\\*", "\\\\*")
                    .replaceAll("\\?", "\\\\?").replaceAll("\\|\\|", "\\\\||")
                    .replaceAll("\\:", "\\\\:");
    }

 

6.读取不同文件(方法在Java基础中工具类中实现)

 

  private static String postFix(File file, FileParseDomain fileParseDomain) {
        String txtReader = "";
        try {
            if (file.getPath().endsWith(".doc")) {
                txtReader = fileParseDomain.readWord(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".pdf")) {
                txtReader = fileParseDomain.readPDF(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".xls")) {
                txtReader = fileParseDomain.readExcel(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".txt")) {
                txtReader = fileParseDomain.readTxt(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".html")
                    || file.getPath().endsWith(".htm")) {
                txtReader = fileParseDomain.readHtmlText(file.getCanonicalPath());
            }
        } catch (IOException e) {
            log.debug("文件读取出错" + e.getMessage(), e);
            throw new RuntimeException("文件读取出错" + e.getMessage());
        }
        return txtReader;
    }

 

相关标签: 全文检索 lucene