Lucene全文检索应用

程序员文章站 2022-03-04 23:29:22

...

一后台接口方法：

1.创建索引

 /**
     * 创建知识索引
     *
     * @param flag
     * @param path
     * @param indexPath
     * @param title
     * @param knowlegeid
     */
    public void createKnowledgeIndex(boolean flag, String path, String indexPath, String title, Date createtime, String infoType, String knowlegeid) {
        // 根据类目代码找到类目，获得文件及文件索引目录
        File indexDir = new File(indexPath + "/knowledgeIndex");
        if (!indexDir.exists()) {
            if (!indexDir.mkdirs()) throw new RuntimeException("索引文件夹创建出错");
        }
        // 建立分词
        Analyzer luceneAnalyzer = new IK_CAnalyzer();
        // 取得目录下所有Files
        // 建立indexWrite indexWrite主要作用是添加索引,并判断索引目录是否有索引文件
        File[] file = indexDir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.equals("segments.gen");
            }
        });
        if (file == null || file.length == 0) {
            flag = true;
        }
        IndexWriter indexWriter = null;
        long startTime = new Date().getTime();
        try {
            indexWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
            File dataFiles = new File(path);
            String txtReader = "";
            if (!"".equals(path) && path != null) {
                txtReader = postFix(dataFiles, fileParseDomain);
                Document document = new Document();
                document.add(new Field("path", dataFiles.getCanonicalPath(),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.add(new Field("title", title, Field.Store.YES,
                        Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                document.add(new Field("createtime", StringUtil.getDateStringYMD(createtime),
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("infoType", infoType,
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("knowlegeid", knowlegeid,
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("contents", txtReader,
                        Field.Store.COMPRESS, Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                document.add(new Field("all", title + txtReader,
                        Field.Store.COMPRESS, Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                indexWriter.addDocument(document);
            }
            indexWriter.optimize();
        } catch (IOException e) {
            log.error("索引创建出错" + e.getMessage(), e);
            throw new RuntimeException("索引创建出错" + e.getMessage());
        } finally {
            try {
                indexWriter.close();
            } catch (CorruptIndexException e) {
                log.error(e.getMessage());
                throw new RuntimeException("关闭写索引流出错" + e.getMessage());
            } catch (IOException e) {
                log.error(e.getMessage());
                throw new RuntimeException("关闭写索引流出错" + e.getMessage());
            }
        }
        long endTime = new Date().getTime();
        log.info("创建索引总时间：" + (endTime - startTime));
    }

2.根据id和路径删除索引：

 public void delKnowledgeIndexByinfoid(String path,String knowlegeid, String indexPath) {
       Directory directory = null;
        try {
            directory = FSDirectory.getDirectory(indexPath + "/knowledgeIndex");
            Term[] termArr = new Term[2];
            termArr[0] = new Term("path", path);
            termArr[1] = new Term("knowlegeid", knowlegeid);
            Analyzer luceneAnalyzer = new IK_CAnalyzer();
            IndexWriter indexWriter = new IndexWriter(directory,
                    luceneAnalyzer, false);
            indexWriter.deleteDocuments(termArr);
            indexWriter.optimize();
            indexWriter.close();
        } catch (IOException e) {
            log.debug("索引删除出错" + e.getMessage(), e);
            throw new RuntimeException("索引删除出错" + e.getMessage());
        }
    }

3.查询索引

/**
     * 根据关键字检索知识信息
     *
     * @param type
     * @param keyWord
     * @param indexPath
     * @param sp
     * @return
     */
    public List searchKnowlegeByKey(String type, String keyWord, String indexPath, SplitPage sp) {
        keyWord = specialStrConvert(keyWord);
        // 索引地址
        File indexDir = new File(indexPath + "/knowledgeIndex");
        // 取得索引字典
        FSDirectory directory = null;
        IndexSearcher searcher = null;
        Hits hits = null;
        List list = null;
        try {
            directory = FSDirectory.getDirectory(indexDir, false);
            IndexReader reader = IndexReader.open(directory);
            searcher = new IndexSearcher(directory);
            // 查询的索引地址是否存在
            if (!indexDir.exists()) {
                log.debug("索引文件不存在");
                throw new RuntimeException("索引文件不存在");
            }
            // 建立term 查询docuemnt中contents中的内容（内容要转为大字）
            Analyzer luceneAnalyzer = new IK_CAnalyzer();
            QueryParser parser = new QueryParser(type, luceneAnalyzer);
            parser.setAllowLeadingWildcard(true);
            Query query = null;
            query = parser.parse("+(" + type + ":*" + keyWord + "*)");
            // 生成结果
            Sort sort = new Sort(new SortField[]{new SortField("createtime", false)});//对索引结果排序
            hits = searcher.search(query, sort);
            // 分词结果
            list = new ArrayList();
            SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(
                    "<b><span style='background-color:yellow;'>", "</span></b>");
            Highlighter highlighter = new Highlighter(sHtmlF, new QueryScorer(
                    query));
            if (hits != null && hits.length() > 0) {
                int len = hits.length();
                sp.setRecordCount(len);
                sp.init();
                int endRecord = sp.getStartRecord() + sp.getPageSize();
                int con = endRecord > sp.getRecordCount() ? sp.getRecordCount()
                        : endRecord;
                for (int i = sp.getStartRecord(); i < con; i++) {
                    Document docTemp = hits.doc(i);
                    String value = docTemp.get(type);
                    // 对要高亮显示的字段格式化，这里只是加红色显示和加粗
                    Map m = new HashMap();
                    m.put("path", docTemp.get("path"));
                    m.put("title", docTemp.get("title"));
                    m.put("createtime", docTemp.get("createtime"));
                    m.put("infoType", docTemp.get("infoType"));
                    m.put("knowlegeid", docTemp.get("knowlegeid"));
                    if (value != null && !type.equals("title")) {
                        // Lucene使用项向量提高高亮显示性能
                        TermPositionVector termFreqVector = (TermPositionVector) reader
                                .getTermFreqVector(hits.id(i), type);
                        TokenStream tokenStream = TokenSources
                                .getTokenStream(termFreqVector);
                        String str = highlighter.getBestFragment(tokenStream,
                                value);
                        m.put(type, str);
                    }
                    list.add(m);

                }
            }
            // long ll = System.currentTimeMillis();
            // System.out.println("高亮显示" + (ll - l));
            searcher.close();
            reader.close();
        } catch (IOException e) {
            log.debug(e.getMessage(), e);
            throw new RuntimeException(e.getMessage());
        } catch (ParseException e) {
            log.debug("lucene分词转换出错" + e.getMessage(), e);
            throw new RuntimeException("lucene分词转换出错" + e.getMessage());
        }
        return list;
    }

4.判定是否存在索引

 public String isExistsKnowlegeIndex(String path) {
        String mes;
        //获得文件及文件索引目录
        File indexDir = new File(path + "/knowledgeIndex");
        if (!indexDir.exists()) {
            if (!indexDir.mkdirs()) throw new RuntimeException("索引文件夹创建出错");
        }
        // 建立indexWrite indexWrite主要作用是添加索引,并判断索引目录是否有索引文件
        File[] file = indexDir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.equals("segments.gen");
            }
        });
        if (file == null || file.length == 0) {
            mes = "";
        } else {
            mes = "ok";
        }
        return mes;
    }

5.替换特殊字符

 /**
     * 替换特殊字符
     *
     * @param str
     * @return
     */
    private static String specialStrConvert(String str) {
        // + - && || ! ( ) { } [ ] ^ " ~ * ? : \
        if ("".equals(str) || str == null)
            return "";
        else
            return str.replaceAll("\\\\", "\\\\\\\\")
                    .replaceAll("\\+", "\\\\+").replaceAll("\\-", "\\\\-")
                    .replaceAll("\\&&", "\\\\&&").replaceAll("\\!", "\\\\!")
                    .replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)")
                    .replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}")
                    .replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]")
                    .replaceAll("\\^", "\\\\^").replaceAll("\"", "\\\\\"")
                    .replaceAll("\\~", "\\\\~").replaceAll("\\*", "\\\\*")
                    .replaceAll("\\?", "\\\\?").replaceAll("\\|\\|", "\\\\||")
                    .replaceAll("\\:", "\\\\:");
    }

6.读取不同文件（方法在Java基础中工具类中实现）

  private static String postFix(File file, FileParseDomain fileParseDomain) {
        String txtReader = "";
        try {
            if (file.getPath().endsWith(".doc")) {
                txtReader = fileParseDomain.readWord(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".pdf")) {
                txtReader = fileParseDomain.readPDF(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".xls")) {
                txtReader = fileParseDomain.readExcel(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".txt")) {
                txtReader = fileParseDomain.readTxt(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".html")
                    || file.getPath().endsWith(".htm")) {
                txtReader = fileParseDomain.readHtmlText(file.getCanonicalPath());
            }
        } catch (IOException e) {
            log.debug("文件读取出错" + e.getMessage(), e);
            throw new RuntimeException("文件读取出错" + e.getMessage());
        }
        return txtReader;
    }

Lucene全文检索应用

SQLServer 全文检索(full-text)语法

mysql 全文检索中文解决方法及实例代码

SQL Server全文检索查询浅析

干货 |《从Lucene到Elasticsearch全文检索实战》拆解实践

关于Sphinx创建全文检索的索引介绍

mysql 全文检索中文解决方法及实例代码

Django中使用Whoosh进行全文检索的方法

基于JieBaNet+Lucene.Net实现全文搜索

SQL Server 2005 中做全文检索的方法分享

I-team 博客全文检索 Elasticsearch 实战