HighLighter 博客分类: java luceneApache.netGmailAnt
今天搞了一个关于Lucene的例子,权当入门教程。网上有很多资料,但是要么不全、要么不好用,所以这里把全部代码以及依赖的包贴上来了。
功能包括:创建索引、检索索引、高亮显示查询结果。分词使用的庖丁解牛。
使用前先下载相关的LuceneCore jar包、LuceneHighLighter jar包、庖丁解牛分分词jar包、庖丁解牛词典。并设定环境变量PAODING_DIC_HOME指向词典位置。
前两个可以到官方网站找,庖丁去http://code.google.com/p/paoding/downloads/list下载。
Lucene庖丁整合方式1:
1、将paoding-analysis.jar拷贝到项目的WEB-INF/lib目录;
2、接着需要设置环境变量PAODING_DIC_HOME,变量名:PAODING_DIC_HOME 变量值:E:\paoding\dic
3、第三步将E:\paoding\src目录下的paoding-dic-home.properties属性文件拷贝到项目的src目录下,添加2行
paoding.dic.home.config-fisrt=this
paoding.dic.home=E:/paoding/dic
Lucene庖丁整合方式2:
修改E:\paoding\src\paoding-dic-home.properties,增加一行
paoding.dic.home=classpath:dic
然后运行ant重新生成一个庖丁jar,拷贝到lib下就OK了。
第一种方式便于更新字典,第二种便于移植。本例使用第二种方法整合。
关于庖丁环境的设置可以参考net\paoding\analysis\Constants.java。
使用时注意LuceneCore和LuceneHighLighter的版本配置。我开始使用lucene-core-2.3.2.jar+Highlighter 2.4,后台报错,明显的版本问题。现在使用的是Lucene 2.3.2 + Highlighter 2.2.0。
主要代码实现:
CreateIndex:创建索引文件
- package demo;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.util.Date;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexWriter;
- /**
- * 建立索引
- *
- */
- public class CreateIndex {
- public void createIndex() throws Exception {
- /* 指明要索引文件夹的位置,这里是C盘的S文件夹下 */
- File surceFileDir = new File("D:\\save\\source");
- /* 这里放索引文件的位置 */
- File indexFileDir = new File("D:\\save");
- //Analyzer luceneAnalyzer = new StandardAnalyzer();
- Analyzer luceneAnalyzer = new PaodingAnalyzer();//使用庖丁解牛分词法
- IndexWriter indexWriter = new IndexWriter(indexFileDir, luceneAnalyzer, true);///参数isEmpty是false表示增量索引
- File[] sourceFextFiles = surceFileDir.listFiles();
- long startTime = new Date().getTime();
- // 增加document到索引去
- for (int i = 0; i < sourceFextFiles.length; i++) {
- if (sourceFextFiles[i].isFile()
- && sourceFextFiles[i].getName().endsWith(".txt")) {
- System.out.println("File " + sourceFextFiles[i].getCanonicalPath() + "正在被索引....");
- String temp = FileReaderAll(sourceFextFiles[i].getCanonicalPath(), "GBK");
- System.out.println(temp);
- Document document = new Document();
- Field FieldPath = new Field("path", sourceFextFiles[i].getPath(), Field.Store.YES, Field.Index.NO);
- Field FieldBody = new Field("body", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- Field FieldTitle = new Field("title", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- document.add(FieldPath);
- document.add(FieldBody);document.add(FieldTitle);
- indexWriter.addDocument(document);
- }
- }
- // optimize()方法是对索引进行优化
- indexWriter.optimize();
- indexWriter.close();
- // 测试一下索引的时间
- long endTime = new Date().getTime();
- System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!"
- + indexFileDir.getPath());
- }
- public static String FileReaderAll(String FileName, String charset)
- throws IOException {
- BufferedReader reader = new BufferedReader(new InputStreamReader(
- new FileInputStream(FileName), charset));
- String line = new String();
- String temp = new String();
- while ((line = reader.readLine()) != null) {
- temp += line;
- }
- reader.close();
- return temp;
- }
- /**
- * @param args
- */
- public static void main(String[] args) {
- try {
- new CreateIndex().createIndex();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
package demo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; /** * 建立索引 * */ public class CreateIndex { public void createIndex() throws Exception { /* 指明要索引文件夹的位置,这里是C盘的S文件夹下 */ File surceFileDir = new File("D:\\save\\source"); /* 这里放索引文件的位置 */ File indexFileDir = new File("D:\\save"); //Analyzer luceneAnalyzer = new StandardAnalyzer(); Analyzer luceneAnalyzer = new PaodingAnalyzer();//使用庖丁解牛分词法 IndexWriter indexWriter = new IndexWriter(indexFileDir, luceneAnalyzer, true);///参数isEmpty是false表示增量索引 File[] sourceFextFiles = surceFileDir.listFiles(); long startTime = new Date().getTime(); // 增加document到索引去 for (int i = 0; i < sourceFextFiles.length; i++) { if (sourceFextFiles[i].isFile() && sourceFextFiles[i].getName().endsWith(".txt")) { System.out.println("File " + sourceFextFiles[i].getCanonicalPath() + "正在被索引...."); String temp = FileReaderAll(sourceFextFiles[i].getCanonicalPath(), "GBK"); System.out.println(temp); Document document = new Document(); Field FieldPath = new Field("path", sourceFextFiles[i].getPath(), Field.Store.YES, Field.Index.NO); Field FieldBody = new Field("body", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field FieldTitle = new Field("title", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(FieldPath); document.add(FieldBody);document.add(FieldTitle); indexWriter.addDocument(document); } } // optimize()方法是对索引进行优化 indexWriter.optimize(); indexWriter.close(); // 测试一下索引的时间 long endTime = new Date().getTime(); System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + indexFileDir.getPath()); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } /** * @param args */ public static void main(String[] args) { try { new CreateIndex().createIndex(); } catch (Exception e) { e.printStackTrace(); } } }
QueryHighLighter:检索关键字并高亮显示
- package demo;
- import java.io.StringReader;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.BooleanClause;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocCollector;
- import org.apache.lucene.search.highlight.Highlighter;
- import org.apache.lucene.search.highlight.QueryScorer;
- import org.apache.lucene.search.highlight.SimpleFragmenter;
- import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
- import test.TestLuceneHighlighter2;
- /**
- * 高亮显示检索结果
- * Lucene 2.3.2 + Highlighter 2.2.0 的分页+高亮显示代码例子.<br>
- * Lucene和Highlighter不是最新版本可以升级。
- */
- public class QueryHighLighter {
- private static final String FIELD_TITLE = "title";
- private static final String FIELD_BODY = "body";
- public synchronized Analyzer getAnalyzer() {
- return new PaodingAnalyzer();// 此处使用"庖丁解牛"分词法,另外一种是中科院分词法
- }
- public String test(String queryString, int begin, int number) {
- StringBuffer sb = new StringBuffer();
- IndexSearcher isearcher = null;
- try {
- isearcher = new IndexSearcher("D:\\save");
- /* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */
- BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,
- BooleanClause.Occur.SHOULD };
- TopDocCollector collector = new TopDocCollector(10);
- /*Query query = MultiFieldQueryParser.parse(queryString,
- new String[] { FIELD_TITLE, FIELD_BODY }, clauses,
- getAnalyzer());*/
- QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer());
- Query query = queryParse.parse(queryString);
- isearcher.search(query, collector);
- ScoreDoc[] hits = collector.topDocs().scoreDocs;
- // 用这个进行高亮显示,默认是<b>..</b>
- // 用这个指定<read>..</read>
- SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>");
- // 构造高亮
- // 指定高亮的格式
- // 指定查询评分
- Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
- // 这个一般等于你要返回的,高亮的数据长度
- // 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少
- // 太大,有时太浪费了。
- highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
- for (int i = begin; i < hits.length && i < begin + number; i++) {
- Document doc = isearcher.doc(hits[i].doc);
- String value = doc.get(FIELD_TITLE);
- String value2 = doc.get(FIELD_BODY);
- // 有三个参数
- // 分析器
- // 要解析的字段名
- // 要解析的数据
- //System.out.println(highlighter.getBestFragment(getAnalyzer(),
- // FIELD_TITLE, doc.get(FIELD_TITLE)));
- if (value != null) {
- TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value));
- String str = highlighter.getBestFragment(tokenStream, value);
- sb.append("<li><li>").append(str).append("<br/>");
- System.out.println(str);
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- if (isearcher != null) {
- try {
- isearcher.close();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- return sb.toString();
- }
- public static void main(String[] args){
- TestLuceneHighlighter2 t = new TestLuceneHighlighter2();
- String queryString = "*";
- int begin = 0;
- int number = 10;
- t.test(queryString, begin, number);
- }
- }
- package demo;
- import java.io.StringReader;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.BooleanClause;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocCollector;
- import org.apache.lucene.search.highlight.Highlighter;
- import org.apache.lucene.search.highlight.QueryScorer;
- import org.apache.lucene.search.highlight.SimpleFragmenter;
- import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
- import test.TestLuceneHighlighter2;
- /**
- * 高亮显示检索结果
- * Lucene 2.3.2 + Highlighter 2.2.0 的分页+高亮显示代码例子.<br>
- * Lucene和Highlighter不是最新版本可以升级。
- */
- public class QueryHighLighter {
- private static final String FIELD_TITLE = "title";
- private static final String FIELD_BODY = "body";
- public synchronized Analyzer getAnalyzer() {
- return new PaodingAnalyzer();// 此处使用"庖丁解牛"分词法,另外一种是中科院分词法
- }
- public String test(String queryString, int begin, int number) {
- StringBuffer sb = new StringBuffer();
- IndexSearcher isearcher = null;
- try {
- isearcher = new IndexSearcher("D:\\save");
- /* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */
- BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,
- BooleanClause.Occur.SHOULD };
- TopDocCollector collector = new TopDocCollector(10);
- /*Query query = MultiFieldQueryParser.parse(queryString,
- new String[] { FIELD_TITLE, FIELD_BODY }, clauses,
- getAnalyzer());*/
- QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer());
- Query query = queryParse.parse(queryString);
- isearcher.search(query, collector);
- ScoreDoc[] hits = collector.topDocs().scoreDocs;
- // 用这个进行高亮显示,默认是<b>..</b>
- // 用这个指定<read>..</read>
- SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>");
- // 构造高亮
- // 指定高亮的格式
- // 指定查询评分
- Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
- // 这个一般等于你要返回的,高亮的数据长度
- // 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少
- // 太大,有时太浪费了。
- highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
- for (int i = begin; i < hits.length && i < begin + number; i++) {
- Document doc = isearcher.doc(hits[i].doc);
- String value = doc.get(FIELD_TITLE);
- String value2 = doc.get(FIELD_BODY);
- // 有三个参数
- // 分析器
- // 要解析的字段名
- // 要解析的数据
- //System.out.println(highlighter.getBestFragment(getAnalyzer(),
- // FIELD_TITLE, doc.get(FIELD_TITLE)));
- if (value != null) {
- TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value));
- String str = highlighter.getBestFragment(tokenStream, value);
- sb.append("<li><li>").append(str).append("<br/>");
- System.out.println(str);
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- if (isearcher != null) {
- try {
- isearcher.close();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- return sb.toString();
- }
- public static void main(String[] args){
- TestLuceneHighlighter2 t = new TestLuceneHighlighter2();
- String queryString = "*";
- int begin = 0;
- int number = 10;
- t.test(queryString, begin, number);
- }
- }
package demo; import java.io.StringReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import test.TestLuceneHighlighter2; /** * 高亮显示检索结果 * Lucene 2.3.2 + Highlighter 2.2.0 的分页+高亮显示代码例子.<br> * Lucene和Highlighter不是最新版本可以升级。 */ public class QueryHighLighter { private static final String FIELD_TITLE = "title"; private static final String FIELD_BODY = "body"; public synchronized Analyzer getAnalyzer() { return new PaodingAnalyzer();// 此处使用"庖丁解牛"分词法,另外一种是中科院分词法 } public String test(String queryString, int begin, int number) { StringBuffer sb = new StringBuffer(); IndexSearcher isearcher = null; try { isearcher = new IndexSearcher("D:\\save"); /* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */ BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD }; TopDocCollector collector = new TopDocCollector(10); /*Query query = MultiFieldQueryParser.parse(queryString, new String[] { FIELD_TITLE, FIELD_BODY }, clauses, getAnalyzer());*/ QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer()); Query query = queryParse.parse(queryString); isearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 用这个进行高亮显示,默认是<b>..</b> // 用这个指定<read>..</read> SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>"); // 构造高亮 // 指定高亮的格式 // 指定查询评分 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); // 这个一般等于你要返回的,高亮的数据长度 // 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少 // 太大,有时太浪费了。 highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); for (int i = begin; i < hits.length && i < begin + number; i++) { Document doc = isearcher.doc(hits[i].doc); String value = doc.get(FIELD_TITLE); String value2 = doc.get(FIELD_BODY); // 有三个参数 // 分析器 // 要解析的字段名 // 要解析的数据 //System.out.println(highlighter.getBestFragment(getAnalyzer(), // FIELD_TITLE, doc.get(FIELD_TITLE))); if (value != null) { TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value)); String str = highlighter.getBestFragment(tokenStream, value); sb.append("<li><li>").append(str).append("<br/>"); System.out.println(str); } } } catch (Exception e) { e.printStackTrace(); } finally { if (isearcher != null) { try { isearcher.close(); } catch (Exception e) { e.printStackTrace(); } } } return sb.toString(); } public static void main(String[] args){ TestLuceneHighlighter2 t = new TestLuceneHighlighter2(); String queryString = "*"; int begin = 0; int number = 10; t.test(queryString, begin, number); } } package demo; import java.io.StringReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import test.TestLuceneHighlighter2; /** * 高亮显示检索结果 * Lucene 2.3.2 + Highlighter 2.2.0 的分页+高亮显示代码例子.<br> * Lucene和Highlighter不是最新版本可以升级。 */ public class QueryHighLighter { private static final String FIELD_TITLE = "title"; private static final String FIELD_BODY = "body"; public synchronized Analyzer getAnalyzer() { return new PaodingAnalyzer();// 此处使用"庖丁解牛"分词法,另外一种是中科院分词法 } public String test(String queryString, int begin, int number) { StringBuffer sb = new StringBuffer(); IndexSearcher isearcher = null; try { isearcher = new IndexSearcher("D:\\save"); /* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */ BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD }; TopDocCollector collector = new TopDocCollector(10); /*Query query = MultiFieldQueryParser.parse(queryString, new String[] { FIELD_TITLE, FIELD_BODY }, clauses, getAnalyzer());*/ QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer()); Query query = queryParse.parse(queryString); isearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 用这个进行高亮显示,默认是<b>..</b> // 用这个指定<read>..</read> SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>"); // 构造高亮 // 指定高亮的格式 // 指定查询评分 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); // 这个一般等于你要返回的,高亮的数据长度 // 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少 // 太大,有时太浪费了。 highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); for (int i = begin; i < hits.length && i < begin + number; i++) { Document doc = isearcher.doc(hits[i].doc); String value = doc.get(FIELD_TITLE); String value2 = doc.get(FIELD_BODY); // 有三个参数 // 分析器 // 要解析的字段名 // 要解析的数据 //System.out.println(highlighter.getBestFragment(getAnalyzer(), // FIELD_TITLE, doc.get(FIELD_TITLE))); if (value != null) { TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value)); String str = highlighter.getBestFragment(tokenStream, value); sb.append("<li><li>").append(str).append("<br/>"); System.out.println(str); } } } catch (Exception e) { e.printStackTrace(); } finally { if (isearcher != null) { try { isearcher.close(); } catch (Exception e) { e.printStackTrace(); } } } return sb.toString(); } public static void main(String[] args){ TestLuceneHighlighter2 t = new TestLuceneHighlighter2(); String queryString = "*"; int begin = 0; int number = 10; t.test(queryString, begin, number); } }
附加上传net\paoding\analysis\Constants.java便于理解参数设置:
- package net.paoding.analysis;
- import java.util.HashMap;
- import java.util.Map;
- import java.util.Properties;
- /**
- *
- * @author Zhiliang Wang [qieqie.wang@gmail.com]
- *
- * @since 2.0.0
- */
- </strong
推荐阅读
-
HighLighter 博客分类: java luceneApache.netGmailAnt
-
JVM命令之jstack深入讲解 博客分类: java开发
-
实用的java 串口通信程序 博客分类: java Java数据结构thread
-
Hadoop 博客分类: java HadoopMapreducelucene编程搜索引擎
-
JSR and JCP 博客分类: Java Foundation JavaComet软件测试SUNJ2SE
-
XMPP 博客分类: java 网络协议Google应用服务器网络应用互联网
-
搭建spring mvc rest返回json,xml遇到的问题,及解决办法 博客分类: java
-
JNI完全手册 --摘抄 博客分类: java基础 JNIJavaC#C++C
-
动态调用 简单Java Bean 的get/set 方法 博客分类: java基础 BeanJavaDAOSQLF#
-
GlassFish替换Tomcat 博客分类: Java GlassfishTomcatEclipseJSP浏览器