windows构建网页版搜索引擎 Nutch+Lucene+Mysql+Tomcat(二)
一、前言
通过前一篇的学习,我们学会如何搭建nutch平台,并且知道如何将nutch爬取下来的网站存储到数据库中,效果图如下:
接下来我们将通过mysql存下来的网页,利用lucene构建索引并进行查询。
二、Lucene介绍
Lucene是一个用Java开发的开源全文检索引擎,官网是http://lucene.apache.org/ ,Lucene不是一个完整的全文索引应用(与之对应的是solr),而是是一个用Java写的全文索引引擎工具包,它可以方便的嵌入到各种应用中实现针对应用的全文索引/检索功能,更多介绍大家自行搜索。
三、关于Lucene软件包的使用
其中最常用的五个文件:
第一个,Lucene-analyzers-common-4.0.0.jar,这里面包含了各种语言的词法分析器,用于对文件内容进行关键字切分,提取。
第二个,也是最重要的,Lucene-core-4.0.0.jar,其中包括了常用的文档,索引,搜索,存储等相关核心代码。
第三个,Lucene-highlighter-4.0.0.jar,这个jar包主要用于搜索出的内容高亮显示。
第四个和第五个,Lucene-queryparser-4.0.0.jar,提供了搜索相关的代码,用于各种搜索,比如模糊搜索,范围搜索,等等。
IKAnalyzer2012_FF.jar是一个国人写的中文分词工具,Lucene自带的分词对中文支持不好。注意,这个jar包网上比较乱,随便从网上下载的话可能不兼容,因为跟具体的Lucene版本有关,初学者建议直接用我demo里面整理好的jar包。
四、Lucene倒排索引的建立
Lucene对于索引的构建是通过倒排的方式进行构建,首先我们先来看看一张图:
从上图可知,我们Lucene是如何对索引进行构建的。
五、 建立索引
import java.io.File;
import java.nio.file.FileSystems;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class lucene_index {
public static final String INDEX_PATH = "D:\\lucene"; // 存放Lucene索引文件的位置
public static final String SCAN_PATH = "D:\\text"; // 需要被扫描的位置,测试的时候记得多在这下面放一些文件
/**
* 创建索引
*/
public void creatIndex()
{
IndexWriter indexWriter = null;
try
{
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(INDEX_PATH));
//Analyzer analyzer = new StandardAnalyzer();
Analyzer analyzer = new IKAnalyzer(true);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriter = new IndexWriter(directory, indexWriterConfig);
indexWriter.deleteAll();// 清除以前的index
// 获取被扫描目录下的所有文件,包括子目录
List<File> files = FileUtil.listAllFiles(SCAN_PATH);
for(int i=0; i<files.size(); i++)
{
Document document = new Document();
File file = files.get(i);
document.add(new Field("content", FileUtil.readFile(file.getAbsolutePath(),"utf-8"), TextField.TYPE_STORED));
document.add(new Field("fileName", file.getName(), TextField.TYPE_STORED));
document.add(new Field("filePath", file.getAbsolutePath(), TextField.TYPE_STORED));
document.add(new Field("updateTime", file.lastModified()+"", TextField.TYPE_STORED));
indexWriter.addDocument(document);
}
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
try
{
if(indexWriter != null) indexWriter.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
public static void main(String args[])
{
lucene_index demo = new lucene_index();
Search s = new Search();
// demo.creatIndex();
s.search("是");
}
}
关于FileUtil 在我的源代码中有。
执行完之后就在指定目录新建了索引文件,以后的搜索就靠他们了:
六、简单的搜索
import java.nio.file.FileSystems;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryRescorer;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class Search {
/**
* 搜索
*/
public void search(String keyWord)
{
DirectoryReader directoryReader = null;
try
{
// 1、创建Directory
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(lucene_index.INDEX_PATH));
// 2、创建IndexReader
directoryReader = DirectoryReader.open(directory);
// 3、根据IndexReader创建IndexSearch
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
// 4、创建搜索的Query
// Analyzer analyzer = new StandardAnalyzer();
Analyzer analyzer = new IKAnalyzer(true); // 使用IK分词
// 简单的查询,创建Query表示搜索域为content包含keyWord的文档
//Query query = new QueryParser("content", analyzer).parse(keyWord);
String[] fields = {"fileName", "content"}; // 要搜索的字段,一般搜索时都不会只搜索一个字段
// 字段之间的与或非关系,MUST表示and,MUST_NOT表示not,SHOULD表示or,有几个fields就必须有几个clauses
BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
// MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国"
Query multiFieldQuery = MultiFieldQueryParser.parse(keyWord, fields, clauses, analyzer);
// 5、根据searcher搜索并且返回TopDocs
TopDocs topDocs = indexSearcher.search(multiFieldQuery, 100); // 搜索前100条结果
System.out.println("共找到匹配处:" + topDocs.totalHits); // totalHits和scoreDocs.length的区别还没搞明白
// 6、根据TopDocs获取ScoreDoc对象
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
System.out.println("共找到匹配文档数:" + scoreDocs.length);
QueryScorer scorer = new QueryScorer(multiFieldQuery, "content");
// 自定义高亮代码
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"backgroud:red\">", "</span>");
Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
for (ScoreDoc scoreDoc : scoreDocs)
{
// 7、根据searcher和ScoreDoc对象获取具体的Document对象
Document document = indexSearcher.doc(scoreDoc.doc);
//TokenStream tokenStream = new SimpleAnalyzer().tokenStream("content", new StringReader(content));
//TokenSources.getTokenStream("content", tvFields, content, analyzer, 100);
//TokenStream tokenStream = TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), scoreDoc.doc, "content", document, analyzer);
//System.out.println(highlighter.getBestFragment(tokenStream, content));
System.out.println("-----------------------------------------");
System.out.println(document.get("fileName") + ":" + document.get("filePath"));
System.out.println(highlighter.getBestFragment(analyzer, "content", document.get("content")));
System.out.println("");
}
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
try
{
if(directoryReader != null) directoryReader.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
}
测试
public static void main(String args[])
{
FileSearchDemo demo = new FileSearchDemo();
demo.creatIndex();
demo.search("读取 导出");
}
七、对数据库进行搜索
通过前面几点的描述我们大概知道了lucene是如何构建索引的,以及简单的搜索功能的实现,接下来我们将实现lucene对mysql数据库的数据进行索引构建
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.file.FileSystems;
import java.sql.ResultSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* 基于Lucene5.5.4的数据库搜索demo
* @author liuxianan
*/
public class DbSearchDemo
{
public static final String INDEX_PATH = "D:\\lucene-db";
public static final String JDBC_URL = "jdbc:mysql://localhost:3306/nutch?useUnicode=true&characterEncoding=utf-8";
public static final String USER = "root";
public static final String PWD = "root";
/**
* 创建索引
*/
public void creatIndex()
{
IndexWriter indexWriter = null;
try
{
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(INDEX_PATH));
//Analyzer analyzer = new StandardAnalyzer();
Analyzer analyzer = new IKAnalyzer(true);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriter = new IndexWriter(directory, indexWriterConfig);
indexWriter.deleteAll();// 清除以前的index
//eclipese连接MySQL出错“Class.forName("com.mysql.jdbc.Driver")”问题解决?将mysql-connector-java-5.1.18.jar导入eclipse中就可以了。
JdbcUtil jdbc = new JdbcUtil(JDBC_URL, USER, PWD);
ResultSet rs = jdbc.query("select * from webpage");
while(rs.next())
{
Document document = new Document();
//面对NULL的处理
if(rs.getString("title")==null)
{
document.add(new Field("baseUrl", "", TextField.TYPE_STORED));
}
else
{
document.add(new Field("baseUrl", rs.getString("baseUrl"), TextField.TYPE_STORED));
}
if(rs.getString("title")==null)
{
document.add(new Field("title", "", TextField.TYPE_STORED));
}
else
{
document.add(new Field("title", rs.getString("title"), TextField.TYPE_STORED));
}
if(rs.getString("text")==null)
{
document.add(new Field("text", "", TextField.TYPE_STORED));
}
else
{
document.add(new Field("text", rs.getString("text"), TextField.TYPE_STORED));
}
// document.add(new Field("tag", rs.getString("tags"), TextField.TYPE_STORED));
// document.add(new Field("url", rs.getString("url"), TextField.TYPE_STORED));
indexWriter.addDocument(document);
}
jdbc.closeAll();
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
try
{
if(indexWriter != null) indexWriter.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
public static String readTxt(String filePath) {
try {
File file = new File(filePath);
if(file.isFile() && file.exists()) {
InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "utf-8");
BufferedReader br = new BufferedReader(isr);
String lineTxt = null;
while ((lineTxt = br.readLine()) != null) {
System.out.println(lineTxt);
}
br.close();
} else {
System.out.println("文件不存在!");
}
} catch (Exception e) {
System.out.println("文件读取错误!");
}
return filePath;
}
/**
* 搜索
*/
public void search(String keyWord)
{
DirectoryReader directoryReader = null;
try
{
// 1、创建Directory
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(INDEX_PATH));
// 2、创建IndexReader
directoryReader = DirectoryReader.open(directory);
// 3、根据IndexReader创建IndexSearch
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
// 4、创建搜索的Query
// Analyzer analyzer = new StandardAnalyzer();
Analyzer analyzer = new IKAnalyzer(true); // 使用IK分词
// 简单的查询,创建Query表示搜索域为content包含keyWord的文档
//Query query = new QueryParser("content", analyzer).parse(keyWord);
String[] fields = {"baseUrl", "text", "title"};
// MUST 表示and,MUST_NOT 表示not ,SHOULD表示or
BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
// MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国"
Query multiFieldQuery = MultiFieldQueryParser.parse(keyWord, fields, clauses, analyzer);
// 5、根据searcher搜索并且返回TopDocs
TopDocs topDocs = indexSearcher.search(multiFieldQuery, 100); // 搜索前100条结果
System.out.println("共找到匹配处:" + topDocs.totalHits);
// 6、根据TopDocs获取ScoreDoc对象
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
System.out.println("共找到匹配文档数:" + scoreDocs.length);
QueryScorer scorer = new QueryScorer(multiFieldQuery, "text");
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"backgroud:red\">", "</span>");
Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
for (ScoreDoc scoreDoc : scoreDocs)
{
// 7、根据searcher和ScoreDoc对象获取具体的Document对象
Document document = indexSearcher.doc(scoreDoc.doc);
String text = document.get("text");
//TokenStream tokenStream = new SimpleAnalyzer().tokenStream("content", new StringReader(content));
//TokenSources.getTokenStream("content", tvFields, content, analyzer, 100);
//TokenStream tokenStream = TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), scoreDoc.doc, "content", document, analyzer);
//System.out.println(highlighter.getBestFragment(tokenStream, content));
System.out.println("-----------------------------------------");
System.out.println("文章标题:"+document.get("title"));
System.out.println("文章地址:" + document.get("baseUrl"));
System.out.println("文章内容:");
System.out.println(highlighter.getBestFragment(analyzer, "text", text));
System.out.println("");
// 8、根据Document对象获取需要的值
}
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
try
{
if(directoryReader != null) directoryReader.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
public static void main(String args[])
{
DbSearchDemo demo = new DbSearchDemo();
demo.creatIndex();
demo.search("秦死皇");
}
}
运行的效果图如下:
八、jsp版搜索构建
关于jsp版的构建,只是将搜索的结果放入到前端,有兴趣的同学可以参考一下我的源代码
效果图如下:
上一篇: hive搭建和基本使用