lucene搜索
程序员文章站
2022-07-08 19:32:00
...
这里做的lucene是根据一个表里的网址链接抓取网页生成索引。
线程配置文件
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd">
<!-- blogindextop -->
<beans>
<!-- (装载定时器)-->
<bean class="org.springframework.scheduling.quartz.SchedulerFactoryBean">
<property name="triggers">
<list>
<ref bean="TaskStatTrigger" />
<ref bean="TranslateCheckInfoTrigger" />
</list>
</property>
</bean>
<!-- (定时器)每日定时执行 -->
<bean id="TaskStatTrigger" class="org.springframework.scheduling.quartz.CronTriggerBean">
<property name="jobDetail">
<bean class="org.springframework.scheduling.quartz.MethodInvokingJobDetailFactoryBean">
<property name="targetObject" ref="TranslateLoadAttendanceInfoService" />
<property name="targetMethod" value="translate" />
<property name="concurrent" value="false" />
</bean>
</property>
<!-- 0 代表秒,27代表分,15代表小时(24小时制)
?不代表任何值(也就是对日期不做要求),* 在这代表1-12月的每一个月,
FRI 是星期五
如:"0 15 10 * * ? *" 每天上午10:15执行
"0 15 10 ? * MON-FRI" 周一至周五的上午10:15
"34 10 9 ? * SUN" 每个月的星期六的上午9:10:34 执行
"34 10 9 5 * ?" 每个月的5号上上午9:10:34 执行
每天每隔10秒
-->
<property name="cronExpression" value="0 00 00 * * ?" />
</bean>
<!-- (定时器)定时类 加入考勤人员 -->
<bean id="TranslateLoadAttendanceInfoService" parent="txProxyTemplate">
<property name="target">
<bean class="com.sdfxw.office.service.TranslateLoadAttendanceInfoServiceImp">
<property name="attendancelDAO">
<ref bean="AttendancelDAO" />
</property>
<property name="personNelInfoDAO">
<ref bean="PersonNelInfoDAO" />
</property>
<property name="attendanceRuleDAO">
<ref bean="AttendanceRuleDAO" />
</property>
</bean>
</property>
</bean>
struts配置文件
<!--搜索引擎-->
<action parameter="actionName" path="/search" type="com.sdfxw.search.action.SearchAction" >
<forward name="searchview" path="/jsp/search/search.jsp" />
<forward name="customview" path="/jsp/search/custom.jsp" />
</action>
建表语句
-- Create table
create table SEARCHLINK
(
LINKID VARCHAR2(50) not null,
URL VARCHAR2(200),
MODEL VARCHAR2(50)
)
先做action
package com.sdfxw.search.action;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.struts.action.ActionForm;
import org.apache.struts.action.ActionForward;
import org.apache.struts.action.ActionMapping;
import org.apache.struts.actions.DispatchAction;
import org.carf.common.spring.ApplicationFactory;
import org.carf.util.common.ParameterUtil;
import org.carf.util.page.PageViewContext;
import com.sdfxw.search.service.SearchService;
public class SearchAction extends DispatchAction
{
public ActionForward search(ActionMapping mapping, ActionForm form,
HttpServletRequest request, HttpServletResponse response) throws Exception
{
String keyword = ParameterUtil.getParameter(request, "keyword");
String page = ParameterUtil.getParameter(request, "page");
//if(StringUtils.isNotBlank(keyword)){
SearchService service = (SearchService) ApplicationFactory
.getService("SearchService");
String querystr = "Content:" + keyword;
PageViewContext pp = service.query(querystr, 20, page);
request.setAttribute("PP", pp);
//}
return mapping.findForward("searchview");
}
public ActionForward custom(ActionMapping mapping, ActionForm form,
HttpServletRequest request, HttpServletResponse response) throws Exception
{
String keyword = ParameterUtil.getParameter(request, "keyword");
String page = ParameterUtil.getParameter(request, "page");
SearchService service = (SearchService) ApplicationFactory
.getService("SearchService");
String querystr = "Content:" + keyword + " OR Caption:" + keyword;
PageViewContext pp = service.querycustom(querystr, 20, page);
request.setAttribute("PP", pp);
return mapping.findForward("customview");
}
}
service代码(生成索引,搜索索引)
package com.sdfxw.search.service;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.carf.util.common.WebFormatter;
import org.carf.util.page.PageViewContext;
import org.carf.util.page.PageViewUtil;
import org.springframework.core.io.Resource;
import com.sdfxw.search.dao.URLDao;
public class SearchService
{
public final String MODEL_SAFE = "1";
public final String MODEL_EXPERT = "13";
public final String MODEL_PRODUCT = "2";
public final String MODEL_ING = "31";
public final String MODEL_CHANCE = "41";
public final String MODEL_STORY = "42";
public final String MODEL_ANGEL = "43";
public final String MODEL_MONTHLY = "53";
public final String MODEL_JOB = "75";
public final String MODEL_CUSTOM = "16";
public final String MODEL_MEDIUM = "76";
public final String MODEL_AGENCY = "77";
public final String MODEL_DOWNLOAD = "78";
private Resource indexDir;
private String indexPrefix;
private URLDao urldao;
private static final String[] specialChar = new String[] { "\\", "+", "-", "&&",
"||", "!", "(", ")", "{", "}", "[", "]", "^", "\"", "~", "*", "?", ":" };
public PageViewContext query(String queryStr, int pageSize, String pageNum)
throws IOException, ParseException
{
IndexSearcher indexSearcher = new IndexSearcher(indexDir.getFile().getPath());
QueryParser queryParser = new QueryParser("Content", new StandardAnalyzer());
Query query = queryParser.parse(queryStr);
// Query query = MultiFieldQueryParser.parse(new String[]{"Content","Model"}, new String[]{queryStr,MODEL_SAFE}, new StandardAnalyzer());
// BooleanQuery query2 = new BooleanQuery();
// query2.add(arg0, arg1)
Hits hits = indexSearcher.search(query);
PageViewContext pp = PageViewUtil.getPageViewContext(pageSize, pageNum, hits);
indexSearcher.close();
return pp;
}
public PageViewContext querycustom(String queryStr, int pageSize, String pageNum)
throws IOException, ParseException
{
IndexSearcher indexSearcher = new IndexSearcher(indexDir.getFile().getPath()
+ "_custom");
QueryParser queryParser = new QueryParser("Content", new StandardAnalyzer());
Query query = queryParser.parse(queryStr);
Hits hits = indexSearcher.search(query);
PageViewContext pp = PageViewUtil.getPageViewContext(pageSize, pageNum, hits);
indexSearcher.close();
return pp;
}
public void createIndex()
{
List list = urldao.getURL();
for (int i = 0; i < list.size(); i++)
{
Map map = (Map) list.get(i);
String urlstr = (String) map.get("URL");
String model = (String) map.get("Model");
try
{
createOneItem(urlstr, model);
}
catch (Exception e)
{
e.printStackTrace(System.out);
}
}
try
{
String path = indexDir.getFile().getPath();
File f = new File(path + "_tmp");
try
{
FileUtils.forceDelete(indexDir.getFile());
}
catch (Exception ee)
{
}
f.renameTo(indexDir.getFile());
}
catch (Exception e)
{
e.printStackTrace(System.out);
}
}
private void createOneItem(String urlstr, String model) throws Exception
{
Analyzer luceneAnalyzer = new StandardAnalyzer();
String path = indexDir.getFile().getPath() + "_tmp";
boolean create = !indexExist(path);
IndexWriter indexWriter = new IndexWriter(path, luceneAnalyzer, create);
try
{
indexWriter.setMergeFactor(1500);
Document doc = new Document();
if (!urlstr.startsWith("\\") && !urlstr.startsWith("/"))
urlstr = "/" + urlstr;
urlstr = StringUtils.replace(urlstr, "\\", "/");
Field f_url = new Field("URL", urlstr, Field.Store.YES,
Field.Index.UN_TOKENIZED);
try
{
InputStream is = getHtmlStream(urlstr);
String content = IOUtils.toString(is);
content = WebFormatter.html2text(content);
content = StringUtils.deleteWhitespace(content);
String summary = StringUtils.abbreviate(content, 200);
is = getHtmlStream(urlstr);
HTMLParser parser = new HTMLParser(is);
Field f_title = new Field("Title", parser.getTitle(), Field.Store.YES,
Field.Index.TOKENIZED);
Field f_model = new Field("Model", model, Field.Store.YES,
Field.Index.UN_TOKENIZED);
Field f_summary = new Field("Summary", summary, Field.Store.YES,
Field.Index.UN_TOKENIZED);
Field f_content = new Field("Content", content, Field.Store.NO,
Field.Index.TOKENIZED);
doc.add(f_url);
doc.add(f_title);
doc.add(f_summary);
doc.add(f_content);
indexWriter.addDocument(doc);
indexWriter.optimize();
}
catch (Exception e)
{
e.printStackTrace(System.out);
}
}
finally
{
indexWriter.close();
}
Thread t = Thread.currentThread();
}
private InputStream getHtmlStream(String urlstr) throws Exception
{
HttpClient httpclient = new HttpClient();
System.out.println(indexPrefix + urlstr);
PostMethod httppost = new PostMethod(indexPrefix + urlstr);
httpclient.executeMethod(httppost);
InputStream is = httppost.getResponseBodyAsStream();
return is;
}
public boolean indexExist(String indexDir)
{
return IndexReader.indexExists(indexDir);
}
private String EscapSpecialChar(String str)
{
for (int i = 0; i < specialChar.length; i++)
{
StringUtils.replace(str, specialChar[i], "\\" + specialChar[i]);
}
return str;
}
public String genFiled(String fName, String fValue)
{
String str = fName + ":\"" + EscapSpecialChar(fValue) + "\"";
return str;
}
public String getIndexPrefix()
{
return indexPrefix;
}
public void setIndexPrefix(String indexPrefix)
{
this.indexPrefix = indexPrefix;
}
public URLDao getUrldao()
{
return urldao;
}
public void setUrldao(URLDao urldao)
{
this.urldao = urldao;
}
public void setIndexDir(Resource indexDir)
{
this.indexDir = indexDir;
}
public void insertSearchLink(String modelID, String url)
{
Map map = new HashMap();
map.put("Model", modelID);
map.put("URL", url);
this.urldao.insert(map);
}
public void deleteSearchLink(String modelID)
{
this.urldao.deleteByModelID(modelID);
}
public void createCustomIndex(List list) throws Exception
{
Analyzer luceneAnalyzer = new StandardAnalyzer();
String temppath = indexDir.getFile().getPath() + "_customtmp";
String path = indexDir.getFile().getPath() + "_custom";
boolean create = !indexExist(temppath);
IndexWriter indexWriter = new IndexWriter(temppath, luceneAnalyzer, create);
try
{
indexWriter.setMergeFactor(1500);
for (int i = 0; i < list.size(); i++)
{
Map map = (Map) list.get(i);
String Caption = (String) map.get("Caption");
Object IssueDate = map.get("IssueDate");
String IssueDateStr = "";
if (IssueDate != null)
{
IssueDateStr = IssueDate.toString().substring(0, 10);
}
else
{
IssueDateStr = "";
}
String ID = (String) map.get("ID");
String Content = (String) map.get("Content");// 内容
Document doc = new Document();
Field f_ID = new Field("ID", ID, Field.Store.YES,
Field.Index.UN_TOKENIZED);
Field f_Caption = new Field("Caption", Caption, Field.Store.YES,
Field.Index.TOKENIZED);
Field f_content = new Field("Content", Content, Field.Store.NO,
Field.Index.TOKENIZED);
Field f_IssueDate = new Field("IssueDate", IssueDateStr, Field.Store.YES,
Field.Index.UN_TOKENIZED);
doc.add(f_ID);
doc.add(f_Caption);
doc.add(f_content);
doc.add(f_IssueDate);
indexWriter.addDocument(doc);
indexWriter.optimize();
}
}
finally
{
indexWriter.close();
}
try
{
File ftemp = new File(temppath);
File f = new File(path);
try
{
FileUtils.forceDelete(f);
}
catch (Exception ee)
{
}
ftemp.renameTo(f);
}
catch (Exception e)
{
e.printStackTrace(System.out);
}
}
public List getURL(String url)
{
return urldao.getURL();
}
public void update(Map map)
{
urldao.update(map);
}
public void insertOrupdateByUrl(String url,String model)
{
//String searchurl = url.replace("&", "'||chr(38)||'");
String searchurl = StringUtils.replace(url, "&", "'||chr(38)||'");
List results = urldao.getURL(searchurl);
if((results == null || results.size()==0) && StringUtils.isNotBlank(model))
{
Map map = new HashMap();
map.put("URL", url);
map.put("MODEL", model);
urldao.insert(map);
}else if(results != null && results.size()>0 && StringUtils.isNotBlank(url)){
Map map = (Map)results.get(0);
map.put("URL", url);
urldao.update(map);
}
}
public void deleteByUrl(String url)
{
url = StringUtils.replace(url, "&", "'||chr(38)||'");
urldao.deleteByUrl(url);
}
}
dao层代码
package com.sdfxw.search.dao;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringEscapeUtils;
import org.carf.common.jdbc.JdbcDaoSupportEx;
import org.carf.common.jdbc.JdbcTemplateEx;
public class URLDao extends JdbcDaoSupportEx
{
public List getURL()
{
JdbcTemplateEx template = this.getJdbcTemplate();
String sql = "select * from SEARCHLINK";
return template.queryForList(sql);
}
public List getURL(String url)
{
JdbcTemplateEx template = this.getJdbcTemplate();
String sql = "select * from SEARCHLINK where URL ='"+ StringEscapeUtils.escapeSql(url) +"'";
return template.queryForList(sql);
}
public void insert(Map map)
{
JdbcTemplateEx template = this.getJdbcTemplate();
map.put("LINKID", "-2");
template.insertMap(map, "SEARCHLINK", "LINKID");
}
public void update(Map map)
{
JdbcTemplateEx template = this.getJdbcTemplate();
template.updateMap(map, "SEARCHLINK", "LINKID");
}
public void deleteByUrl(String url)
{
JdbcTemplateEx template = this.getJdbcTemplate();
String sql = "DELETE FROM SEARCHLINK WHERE URL='"+url + "'";
template.update(sql);
}
public void deleteByModelID(String modelID)
{
JdbcTemplateEx template = this.getJdbcTemplate();
String sql = "DELETE FROM SEARCHLINK WHERE Model=?";
template.update(sql, new Object[] { modelID });
}
}
上一篇: PHP获取文件行数的方法
下一篇: 痔疮痒该怎么办 快速治疗痔疮痒的方法