Lucene---全文检索(处理一对多去重问题 )
程序员文章站
2022-07-09 11:51:11
...
在处理如"问答"功能时,以答进行搜索,这时假就会出现去重问题--->http://www.iteye.com/problems/56869
解决方案:
1,写个线程管理器,用来存储当前查出的重复数据
/*
* CopyRright (c) www.fdauto.com
*/
package com.fdauto.bws.business.module.lucene.duplicate;
import java.util.ArrayList;
import java.util.List;
/**
*
*用于管理在一个线程内共用一个Map
*
* @author 09817(wu_quanyin)
* @date 2011-01-06 上午17:02:44
* @version 1.0
*/
public class DuplicateManager {
private static ThreadLocal<List<DuplicateModel>>
threadLocalList = new ThreadLocal<List<DuplicateModel>>() {
// 该函数说明,在当前线程内,不管调用多次,只实例化一次
protected synchronized List<DuplicateModel> initialValue() {
return new ArrayList<DuplicateModel>();
}
};
public static List<DuplicateModel> getCurrentThreadList() {
return threadLocalList.get();
}
/**清空当前线程中设置进的list*/
public static void removeCurrentThreadListValue(){
threadLocalList.remove();
}
public static void main(String[] args) throws Exception {
Thread thread1=new Thread(){
public void run(){
DuplicateManager.getCurrentThreadList().add(new DuplicateModel());
System.out.println(DuplicateManager.getCurrentThreadList().size());
DuplicateManager.removeCurrentThreadListValue();
DuplicateManager.removeCurrentThreadListValue();
System.out.println("当前线程的值被删除了."+DuplicateManager.getCurrentThreadList().size());
}
};
Thread thread2=new Thread(){
public void run(){
System.out.println(DuplicateManager.getCurrentThreadList().size());
}
};
thread1.start();
Thread.sleep(1000L);
thread2.start();
System.out.println("-------------------------------");
Runnable runnable1=new Runnable(){
public void run(){
DuplicateManager.getCurrentThreadList().add(new DuplicateModel());
System.out.println(DuplicateManager.getCurrentThreadList().size());
}
};
Thread thread3=new Thread(runnable1);
Thread thread4=new Thread(runnable1);
thread3.start();
Thread.sleep(1000L);
thread4.start();
}
}
二, 设置一个model存储相同值的doc,并设置开关,只出现一次
/*
* CopyRright (c) www.fdauto.com
*/
package com.fdauto.bws.business.module.lucene.duplicate;
import java.util.ArrayList;
import java.util.List;
/**
*
*用于存放一组具有相同值的字段
*
* @author 09817(wu_quanyin)
* @date 2011-01-06 上午17:02:44
* @version 1.0
*/
public class DuplicateModel {
private int mainDoc;
//用于进行判断该重复值是否比较过,如果比较过则不在比较
private boolean isChecked=false;
private List<Integer> assistantDocs=new ArrayList<Integer>();
public DuplicateModel(){}
public DuplicateModel(int mainDoc){
this.mainDoc=mainDoc;
}
public int getMainDoc() {
return mainDoc;
}
public void setMainDoc(int mainDoc) {
this.mainDoc = mainDoc;
}
public boolean isChecked() {
return isChecked;
}
public void setChecked(boolean isChecked) {
this.isChecked = isChecked;
}
public List<Integer> getAssistantDocs() {
return assistantDocs;
}
public void setAssistantDocs(List<Integer> assistantDocs) {
this.assistantDocs = assistantDocs;
}
}
三,扩展duplicatefilter类
/*
* CopyRright (c) www.fdauto.com
*/
package com.fdauto.bws.business.module.lucene.duplicate;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.OpenBitSet;
/**
* 在对重复的项进行以集合的形式区分存放在list中待在DuplicateQuery进行比较
*
* @author wu_quanyin(09817)
* @version 1.0
* @date 2011-01-06 上午10:54:01
*/
public class DuplicateExtendFilter extends Filter {
private static final long serialVersionUID = 1601875802819099276L;
String fieldName;
public DuplicateExtendFilter(String fieldName) {
this.fieldName = fieldName;
}
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
return correctBits(reader);
}
private OpenBitSet correctBits(IndexReader reader) throws IOException {
OpenBitSet bits = new OpenBitSet(reader.maxDoc()); // assume all are
// INvalid
Term startTerm = new Term(fieldName);
TermEnum te = reader.terms(startTerm);
if (te != null) {
Term currTerm = te.term();
while ((currTerm != null)
&& (currTerm.field() == startTerm.field())) // term
// fieldnames
// are interned
{
int firstDoc = -1;
// set non duplicates
TermDocs td = reader.termDocs(currTerm);
// 第一个全部添加
if (td.next()) {
firstDoc = td.doc();
bits.set(firstDoc);
}
// --------------当有重复键时,实行对具有重复的进行保留比较
DuplicateModel duplicateModel = new DuplicateModel();
boolean isDuplicate = false;
while (td.next()) {
isDuplicate = true;
duplicateModel.getAssistantDocs().add(td.doc());
}
if (isDuplicate) {
duplicateModel.setMainDoc(firstDoc);
duplicateModel.getAssistantDocs().add(firstDoc);
DuplicateManager.getCurrentThreadList().add(duplicateModel);
} else {
duplicateModel = null;
}
// -----------------------
if (!te.next()) {
break;
}
currTerm = te.term();
}
}
return bits;
}
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if ((obj == null) || (obj.getClass() != this.getClass()))
return false;
DuplicateExtendFilter other = (DuplicateExtendFilter) obj;
return (fieldName == other.fieldName || (fieldName != null && fieldName
.equals(other.fieldName)));
}
@Override
public int hashCode() {
int hash = 217;
hash = 31 * hash + fieldName.hashCode();
return hash;
}
}
四,在query中,根据自己的逻辑进行判断,扩展filterquery类
/*
* CopyRright (c) www.fdauto.com
*/
package com.fdauto.bws.business.module.lucene.duplicate;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ToStringUtils;
/**
* 与DuplicateExtendFilter配合使用,
* 对有重复的一个个比较过去..
*
* @author 09817(wu_quanyin)
* @date 2010-12-25 上午09:02:44
* @version 1.0
*/
public class DuplicateQuery extends Query {
/**
*
*/
private static final long serialVersionUID = 4610299680666587077L;
Query query;
Filter filter;
/**
* Constructs a new query which applies a filter to the results of the
* original query. Filter.getDocIdSet() will be called every time this query
* is used in a search.
*
* @param query
* Query to be filtered, cannot be <code>null</code>.
* @param filter
* Filter to apply to query results, cannot be <code>null</code>.
*/
public DuplicateQuery(Query query, Filter filter) {
this.query = query;
this.filter = filter;
}
/**
* Returns a Weight that applies the filter to the enclosed query's Weight.
* This is accomplished by overriding the Scorer returned by the Weight.
*/
@Override
public Weight createWeight(final Searcher searcher) throws IOException {
final Weight weight = query.createWeight(searcher);
final Similarity similarity = query.getSimilarity(searcher);
return new Weight() {
private static final long serialVersionUID = 3001781092877864947L;
private float value;
// pass these methods through to enclosed query's weight
@Override
public float getValue() {
return value;
}
@Override
public float sumOfSquaredWeights() throws IOException {
return weight.sumOfSquaredWeights() * getBoost() * getBoost();
}
@Override
public void normalize(float v) {
weight.normalize(v);
value = weight.getValue() * getBoost();
}
@Override
public Explanation explain(IndexReader ir, int i)
throws IOException {
Explanation inner = weight.explain(ir, i);
if (getBoost() != 1) {
Explanation preBoost = inner;
inner = new Explanation(inner.getValue() * getBoost(),
"product of:");
inner.addDetail(new Explanation(getBoost(), "boost"));
inner.addDetail(preBoost);
}
Filter f = DuplicateQuery.this.filter;
DocIdSet docIdSet = f.getDocIdSet(ir);
DocIdSetIterator docIdSetIterator = docIdSet == null ? DocIdSet.EMPTY_DOCIDSET
.iterator()
: docIdSet.iterator();
if (docIdSetIterator == null) {
docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator();
}
if (docIdSetIterator.advance(i) == i) {
return inner;
} else {
Explanation result = new Explanation(0.0f,
"failure to match filter: " + f.toString());
result.addDetail(inner);
return result;
}
}
// return this query
@Override
public Query getQuery() {
return DuplicateQuery.this;
}
// return a filtering scorer
@Override
public Scorer scorer(IndexReader indexReader,
boolean scoreDocsInOrder, boolean topScorer)
throws IOException {
final Scorer scorer = weight.scorer(indexReader, true, false);
if (scorer == null) {
return null;
}
DocIdSet docIdSet = filter.getDocIdSet(indexReader);
if (docIdSet == null) {
return null;
}
final DocIdSetIterator docIdSetIterator = docIdSet.iterator();
if (docIdSetIterator == null) {
return null;
}
return new Scorer(similarity) {
private int doc = -1;
private int advanceToCommon(int scorerDoc, int disiDoc)
throws IOException {
while (scorerDoc != disiDoc) {
if (scorerDoc < disiDoc) {
scorerDoc = scorer.advance(disiDoc);
} else {
disiDoc = docIdSetIterator.advance(scorerDoc);
}
}
return scorerDoc;
}
@Override
public int nextDoc() throws IOException {
int scorerDoc = -1;
//对每一个重复的进行比较
while ((scorerDoc = scorer.nextDoc()) != NO_MORE_DOCS) {
boolean ignoreDocment=false;
for (Iterator<DuplicateModel> duplicateIterator = DuplicateManager
.getCurrentThreadList().iterator(); duplicateIterator
.hasNext();) {
DuplicateModel duplicateModel = duplicateIterator.next();
if(duplicateModel.getAssistantDocs()
.contains(scorerDoc)){
if(!duplicateModel.isChecked()){
duplicateModel.setChecked(true);
return scorerDoc;
}else{
//如果已经被检查过的忽略
ignoreDocment=true;
break;
}
}
}
// 如果该doc已经加入过,则忽略
if (ignoreDocment) {
continue;
}
return scorerDoc;
}
return NO_MORE_DOCS;
}
@Override
public int docID() {
return doc;
}
@Override
public int advance(int target) throws IOException {
int disiDoc, scorerDoc;
return doc = (disiDoc = docIdSetIterator
.advance(target)) != NO_MORE_DOCS
&& (scorerDoc = scorer.advance(disiDoc)) != NO_MORE_DOCS
&& advanceToCommon(scorerDoc, disiDoc) != NO_MORE_DOCS ? scorer
.docID()
: NO_MORE_DOCS;
}
@Override
public float score() throws IOException {
return getBoost() * scorer.score();
}
};
}
};
}
/** Rewrites the wrapped query. */
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = query.rewrite(reader);
if (rewritten != query) {
DuplicateQuery clone = (DuplicateQuery) this.clone();
clone.query = rewritten;
return clone;
} else {
return this;
}
}
public Query getQuery() {
return query;
}
public Filter getFilter() {
return filter;
}
// inherit javadoc
@Override
public void extractTerms(Set<Term> terms) {
getQuery().extractTerms(terms);
}
/** Prints a user-readable version of this query. */
@Override
public String toString(String s) {
StringBuilder buffer = new StringBuilder();
buffer.append("filtered(");
buffer.append(query.toString(s));
buffer.append(")->");
buffer.append(filter);
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
/** Returns true iff <code>o</code> is equal to this. */
@Override
public boolean equals(Object o) {
if (o instanceof FilteredQuery) {
DuplicateQuery fq = (DuplicateQuery) o;
return (query.equals(fq.query) && filter.equals(fq.filter) && getBoost() == fq
.getBoost());
}
return false;
}
/** Returns a hash code value for this object. */
@Override
public int hashCode() {
return query.hashCode() ^ filter.hashCode()
+ Float.floatToRawIntBits(getBoost());
}
}
使用:
DuplicateExtendFilter filter = new DuplicateExtendFilter(
uniqueField);
duplicateQuery = new DuplicateQuery(query, filter);
下一篇: lucene基础