Lucene/Solr(5.0) 源码初探- Lucene Facet SortedSetDocValues (三) Lucene/Solr 4.0+FacetSortedSetDocValues
程序员文章站
2024-03-18 20:42:40
...
前面粗略研究了SortedSetDocValues如何index,这章研究粗略看下如何在搜索过程中做facet,还是以lucene 5.0自带的例子做为开头:
先看看SortedSetDocValuesReaderState 都做了什么?
回想下前面讲过,所有的dim和label会拼接在一起生成一个长字符串存储,在例子中,用户一般是根据其自定义的dim来查找比如例子里面根据"Author" 或者"Publish Year"来查找,所以在读取term的的时候要确定前几个term是属于同一个dim(例如"Author"),prefixToOrdRange 就是为了确定每一个dim在所有的存储docvalues的范围,以便做后续统计。
再来看SortedSetDocValuesFacetCounts 做了什么:
SortedSetDocValues 的初始话过程主要就是每个term的频率统计,深入看看segValues的查找每个term的结构:
最后看归并的过程:
到此所有的SortedSetDocValues 查找和索引都基本分析完结了。
//SimpleSortedSetFacetsExample private List<FacetResult> search() throws IOException { DirectoryReader indexReader = DirectoryReader.open(indexDir); IndexSearcher searcher = new IndexSearcher(indexReader); /**建立SortedSetDocValuesReader 实例,并且将SortedSetDocValues 做了一部分预处理*/ SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexReader); // Aggregatses the facet counts FacetsCollector fc = new FacetsCollector(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: //先对所有的文档进行搜索,拿出所有的文档id,再来进行facet FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); // Retrieve results //对每个文档的$facets field 值进行统计 Facets facets = new SortedSetDocValuesFacetCounts(state, fc); List<FacetResult> results = new ArrayList<>(); //开始对特定条件的facet进行归并 results.add(facets.getTopChildren(10, "Author")); results.add(facets.getTopChildren(10, "Publish Year")); indexReader.close(); return results; }
先看看SortedSetDocValuesReaderState 都做了什么?
/** Creates this, pulling doc values from the specified * field. */ public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException { this.field = field; this.origReader = reader; // We need this to create thread-safe MultiSortedSetDV // per collector: topReader = SlowCompositeReaderWrapper.wrap(reader); /**读取上章存储的数据字典集主要包括上章提及的OrdMap,ordinalCountsm,OrdinalIndex*/ SortedSetDocValues dv = topReader.getSortedSetDocValues(field); if (dv == null) { throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); } if (dv.getValueCount() > Integer.MAX_VALUE) { throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount()); } //已经存储多少个docvalues(去重) valueCount = (int) dv.getValueCount(); // TODO: we can make this more efficient if eg we can be // "involved" when OrdinalMap is being created? Ie see // each term/ord it's assigning as it goes... String lastDim = null; int startOrd = -1; // TODO: this approach can work for full hierarchy?; // TaxoReader can't do this since ords are not in // "sorted order" ... but we should generalize this to // support arbitrary hierarchy: //循环查找所有存储的docvalues,确定同一个dim 在存储的binary的范围 for(int ord=0;ord<valueCount;ord++) { final BytesRef term = dv.lookupOrd(ord); String[] components = FacetsConfig.stringToPath(term.utf8ToString()); if (components.length != 2) { throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + term.utf8ToString()); } if (!components[0].equals(lastDim)) { if (lastDim != null) { prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord-1)); } startOrd = ord; lastDim = components[0]; } } if (lastDim != null) { prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount-1)); } }
回想下前面讲过,所有的dim和label会拼接在一起生成一个长字符串存储,在例子中,用户一般是根据其自定义的dim来查找比如例子里面根据"Author" 或者"Publish Year"来查找,所以在读取term的的时候要确定前几个term是属于同一个dim(例如"Author"),prefixToOrdRange 就是为了确定每一个dim在所有的存储docvalues的范围,以便做后续统计。
再来看SortedSetDocValuesFacetCounts 做了什么:
public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, FacetsCollector hits) throws IOException { this.state = state; this.field = state.getField(); dv = state.getDocValues(); counts = new int[state.getSize()]; //System.out.println("field=" + field); //开始统计所有的term在文档的出现次数 count(hits.getMatchingDocs()); } private final void count(List<MatchingDocs> matchingDocs) throws IOException { //System.out.println("ssdv count"); MultiDocValues.OrdinalMap ordinalMap; // TODO: is this right? really, we need a way to // verify that this ordinalMap "matches" the leaves in // matchingDocs... if (dv instanceof MultiSortedSetDocValues && matchingDocs.size() > 1) { ordinalMap = ((MultiSortedSetDocValues) dv).mapping; } else { ordinalMap = null; } IndexReader origReader = state.getOrigReader(); for(MatchingDocs hits : matchingDocs) { LeafReader reader = hits.context.reader(); //System.out.println(" reader=" + reader); // LUCENE-5090: make sure the provided reader context "matches" // the top-level reader passed to the // SortedSetDocValuesReaderState, else cryptic // AIOOBE can happen: if (ReaderUtil.getTopLevelContext(hits.context).reader() != origReader) { throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader"); } SortedSetDocValues segValues = reader.getSortedSetDocValues(field); if (segValues == null) { continue; } DocIdSetIterator docs = hits.bits.iterator(); // TODO: yet another option is to count all segs // first, only in seg-ord space, and then do a // merge-sort-PQ in the end to only "resolve to // global" those seg ords that can compete, if we know // we just want top K? ie, this is the same algo // that'd be used for merging facets across shards // (distributed faceting). but this has much higher // temp ram req'ts (sum of number of ords across all // segs) if (ordinalMap != null) { final int segOrd = hits.context.ord; final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd); int numSegOrds = (int) segValues.getValueCount(); if (hits.totalHits < numSegOrds/10) { //System.out.println(" remap as-we-go"); // Remap every ord to global ord as we iterate: int doc; while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(" doc=" + doc); segValues.setDocument(doc); int term = (int) segValues.nextOrd(); while (term != SortedSetDocValues.NO_MORE_ORDS) { //System.out.println(" segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term)); counts[(int) ordMap.get(term)]++; term = (int) segValues.nextOrd(); } } } else { //System.out.println(" count in seg ord first"); // First count in seg-ord space: final int[] segCounts = new int[numSegOrds]; int doc; while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(" doc=" + doc); segValues.setDocument(doc); int term = (int) segValues.nextOrd(); while (term != SortedSetDocValues.NO_MORE_ORDS) { //System.out.println(" ord=" + term); segCounts[term]++; term = (int) segValues.nextOrd(); } } // Then, migrate to global ords: for(int ord=0;ord<numSegOrds;ord++) { int count = segCounts[ord]; if (count != 0) { //System.out.println(" migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord)); counts[(int) ordMap.get(ord)] += count; } } } } else { // No ord mapping (e.g., single segment index): // just aggregate directly into counts: int doc; while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //设置该文档属于term的边界。 segValues.setDocument(doc); //读取第一个term int term = (int) segValues.nextOrd(); while (term != SortedSetDocValues.NO_MORE_ORDS) { //判定termid 有效,统计该term的count counts[term]++; //读取下一个term term = (int) segValues.nextOrd(); } } } } }
SortedSetDocValues 的初始话过程主要就是每个term的频率统计,深入看看segValues的查找每个term的结构:
//Lucene50DocValuesProducer @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { SortedSetEntry ss = sortedSets.get(field.name); if (ss.format == SORTED_SINGLE_VALUED) { final SortedDocValues values = getSorted(field); return DocValues.singleton(values); } else if (ss.format != SORTED_WITH_ADDRESSES) { throw new AssertionError(); } final long valueCount = binaries.get(field.name).count; // we keep the byte[]s and list of ords on disk, these could be large final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); final LongValues ordinals = getNumeric(ords.get(field.name)); // but the addresses to the ord stream are in RAM final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); return new RandomAccessOrds() { long startOffset; long offset; long endOffset; @Override public long nextOrd() { if (offset == endOffset) { return NO_MORE_ORDS; } else { //通过确定的边界查找该文档包含准确的termid,?Ordinals =pending ords long ord = ordinals.get(offset); offset++; return ord; } } @Override public void setDocument(int docID) { /**通过文档id查找在Ords中的边界,OrdIndex = pendingCounts ordCounts*/ startOffset = offset = ordIndex.get(docID); endOffset = ordIndex.get(docID+1L); } @Override public BytesRef lookupOrd(long ord) { //返回存储的term内容,ord传入的是termid ,同样查找过程需要OrdMap辅助 return binary.get(ord); } @Override public long getValueCount() { return valueCount; } @Override public long lookupTerm(BytesRef key) { if (binary instanceof CompressedBinaryDocValues) { return ((CompressedBinaryDocValues)binary).lookupTerm(key); } else { return super.lookupTerm(key); } } @Override public TermsEnum termsEnum() { if (binary instanceof CompressedBinaryDocValues) { return ((CompressedBinaryDocValues)binary).getTermsEnum(); } else { return super.termsEnum(); } } @Override public long ordAt(int index) { return ordinals.get(startOffset + index); } @Override public int cardinality() { return (int) (endOffset - startOffset); } }; }
最后看归并的过程:
@Override public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { if (topN <= 0) { throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")"); } if (path.length > 0) { throw new IllegalArgumentException("path should be 0 length"); } //每个dim在存储的docvalues的范围 OrdRange ordRange = state.getOrdRange(dim); if (ordRange == null) { throw new IllegalArgumentException("dimension \"" + dim + "\" was not indexed"); } return getDim(dim, ordRange, topN); } private final FacetResult getDim(String dim, OrdRange ordRange, int topN) { TopOrdAndIntQueue q = null; int bottomCount = 0; int dimCount = 0; int childCount = 0; TopOrdAndIntQueue.OrdAndValue reuse = null; //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end); /**在符合客户定义的dim的范围查看其term的频率,由于涉及到topN的定义,返回 前N个频率最多的必然涉及到最小堆的排序,TopOrdAndIntQueue就是用来排TopN。 for(int ord=ordRange.start; ord<=ordRange.end; ord++) { //System.out.println(" ord=" + ord + " count=" + counts[ord]); if (counts[ord] > 0) { dimCount += counts[ord]; childCount++; if (counts[ord] > bottomCount) { if (reuse == null) { reuse = new TopOrdAndIntQueue.OrdAndValue(); } reuse.ord = ord; reuse.value = counts[ord]; if (q == null) { // Lazy init, so we don't create this for the // sparse case unnecessarily q = new TopOrdAndIntQueue(topN); } reuse = q.insertWithOverflow(reuse); if (q.size() == topN) { bottomCount = q.top().value; } } } } if (q == null) { return null; } LabelAndValue[] labelValues = new LabelAndValue[q.size()]; for(int i=labelValues.length-1;i>=0;i--) { TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); final BytesRef term = dv.lookupOrd(ordAndValue.ord); String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value); } return new FacetResult(dim, new String[0], dimCount, labelValues, childCount); }
到此所有的SortedSetDocValues 查找和索引都基本分析完结了。
上一篇: Gas 博客分类: 以太坊