FST源代码解读6——FST的读取

程序员文章站 2022-03-30 15:24:50

...

lucene fst 源代码解读

在lucene的官方文档里，有读取的代码的介绍，直接用了一个Util类，代码如下，方法名字是：org.apache.lucene.util.fst.Util.get(FST<T>, IntsRef)

/** Looks up the output for this input, or null if the input is not accepted. */
public static <T> T get(FST<T> fst, IntsRef input) throws IOException {
	// TODO: would be nice not to alloc this on every lookup
	final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());// 获得一个指向root的虚拟arc
	final BytesReader fstReader = fst.getBytesReader();//获得一个reader，如果是没有压缩的，则是从后面读，否则从前面开始读
	// Accumulate output as we go
	T output = fst.outputs.getNoOutput();
	for (int i = 0; i < input.length; i++) {
		if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {// 获得一个lebal是指定值的arc，如果这个找不到，则认为要查找的input是不存在的，直接返回null，
			return null;
		}
		output = fst.outputs.add(output, arc.output);//找到了，累加输出
	}
	if (arc.isFinal()) {//都找到了，需要判断原来的输入是不是final的，如果不是也不存在。
		return fst.outputs.add(output, arc.nextFinalOutput);//是final的，再累加final输出
	} else {
		return null;
	}
}

/**
 * 读的时候调用，找到一个lebal是指定lebal的arc <br/>
 * Finds an arc leaving the incoming arc, replacing the arc in place. This returns null if the arc was not found, else the incoming arc.
 * @param follow 查找的条件的arc
 * @param arc 最终将查询的结果写入到这个里面来
 */
private Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in, boolean useRootArcCache) throws IOException {

	if (labelToMatch == END_LABEL) {//TODO 这个没有用到过，忽略
		if (follow.isFinal()) {
			if (follow.target <= 0) {
				arc.flags = BIT_LAST_ARC;
			} else {
				arc.flags = 0;
				// NOTE: nextArc is a node (not an address!) in this case:
				arc.nextArc = follow.target;
				arc.node = follow.target;
			}
			arc.output = follow.nextFinalOutput;
			arc.label = END_LABEL;
			return arc;
		} else {
			return null;
		}
	}

	// Short-circuit if this arc is in the root arc cache:
	if (useRootArcCache && cachedRootArcs != null && follow.target == startNode	&& labelToMatch < cachedRootArcs.length) {//从缓存中查找
		final Arc<T> result = cachedRootArcs[labelToMatch];
		// LUCENE-5152: detect tricky cases where caller modified previously returned cached root-arcs:
		assert assertRootCachedArc(labelToMatch, result);
		if (result == null) {
			return null;
		} else {
			arc.copyFrom(result);
			return arc;
		}
	}
	
	if (!targetHasArcs(follow)) {//判断指向的节点是否存在符合条件的arc
		return null;
	}
	in.setPosition(getNodeAddress(follow.target));//准备从bytes中读取，提前找好位置，定位到taget节点的开始的位置
	arc.node = follow.target;

	if (in.readByte() == ARCS_AS_FIXED_ARRAY) {//fixedArray格式的，采用二分查找
		// Arcs are full array; do binary search:
		arc.numArcs = in.readVInt();//arc的数量
		if (packed || version >= VERSION_VINT_TARGET) {//一个arc占用的内存的大小
			arc.bytesPerArc = in.readVInt();
		} else {
			arc.bytesPerArc = in.readInt();
		}
		arc.posArcsStart = in.getPosition();//这个节点中的arc开始的位置
		int low = 0;//开始用二分法查找
		int high = arc.numArcs - 1;
		while (low <= high) {
			int mid = (low + high) >>> 1;
			in.setPosition(arc.posArcsStart);
			in.skipBytes(arc.bytesPerArc * mid + 1);//先指向开始，然后再根据每个arc的大小和是第几个arc移动指针，这样就移动到了中间的一个arc上
			int midLabel = readLabel(in);//读取这个arc的lebal
			final int cmp = midLabel - labelToMatch;//根据label的大小判断该怎么移动指针。
			if (cmp < 0) {
				low = mid + 1;
			} else if (cmp > 0) {
				high = mid - 1;
			} else {
				arc.arcIdx = mid - 1;//正好匹配，则读取这个arc。这里减一是因为后面再readNextRealArc中会自动加一
				return readNextRealArc(arc, in);
			}
		}
		return null;
	}

	// Linear scan  如果不是fixedArray的，则先读取第一个，读取到arc中
	readFirstRealTargetArc(follow.target, arc, in);

	while (true) {//遍历
		// TODO: we should fix this code to not have to create object for the output of every arc we scan... only for the matching arc, if found
		if (arc.label == labelToMatch) {//相同的
			return arc;
		} else if (arc.label > labelToMatch) {//因为是从小到大的排列每个节点上的arc，所以如果一个arc的值大于目标值则标识找不到
			return null;
		} else if (arc.isLast()) {//找不到
			return null;
		} else {//继续读取下一个arc的属性到arc中
			readNextRealArc(arc, in);
		}
	}
}

还有一个功能是根据输出查找输入，即：Util.getByOutput(FST<Long>, long)，不过他是有条件的，只有当加入到fst的key-value中所有的value是按照从小到大的顺序添加时才可以使用这个方法，且只有当输出值是long时。这种的使用情况为：用在文件指针上。代码如下：

/**
 * Reverse lookup (lookup by output instead of by input), in the special case when your FSTs outputs are strictly ascending. This locates the input/output pair where the output is equal to the target, and will return null if that output does not exist.
 * <p>
 * NOTE: this only works with {@code FST<Long>}, only works when the outputs are ascending in order with the inputs. For example, simple ordinals (0,1, 2, ...), or file offets (when appending to a file) fit this.  只有long的输出才可以用。
 */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
	
	final BytesReader in = fst.getBytesReader();

	// TODO: would be nice not to alloc this on every lookup  指向root的虚拟arc
	FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());

	FST.Arc<Long> scratchArc = new FST.Arc<>();

	final IntsRefBuilder result = new IntsRefBuilder();
	return getByOutput(fst, targetOutput, in, arc, scratchArc, result);//
}

/**
 * 别忘了前提是按照output严格递增的顺序来添加的，这个特性导致arc上的输出也是不会减小的，而且不会出现负数的输出。在读取的时候如果一个节点上任何arc的输出值和之前读取的所有的arc上的输出的累加和
 * 不等于targetOutput，则此节点上选择的arc是  输出值和之前所有的arc的输出值的和比targetOutput要小的集合中的最大的一个，解释：首先不能是超过targetOutput的，因为这种fst的所有的arc的输出都不会有负数，
 * 所以一旦超过targetOutput，则不会变小；第二：因为所有的输出是按照严格递增来的，那么同一个节点上发出的arc中，顺序靠后的arc上的输出一定比   在这个arc之前的任意arc开始到结束节点的所有arc上的输出的和要大
 * ，比如在最大的一个都不够大的时候，必须选最大的一个才有可能是相等的，不选最大的那个则一定是比targetOutput小的。
 * @param fst
 * @param targetOutput	查找的目标值，也即是输出
 * @param in			读取fst的reader
 * @param arc			指向root的虚拟的arc
 * @param scratchArc	        scratchArc是一个复用的对象，用来记录上一个读取的arc
 * @param result		将查找到的结果写入到里面
 * @return
 * @throws IOException
 */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput, BytesReader in, Arc<Long> arc, Arc<Long> scratchArc, IntsRefBuilder result) throws IOException {
	
	long output = arc.output;//读取的lebal
	int upto = 0;

	while (true) {
		
		// 判断之前读取的输出是否匹配targetOutput
		if (arc.isFinal()) {
			final long finalOutput = output + arc.nextFinalOutput;//留心final output
			if (finalOutput == targetOutput) {//如果相等，则找到了
				result.setLength(upto);
				return result.get();
			} else if (finalOutput > targetOutput) {
				return null;
			}
		}//如果不是final的，则继续读取当前的arc的指向的节点的所有的arc

		if (FST.targetHasArcs(arc)) {
			result.grow(1 + upto);//还没有查找到，而且有新的查找可能，所以要增大

			fst.readFirstRealTargetArc(arc.target, arc, in);//读取arc.target节点上的第一个arc，将结果读取再arc上

			if (arc.bytesPerArc != 0) {//这个节点是采用fixedArray格式存储的

				int low = 0;
				int high = arc.numArcs - 1;
				int mid = 0;
				boolean exact = false;
				while (low <= high) {//在所有的arc中查找；如果找不到的话
					mid = (low + high) >>> 1;
					in.setPosition(arc.posArcsStart);// 指向这个节点的开始
					in.skipBytes(arc.bytesPerArc * mid);//移动到这个arc的开始
					final byte flags = in.readByte();//读取这个arc的flag
					fst.readLabel(in);//读取label
					final long minArcOutput;
					if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0) {//有输出
						final long arcOutput = fst.outputs.read(in);//读取输出
						minArcOutput = output + arcOutput;//当前查找到的输出
					} else {
						minArcOutput = output;
					}
					if (minArcOutput == targetOutput) {
						exact = true;
						break;
					} else if (minArcOutput < targetOutput) {
						low = mid + 1;
					} else {
						high = mid - 1;
					}
				}

				if (high == -1) {// 这个是连第一个都比targetOutput大的时候，此时说明找不到，因为后面的只能更大
					return null;
				} else if (exact) {//他的目的是读取下标是mid的arc，但是因为下面的readNextRealArc会加一，所以这里要减一
					arc.arcIdx = mid - 1;
				} else {// 这个需要好好分析一下二分查找，如果找不到的话，low永远是在准确值的左边的一个，所以减一就可以了，因为下面的readNextRealArc中会加一，所以这里要减二。
					arc.arcIdx = low - 2;
				}

				fst.readNextRealArc(arc, in);// 读取下一个arc（读了之后可能是准确的，可能是不准确的，如果不准确，则是输出最紧接于targetOutput的arc）
				result.setIntAt(upto++, arc.label);//将读取的lebal写在result中
				output += arc.output;//将读取的输出添加到output

			} else {//当不是fixedArray格式存储的时候，挨个比对，要查找output+arc.ouput < targetOutput中最大的一个（或者说最后的一个），也就是下面的pervArc，然后再去继续读取他指向的node上的arc
				FST.Arc<Long> prevArc = null;
				while (true) {//遍历当前节点上的所有的arc
					// This is the min output we'd hit if we follow this arc:
					final long minArcOutput = output + arc.output;//从小的开始匹配
					if (minArcOutput == targetOutput) {//相等了，不能再对比下去了，因为后面的一定更大。所以要退出让前面的程序判断是否满足条件，即是否是final的
						// Recurse on this arc:
						output = minArcOutput;
						result.setIntAt(upto++, arc.label);
						break;
					} else if (minArcOutput > targetOutput) {//我们的目的就是找到一个比targetOutput大的，然后确定他的前一个，也就是prevArc。
						if (prevArc == null) {//prevArc不存在，说明是第一次遍历这个node，当前的minArcOutput是这个节点任意的arc所能提供的最小的值，最小的值都比targetOutput，则一定不满足
							return null;
						} else {//优先看最后一个else，就很好明白了
							// Recurse on previous arc:
							arc.copyFrom(prevArc);
							result.setIntAt(upto++, arc.label);
							output += arc.output;
							break;
						}
					} else if (arc.isLast()) {// 小于targetOutput且是最后一个，则需要继续读取这个arc指向的节点。所以要break，退出对当前节点的遍历。如果不是最后一个，则进入下面的else，继续遍历当前节点的arc
						// Recurse on this arc:
						output = minArcOutput;
						result.setIntAt(upto++, arc.label);
						break;
					} else {// 结果比targetOutput要小，则记录这个arc为prev，因为下一个arc有可能比targetOutput要大，如果是的话，那么prev就是我们要找的arc。
						// Read next arc in this node:
						prevArc = scratchArc;//scratchArc就是当前的arc，传入的时候是一个对象
						prevArc.copyFrom(arc);
						fst.readNextRealArc(arc, in);//继续读取这个节点的下一个arc，然后继续判断这个是否比targetOutput大
					}
				}
			}
		} else {//如果这个节点没有输出，说明是最后一个节点了，还不匹配，则找不到
			return null;
		}
	}
}

这个的代码不难，他的关键就是在查找一个节点的arc的时候，使用所有 arc输出和之前的输出的和小于目标值的所有的arc中，和最大的一个arc，然后再顺着这个思路查找这个arc的下一个节点。不过用处我觉得不如根据input查找out多。

如果懂了写的代码，读的代码是很容易理解的，FST就这样看完了，花了我足足一周的时间，不过好开心哈哈哈哈哈哈。如果有地方错误，请联系我的qq：1308567317

FST源代码解读6——FST的读取

FST源代码解读5——FST的压缩

FST源代码解读6——FST的读取

FST源代码解读4——结束添加

FST源代码解读2——FST的生成

FST源代码解读5——FST的压缩

FST源代码解读3——编译节点

FST源代码解读6——FST的读取