FST源代码解读3——编译节点
程序员文章站
2022-03-23 14:26:47
...
上一篇看完了FST.Builder的代码了,这里写FST代码了。先看编译节点时是如何操作的,即org.apache.lucene.util.fst.FST.addNode(Builder<T>, UnCompiledNode<T>)方法:
long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException { T NO_OUTPUT = outputs.getNoOutput(); if (nodeIn.numArcs == 0) {// 如果添加的是没有arc的,也就是最后一个,返回的都是一样的,这样就可以认为所有的结束节点是同一个节点。 if (nodeIn.isFinal) { return FINAL_END_NODE; } else { return NON_FINAL_END_NODE;// 不知道为啥还有这种情况,没有arc却不是final的 } } // 往下走都是有arc的 final long startAddress = builder.bytes.getPosition();// 写这个节点的开始的position,这里的bytes是保存编译后的二进制的byte[]的,还没有详细的介绍, // 怎么存储这个node的arc,如果有太多的,则使用fixedArray的格式,每个arc用相同的空间,因为arc上的label一定是按照顺序排序的,这样能用二分查找 final boolean doFixedArray = shouldExpand(builder, nodeIn); if (doFixedArray) { if (builder.reusedBytesPerArc.length < nodeIn.numArcs) {// 将用来记录每个arc占用多少内存的数组扩容 builder.reusedBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } } builder.arcCount += nodeIn.numArcs;// 计算arc的数量 final int lastArc = nodeIn.numArcs - 1;// long lastArcStart = builder.bytes.getPosition();// 记录一个arc写入的开始 int maxBytesPerArc = 0;// 所有的arc中,最大的一个arc使用的byte[]的大小 for (int arcIdx = 0; arcIdx < nodeIn.numArcs; arcIdx++) { // 遍历这个节点所有的arc,将每个arc写入到byteStore中去,记录每个arc使用的byte[],记录最大的arc使用的byte[], final Builder.Arc<T> arc = nodeIn.arcs[arcIdx];// arc final Builder.CompiledNode target = (Builder.CompiledNode) arc.target;// 这个arc指向的节点 int flags = 0; if (arcIdx == lastArc) {// 是否是该节点的最后一个arc flags += BIT_LAST_ARC; } if (builder.lastFrozenNode == target.node && !doFixedArray) {// 刚刚写入到byte[]中的节点就是下一个节点,此时不用存储target,两个节点的node是有规律的,node值差一,根据这个node就可以计算出下一个node,然后就可以获得位置;否则该ARC需要存储target信息 // TODO: for better perf (but more RAM used) we could avoid this // except when arc is "near" the last arc: flags += BIT_TARGET_NEXT;// bytes中的下一个Node就是此边的目标节点 } if (arc.isFinal) { flags += BIT_FINAL_ARC; if (arc.nextFinalOutput != NO_OUTPUT) {// 只有是final的才会有final flags += BIT_ARC_HAS_FINAL_OUTPUT; } } else { assert arc.nextFinalOutput == NO_OUTPUT; } boolean targetHasArcs = target.node > 0;// 大于0说明不是最后一个node if (!targetHasArcs) {// 这个arc指向的node没有arc,说明是最后一个arc flags += BIT_STOP_NODE; } else if (inCounts != null) {// 不是最后一个arc inCounts.set((int) target.node, inCounts.get((int) target.node) + 1);//增加指向这个节点的arc的数量 } if (arc.output != NO_OUTPUT) { flags += BIT_ARC_HAS_OUTPUT; } builder.bytes.writeByte((byte) flags); writeLabel(builder.bytes, arc.label);// 写入标记为,表示这个arc都是存储了什么,等读的时候解析用 if (arc.output != NO_OUTPUT) {// outputs.write(arc.output, builder.bytes);// 将输出写入到builder.bytes中 } if (arc.nextFinalOutput != NO_OUTPUT) {// 将finalOutput写入 outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {// 写入目标节点的编号。第一个targetHasArcs说明这个arc的目标节点不是最后node,(flags & BIT_TARGET_NEXT) == 0 表示他的目标节点不是紧挨着的下一个节点,此时需要单独记录下一个节点的编号 assert target.node > 0; builder.bytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, but record how many bytes each one took, and max byte size: if (doFixedArray) {//如果是使用fixedArray格式的话,需要记录每个arc使用的内存大小 builder.reusedBytesPerArc[arcIdx] = (int) (builder.bytes.getPosition() - lastArcStart); lastArcStart = builder.bytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, builder.reusedBytesPerArc[arcIdx]);//计算arc占用内存最大的值 } } if (doFixedArray) {// 按照固定的长度重新写一遍,每个arc都占用maxBytesPerArc个byte,覆盖之前写入的arc。写之前先写入header final int MAX_HEADER_SIZE = 11; // header(byte) + numArcs(vint) + numBytes(vint) assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed byte size // create the header // TODO: clean this up: or just rewind+reuse and deal with it byte header[] = new byte[MAX_HEADER_SIZE]; ByteArrayDataOutput bad = new ByteArrayDataOutput(header); // write a "false" first arc: bad.writeByte(ARCS_AS_FIXED_ARRAY);// 其实是0,这个是第一个写入的,但是在reverse之后就成了最后一个了。 bad.writeVInt(nodeIn.numArcs); bad.writeVInt(maxBytesPerArc); int headerLen = bad.getPosition(); final long fixedArrayStart = startAddress + headerLen;// 写完header之后的position,也就是写第一个arc的开始 // expand the arcs in place, backwards long srcPos = builder.bytes.getPosition();// 现在不按照fixedArray写的postion,这个是一定小于destPos的,因为destPos是将每个arc按照最大的长度写的。 long destPos = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc;// 写完所有的arc之后的position assert destPos >= srcPos; if (destPos > srcPos) { // 一定进入 builder.bytes.skipBytes((int) (destPos - srcPos));// 先在bytesStore中留出足够的空间。 for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {// 从后往前写每个arc,将之前写的移动到新的位置上 destPos -= maxBytesPerArc;// 这个arc即将要写入的开始,也就是最后减去n*maxBytesPerArc srcPos -= builder.reusedBytesPerArc[arcIdx];// 回退到之前写的这个arc的开始位置, if (srcPos != destPos) {// 可能会相等,绝大部分不相等 assert destPos > srcPos : "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + builder.reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; builder.bytes.copyBytes(srcPos, destPos, builder.reusedBytesPerArc[arcIdx]);// 自己copy自己, } } } // now write the header 虽然是后写,但是写在arcs之前。 builder.bytes.writeBytes(startAddress, header, 0, headerLen); } final long thisNodeAddress = builder.bytes.getPosition() - 1;// 这个节点结束的位置 // 将刚刚写入的一片byte[]都反转过来,这是因为在调用这个方法的时候,添加的节点的顺序就是倒叙的,从frontier中最后的一个node开始的,生成的编译后的byte[]当然也是倒叙的, // 而刚刚添加的这个node的所有的arc都是正序的,所以把这个node的所有的arc都倒叙,这样就统一顺序了,下面再生成一个reader的时候,就可以直接从后面开始顺序读取了。 builder.bytes.reverse(startAddress, thisNodeAddress); // PackedInts uses int as the index, so we cannot handle > 2.1B nodes when packing: if (nodeAddress != null && builder.nodeCount == Integer.MAX_VALUE) { throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes"); } builder.nodeCount++; final long node; if (nodeAddress != null) {// 带有压缩的(nodeAddres下面有解释) // Nodes are addressed by 1+ord: if ((int) builder.nodeCount == nodeAddress.size()) {// 扩容nodeAddress nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); } nodeAddress.set((int) builder.nodeCount, thisNodeAddress);// 第n个节点结束的位置 node = builder.nodeCount;//node其实就是加入到fst中的序号,是一个递增的值 } else { node = thisNodeAddress;//如果没有压缩的,则node就是这个节点在byte[]中的开始位置。 } // 通过返回的值可以确定下一个节点 return node;// 返回的值,如果是压缩的,则返回的是编号(通过编号可以获得这个节点在byteStore中的结束位置),如果不是,则是这个节点在byteStore中的结束位置-1(也就是倒序开始读的位置) }
上面的方法还是很好理解的,将一个没有编译的节点传入这个参数,如果这个节点是最后一个节点,则永远返回一个小于1的数字,这样所有的结束的节点就可以都看做是同一个。如果不是最后一个节点,将这个节点的多个发出的arc的属性写在BytesStore bytes对象中(这个类下面有介绍),写的时候可能有两种方式,一个是每个arc占固定的大小,一个是可变的,固定大小的好处是可以使用二分法查找,加快查找速度,所以他只有在arc数量很多的时候才会使用;写入的内容包括标记、label、输出、final输出、指向的节点的开始。写完后再reverse过来,方便后面的统一读取,因为未编译的节点写入fst的顺序就是倒叙的,所以这里也要变为倒叙的。写完之后,需要返回这个未编译节点在bytes中的开始位置,这里也有两种情况,一种是直接返回绝对的位置(对应于上面的没有压缩的情况),一种是返回这个节点的序号,然后使用一个单独的对象记录序号到绝对位置的映射(也就是上面的nodeAddress,这个类其实就是封装的packedInts,可以将其理解为一个int[])。frontier中未编译的节点的arc根据这个序号(或者是绝对位置)就能找到他的目标节点在bytes中的开始位置了。
BytesStore的介绍:这个其实就是封装了byte[],提供适合FST的方法,代码如下:
class BytesStore extends DataOutput implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(BytesStore.class) + RamUsageEstimator.shallowSizeOfInstance(ArrayList.class); /** 先写入current,当current写的到blockSize时,就加入到blocks中,然后创建一个新的 */ private final List<byte[]> blocks = new ArrayList<>(); private final int blockSize; private final int blockBits; private final int blockMask; private byte[] current; private int nextWrite; public BytesStore(int blockBits) { this.blockBits = blockBits; blockSize = 1 << blockBits; blockMask = blockSize - 1; nextWrite = blockSize; } /** Pulls bytes from the provided IndexInput. */ public BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException { int blockSize = 2; int blockBits = 1; while (blockSize < numBytes && blockSize < maxBlockSize) { blockSize *= 2; blockBits++; } this.blockBits = blockBits; this.blockSize = blockSize; this.blockMask = blockSize - 1; long left = numBytes; while (left > 0) { final int chunk = (int) Math.min(blockSize, left); byte[] block = new byte[chunk]; in.readBytes(block, 0, block.length); blocks.add(block); left -= chunk; } // So .getPosition still works nextWrite = blocks.get(blocks.size() - 1).length; } /**覆盖写,将制定位置的byte设置为某个值 <br/> Absolute write byte; you must ensure dest is < max position written so far. */ public void writeByte(int dest, byte b) { int blockIndex = dest >> blockBits; byte[] block = blocks.get(blockIndex); block[dest & blockMask] = b; } /* 追加写 */ @Override public void writeByte(byte b) { if (nextWrite == blockSize) { current = new byte[blockSize]; blocks.add(current); nextWrite = 0; } current[nextWrite++] = b; } @Override public void writeBytes(byte[] b, int offset, int len) { while (len > 0) { int chunk = blockSize - nextWrite; if (len <= chunk) { assert b != null; assert current != null; System.arraycopy(b, offset, current, nextWrite, len); nextWrite += len; break; } else { if (chunk > 0) { System.arraycopy(b, offset, current, nextWrite, chunk); offset += chunk; len -= chunk; } current = new byte[blockSize]; blocks.add(current); nextWrite = 0; } } } int getBlockBits() { return blockBits; } /** 覆盖写,用b来覆盖,从b的offset开始,写len个 <br/> Absolute writeBytes without changing the current position. Note: this cannot "grow" the bytes, so you must only call it on already written parts. */ void writeBytes(long dest, byte[] b, int offset, int len) { assert dest + len <= getPosition() : "dest=" + dest + " pos=" + getPosition() + " len=" + len; final long end = dest + len;//最后一个位置 int blockIndex = (int) (end >> blockBits);//最后一个位置第几个块上 int downTo = (int) (end & blockMask);//每个块中写入的最后的位置 if (downTo == 0) {//如果恰好是整除 blockIndex--; downTo = blockSize; } byte[] block = blocks.get(blockIndex);//结束的位置 while (len > 0) {//从最后开始复制的, if (len <= downTo) { System.arraycopy(b, offset, block, downTo - len, len); break; } else { len -= downTo; System.arraycopy(b, offset + len, block, 0, downTo); blockIndex--; block = blocks.get(blockIndex); downTo = blockSize; } } } /** * Absolute copy bytes self to self, without changing the position. Note: this cannot "grow" the bytes, so must only call it on already written parts. <br/> * 从src开始复制,复制len个,从dest开始写,自己copy自己 */ public void copyBytes(long src, long dest, int len) { long end = src + len; int blockIndex = (int) (end >> blockBits);// 复制的块的下标 int downTo = (int) (end & blockMask);// 复制的块的截止的下标 if (downTo == 0) { blockIndex--; downTo = blockSize; } byte[] block = blocks.get(blockIndex);// 要复制的最后一个 while (len > 0) { if (len <= downTo) {//只复制这一个就够了 writeBytes(dest, block, downTo - len, len); break; } else {//只复制这一个不够,还要继续复制,则把这一个复制完。 len -= downTo; writeBytes(dest + len, block, 0, downTo);//覆盖写 blockIndex--; block = blocks.get(blockIndex); downTo = blockSize; } } } /** 覆盖写,在指定的位置写入一个int <br/> Writes an int at the absolute position without changing the current pointer. */ public void writeInt(long pos, int value) { int blockIndex = (int) (pos >> blockBits);// 写入的block的下标 int upto = (int) (pos & blockMask);// 写入的block的的offset byte[] block = blocks.get(blockIndex); int shift = 24; for (int i = 0; i < 4; i++) { block[upto++] = (byte) (value >> shift); shift -= 8; if (upto == blockSize) { upto = 0; blockIndex++; block = blocks.get(blockIndex); } } } /** 将指定的区间倒叙 <br/> Reverse from srcPos, inclusive, to destPos, inclusive. */ public void reverse(long srcPos, long destPos) { assert srcPos < destPos; assert destPos < getPosition(); int srcBlockIndex = (int) (srcPos >> blockBits); int src = (int) (srcPos & blockMask); byte[] srcBlock = blocks.get(srcBlockIndex); int destBlockIndex = (int) (destPos >> blockBits); int dest = (int) (destPos & blockMask); byte[] destBlock = blocks.get(destBlockIndex); int limit = (int) (destPos - srcPos + 1) / 2; for (int i = 0; i < limit; i++) { byte b = srcBlock[src]; srcBlock[src] = destBlock[dest]; destBlock[dest] = b; src++; if (src == blockSize) { srcBlockIndex++; srcBlock = blocks.get(srcBlockIndex); src = 0; } dest--; if (dest == -1) { destBlockIndex--; destBlock = blocks.get(destBlockIndex); dest = blockSize - 1; } } } /** 跳过len个位置,挪动指针,目的是创造足够的空间大小 */ public void skipBytes(int len) { while (len > 0) { int chunk = blockSize - nextWrite;// if (len <= chunk) { nextWrite += len; break; } else { len -= chunk; current = new byte[blockSize]; blocks.add(current); nextWrite = 0; } } } /** 返回现在的指针 */ public long getPosition() { return ((long) blocks.size() - 1) * blockSize + nextWrite; } /** * 将指定位置以后的清除 <br/> Pos must be less than the max position written so far! Ie, you cannot "grow" the file with this! */ public void truncate(long newLen) { assert newLen <= getPosition(); assert newLen >= 0; int blockIndex = (int) (newLen >> blockBits);//找到最后一个block和在其中的下标 nextWrite = (int) (newLen & blockMask); if (nextWrite == 0) { blockIndex--; nextWrite = blockSize; } // 将最后一个block的后面的block都清空 blocks.subList(blockIndex + 1, blocks.size()).clear(); if (newLen == 0) { current = null; } else { current = blocks.get(blockIndex); } assert newLen == getPosition(); } public void finish() { if (current != null) { byte[] lastBuffer = new byte[nextWrite]; System.arraycopy(current, 0, lastBuffer, 0, nextWrite); blocks.set(blocks.size() - 1, lastBuffer); current = null; } } /** Writes all of our bytes to the target {@link DataOutput}. */ public void writeTo(DataOutput out) throws IOException { for (byte[] block : blocks) { out.writeBytes(block, 0, block.length); } } /** 从前向后读的reader */ public FST.BytesReader getForwardReader() { if (blocks.size() == 1) { return new ForwardBytesReader(blocks.get(0)); } return new FST.BytesReader() { private byte[] current; private int nextBuffer; private int nextRead = blockSize; @Override public byte readByte() { if (nextRead == blockSize) { current = blocks.get(nextBuffer++); nextRead = 0; } return current[nextRead++]; } @Override public void skipBytes(long count) { setPosition(getPosition() + count); } @Override public void readBytes(byte[] b, int offset, int len) { while (len > 0) { int chunkLeft = blockSize - nextRead; if (len <= chunkLeft) { System.arraycopy(current, nextRead, b, offset, len); nextRead += len; break; } else { if (chunkLeft > 0) { System.arraycopy(current, nextRead, b, offset, chunkLeft); offset += chunkLeft; len -= chunkLeft; } current = blocks.get(nextBuffer++); nextRead = 0; } } } @Override public long getPosition() { return ((long) nextBuffer - 1) * blockSize + nextRead; } @Override public void setPosition(long pos) { int bufferIndex = (int) (pos >> blockBits); nextBuffer = bufferIndex + 1; current = blocks.get(bufferIndex); nextRead = (int) (pos & blockMask); assert getPosition() == pos; } @Override public boolean reversed() { return false; } }; } public FST.BytesReader getReverseReader() { return getReverseReader(true); } /** 从后向前读的, */ FST.BytesReader getReverseReader(boolean allowSingle) { if (allowSingle && blocks.size() == 1) { return new ReverseBytesReader(blocks.get(0)); } return new FST.BytesReader() { /** 当前读取的 */ private byte[] current = blocks.size() == 0 ? null : blocks.get(0); /** 下一个byte[]的下标 */ private int nextBuffer = -1; /** 当前读取的下标 */ private int nextRead = 0; @Override public byte readByte() {//向前读,倒着读 if (nextRead == -1) { current = blocks.get(nextBuffer--); nextRead = blockSize - 1; } return current[nextRead--]; } @Override public void skipBytes(long count) { setPosition(getPosition() - count); } @Override public void readBytes(byte[] b, int offset, int len) { for (int i = 0; i < len; i++) { b[offset + i] = readByte(); } } @Override public long getPosition() { return ((long) nextBuffer + 1) * blockSize + nextRead; } @Override public void setPosition(long pos) {//当读取的时候先调用这个方法,指向某个节点的开始位置(是倒序的,从后面开始读) // NOTE: a little weird because if you // setPosition(0), the next byte you read is // bytes[0] ... but I would expect bytes[-1] (ie, // EOF)...? int bufferIndex = (int) (pos >> blockBits); nextBuffer = bufferIndex - 1; current = blocks.get(bufferIndex); nextRead = (int) (pos & blockMask); assert getPosition() == pos : "pos=" + pos + " getPos()=" + getPosition(); } @Override public boolean reversed() { return true; } }; } }看完了上面的内容,很多节点写入FST的过程就可以理解了,下面再看下一步,结束fst的写入,即调用Builder.finish方法:当所有的term都写完之后调用这个方法:
/** * 将frontier写入到fst,从0开始冷冻 * Returns final FST. NOTE: this will return null if nothing is accepted by the FST. */ public FST<T> finish() throws IOException { final UnCompiledNode<T> root = frontier[0];//root,也就是第一个节点,开始节点 // minimize nodes in the last word's suffix freezeTail(0);//将最后的frontier写入到fst中,虽然传入的是0,但是从第二个node(下标是1)开始编译到fst if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) {//不进入 if (fst.emptyOutput == null) { return null; } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { // empty string got pruned return null; } } else { if (minSuffixCount2 != 0) {//不进入 compileAllTargets(root, lastInput.length()); } } fst.finish(compileNode(root, lastInput.length()).node);//将root写入fst。 if (doPackFST) {//后续会介绍 return fst.pack(this, 3, Math.max(10, (int) (getNodeCount() / 4)), acceptableOverheadRatio); } else { return fst; } }里面还是调用了freezeTail方法,虽然穿的是0,但是freezeTail方法中会提高到1,调用这个方法的目的是将现在的frontier写入到fst中。调用完之后所有的term才都写入到fst中了,然后调用fst的finish方法,调用之前对root节点进行编译,因为之前调用freezeTail的时候都是从第一个开始编译的,所有从root发出的arc都还没有编译呢,所以要编译root,也就是所有的从root发出的arc,获得一个编译后的节点的序号(或者绝对位置),然后调用fst.finish,代码在下一篇博客中介绍避免篇幅太长了。
如果读者发现文中有错误,请联系我的QQ:1308567317