Java实现AC自动机全文检索示例
程序员文章站
2024-03-08 08:10:27
第一步,构建trie树,定义node类型:
/**
* created by zhaoyy on 2017/2/7.
*/
interface node...
第一步,构建trie树,定义node类型:
/** * created by zhaoyy on 2017/2/7. */ interface node { char value(); boolean exists(); boolean isroot(); node parent(); node childof(char c); node fail(); void setfail(node node); void setexists(boolean exists); void add(node child); list<node> children(); }
第二步,实现两种node,如果词汇全是可打印的ascii字符,就采用asciinode,否则(比如包含汉字),使用基于hash表的mapnode;这两种node均集成自abstractnode:
/** * created by zhaoyy on 2017/2/8. */ abstract class abstractnode implements node { private static final char empty = '\0'; private final char value; private final node parent; private boolean exists; private node fail; public abstractnode(node parent, char value) { this.parent = parent; this.value = value; this.exists = false; this.fail = null; } public abstractnode() { this(null, empty); } private static string tab(int n) { stringbuilder builder = new stringbuilder(); for (int i = 0; i < n; i++) { builder.append('\t'); } return builder.tostring(); } private static string tostring(node node, int depth) { stringbuilder builder = new stringbuilder(); string tab = tab(depth); node fail = node.fail(); node parent = node.parent(); builder .append(tab) .append('<') .append(node.value()) .append(" exists=\"") .append(node.exists()) .append('"') .append(" parent=\"") .append(parent == null ? "null" : parent.value()) .append('"') .append(" fail=\"") .append(fail == null ? "null" : fail.value()) .append("\">\r\n"); for (node child : node.children()) builder.append(tostring(child, depth + 1)); builder .append(tab) .append("</") .append(node.value()) .append(">\r\n"); return builder.tostring(); } @override public char value() { return value; } @override public boolean exists() { return exists; } @override public boolean isroot() { return value == empty; } @override public node parent() { return parent; } @override public node fail() { return fail; } @override public void setfail(node node) { this.fail = node; } @override public void setexists(boolean exists) { this.exists = exists; } @override public string tostring() { return tostring(this, 0); } } ///////////////////////////////////////////////////////////////////////////////////////////// /** * created by zhaoyy on 2017/2/8. */ final class asciinode extends abstractnode implements node { private static final char from = 32; private static final char to = 126; private final node[] children; public asciinode() { super(); this.children = new node[to - from + 1]; } public asciinode(node parent, char value) { super(parent, value); this.children = new node[to - from + 1]; } @override public node childof(char c) { if (c >= from && c <= to) return children[(int) c - from]; else return null; } @override public void add(node child) { int index = (int) child.value(); children[index - from] = child; } @override public list<node> children() { list<node> nodes = new arraylist<node>(); for (node child : children) if (child != null) nodes.add(child); return nodes; } } ////////////////////////////////////////////////////////////////////////////////////////////// /** * created by zhaoyy on 2017/2/8. */ final class mapnode extends abstractnode implements node { private final map<character, node> children; public mapnode() { super(); this.children = new hashmap<character, node>(); } public mapnode(node parent, char value) { super(parent, value); this.children = new hashmap<character, node>(); } @override public node childof(char c) { return children.get(c); } @override public void add(node child) { children.put(child.value(), child); } @override public list<node> children() { list<node> nodes = new arraylist<node>(); nodes.addall(children.values()); return nodes; } }
第三步,
首先定义一个node构造器:
/** * created by zhaoyy on 2017/2/8. */ public interface nodemaker { node make(node parent, char value); node makeroot(); }
然后构建ac自动机,实现创建及查找方法:
/** * created by zhaoyy on 2017/2/7. */ public final class wordtable { private final node root; private wordtable(collection<? extends charsequence> words, nodemaker maker) { node root = buildtrie(words, maker); setfailnode(root); this.root = root; } public static wordtable compile(collection<? extends charsequence> words) { if (words == null || words.isempty()) throw new illegalargumentexception(); final nodemaker maker; if (isascii(words)) maker = new nodemaker() { @override public node make(node parent, char value) { return new asciinode(parent, value); } @override public node makeroot() { return new asciinode(); } }; else maker = new nodemaker() { @override public node make(node parent, char value) { return new mapnode(parent, value); } @override public node makeroot() { return new mapnode(); } }; return new wordtable(words, maker); } private static boolean isascii(collection<? extends charsequence> words) { for (charsequence word : words) { int len = word.length(); for (int i = 0; i < len; i++) { int c = (int) word.charat(i); if (c < 32 || c > 126) return false; } } return true; } private static node buildtrie(collection<? extends charsequence> sequences, nodemaker maker) { node root = maker.makeroot(); for (charsequence sequence : sequences) { int len = sequence.length(); node current = root; for (int i = 0; i < len; i++) { char c = sequence.charat(i); node node = current.childof(c); if (node == null) { node = maker.make(current, c); current.add(node); } current = node; if (i == len - 1) node.setexists(true); } } return root; } private static void setfailnode(final node root) { root.setfail(null); queue<node> queue = new linkedlist<node>(); queue.add(root); while (!queue.isempty()) { node parent = queue.poll(); node temp; for (node child : parent.children()) { if (parent.isroot()) child.setfail(root); else { temp = parent.fail(); while (temp != null) { node node = temp.childof(child.value()); if (node != null) { child.setfail(node); break; } temp = temp.fail(); } if (temp == null) child.setfail(root); } queue.add(child); } } } public boolean findanyin(charsequence cs) { int len = cs.length(); node node = root; for (int i = 0; i < len; i++) { node next = node.childof(cs.charat(i)); if (next == null) { next = node.fail(); if (next == null) { node = root; continue; } } if (next.exists()) return true; } return false; } public list<matchinfo> search(charsequence cs) { if (cs == null || cs.length() == 0) return collections.emptylist(); list<matchinfo> result = new arraylist<matchinfo>(); int len = cs.length(); node node = root; for (int i = 0; i < len; i++) { node next = node.childof(cs.charat(i)); if (next == null) { next = node.fail(); if (next == null) { node = root; continue; } } if (next.exists()) { matchinfo info = new matchinfo(i, next); result.add(info); node = root; continue; } node = next; } return result; } @override public string tostring() { return root.tostring(); } }
定义一个保存查找结果的实体:
/** * created by zhaoyy on 2017/2/7. */ public final class matchinfo { private final int index; private final string word; public matchinfo(int index, string word) { this.index = index; this.word = word; } public matchinfo(int index, node node) { stringbuilder builder = new stringbuilder(); while (node != null) { if (!node.isroot()) builder.append(node.value()); node = node.parent(); } string word = builder.reverse().tostring(); this.index = index + 1 - word.length(); this.word = word; } public int getindex() { return index; } public string getword() { return word; } @override public string tostring() { return index + ":" + word; } }
第四步,调用demo:
public static void main(string[] args) { list<string> list = arrays.aslist("say", "her", "he", "she", "shr", "alone"); wordtable table = wordtable.compile(list); system.out.println(table); system.out.println(table.search("1shesaynothingabouthislivinghimalone")); }
以下是输出结果:
< exists="false" parent="null" fail="null"> <s exists="false" parent=" " fail=" "> <a exists="false" parent="s" fail="a"> <y exists="true" parent="a" fail=" "> </y> </a> <h exists="false" parent="s" fail="h"> <e exists="true" parent="h" fail="e"> </e> <r exists="true" parent="h" fail=" "> </r> </h> </s> <h exists="false" parent=" " fail=" "> <e exists="true" parent="h" fail=" "> <r exists="true" parent="e" fail=" "> </r> </e> </h> <a exists="false" parent=" " fail=" "> <l exists="false" parent="a" fail=" "> <o exists="false" parent="l" fail=" "> <n exists="false" parent="o" fail=" "> <e exists="true" parent="n" fail=" "> </e> </n> </o> </l> </a> </ > [1:she, 4:say, 31:alone]
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。