C# 实现敏感词过滤
程序员文章站
2022-06-13 08:25:08
实现 该 敏感词过滤 采用的是 DFA算法,参考文章:https://blog.csdn.net/chenssy/article/details/26961957 具体 实现 步骤 如下: 第一步,构建 敏感词库(WordsLibrary) 类: using System.Collections.G ......
实现 该 敏感词过滤 采用的是 dfa算法,参考文章:
具体 实现 步骤 如下:
第一步,构建 敏感词库(wordslibrary) 类:
using system.collections.generic; using system.linq; using system; namespace contentsafe.sensitiveword { /// <summary> /// 敏感词库 /// </summary> public class wordslibrary { /// <summary> /// 词库树结构类 /// </summary> public class itemtree { public char item { get; set; } public bool isend { get; set; } public list<itemtree> child { get; set; } } /// <summary> /// 词库树 /// </summary> public itemtree library { get; private set; } /// <summary> /// 敏感词组 /// </summary> public string[] words { get; protected set; } /// <summary> /// 敏感词库 /// </summary> public wordslibrary() { loadwords(); init(); } /// <summary> /// 敏感词库 /// </summary> /// <param name="words">敏感词组</param> public wordslibrary(string[] words) : this() { words = words; } /// <summary> /// 加载 敏感词组,可被重写以自定义 如何加载 敏感词组 /// </summary> public virtual void loadwords() { } /// <summary> /// 词库初始化 /// </summary> private void init() { if (words == null) words = new[] { "" }; library = new itemtree() { item = 'r', isend = false, child = createtree(words) }; } /// <summary> /// 创建词库树 /// </summary> /// <param name="words">敏感词组</param> /// <returns></returns> private list<itemtree> createtree(string[] words) { list<itemtree> tree = null; if (words != null && words.length > 0) { tree = new list<itemtree>(); foreach (var item in words) if (!string.isnullorempty(item)) { char cha = item[0]; itemtree node = tree.find(e => e.item == cha); if (node != null) addchildtree(node, item); else tree.add(createsingletree(item)); } } return tree; } /// <summary> /// 创建单个完整树 /// </summary> /// <param name="word">单个敏感词</param> /// <returns></returns> private itemtree createsingletree(string word) { //根节点,此节点 值为空 itemtree root = new itemtree(); //移动 游标 itemtree p = root; for (int i = 0; i < word.length; i++) { itemtree child = new itemtree() { item = word[i], isend = false, child = null }; p.child = new list<itemtree>() { child }; p = child; } p.isend = true; return root.child.first(); } /// <summary> /// 附加分支子树 /// </summary> /// <param name="childtree">子树</param> /// <param name="word">单个敏感词</param> private void addchildtree(itemtree childtree, string word) { //移动 游标 itemtree p = childtree; for (int i = 1; i < word.length; i++) { char cha = word[i]; list<itemtree> child = p.child; if (child == null) { itemtree node = new itemtree() { item = cha, isend = false, child = null }; p.child = new list<itemtree>() { node }; p = node; } else { itemtree node = child.find(e => e.item == cha); if (node == null) { node = new itemtree() { item = cha, isend = false, child = null }; child.add(node); p = node; } else p = node; } } p.isend = true; } } }
第二步,构建 敏感词检测(contentcheck) 类:
using system.collections.generic; using system.linq; using system; namespace contentsafe.sensitiveword { /// <summary> /// 敏感词检测 /// </summary> public class contentcheck { /// <summary> /// 检测文本 /// </summary> public string text { private get; set; } /// <summary> /// 敏感词库 词树 /// </summary> public wordslibrary.itemtree library { private get; set; } /// <summary> /// 敏感词检测 /// </summary> public contentcheck() { } /// <summary> /// 敏感词检测 /// </summary> /// <param name="library">敏感词库</param> public contentcheck(wordslibrary library) { if (library.library == null) throw new exception("敏感词库未初始化"); library = library.library; } /// <summary> /// 敏感词检测 /// </summary> /// <param name="library">敏感词库</param> /// <param name="text">检测文本</param> public contentcheck(wordslibrary library, string text) : this(library) { if (text == null) throw new exception("检测文本不能为null"); text = text; } /// <summary> /// 检测敏感词 /// </summary> /// <param name="text">检测文本</param> /// <returns></returns> private dictionary<int, char> wordscheck(string text) { if (library == null) throw new exception("未设置敏感词库 词树"); dictionary<int, char> dic = new dictionary<int, char>(); wordslibrary.itemtree p = library; list<int> indexs = new list<int>(); for (int i = 0, j = 0; j < text.length; j++) { char cha = text[j]; var child = p.child; var node = child.find(e => e.item == cha); if (node != null) { indexs.add(j); if (node.isend || node.child == null) { if (node.child != null) { int k = j + 1; if (k < text.length && node.child.exists(e => e.item == text[k])) { p = node; continue; } } foreach (var item in indexs) dic.add(item, text[item]); indexs.clear(); p = library; i = j; ++i; } else p = node; } else { indexs.clear(); if (p.gethashcode() != library.gethashcode()) { ++i; j = i; p = library; } else i = j; } } return dic; } /// <summary> /// 替换敏感词 /// </summary> /// <param name="library">敏感词库</param> /// <param name="text">检测文本</param> /// <param name="newchar">替换字符</param> /// <returns></returns> public static string sensitivewordsreplace(wordslibrary library, string text, char newchar = '*') { dictionary<int, char> dic = new contentcheck(library).wordscheck(text); if (dic != null && dic.keys.count > 0) { char[] chars = text.tochararray(); foreach (var item in dic) chars[item.key] = newchar; text = new string(chars); } return text; } /// <summary> /// 替换敏感词 /// </summary> /// <param name="text">检测文本</param> /// <param name="newchar">替换字符</param> /// <returns></returns> public string sensitivewordsreplace(string text, char newchar = '*') { dictionary<int, char> dic = wordscheck(text); if (dic != null && dic.keys.count > 0) { char[] chars = text.tochararray(); foreach (var item in dic) chars[item.key] = newchar; text = new string(chars); } return text; } /// <summary> /// 替换敏感词 /// </summary> /// <param name="newchar">替换字符</param> /// <returns></returns> public string sensitivewordsreplace(char newchar = '*') { if (text == null) throw new exception("未设置检测文本"); return sensitivewordsreplace(text, newchar); } /// <summary> /// 查找敏感词 /// </summary> /// <param name="library">敏感词库</param> /// <param name="text">检测文本</param> /// <returns></returns> public static list<string> findsensitivewords(wordslibrary library, string text) { contentcheck check = new contentcheck(library, text); return check.findsensitivewords(); } /// <summary> /// 查找敏感词 /// </summary> /// <param name="text">检测文本</param> /// <returns></returns> public list<string> findsensitivewords(string text) { dictionary<int, char> dic = wordscheck(text); if (dic != null && dic.keys.count > 0) { int i = -1; string str = ""; list<string> list = new list<string>(); foreach(var item in dic) { if (i == -1 || i + 1 == item.key) str += item.value; else { list.add(str); str = "" + item.value; } i = item.key; } list.add(str); return list.distinct().tolist(); } else return null; } /// <summary> /// 查找敏感词 /// </summary> /// <returns></returns> public list<string> findsensitivewords() { if (text == null) throw new exception("未设置检测文本"); return findsensitivewords(text); } } }
第三步,测试与使用方法:
string[] words = new[] { "敏感词1", "敏感词2", "含有", "垃圾" }; //敏感词组 可自行在网上 搜索下载 //敏感词库 类可被继承,如果想实现自定义 敏感词导入方法 可以 对 loadwords 方法进行 重写 var library = new wordslibrary(words); //实例化 敏感词库 string text = "在任意一个文本中都可能包含敏感词1、2、3等等,只要含有敏感词都会被找出来,比如:垃圾"; contentcheck check = new contentcheck(library, text); //实例化 内容检测类 var list = check.findsensitivewords(); //调用 查找敏感词方法 返回敏感词列表 var str = check.sensitivewordsreplace(); //调用 敏感词替换方法 返回处理过的字符串
该 实现方案 不止 这个 使用方法,更多使用方法 可自行 研究