web系统安全运营之基础- 基于DFA算法的高性能的敏感词,脏词的检测过滤算法类(c#).
程序员文章站
2022-06-21 20:28:40
想持久运营一款web或移动端的产品,对内容进行必要的把关必不可少。这里分享一个基于DFA算法的高性能的敏感词,脏词的检测过滤算法类(c#). ......
【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词.. 这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。
废话少说,先看下代码,可以拿过去直接使用。
1 using microsoft.visualbasic; 2 using system; 3 using system.collections.generic; 4 using system.io; 5 using system.linq; 6 using system.text; 7 8 namespace opencore.contentsecurity 9 { 10 /// <summary> 11 /// 功能简介:基于dfa算法的高效率非法关键词检测过滤类(杜绝违法内容) 12 /// 开发前参考内容:https://blog.csdn.net/u011966339/article/details/72832197 13 /// 更新日志: 14 /// 2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能. 15 /// 支持多词库文件加载. 16 /// 优化了算法的细节,提高健壮性。 17 /// </summary> 18 public class sensitivewordfilter 19 { 20 private static string[] dictionarypathlist = null; 21 /// <summary> 22 /// 内存词典 23 /// </summary> 24 private static wordgroup[] memorylexicon = new wordgroup[(int)char.maxvalue]; 25 private static object lockobj = new object(); 26 public static void init(string[] sdictionaryfilename) 27 { 28 dictionarypathlist = sdictionaryfilename; 29 loaddictionary(); 30 } 31 public sensitivewordfilter() 32 { 33 34 } 35 private string sourcttext = string.empty; 36 /// <summary> 37 /// 检测源 38 /// </summary> 39 private string sourcttext 40 { 41 get { return sourcttext; } 42 set { sourcttext = value; } 43 } 44 /// <summary> 45 /// 检测源游标 46 /// </summary> 47 private int cursor = 0; 48 /// <summary> 49 /// 匹配成功后偏移量 50 /// </summary> 51 private int wordlenght = 0; 52 /// <summary> 53 /// 检测词游标 54 /// </summary> 55 private int nextcursor = 0; 56 private list<string> illegalwords = new list<string>(); 57 /// <summary> 58 /// 检测到的非法词集 59 /// </summary> 60 public list<string> illegalwords 61 { 62 get { return illegalwords; } 63 } 64 /// <summary> 65 /// 判断是否是中文 66 /// </summary> 67 /// <param name="character"></param> 68 /// <returns></returns> 69 private bool ischs(char character) 70 { 71 // 中文表意字符的范围 4e00-9fa5 72 int charval = (int)character; 73 return (charval >= 0x4e00 && charval <= 0x9fa5); 74 } 75 /// <summary> 76 /// 判断是否是数字 77 /// </summary> 78 /// <param name="character"></param> 79 /// <returns></returns> 80 private bool isnum(char character) 81 { 82 int charval = (int)character; 83 return (charval >= 48 && charval <= 57); 84 } 85 /// <summary> 86 /// 判断是否是字母 87 /// </summary> 88 /// <param name="character"></param> 89 /// <returns></returns> 90 private bool isalphabet(char character) 91 { 92 int charval = (int)character; 93 return ((charval >= 97 && charval <= 122) || (charval >= 65 && charval <= 90)); 94 } 95 /// <summary> 96 /// 转半角小写的函数(dbc case) 97 /// </summary> 98 /// <param name="input">任意字符串</param> 99 /// <returns>半角字符串</returns> 100 ///<remarks> 101 ///全角空格为12288,半角空格为32 102 ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248 103 ///</remarks> 104 private static string todbc(string input) 105 { 106 char[] c = input.tochararray(); 107 for (int i = 0; i < c.length; i++) 108 { 109 if (c[i] == 12288) 110 { 111 c[i] = (char)32; 112 continue; 113 } 114 if (c[i] > 65280 && c[i] < 65375) 115 c[i] = (char)(c[i] - 65248); 116 } 117 return new string(c).tolower(); 118 } 119 /// <summary> 120 /// 转换为简体中文 121 /// </summary> 122 /// <param name="sinput"></param> 123 /// <returns></returns> 124 private static string tosimplifiedchiniese(string sinput) 125 { 126 if (string.isnullorempty(sinput)) 127 { 128 return string.empty; 129 } 130 try 131 { 132 return strings.strconv(sinput, vbstrconv.simplifiedchinese, 0); 133 } 134 catch (exception ex) 135 { 136 137 } 138 return sinput; 139 } 140 /// <summary> 141 /// 写入日志(非跨程序域的场景) 142 /// </summary> 143 /// <param name="msg"></param> 144 private static void savelog(string msg) 145 { 146 string spath = path.combine(appdomain.currentdomain.setupinformation.applicationbase, "securitylog"); 147 if (!directory.exists(spath)) 148 { 149 directory.createdirectory(spath); 150 } 151 spath = string.format("{0}\\{1}", spath, datetime.now.tostring("yyyymmdd") + ".log"); 152 try 153 { 154 file.appendalltext(spath, "[" + datetime.now.tostring() + "]" + msg + "\r\n"); 155 } 156 catch 157 { 158 } 159 } 160 /// <summary> 161 /// 加载内存词库 162 /// </summary> 163 private static void loaddictionary() 164 { 165 if (dictionarypathlist == null || dictionarypathlist.length == 0) 166 { 167 savelog($"sensitivewordfilter.loaddictionary.字典路径配置为空"); 168 return; 169 } 170 foreach (string sfilename in dictionarypathlist) 171 { 172 if (file.exists(sfilename) == false) 173 { 174 savelog($"sensitivewordfilter.loaddictionary.路径:{sfilename}不是一个有效的文件"); 175 return; 176 } 177 } 178 list<string> wordlist = new list<string>(); 179 array.clear(memorylexicon, 0, memorylexicon.length); 180 foreach (string sdictionaryfile in dictionarypathlist) 181 { 182 string[] words = system.io.file.readalllines(sdictionaryfile, system.text.encoding.default); 183 foreach (string word in words) 184 { 185 if (string.isnullorempty(word)) 186 continue; 187 if (word.trim().length == 0) 188 continue; 189 string key = todbc(word); 190 wordlist.add(key); 191 //适配繁体,简体.addbyww@2020-4-15 192 string key_simple = tosimplifiedchiniese(key); 193 if (key_simple != key) 194 { 195 wordlist.add(key_simple); 196 } 197 } 198 } 199 comparison<string> cmp = delegate (string key1, string key2) 200 { 201 return key1.compareto(key2); 202 }; 203 wordlist.sort(cmp); 204 for (int i = wordlist.count - 1; i > 0; i--) 205 { 206 if (wordlist[i].tostring() == wordlist[i - 1].tostring()) 207 { 208 wordlist.removeat(i); 209 } 210 } 211 foreach (var word in wordlist) 212 { 213 if (word.length > 0) 214 { 215 wordgroup group = memorylexicon[(int)word[0]]; 216 if (group == null) 217 { 218 group = new wordgroup(); 219 memorylexicon[(int)word[0]] = group; 220 } 221 group.add(word.substring(1)); 222 } 223 } 224 } 225 /// <summary> 226 /// 检测 227 /// </summary> 228 /// <param name="blackword"></param> 229 /// <returns></returns> 230 private bool check(string blackword) 231 { 232 wordlenght = 0; 233 //检测源下一位游标 234 nextcursor = cursor + 1; 235 bool found = false; 236 //遍历词的每一位做匹配 237 for (int i = 0; i < blackword.length; i++) 238 { 239 //特殊字符偏移游标 240 int offset = 0; 241 if (nextcursor >= sourcttext.length) 242 { 243 break; 244 } 245 else 246 { 247 //检测下位字符如果不是汉字 数字 字符 偏移量加1 248 for (int y = nextcursor; y < sourcttext.length; y++) 249 { 250 251 if (!ischs(sourcttext[y]) && !isnum(sourcttext[y]) && !isalphabet(sourcttext[y])) 252 { 253 offset++; 254 //避让特殊字符,下位游标如果>=字符串长度 跳出 255 if (nextcursor + offset >= sourcttext.length) break; 256 wordlenght++; 257 } 258 else break; 259 } 260 if ((int)blackword[i] == (int)sourcttext[nextcursor + offset]) 261 { 262 found = true; 263 } 264 else 265 { 266 found = false; 267 break; 268 } 269 } 270 nextcursor = nextcursor + 1 + offset; 271 wordlenght++; 272 } 273 return found; 274 } 275 /// <summary> 276 /// 检测并替换敏感词为指定字符。之后返回 277 /// </summary> 278 /// <param name="replacechar">比如:*</param> 279 public string getdatabyfilter(string ssourceinput, char replacechar) 280 { 281 if (string.isnullorempty(ssourceinput)) 282 { 283 return ssourceinput; 284 } 285 if (memorylexicon == null || memorylexicon.length == 0) 286 { 287 savelog($"sensitivewordfilter.getdatabyfilter.内存字典为空"); 288 return ssourceinput; 289 } 290 //初始化 291 this.cursor = 0; 292 this.wordlenght = 0; 293 this.illegalwords.clear(); 294 this.sourcttext = ssourceinput; 295 if (sourcttext != string.empty) 296 { 297 char[] tempstring = sourcttext.tochararray(); 298 for (int i = 0; i < sourcttext.length; i++) 299 { 300 //查询以该字为首字符的词组 301 wordgroup group = memorylexicon[(int)todbc(sourcttext)[i]]; 302 if (group != null) 303 { 304 for (int z = 0; z < group.count(); z++) 305 { 306 string word = group.getword(z); 307 if (word.length == 0 || check(word)) 308 { 309 string blackword = string.empty; 310 for (int pos = 0; pos < wordlenght + 1; pos++) 311 { 312 blackword += tempstring[pos + cursor].tostring(); 313 tempstring[pos + cursor] = replacechar; 314 } 315 illegalwords.add(blackword); 316 cursor = cursor + wordlenght; 317 i = i + wordlenght; 318 } 319 } 320 } 321 cursor++; 322 } 323 return new string(tempstring); 324 } 325 else 326 { 327 return string.empty; 328 } 329 } 330 } 331 /// <summary> 332 /// 具有相同首字符的词组集合 333 /// </summary> 334 public class wordgroup 335 { 336 /// <summary> 337 /// 集合 338 /// </summary> 339 private list<string> grouplist=new list<string>(); 340 public wordgroup() 341 { 342 343 } 344 /// <summary> 345 /// 添加词 346 /// </summary> 347 /// <param name="word"></param> 348 public void add(string word) 349 { 350 if (grouplist.contains(word) == false) 351 { 352 grouplist.add(word); 353 } 354 } 355 /// <summary> 356 /// 获取总数 357 /// </summary> 358 /// <returns></returns> 359 public int count() 360 { 361 return grouplist.count; 362 } 363 /// <summary> 364 /// 根据下标获取词 365 /// </summary> 366 /// <param name="index"></param> 367 /// <returns></returns> 368 public string getword(int index) 369 { 370 return grouplist[index]; 371 } 372 } 373 }
上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:
1 //全局配置,整个程序只要配置一次即可,后续无需配置 2 sensitivewordfilter.init(new string[] { 3 @"c:\users\x\downloads\网站需要过滤的敏感词\mgck-master\暴恐词库.txt", 4 @"c:\users\x\downloads\网站需要过滤的敏感词\mgck-master\反动词库.txt", 5 @"c:\users\x\downloads\网站需要过滤的敏感词\mgck-master\民生词库.txt", 6 @"c:\users\x\downloads\网站需要过滤的敏感词\mgck-master\色情词库.txt", 7 @"c:\users\x\downloads\网站需要过滤的敏感词\mgck-master\贪腐词库.txt", 8 @"c:\users\x\downloads\网站需要过滤的敏感词\mgck-master\其他词库.txt" 9 }); 10 //下列可以在多个地方实例化,可以并发执行 11 sensitivewordfilter wordfilter = new sensitivewordfilter(); 12 dictionary<string, string> dicttestdata = new dictionary<string, string>(); 13 //多测几个示例,看看效果 14 dicttestdata["杀^人游戏,有人找一夜q"] = ""; 15 dicttestdata["数学学习课堂"] = ""; 16 dicttestdata["打击法0功有,法0功毒害大众"] = ""; 17 dictionary<string, string> dictresult = new dictionary<string, string>(); 18 foreach(string skey in dicttestdata.keys) 19 { 20 dictresult[skey] = $"替换后:{wordfilter.getdatabyfilter(skey,'|')}, ------------检测违禁词:{string.join(",",(wordfilter.illegalwords==null?new list<string>():wordfilter.illegalwords))}"; 21 } 22 string sresultjson = jsonconverter.serializeobject(dictresult); 23 utils.savelog(sresultjson);
最后,给一下打印的结果:
"杀^人游戏,有人找一夜q": 替换后: "杀^人游戏,有人找|||", ------------检测违禁词:一夜q",
"数学学习课堂": 替换后:"数学学习课堂", ------------检测违禁词:,
"打击法0功有,法0功毒害大众": 替换后:"打击|||有,|||毒害大众", ------------检测违禁词:法0功,法0功"
-------------附
词库下载地址:https://codeload.github.com/chason777777/mgck/zip/master