敏感词汇过滤DFA算法
2016-03-06 12:53
471 查看
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace SensitiveWordFilter { public class SensitiveWord { private static readonly char IsEndChar = '$'; /** * 初始化敏感词库<br> * 将敏感词加入到HashMap中<br> * 构建DFA算法模型 * * @author dxm * */ public class SensitiveWordInit { // 字符编码 private static readonly String ENCODING = "UTF-8"; /** * 初始化敏感字库 * * @return */ public Dictionary<char, object> initKeyWord() { // 读取敏感词库 HashSet<String> wordSet = readSensitiveWordFile(); // 将敏感词库加入到HashMap中 return addSensitiveWordToHashMap(wordSet); } /** * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br> * 中 = { * isEnd = 0 * 国 = { * isEnd = 1 * 人 = { * isEnd = 0 * 民 = { * isEnd = 1 * } * } * 男 = { * isEnd = 0 * 人 = { * isEnd = 1 * } * } * } * } * 五 = { * isEnd = 0 * 星 = { * isEnd = 0 * 红 = { * isEnd = 0 * 旗 = { * isEnd = 1 * } * } * } * } */ private Dictionary<char, object> addSensitiveWordToHashMap(HashSet<String> wordSet) { // 初始化敏感词容器,减少扩容操作 Dictionary<char, object> wordMap = new Dictionary<char, object>(wordSet.Count); foreach (String word in wordSet) { IDictionary<char, object> nowMap = wordMap; for (int i = 0; i < word.Length; i++) { // 转换成char型 char keyChar = word[i]; if (keyChar == IsEndChar) continue; Object tempMap; // 获取 nowMap.TryGetValue(keyChar, out tempMap); // 如果存在该key,直接赋值 if (tempMap != null) { nowMap = (Dictionary<char, object>)tempMap; } // 不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个 else { // 设置标志位 Dictionary<char, object> newMap = new Dictionary<char, object>(); newMap.Add(IsEndChar, "0"); // 添加到集合 nowMap.Add(keyChar, newMap); nowMap = newMap; } // 最后一个 if (i == word.Length - 1) { nowMap[IsEndChar] = "1"; } } } return wordMap; } /** * 读取敏感词库中的内容,将内容添加到SortedSet集合中 * * @return * @throws Exception */ private HashSet<String> readSensitiveWordFile() { HashSet<String> wordSet = new HashSet<string>(); string content = File.ReadAllText("dic.txt", Encoding.GetEncoding(ENCODING)); using (StringReader sr = new StringReader(content)) { string s; while ((s = sr.ReadLine()) != null) { wordSet.Add(s); } } return wordSet; } } public class SensitivewordFilter { private Dictionary<char, object> sensitiveWordMap = null; // 最小匹配规则 public static int minMatchTYpe = 1; // 最大匹配规则 public static int maxMatchType = 2; // 单例 private static SensitivewordFilter inst = null; /** * 构造函数,初始化敏感词库 */ private SensitivewordFilter() { sensitiveWordMap = new SensitiveWordInit().initKeyWord(); } /** * 获取单例 * * @return */ public static SensitivewordFilter getInstance() { if (null == inst) { inst = new SensitivewordFilter(); } return inst; } /** * 判断文字是否包含敏感字符 * * @param txt * @param matchType * @return */ public bool isContaintSensitiveWord(String txt, int matchType = 1) { bool flag = false; for (int i = 0; i < txt.Length; i++) { // 判断是否包含敏感字符 int matchFlag = this.CheckSensitiveWord(txt, i, matchType); // 大于0存在,返回true if (matchFlag > 0) { flag = true; } } return flag; } /** * 获取文字中的敏感词 * * @param txt * @param matchType * @return */ public HashSet<String> getSensitiveWord(String txt, int matchType = 1) { HashSet<String> sensitiveWordList = new HashSet<String>(); for (int i = 0; i < txt.Length; i++) { // 判断是否包含敏感字符 int length = CheckSensitiveWord(txt, i, matchType); // 存在,加入list中 if (length > 0) { sensitiveWordList.Add(txt.Substring(i, length)); // 减1的原因,是因为for会自增 i = i + length - 1; } } return sensitiveWordList; } /** * 替换敏感字字符 * * @param txt * @param matchType * @param replaceChar * @return */ public String replaceSensitiveWord(String txt, String replaceChar, int matchType = 1) { StringBuilder sb = new StringBuilder(txt); for (int i = 0; i < txt.Length; i++) { // 判断是否包含敏感字符 int length = CheckSensitiveWord(txt, i, matchType); // 存在,加入list中 if (length > 0) { var ttxt = txt.Substring(i, length); sb.Replace(ttxt, getReplaceChars(replaceChar, ttxt.Length), i, length); // 减1的原因,是因为for会自增 i = i + length - 1; } } return sb.ToString(); } /** * 获取替换字符串 * * @param replaceChar * @param length * @return */ private String getReplaceChars(String replaceChar, int length) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < length; i++) { sb.Append(replaceChar); } return sb.ToString(); } /** * 检查文字中是否包含敏感字符,检查规则如下:<br> * 如果存在,则返回敏感词字符的长度,不存在返回0 * * @param txt * @param beginIndex * @param matchType * @return */ public int CheckSensitiveWord(String txt, int beginIndex, int matchType) { // 敏感词结束标识位:用于敏感词只有1位的情况 bool flag = false; // 匹配标识数默认为0 int matchFlag = 0; Dictionary<char, object> nowMap = sensitiveWordMap; int tempFlag = 0; Dictionary<char, object> tempMapForBack = new Dictionary<char, object>(); int len = txt.Length; for (int i = beginIndex; i < len; i++) { char word = txt[i]; if (word == IsEndChar) continue; // 获取指定key Object tempMap; // 获取 nowMap.TryGetValue(word, out tempMap); if (tempFlag == 0) tempMapForBack = nowMap; // 如果存在该key,直接赋值 if (tempMap != null) { nowMap = (Dictionary<char, object>)tempMap; } else { if (tempFlag > 0) { matchFlag = matchFlag - (i - tempFlag); i = tempFlag - 1; nowMap = tempMapForBack; continue; } else { nowMap = null; } } // 存在,则判断是否为最后一个 if (nowMap != null) { // 找到相应key,匹配标识+1 matchFlag++; object value; if (nowMap.TryGetValue(IsEndChar, out value)) { if (value is string) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数 if ("1" == (string)value) { if (nowMap.Keys.Count == 1 || tempFlag != 0 || i == len - 1) { // 结束标志位为true flag = true; // 最小规则,直接返回,最大规则还需继续查找 if (SensitivewordFilter.minMatchTYpe == matchType) { break; } } else { tempFlag = i; } } } } } // 不存在,直接返回 else { break; } } // 长度必须大于等于1,为词 if (matchFlag < 2 || !flag) { matchFlag = 0; } return matchFlag; } } } } using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace SensitiveWordFilter { class Program { static void Main(string[] args) { SensitiveWord.SensitivewordFilter filter = SensitiveWord.SensitivewordFilter.getInstance(); String txt = "$fuckfuck you你麻痹e菜太菜了fuckyou从飞啊 fuck you"; String hou = filter.replaceSensitiveWord(txt, "*"); Console.WriteLine("替换前的文字为:" + txt); Console.WriteLine("替换后的文字为:" + hou); Console.ReadKey(); } } }
相关文章推荐
- 高效敏感词过滤JAVA实现(DFA算法)
- java敏感词过滤-使用HashMap实现DFA算法
- Java使用DFA算法实现敏感词过滤
- JavaWeb敏感词过滤实现
- php实现简单的基于DFA算法的敏感词过滤
- 简单实现java DFA算法对敏感词过滤
- Java使用DFA算法实现过滤多家公司自定义敏感字功能详解
- java利用DFA算法实现敏感词过滤功能
- 敏感词汇过滤DFA算法
- javacript 实现瀑布流原理和效果, 滚动加载图片【图文解析 附源码】
- python os 常用命令
- BestCoder #74 B (div2)
- Android 常用代码
- 编译php5.4的时候出现错误----configure: error: in `/usr/local/src/php540/php-5.4.0':
- 历届试题 连号区间数
- 课程设计__复数的计算
- 机器学习技法 笔记五 Kernel Logistic Regression
- VBA中将数字列转换成字母的方法
- 构建之法问题
- hdu 5233 Gunner II (map的简单用法)