您的位置:首页 > 其它

敏感词汇过滤DFA算法

2016-03-06 12:53 471 查看
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace SensitiveWordFilter
{
public class SensitiveWord
{
private static readonly char IsEndChar = '$';

/**
* 初始化敏感词库<br>
* 将敏感词加入到HashMap中<br>
* 构建DFA算法模型
*
* @author dxm
*
*/
public class SensitiveWordInit
{

// 字符编码
private static readonly  String ENCODING = "UTF-8";

/**
* 初始化敏感字库
*
* @return
*/
public Dictionary<char, object> initKeyWord()
{

// 读取敏感词库
HashSet<String> wordSet = readSensitiveWordFile();

// 将敏感词库加入到HashMap中
return addSensitiveWordToHashMap(wordSet);
}

/**
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
* 中 = {
*       isEnd = 0
*       国 = {
*             isEnd = 1
*             人 = {
*                   isEnd = 0
*                   民 = {
*                         isEnd = 1
*                   }
*             }
*             男 = {
*                   isEnd = 0
*                   人 = {
*                         isEnd = 1
*                   }
*             }
*       }
* }
* 五 = {
*       isEnd = 0
*       星 = {
*             isEnd = 0
*             红 = {
*                    isEnd = 0
*                    旗 = {
*                           isEnd = 1
*                    }
*              }
*       }
* }
*/
private Dictionary<char, object> addSensitiveWordToHashMap(HashSet<String> wordSet)
{

// 初始化敏感词容器,减少扩容操作
Dictionary<char, object> wordMap = new Dictionary<char, object>(wordSet.Count);

foreach (String word in wordSet)
{
IDictionary<char, object> nowMap = wordMap;
for (int i = 0; i < word.Length; i++)
{

// 转换成char型
char keyChar = word[i];

if (keyChar == IsEndChar)
continue;

Object tempMap;
// 获取
nowMap.TryGetValue(keyChar, out tempMap);

// 如果存在该key,直接赋值
if (tempMap != null)
{
nowMap = (Dictionary<char, object>)tempMap;
}

// 不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
else {

// 设置标志位
Dictionary<char, object> newMap = new Dictionary<char, object>();
newMap.Add(IsEndChar, "0");

// 添加到集合
nowMap.Add(keyChar, newMap);
nowMap = newMap;
}

// 最后一个
if (i == word.Length - 1)
{
nowMap[IsEndChar] = "1";
}
}
}

return wordMap;
}

/**
* 读取敏感词库中的内容,将内容添加到SortedSet集合中
*
* @return
* @throws Exception
*/
private HashSet<String> readSensitiveWordFile()
{
HashSet<String> wordSet = new HashSet<string>();
string content = File.ReadAllText("dic.txt", Encoding.GetEncoding(ENCODING));
using (StringReader sr = new StringReader(content))
{
string s;
while ((s = sr.ReadLine()) != null)
{
wordSet.Add(s);
}
}
return wordSet;
}
}

public class SensitivewordFilter
{

private Dictionary<char, object> sensitiveWordMap = null;

// 最小匹配规则
public static int minMatchTYpe = 1;

// 最大匹配规则
public static int maxMatchType = 2;

// 单例
private static SensitivewordFilter inst = null;

/**
* 构造函数,初始化敏感词库
*/
private SensitivewordFilter()
{
sensitiveWordMap = new SensitiveWordInit().initKeyWord();
}

/**
* 获取单例
*
* @return
*/
public static SensitivewordFilter getInstance()
{
if (null == inst)
{
inst = new SensitivewordFilter();
}
return inst;
}

/**
* 判断文字是否包含敏感字符
*
* @param txt
* @param matchType
* @return
*/
public bool isContaintSensitiveWord(String txt, int matchType = 1)
{
bool flag = false;
for (int i = 0; i < txt.Length; i++)
{

// 判断是否包含敏感字符
int matchFlag = this.CheckSensitiveWord(txt, i, matchType);

// 大于0存在,返回true
if (matchFlag > 0)
{
flag = true;
}
}
return flag;
}

/**
* 获取文字中的敏感词
*
* @param txt
* @param matchType
* @return
*/
public HashSet<String> getSensitiveWord(String txt, int matchType = 1)
{
HashSet<String> sensitiveWordList = new HashSet<String>();

for (int i = 0; i < txt.Length; i++)
{

// 判断是否包含敏感字符
int length = CheckSensitiveWord(txt, i, matchType);

// 存在,加入list中
if (length > 0)
{
sensitiveWordList.Add(txt.Substring(i, length));

// 减1的原因,是因为for会自增
i = i + length - 1;
}
}

return sensitiveWordList;
}

/**
* 替换敏感字字符
*
* @param txt
* @param matchType
* @param replaceChar
* @return
*/
public String replaceSensitiveWord(String txt, String replaceChar, int matchType = 1)
{
StringBuilder sb = new StringBuilder(txt);
for (int i = 0; i < txt.Length; i++)
{

// 判断是否包含敏感字符
int length = CheckSensitiveWord(txt, i, matchType);

// 存在,加入list中
if (length > 0)
{
var ttxt = txt.Substring(i, length);
sb.Replace(ttxt, getReplaceChars(replaceChar, ttxt.Length), i, length);

// 减1的原因,是因为for会自增
i = i + length - 1;
}
}

return sb.ToString();
}

/**
* 获取替换字符串
*
* @param replaceChar
* @param length
* @return
*/
private String getReplaceChars(String replaceChar, int length)
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < length; i++)
{
sb.Append(replaceChar);
}

return sb.ToString();
}

/**
* 检查文字中是否包含敏感字符,检查规则如下:<br>
* 如果存在,则返回敏感词字符的长度,不存在返回0
*
* @param txt
* @param beginIndex
* @param matchType
* @return
*/
public int CheckSensitiveWord(String txt, int beginIndex, int matchType)
{

// 敏感词结束标识位:用于敏感词只有1位的情况
bool flag = false;

// 匹配标识数默认为0
int matchFlag = 0;
Dictionary<char, object> nowMap = sensitiveWordMap;
int tempFlag = 0;
Dictionary<char, object> tempMapForBack = new Dictionary<char, object>();
int len = txt.Length;
for (int i = beginIndex; i < len; i++)
{
char word = txt[i];

if (word == IsEndChar)
continue;

// 获取指定key
Object tempMap;
// 获取
nowMap.TryGetValue(word, out tempMap);

if (tempFlag == 0)
tempMapForBack = nowMap;

// 如果存在该key,直接赋值
if (tempMap != null)
{
nowMap = (Dictionary<char, object>)tempMap;
}
else
{
if (tempFlag > 0)
{
matchFlag = matchFlag - (i - tempFlag);
i = tempFlag - 1;
nowMap = tempMapForBack;
continue;
}
else
{
nowMap = null;
}
}

// 存在,则判断是否为最后一个
if (nowMap != null)
{

// 找到相应key,匹配标识+1
matchFlag++;

object value;

if (nowMap.TryGetValue(IsEndChar, out value))
{
if (value is string)
{
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
if ("1" == (string)value)
{
if (nowMap.Keys.Count == 1 || tempFlag != 0 || i == len - 1)
{
// 结束标志位为true
flag = true;

// 最小规则,直接返回,最大规则还需继续查找
if (SensitivewordFilter.minMatchTYpe == matchType)
{
break;
}
}
else
{
tempFlag = i;
}
}
}
}
}
// 不存在,直接返回
else
{
break;
}
}

// 长度必须大于等于1,为词
if (matchFlag < 2 || !flag)
{
matchFlag = 0;
}
return matchFlag;
}
}
}
}

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace SensitiveWordFilter
{
class Program
{
static void Main(string[] args)
{
SensitiveWord.SensitivewordFilter filter = SensitiveWord.SensitivewordFilter.getInstance();
String txt = "$fuckfuck you你麻痹e菜太菜了fuckyou从飞啊 fuck you";
String hou = filter.replaceSensitiveWord(txt, "*");
Console.WriteLine("替换前的文字为:" + txt);
Console.WriteLine("替换后的文字为:" + hou);
Console.ReadKey();
}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  DFA算法