C++ 简单中文敏感词检测工具类
2016-06-24 23:33
435 查看
具体思路:
1->敏感词库,可从数据库读取,也可以从文件加载.
2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.
3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.
4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.
代码.h
View Code
测试效果:
![](https://images2015.cnblogs.com/blog/627533/201606/627533-20160624234226469-2078216570.jpg)
完整VS2013工程:http://download.csdn.net/detail/tangxin19930330/9558997
1->敏感词库,可从数据库读取,也可以从文件加载.
2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.
3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.
4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.
代码.h
#include "SenditiveWordsChecker.h" #include "stdio.h" #include "string.h" #include "iconv.h" #include <stdarg.h> #include <new> void SensitiveWordsChecker::LoadWordsFromUTF8File(const char *file_name) { char utf8_buf[enmMaxWordsFileLength] , gbk_buf[enmMaxWordsFileLength]; LoadFile(utf8_buf, enmMaxWordsFileLength, file_name); UTF8_To_GBK(utf8_buf, strlen(utf8_buf), gbk_buf, enmMaxWordsFileLength); GetWords(gbk_buf, enmMaxWordsFileLength, ','); } void SensitiveWordsChecker::LoadWordsFromGBKFile(const char *file_name) { char gbk_buf[enmMaxWordsFileLength]; LoadFile(gbk_buf, enmMaxWordsFileLength, file_name); GetWords(gbk_buf, enmMaxWordsFileLength,','); } int32_t SensitiveWordsChecker::LoadFile(char buf[], const uint32_t buf_size, const char *file_name) { FILE * pFile; size_t lSize = 0, result = 0; fopen_s(&pFile, file_name, "rb"); if (pFile == NULL) { fputs("File error\n", stderr); return -1; } // obtain file size: fseek(pFile, 0, SEEK_END); lSize = ftell(pFile); rewind(pFile); if (lSize >= buf_size){ fputs("file too large\n", stderr); return -1; } result = fread(buf, 1, lSize, pFile); if (result != lSize) { fputs("Reading error\n", stderr); return -1; } buf[lSize] = '\0'; return fclose(pFile); } int32_t SensitiveWordsChecker::CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen) { iconv_t cd; char **pin = &inbuf; char **pout = &outbuf; cd = iconv_open(to_charset, from_charset); if (cd == 0) return -1; memset(outbuf, 0, outlen); if (iconv(cd, pin, &inlen, pout, &outlen) == -1) return -1; iconv_close(cd); *pout = '\0'; return 0; } int32_t SensitiveWordsChecker::UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen) { return CodeConvert("utf-8", "gbk", inbuf, inlen, outbuf, outlen); } int32_t SensitiveWordsChecker::GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen) { return CodeConvert("gbk", "utf-8", inbuf, inlen, outbuf, outlen); } uint32_t SensitiveWordsChecker::GetWordsCount(char buf[], const uint32_t buf_size, char separator) { const char *p = buf - 1; uint32_t i = 0; while ((p = strchr(p + 1, separator)) != NULL) { ++i; } return i; } int32_t SensitiveWordsChecker::WriteToFile(const char buf[], const int32_t buf_size, const char *file_name) { FILE * pFile; size_t result; fopen_s(&pFile, file_name, "wb"); if (pFile == NULL) { fputs("File error\n", stderr); return -1; } result = fwrite(buf, 1, buf_size, pFile); if (result != buf_size) { fputs("Writing error\n", stderr); return -1; } return fclose(pFile); } int32_t SensitiveWordsChecker::GetWords(char gbk_buf[], const uint32_t buf_size, char separator) { char buf[enmMaxWordsFileLength]; StrcpyExcludeChar(buf, enmMaxWordsFileLength, gbk_buf, "\n"); //排除换行符 uint32_t nWordsCount = GetWordsCount(buf, buf_size,','); printf("words_count=%d\n", nWordsCount); arrSensitiveWord = new SensitiveWord[nWordsCount]; if (arrSensitiveWord == NULL){return -1;} nSensitiveWordCnt = 0; const char *p = NULL,*q = buf; while ((p = strchr(q, separator)) != NULL) { memcpy(arrSensitiveWord[nSensitiveWordCnt].szWord, q, p - q); //printf("%s\n", arrSensitiveWord[nSensitiveWordCnt].szWord); q = p + 1; ++nSensitiveWordCnt; } BuildWordMap(); return 0; } char * SensitiveWordsChecker::StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list) { uint32_t i = 0, j = 0, flag = 0; const char *p = NULL; if (dst == NULL && src == NULL)return NULL; if (dst == src)return dst; for (; j < dst_len && src[i] != '\0'; ++i) { flag = 0; p = exclude_list; while (p && *p != '\0') { if (*p == src[i]){ flag = 1; break; } p++; } if (flag == 0)dst[j++] = src[i]; } dst[j] = '\0'; return dst; } uint32_t SensitiveWordsChecker::GetFirstCharFromGBK(char gbk_buf[]) { int32_t code = 0; int32_t len = strlen(gbk_buf); if (len == 0)return 0; if (gbk_buf[0] >= 0 || len == 1) { //printf("%c\n", gbk_buf[0]); return uint32_t(gbk_buf[0]); //ASCII 字符 } else { short high = (short)gbk_buf[0] + 256; short low = (short)gbk_buf[1] + 256; code = high * 256 + low; char cstr[3]; cstr[0] = gbk_buf[0]; // GBK严格按照两个字节表示一个中文字符 cstr[1] = gbk_buf[1]; cstr[2] = 0; //printf("%s %x\n", cstr, code); return code; } } uint32_t SensitiveWordsChecker::GetFirstCharFromTUF8(char utf8_buf[]) { uint32_t code = 0; int32_t len = strlen(utf8_buf); if (len == 0)return 0; if (utf8_buf[0] >= 0 || len == 1) { printf("%c\n", utf8_buf[0]); return int32_t(utf8_buf[0]); //ASCII 字符 } else { short high = (short)utf8_buf[0]; short mid = (short)utf8_buf[1]; short low = (short)utf8_buf[2]; code = high * 256 * 256 + mid * 256 + low; char cstr[4]; cstr[0] = utf8_buf[0]; // UTF8大多数情况下三个字节表示一个中文字符 cstr[1] = utf8_buf[1]; cstr[2] = utf8_buf[2]; cstr[3] = 0; printf("%s\n", cstr); return code; } } uint32_t SensitiveWordsChecker::GetFirstChar(char buf[]) { uint32_t code = 0; int32_t len = strlen(buf); if (len == 0)return 0; return (uint32_t)buf[0]; } void SensitiveWordsChecker::BuildWordMap() { WordList *wordList = NULL; for (uint32_t i = 0; i < nSensitiveWordCnt; ++i) { uint32_t code = GetFirstCharFromGBK(arrSensitiveWord[i].szWord); WordMap::iterator it = mapWords.find(code); if (it == mapWords.end()) { wordList = new WordList(); mapWords[code] = wordList; } else { wordList = it->second; } wordList->push_back(&arrSensitiveWord[i]); } DumpWordMap(); GenTestData(); Test(); } void SensitiveWordsChecker::DumpWordMap() { uint32_t word_cnt = 0,i = 0; WordMap::const_iterator it = mapWords.begin(); for (; it != mapWords.end(); ++it) { //printf("%u : %u\n", i++, it->second->size()); word_cnt += it->second->size(); } printf("word_cnt = %u\n", word_cnt); } int32_t SensitiveWordsChecker::CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]) { // 先把被检测字符串转换为GBK编码 char gbk_buf[enmMaxContentLength],out_gbk_buf[enmMaxContentLength]; UTF8_To_GBK(in_utf8_buf, strlen(in_utf8_buf), gbk_buf, enmMaxContentLength); // 提取GBK字串里面的每一个字符,去map里面查找以该字符为首的关键词列表 int32_t gbk_buf_len = strlen(gbk_buf); uint32_t code = 0, flag = 0, out_gbk_buf_len = 0; char c = 0, cstr[3] = { 0 }; for (int32_t i = 0; i < gbk_buf_len;) { flag = 0; if (gbk_buf[i] >= 0 || i == gbk_buf_len - 1) { c = gbk_buf[i]; //printf("%c\n", c); //ASCII字符 code = (uint32_t)c; flag = 1; out_gbk_buf[out_gbk_buf_len] = c; } else { flag = 2; short high = (short)gbk_buf[i] + 256; short low = (short)gbk_buf[i + 1] + 256; code = high * 256 + low; cstr[0] = gbk_buf[i]; cstr[1] = gbk_buf[i + 1]; cstr[2] = 0; out_gbk_buf[out_gbk_buf_len] = cstr[0]; out_gbk_buf[out_gbk_buf_len + 1] = cstr[1]; //printf("%s\n", cstr); } // 检查敏感词 const SensitiveWord *sensitiveWord = FindSensitiveWord(code, &gbk_buf[i]); int32_t word_len = 0; if (NULL != sensitiveWord) { flag = 0; //printf("%s\n", sensitiveWord->szWord); word_len = strlen(sensitiveWord->szWord); memset(&out_gbk_buf[out_gbk_buf_len],'*', word_len); } int32_t step = word_len + flag; i += step; out_gbk_buf_len += step; } out_gbk_buf[out_gbk_buf_len] = '\0'; //printf("out_gbk_buf = %s\n", out_gbk_buf); GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxContentLength); return 0; } const SensitiveWord* SensitiveWordsChecker::FindSensitiveWord(uint32_t code, const char *pos) { int32_t word_len = 0; WordMap::const_iterator it = mapWords.find(code); if (it == mapWords.end()){ return NULL; } WordList *wordList = it->second; for (uint32_t i = 0; i < wordList->size(); i++) { const SensitiveWord *sensitiveWord = (*wordList)[i]; word_len = strlen(sensitiveWord->szWord); // 如果内容一样,就说明是敏感词 if (memcmp(sensitiveWord->szWord, pos, word_len) == 0) { return sensitiveWord; } } return NULL; } void SensitiveWordsChecker::GenTestData() { char in_gbk_buf[enmMaxWordsFileLength], out_gbk_buf[enmMaxWordsFileLength]; LoadFile(in_gbk_buf, enmMaxWordsFileLength, "poem.txt"); int32_t len = strlen(in_gbk_buf); uint32_t n = 0; for (int32_t i = 0; i < len && n < enmMaxWordsFileLength;++i) { if (i % 4 == 0 && short(in_gbk_buf[i]) > 0) { int32_t nRandIndex = rand() % nSensitiveWordCnt; SensitiveWord sensitiveWord = arrSensitiveWord[nRandIndex]; int32_t word_len = strlen(sensitiveWord.szWord); for (int32_t j = 0; j < word_len && n < enmMaxWordsFileLength; ++j) { out_gbk_buf[n++] = sensitiveWord.szWord[j]; } } out_gbk_buf[n++] = in_gbk_buf[i]; } out_gbk_buf = '\0'; char out_utf8_buf[enmMaxWordsFileLength]; GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxWordsFileLength); WriteToFile(out_utf8_buf, strlen(out_utf8_buf), "test_data.txt"); } void SensitiveWordsChecker::Test() { const int32_t max_line_len = 1024; char utf8_buf[enmMaxWordsFileLength]; char out_utf8_buf[enmMaxWordsFileLength]; LoadFile(utf8_buf, enmMaxWordsFileLength, "test_data.txt"); const char *p = NULL, *q = utf8_buf; uint32_t offset = 0; while ((p = strchr(q, '\n')) != NULL) { char in_uft8_line[max_line_len] = { 0 }; char out_uft8_line[max_line_len] = { 0 }; char out_gbk_line[max_line_len] = { 0 }; memcpy(in_uft8_line, q, p - q); UTF8_To_GBK(in_uft8_line, strlen(in_uft8_line), out_gbk_line, max_line_len); printf("%s\n", out_gbk_line); CheckSensitiveWord(out_uft8_line, in_uft8_line); q = p + 1; char gbk[enmMaxContentLength]; UTF8_To_GBK(out_uft8_line, strlen(out_uft8_line), gbk, enmMaxContentLength); printf("%s\n", gbk); StrAppend(out_utf8_buf, enmMaxWordsFileLength, offset, "%s", out_uft8_line); } WriteToFile(out_utf8_buf, offset, "test_data_ret.txt"); } void SensitiveWordsChecker::StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...) { va_list argptr; va_start(argptr, fmt); if (offset < bufLen) { offset += vsprintf_s(buf + offset, bufLen - offset, fmt, argptr); } va_end(argptr); }
View Code
测试效果:
![](https://images2015.cnblogs.com/blog/627533/201606/627533-20160624234226469-2078216570.jpg)
完整VS2013工程:http://download.csdn.net/detail/tangxin19930330/9558997
相关文章推荐
- C++11中的std::function
- 剑指offer 面试题2 Singleton模式 C++实现
- ubuntu下的c/c++环境搭建
- c++知识点2
- c++学习知识点
- c语言学习笔记36
- C++ Primer Plus 第六版_编程练习(1)(Chapter_two 1-7)
- RAII惯用法:C++资源管理的利器
- C/C++ 多线程编程方法
- 268. Missing Number
- 提高项目9.1-歌手大奖赛计分系列2
- 提高项目9-歌手大奖赛计分系列1
- 提高项目8-k次方之和
- C++设计模式系列之二结构型模式
- 提高项目7-太乐
- GeekBand c++學習筆記之八(泛型算法)
- C++引用与指针/值传递的比较
- OPTICS算法的C语言实现
- C++类构造函数初始化列表
- C++拷贝构造函数(深拷贝,浅拷贝)