您的位置:首页 > 编程语言 > Python开发

使用python开发敏感词检测过滤系统

2014-10-07 00:52 363 查看
这里使用python 的 bottle框架来做简易的敏感词过滤系统,算法采用成熟的DFA关键词匹配算法,本系统只提供一套基于http的 api,可以提供给各个应用使用。这里我只把最核心的业务实现,其他的再完善。

检索算法是网上找的DFA算法的python实现版本

smallgfw.py

#encoding=utf-8
#DFA based text filter
#version=0.3
class GFW(object):
def __init__(self):
self.d = {}

#give a list of "ming gan ci"
def set(self,keywords):
p = self.d
q = {}
k = ''
for word in keywords:
word += chr(11)
p = self.d
for char in word:
char = char.lower()
if p=='':
q[k] = {}
p = q[k]
if not (char in p):
p[char] = ''
q = p
k = char
p = p[char]

pass

def replace(self,text,mask):
"""
>>> gfw = GFW()
>>> gfw.set(["sexy","girl","love","shit"])
>>> s = gfw.replace("Shit!,Cherry is a sexy girl. She loves python.","*")
>>> print s
*!,Cherry is a * *. She *s python.
"""
p = self.d
i = 0
j = 0
z = 0
result = []
ln = len(text)
while i+j<ln:
#print i,j
t = text[i+j].lower()
#print hex(ord(t))
if not (t in p):
j = 0
i += 1
p = self.d
continue
p = p[t]
j+=1
if chr(11) in p:
p = self.d
result.append(text[z:i])
result.append(mask)
i = i+j
z = i
j = 0
result.append(text[z:i+j])
return "".join(result)

def check(self,text):
"""
>>> gfw = GFW()
>>> gfw.set(["abd","defz","bcz"])
>>> print gfw.check("xabdabczabdxaadefz")
[(1, 3, 'abd'), (5, 3, 'bcz'), (8, 3, 'abd'), (14, 4, 'defz')]
"""
p = self.d
i = 0
j = 0
result = []
ln = len(text)
while i+j<ln:
t = text[i+j].lower()
#print i,j,hex(ord(t))
if not (t in p):
j = 0
i += 1
p = self.d
continue
p = p[t]
j+=1
#print p,i,j
if chr(11) in p:
p = self.d
result.append((i,j,text[i:i+j]))
i = i+j
j = 0
return result

if __name__=="__main__":
import doctest,sys
doctest.testmod(sys.modules[__name__])


然后编写bottle框架的api文件代码,localbottle.py

#-*- coding:utf-8 -*-
#localhost testing
#caroltc 2014/10/7
from bottle import route, run, request
from smallgfw import *
import json
import sys

def initWords():
path = 'words.txt'
fp = open(path,'r')
word_list = []
for line in fp:
line = line[0:-1]
word_list.append(line)
fp.close()
return word_list

@route('/replace', method="POST")
def replace():
reload(sys)
sys.setdefaultencoding('utf8')
getwords = request.params.words or ""
gfw = GFW()
words = initWords()
gfw.set(words)#设置敏感词列表
res = gfw.check(getwords.encode('utf8'))
#    for obj in res:
#        print json.dumps(obj),obj[2]
s = gfw.replace(getwords.encode('utf8'),"**")
return s

@route('/check',method="POST")
def check():
reload(sys)
sys.setdefaultencoding('utf8')
getwords = request.params.words or ""
gfw = GFW()
words = initWords()
gfw.set(words)#设置敏感词列表
res = gfw.check(getwords.encode('utf8'))
resp = {}
resp['count'] = len(res)
resp['datas']= res
return json.dumps(resp)

@route('/test')
def test():
reload(sys)
sys.setdefaultencoding('utf8')
webdata = '<h1>check</h1><form action="/replace" method="post"><input type="text" name="words" /><input type="submit"></from>'
return webdata

run(host='localhost', port=80, debug=True)


敏感词文件为words.txt,每行一个词就可以了,该文件采用gb2312编码,程序均为utf8编码

测试一下api,均为POST请求

过滤敏感词API,直接返回过滤后的数据



检测敏感词API,返回json格式数据



用bottle来开发这样的小工具相当快,而且敏感词检测系统在很多应用场景都需要,独立出来写成接口可以提高效率,并且易于维护,国内目前第三方敏感词检测服务还不多,天朝的需求又很旺盛,可以试试搞个在线敏感词检测服务平台。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: