您的位置:首页 > 其它

朴素贝叶斯---过滤垃圾邮件

2016-06-08 10:20 302 查看
在bayes.py中添加

#朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec

#解析文本
def textParse(bigString):
import re
listOfTokens = re.split(r'\W*', bigString)  #除掉标点符号,保留单词
return [tok.lower() for tok in listOfTokens if len(tok) > 2]   #返回长度大于2,小写后的单词

#垃圾邮件测试函数
def spamTest():
#导入并解析文本
docList = []; classList = []; fullText = []
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' %i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' %i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)   #解析为词列表
trainingSet = range(50); testSet = []  #创建训练集、测试集,训练集初始化为一个整数列表
#随机构建训练集
for i in range(10):  #50封邮件中,随机选取10封作为测试集
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])  #从训练集中删除
trainMat = []; trainClasses = []
for docIndex in trainingSet:  #循环遍历训练集
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))  #对每封邮件基于词汇表构建词向量
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))  #计算分类所需的概率
errorCount = 0
#对测试集分类
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print "classification error", docList[docIndex]
print 'the error rate is : ', float(errorCount / len(testSet))


测试:

>>> import bayes
>>> spamTest()
the error rate is :  0.0
>>>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: