您的位置:首页 > 编程语言 > Python开发

Python利用朴素贝叶斯进行评分的分类

2016-08-14 11:00 806 查看
利用朴素贝叶斯可以对文档进行分类,比如说进行垃圾邮件的过滤等接下来的案例是对评分进行分类的,经过学习判断一句话应该属于几分,0-5分之间。
先利用爬虫爬取样本数据,该数据来自公开课的评论。


# coding=utf-8

import urllib2
from sgmllib import SGMLParser
import jieba

class CommentParser(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.__start_table = False
self.__start_p = False
self.__value_p = ''
self.__value_div = ''
self.__p_state = 0
self.data = []

def start_table(self, attr):
for k, v in attr:
if k == 'class' and v == 'table table-hover':
self.__start_table = True

def end_table(self):
if self.__start_table:
self.data.append([self.__value_p, self.__value_div])
self.__value_p = ''
self.__value_div = ''
self.__p_state = 0
self.__start_table = False

def start_div(self, attr):
if self.__start_table:
for k, v in attr:
if k == 'data-score':
self.__value_div = v

def end_div(self):
pass

def start_p(self, attrs):
if self.__start_table:
self.__p_state += 1
self.__start_p = True

def end_p(self):
if self.__start_table:
self.__start_p = False

def handle_data(self, data):
if self.__start_table and self.__start_p and self.__p_state == 3:
self.__value_p += data

def get_page(url):
page = urllib2.urlopen(url).read()
paraser = CommentParser()
paraser.feed(page)
value = paraser.data
return value

def download():
url = 'http://coursegraph.com/reviews/'
for i in range(1, 9):
value = get_page(url + str(i))
with open('result.txt', 'a+') as f:
for row in value:
f.write('[' + row[1] + ']' + row[0].strip().replace('\n', '').replace('\r', '') + '\n')

def jieba_chn():
all_value = open('result.txt', 'r+').readlines()
with open('result1.txt', 'w+') as f:
for row in all_value:
value = row[:5][1:4]
jb = jieba.cut_for_search(row[5:])
for row in jb:
if len(row) > 1:
value += ',' + row
f.write(value.encode('utf-8') + '\n')

#下载数据
# download()
#对文档数据进行分词操作
jieba_chn()


很简单的一个网络爬虫,然后利用**结巴**分词吧文档分割成词组,并去除掉标点符号等操作。具体的结果可以下载本案例进行查看。
下面开始进行利用朴素贝叶斯进行分类操作
先读取文档数据


def load_data_set():
dataSet = []
labels = []
with open('result1.txt', 'r+') as f:
for row in f.readlines():
t = row.strip().replace('\n', '').split(',')
labels.append(round(float(t[0]), 1))
dataSet.append(t[1:])
return dataSet, labels


创建一个单词向量和对应的标签


def create_vocab_list(dataSet, labels):
vocabSet = []
labelSet = []
for index, document in enumerate(dataSet):
vocabSet.extend(list(set(document)))
labelSet.extend([labels[index] for i in range(len(set(document)))])
return vocabSet, labelSet


根据单词出现修改相应的标签

def set_of_words2_vec(vocabList, label, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
for index, r in enumerate(vocabList):
if r == word:
returnVec[index] += label
return returnVec


对数据惊醒学习操作,计算出词组出现的概率分布


def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
labelSet = list(set(trainCategory))
pAbusive = {}
for r in labelSet:
pAbusive[str(r)] = len([row for row in trainCategory if row == r]) \
/ float(numTrainDocs)
pNumber = {}
pDenom = {}
for row in labelSet:
pNumber[str(row)] = ones(numWords)
pDenom[str(row)] = 2.0
for i in range(numTrainDocs):
pNumber[str(trainCategory[i])] += [row / trainCategory[i] for row in trainMatrix[i]]
pDenom[str(trainCategory[i])] += sum(trainMatrix[i]) / trainCategory[i]

ret = {}
for i in range(len(labelSet)):
ret[str(labelSet[i])] = pNumber[str(labelSet[i])] / pDenom[str(labelSet[i])]

return ret, pAbusive


判断测试词组的出现的概率,选择出出现概率最高的一项,就是该词组的评分了。


def classifyNB(vec2Classify, pVec, pClass, trainCategory):
labelSet = list(set(trainCategory))
p = {}
for row in labelSet:
p[str(row)] = sum(vec2Classify * pVec[str(row)]) + log(pClass[str(row)])
m = sorted(p.items(), key=lambda k: k[1], reverse=True)
return float(m[0][0])


一下是对文档进行测试的操作,


def testingNB():
dataSet, labels = load_data_set()
vocabSet, labelSet = create_vocab_list(dataSet, labels)
trainMatrix = []
for index, row in enumerate(dataSet):
trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
pV, pAb = trainNB0(trainMatrix, labels)
testEntry = ['学习', '很棒', '真不错']
testEntry = list(set(testEntry))
thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
print testEntry, 'classified as: ', classifyNB(thisDoc, pV, pAb, labels)

def test(number):
'''
验证算法的正确性
:param number: 当成测试样本的额百分比
:return:
'''
dataSet, labels = load_data_set()
test_number = int(len(dataSet) * number)
testSet = []
for i in range(test_number):
randIndex = int(random.uniform(0, len(dataSet)))
testSet.append([dataSet[randIndex], labels[randIndex]])
del (dataSet[randIndex])
del (labels[randIndex])
# 进行学习
vocabSet, labelSet = create_vocab_list(dataSet, labels)
trainMatrix = []
for index, row in enumerate(dataSet):
trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
pV, pAb = trainNB0(trainMatrix, labels)
# 进行测试
errorCount = 0
for row in testSet:
testEntry = row[0]
testEntry = list(set(testEntry))
thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
ret = classifyNB(thisDoc, pV, pAb, labels)
if ret != row[1]:
print "classification error", row[1], ret
errorCount += 1
print 'the error rate is: ', float(errorCount) / len(testSet)

test(0.1)
# testingNB()


悲剧的是测试的结果很不理想,难道中文不能这样分词,还是那一个细节出现问题,还请大神指导下,权当学习一下吧!
[案例现在地址](http://download.csdn.net/detail/u010154424/9602826)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息