您的位置:首页 > 编程语言 > Python开发

基于朴素贝叶斯的关于互联网金融新闻分类(python实现)

2014-11-25 20:38 751 查看
        中国互联网金融发展迅速,2014年是中国互联网金融起步的一年,但在短短的一年时间内,互联网金融创业者们融资额度一再创高,雨后春笋般涌现出各类互联网金融产品让用户眼花缭乱,随着创业门槛的降低,在即将到来的2015年,互联网金融必将在中国掀起热潮。

        在当下互联网金融持续升温的今天,作为一个毫无金融背景的工科生,也希望能够搭上互联网金融这趟快速列车,跑在同龄人的前面。在一份偶然的实习机会中,有幸接触了这一领域,也把自己的一部分收获与大家分享,使用朴素贝叶斯算法对互联网金融的新闻进行分类。

        分类属于监督学习,进行分类前需要有固定分类的训练集,那么第一步就是对互联网金融新闻划分类别。从传统金融意义上来讲,对新闻可划分为:银行、证券以及保险。此三类作为传统金融的三大领域。接下来就是划分互联网金融的类别,互联网金融的三大热门形式是网贷、众筹和支付。根据金融常出现的另外三个词:投资、理财以及虚拟货币中的比特币,再分出三类。所以训练集中的新闻总共分为9类:银行、证券、保险、网贷、众筹、支付、投资、理财、比特币。

        训练集数据的获取自然是通过爬虫从网站中爬取新闻正文,然后提取关键词进行判断,关键词提取可以采用tf-idf算法进行提取,找到所爬正文中属于我们要找的9类的文章,然后为这些文章建立向量,在这里向量的构建并没有采取针对整篇文章的词语建立的方式,而是针对提取的20个关键字来建立向量(这里涉及到朴素贝叶斯算法的一些背景知识,在这里不做过多赘述,参考文章前可以了解一下相关背景知识),既能够基本反映出文章内容,又起到降维的作用。训练集数据收集完成之后,就是具体使用朴素贝叶斯算法实现的过程。直接贴出代码以供参考:

'''

@author: lecheng
2014/11/15

'''
import os
import sys
from numpy import *
from data import get_content
from slicewords import extract_keywords
import db

#保险 0
#比特币 1
#理财 2
#投资 3
#网贷 4
#银行 5
#证券 6
#支付 7
#众筹 8

def get_data_by_type(type):

'''从数据库中获取某类型的新闻关键字列表'''
table = db.db['m0_content']#m0_content是存训练集数据的mongo数据库
word_lists = []
contents = table.find({'type':type})
for content in contents:
keywords = extract_keywords(content.get('content'))
word_lists += [keywords]
return word_lists

def load_data():

'''数据初始化'''
dataset = [[],[]]
word_list = get_data_by_type("保险")
for i in word_list:
dataset[0] += [i]
dataset[1] += [0]
print('load baoxin success')
word_list = get_data_by_type("比特币")
for i in word_list:
dataset[0] += [i]
dataset[1] += [1]
print('load bitebi success')
word_list = get_data_by_type("理财")
for i in word_list:
dataset[0] += [i]
dataset[1] += [2]
print('load licai success')
word_list = get_data_by_type("投资")
for i in word_list:
dataset[0] += [i]
dataset[1] += [3]
print('load touzi success')
word_list = get_data_by_type("网贷")
for i in word_list:
dataset[0] += [i]
dataset[1] += [4]
print('load wangdai success')
word_list = get_data_by_type("银行")
for i in word_list:
dataset[0] += [i]
dataset[1] += [5]
print('load yinhang success')
word_list = get_data_by_type("证券")
for i in word_list:
dataset[0] += [i]
dataset[1] += [6]
print('load zhengquan success')
word_list = get_data_by_type("支付")
for i in word_list:
dataset[0] += [i]
dataset[1] += [7]
print('load zhifu success')
word_list = get_data_by_type("众筹")
for i in word_list:
dataset[0] += [i]
dataset[1] += [8]
print('load zhongchou success')
print("load data complete")
return dataset

def
4000
url_to_wordlist(url):

'''根据新闻url获取新闻关键字'''
content = get_content(url)
keywords = extract_keywords(content)
print(keywords)
return keywords

def create_vocablist(dataset):

'''获取所有关键词向量'''
vocabset = set([])
for document in dataset:
vocabset = vocabset | set(document)
return list(vocabset)

def news_to_vec(vocablist,inputset):

'''将新闻转化成向量的函数'''
returnvec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] = 1
else:
print('关键字:%s不在字典集中' % word)
return returnvec

def trainNB0(trainmatrix,traincategory):

'''朴素贝叶斯分类器训练函数'''
numtraindocs = len(trainmatrix)
numwords = len(trainmatrix[0])
p0abusive = traincategory.count(0)/float(numtraindocs)
p1abusive = traincategory.count(1)/float(numtraindocs)
p2abusive = traincategory.count(2)/float(numtraindocs)
p3abusive = traincategory.count(3)/float(numtraindocs)
p4abusive = traincategory.count(4)/float(numtraindocs)
p5abusive = traincategory.count(5)/float(numtraindocs)
p6abusive = traincategory.count(6)/float(numtraindocs)
p7abusive = traincategory.count(7)/float(numtraindocs)
p8abusive = traincategory.count(8)/float(numtraindocs)
p0num = ones(numwords);p1num = ones(numwords);p2num = ones(numwords)
p3num = ones(numwords);p4num = ones(numwords);p5num = ones(numwords)
p6num = ones(numwords);p7num = ones(numwords);p8num = ones(numwords)
p0denom = 2.0;p1denom = 2.0;p2denom = 2.0;p3denom = 2.0;p4denom = 2.0;
p5denom = 2.0;p6denom = 2.0;p7denom = 2.0;p8denom = 2.0;
for i in range(numtraindocs):
if traincategory[i] == 0:
p0num += trainmatrix[i]
p0denom += sum(trainmatrix[i])
elif traincategory[i] == 1:
p1num += trainmatrix[i]
p1denom += sum(trainmatrix[i])
elif traincategory[i] == 2:
p2num += trainmatrix[i]
p2denom += sum(trainmatrix[i])
elif traincategory[i] == 3:
p3num += trainmatrix[i]
p3denom += sum(trainmatrix[i])
elif traincategory[i] == 4:
p4num += trainmatrix[i]
p4denom += sum(trainmatrix[i])
elif traincategory[i] == 5:
p5num += trainmatrix[i]
p5denom += sum(trainmatrix[i])
elif traincategory[i] == 6:
p6num += trainmatrix[i]
p6denom += sum(trainmatrix[i])
elif traincategory[i] == 7:
p7num += trainmatrix[i]
p7denom += sum(trainmatrix[i])
elif traincategory[i] == 8:
p8num += trainmatrix[i]
p8denom += sum(trainmatrix[i])
p0vect = log(p0num/p0denom);p1vect = log(p1num/p1denom);p2vect = log(p2num/p2denom);
p3vect = log(p3num/p3denom);p4vect = log(p4num/p4denom);p5vect = log(p5num/p5denom);
p6vect = log(p6num/p6denom);p7vect = log(p7num/p7denom);p8vect = log(p8num/p8denom);
pvect = [p0vect.tolist()] + [p1vect.tolist()] + [p2vect.tolist()] + [p3vect.tolist()] + [p4vect.tolist()] + [p5vect.tolist()] + [p6vect.tolist()] + [p7vect.tolist()] + [p8vect.tolist()]
pabusive = [p0abusive] + [p1abusive] + [p2abusive] + [p3abusive] + [p4abusive] \
+ [p5abusive] + [p6abusive] + [p7abusive] + [p8abusive]
print("train success")
return pvect,pabusive

def classifyNB(vec2classify,pvect,pabusive):

'''朴素贝叶斯分类函数'''
#vec2classify为要进行分类的新闻向量
p = []
vec2classify = array(vec2classify)
p0 = sum(vec2classify * pvect[0]) + log(pabusive[0])
p1 = sum(vec2classify * pvect[1]) + log(pabusive[1])
p2 = sum(vec2classify * pvect[2]) + log(pabusive[2])
p3 = sum(vec2classify * pvect[3]) + log(pabusive[3])
p4 = sum(vec2classify * pvect[4]) + log(pabusive[4])
p5 = sum(vec2classify * pvect[5]) + log(pabusive[5])
p6 = sum(vec2classify * pvect[6]) + log(pabusive[6])
p7 = sum(vec2classify * pvect[7]) + log(pabusive[7])
p8 = sum(vec2classify * pvect[8]) + log(pabusive[8])
p += [p0]; p += [p1]; p += [p2]; p += [p3]; p += [p4];
p += [p5]; p += [p6]; p += [p7]; p += [p8];
#print(p)
return p.index(max(p))

import jieba.posseg as pseg
import jieba.analyse
import jieba
import db
import sys
import os

def load_dict():
files = os.listdir("userdict")
path = sys.path[0] + "/userdict/"
for filename in files:
jieba.load_userdict(path + filename)

load_dict()#添加自定义字典

def extract_keywords(content):

'''提取文章关键字'''
words = pseg.cut(content)
candidate = []
for w in words:
if ('n' in w.flag  or w.flag == 'v') and w.flag != 'eng':
candidate += [w.word]
keywords = jieba.analyse.extract_tags("".join(candidate))
return keywords
import bayes
import db
import json

def init_data():

'''初始化训练数据'''
print("load data start...")
dataset = bayes.load_data()
vocablist = bayes.create_vocablist(dataset[0])
print(len(dataset[0]))
print(len(dataset[1]))
trainmatrix = []
for i in dataset[0]:
trainmatrix.append(bayes.news_to_vec(vocablist, i))
print("train start...")
pvect, pabusive = bayes.trainNB0(trainmatrix, dataset[1])
d = {}
d["vocablist"] = str(vocablist)
d["pvect"] = str(pvect)
d["pabusive"] = str(pabusive)
f = open("classify.txt", "w")#将训练结果数据保存到classify.txt文件中
json.dump(d, f)
f.close()

def classify_by_url(news_to_classify_url):

'''对输入新闻进行分类'''
f = open("classify.txt")
data = json.load(f)
pvect = json.loads(data.get('pvect'))
s1 = data.get('vocablist').replace('\'', '"')
vocablist = json.loads(s1)
pabusive = json.loads(data.get('pabusive'))
inputset = bayes.url_to_wordlist(news_to_classify_url)
news_to_classify_vec = bayes.news_to_vec(vocablist, inputset)
category = bayes.classifyNB(news_to_classify_vec, pvect, pabusive)
return category

def result(category):
if category == 0:
return "保险"
elif category == 1:
return "比特币"
elif category == 2:
return "理财"
elif category == 3:
return "投资"
elif category == 4:
return "网贷"
elif category == 5:
return "银行"
elif category == 6:
return "证券"
elif category == 7:
return "支付"
elif category == 8:
return "众筹"

if __name__ == "__main__":
# init_data()第一次运行执行数据初始化函数
category = classify_by_url("http://www.csai.cn/if/743378.html")
print(result(category))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐