基于朴素贝叶斯的关于互联网金融新闻分类(python实现)
2014-11-25 20:38
751 查看
中国互联网金融发展迅速,2014年是中国互联网金融起步的一年,但在短短的一年时间内,互联网金融创业者们融资额度一再创高,雨后春笋般涌现出各类互联网金融产品让用户眼花缭乱,随着创业门槛的降低,在即将到来的2015年,互联网金融必将在中国掀起热潮。
在当下互联网金融持续升温的今天,作为一个毫无金融背景的工科生,也希望能够搭上互联网金融这趟快速列车,跑在同龄人的前面。在一份偶然的实习机会中,有幸接触了这一领域,也把自己的一部分收获与大家分享,使用朴素贝叶斯算法对互联网金融的新闻进行分类。
分类属于监督学习,进行分类前需要有固定分类的训练集,那么第一步就是对互联网金融新闻划分类别。从传统金融意义上来讲,对新闻可划分为:银行、证券以及保险。此三类作为传统金融的三大领域。接下来就是划分互联网金融的类别,互联网金融的三大热门形式是网贷、众筹和支付。根据金融常出现的另外三个词:投资、理财以及虚拟货币中的比特币,再分出三类。所以训练集中的新闻总共分为9类:银行、证券、保险、网贷、众筹、支付、投资、理财、比特币。
训练集数据的获取自然是通过爬虫从网站中爬取新闻正文,然后提取关键词进行判断,关键词提取可以采用tf-idf算法进行提取,找到所爬正文中属于我们要找的9类的文章,然后为这些文章建立向量,在这里向量的构建并没有采取针对整篇文章的词语建立的方式,而是针对提取的20个关键字来建立向量(这里涉及到朴素贝叶斯算法的一些背景知识,在这里不做过多赘述,参考文章前可以了解一下相关背景知识),既能够基本反映出文章内容,又起到降维的作用。训练集数据收集完成之后,就是具体使用朴素贝叶斯算法实现的过程。直接贴出代码以供参考:
'''
@author: lecheng
2014/11/15
'''
import os
import sys
from numpy import *
from data import get_content
from slicewords import extract_keywords
import db
#保险 0
#比特币 1
#理财 2
#投资 3
#网贷 4
#银行 5
#证券 6
#支付 7
#众筹 8
def get_data_by_type(type):
'''从数据库中获取某类型的新闻关键字列表'''
table = db.db['m0_content']#m0_content是存训练集数据的mongo数据库
word_lists = []
contents = table.find({'type':type})
for content in contents:
keywords = extract_keywords(content.get('content'))
word_lists += [keywords]
return word_lists
def load_data():
'''数据初始化'''
dataset = [[],[]]
word_list = get_data_by_type("保险")
for i in word_list:
dataset[0] += [i]
dataset[1] += [0]
print('load baoxin success')
word_list = get_data_by_type("比特币")
for i in word_list:
dataset[0] += [i]
dataset[1] += [1]
print('load bitebi success')
word_list = get_data_by_type("理财")
for i in word_list:
dataset[0] += [i]
dataset[1] += [2]
print('load licai success')
word_list = get_data_by_type("投资")
for i in word_list:
dataset[0] += [i]
dataset[1] += [3]
print('load touzi success')
word_list = get_data_by_type("网贷")
for i in word_list:
dataset[0] += [i]
dataset[1] += [4]
print('load wangdai success')
word_list = get_data_by_type("银行")
for i in word_list:
dataset[0] += [i]
dataset[1] += [5]
print('load yinhang success')
word_list = get_data_by_type("证券")
for i in word_list:
dataset[0] += [i]
dataset[1] += [6]
print('load zhengquan success')
word_list = get_data_by_type("支付")
for i in word_list:
dataset[0] += [i]
dataset[1] += [7]
print('load zhifu success')
word_list = get_data_by_type("众筹")
for i in word_list:
dataset[0] += [i]
dataset[1] += [8]
print('load zhongchou success')
print("load data complete")
return dataset
def
4000
url_to_wordlist(url):
'''根据新闻url获取新闻关键字'''
content = get_content(url)
keywords = extract_keywords(content)
print(keywords)
return keywords
def create_vocablist(dataset):
'''获取所有关键词向量'''
vocabset = set([])
for document in dataset:
vocabset = vocabset | set(document)
return list(vocabset)
def news_to_vec(vocablist,inputset):
'''将新闻转化成向量的函数'''
returnvec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] = 1
else:
print('关键字:%s不在字典集中' % word)
return returnvec
def trainNB0(trainmatrix,traincategory):
'''朴素贝叶斯分类器训练函数'''
numtraindocs = len(trainmatrix)
numwords = len(trainmatrix[0])
p0abusive = traincategory.count(0)/float(numtraindocs)
p1abusive = traincategory.count(1)/float(numtraindocs)
p2abusive = traincategory.count(2)/float(numtraindocs)
p3abusive = traincategory.count(3)/float(numtraindocs)
p4abusive = traincategory.count(4)/float(numtraindocs)
p5abusive = traincategory.count(5)/float(numtraindocs)
p6abusive = traincategory.count(6)/float(numtraindocs)
p7abusive = traincategory.count(7)/float(numtraindocs)
p8abusive = traincategory.count(8)/float(numtraindocs)
p0num = ones(numwords);p1num = ones(numwords);p2num = ones(numwords)
p3num = ones(numwords);p4num = ones(numwords);p5num = ones(numwords)
p6num = ones(numwords);p7num = ones(numwords);p8num = ones(numwords)
p0denom = 2.0;p1denom = 2.0;p2denom = 2.0;p3denom = 2.0;p4denom = 2.0;
p5denom = 2.0;p6denom = 2.0;p7denom = 2.0;p8denom = 2.0;
for i in range(numtraindocs):
if traincategory[i] == 0:
p0num += trainmatrix[i]
p0denom += sum(trainmatrix[i])
elif traincategory[i] == 1:
p1num += trainmatrix[i]
p1denom += sum(trainmatrix[i])
elif traincategory[i] == 2:
p2num += trainmatrix[i]
p2denom += sum(trainmatrix[i])
elif traincategory[i] == 3:
p3num += trainmatrix[i]
p3denom += sum(trainmatrix[i])
elif traincategory[i] == 4:
p4num += trainmatrix[i]
p4denom += sum(trainmatrix[i])
elif traincategory[i] == 5:
p5num += trainmatrix[i]
p5denom += sum(trainmatrix[i])
elif traincategory[i] == 6:
p6num += trainmatrix[i]
p6denom += sum(trainmatrix[i])
elif traincategory[i] == 7:
p7num += trainmatrix[i]
p7denom += sum(trainmatrix[i])
elif traincategory[i] == 8:
p8num += trainmatrix[i]
p8denom += sum(trainmatrix[i])
p0vect = log(p0num/p0denom);p1vect = log(p1num/p1denom);p2vect = log(p2num/p2denom);
p3vect = log(p3num/p3denom);p4vect = log(p4num/p4denom);p5vect = log(p5num/p5denom);
p6vect = log(p6num/p6denom);p7vect = log(p7num/p7denom);p8vect = log(p8num/p8denom);
pvect = [p0vect.tolist()] + [p1vect.tolist()] + [p2vect.tolist()] + [p3vect.tolist()] + [p4vect.tolist()] + [p5vect.tolist()] + [p6vect.tolist()] + [p7vect.tolist()] + [p8vect.tolist()]
pabusive = [p0abusive] + [p1abusive] + [p2abusive] + [p3abusive] + [p4abusive] \
+ [p5abusive] + [p6abusive] + [p7abusive] + [p8abusive]
print("train success")
return pvect,pabusive
def classifyNB(vec2classify,pvect,pabusive):
'''朴素贝叶斯分类函数'''
#vec2classify为要进行分类的新闻向量
p = []
vec2classify = array(vec2classify)
p0 = sum(vec2classify * pvect[0]) + log(pabusive[0])
p1 = sum(vec2classify * pvect[1]) + log(pabusive[1])
p2 = sum(vec2classify * pvect[2]) + log(pabusive[2])
p3 = sum(vec2classify * pvect[3]) + log(pabusive[3])
p4 = sum(vec2classify * pvect[4]) + log(pabusive[4])
p5 = sum(vec2classify * pvect[5]) + log(pabusive[5])
p6 = sum(vec2classify * pvect[6]) + log(pabusive[6])
p7 = sum(vec2classify * pvect[7]) + log(pabusive[7])
p8 = sum(vec2classify * pvect[8]) + log(pabusive[8])
p += [p0]; p += [p1]; p += [p2]; p += [p3]; p += [p4];
p += [p5]; p += [p6]; p += [p7]; p += [p8];
#print(p)
return p.index(max(p))
在当下互联网金融持续升温的今天,作为一个毫无金融背景的工科生,也希望能够搭上互联网金融这趟快速列车,跑在同龄人的前面。在一份偶然的实习机会中,有幸接触了这一领域,也把自己的一部分收获与大家分享,使用朴素贝叶斯算法对互联网金融的新闻进行分类。
分类属于监督学习,进行分类前需要有固定分类的训练集,那么第一步就是对互联网金融新闻划分类别。从传统金融意义上来讲,对新闻可划分为:银行、证券以及保险。此三类作为传统金融的三大领域。接下来就是划分互联网金融的类别,互联网金融的三大热门形式是网贷、众筹和支付。根据金融常出现的另外三个词:投资、理财以及虚拟货币中的比特币,再分出三类。所以训练集中的新闻总共分为9类:银行、证券、保险、网贷、众筹、支付、投资、理财、比特币。
训练集数据的获取自然是通过爬虫从网站中爬取新闻正文,然后提取关键词进行判断,关键词提取可以采用tf-idf算法进行提取,找到所爬正文中属于我们要找的9类的文章,然后为这些文章建立向量,在这里向量的构建并没有采取针对整篇文章的词语建立的方式,而是针对提取的20个关键字来建立向量(这里涉及到朴素贝叶斯算法的一些背景知识,在这里不做过多赘述,参考文章前可以了解一下相关背景知识),既能够基本反映出文章内容,又起到降维的作用。训练集数据收集完成之后,就是具体使用朴素贝叶斯算法实现的过程。直接贴出代码以供参考:
'''
@author: lecheng
2014/11/15
'''
import os
import sys
from numpy import *
from data import get_content
from slicewords import extract_keywords
import db
#保险 0
#比特币 1
#理财 2
#投资 3
#网贷 4
#银行 5
#证券 6
#支付 7
#众筹 8
def get_data_by_type(type):
'''从数据库中获取某类型的新闻关键字列表'''
table = db.db['m0_content']#m0_content是存训练集数据的mongo数据库
word_lists = []
contents = table.find({'type':type})
for content in contents:
keywords = extract_keywords(content.get('content'))
word_lists += [keywords]
return word_lists
def load_data():
'''数据初始化'''
dataset = [[],[]]
word_list = get_data_by_type("保险")
for i in word_list:
dataset[0] += [i]
dataset[1] += [0]
print('load baoxin success')
word_list = get_data_by_type("比特币")
for i in word_list:
dataset[0] += [i]
dataset[1] += [1]
print('load bitebi success')
word_list = get_data_by_type("理财")
for i in word_list:
dataset[0] += [i]
dataset[1] += [2]
print('load licai success')
word_list = get_data_by_type("投资")
for i in word_list:
dataset[0] += [i]
dataset[1] += [3]
print('load touzi success')
word_list = get_data_by_type("网贷")
for i in word_list:
dataset[0] += [i]
dataset[1] += [4]
print('load wangdai success')
word_list = get_data_by_type("银行")
for i in word_list:
dataset[0] += [i]
dataset[1] += [5]
print('load yinhang success')
word_list = get_data_by_type("证券")
for i in word_list:
dataset[0] += [i]
dataset[1] += [6]
print('load zhengquan success')
word_list = get_data_by_type("支付")
for i in word_list:
dataset[0] += [i]
dataset[1] += [7]
print('load zhifu success')
word_list = get_data_by_type("众筹")
for i in word_list:
dataset[0] += [i]
dataset[1] += [8]
print('load zhongchou success')
print("load data complete")
return dataset
def
4000
url_to_wordlist(url):
'''根据新闻url获取新闻关键字'''
content = get_content(url)
keywords = extract_keywords(content)
print(keywords)
return keywords
def create_vocablist(dataset):
'''获取所有关键词向量'''
vocabset = set([])
for document in dataset:
vocabset = vocabset | set(document)
return list(vocabset)
def news_to_vec(vocablist,inputset):
'''将新闻转化成向量的函数'''
returnvec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] = 1
else:
print('关键字:%s不在字典集中' % word)
return returnvec
def trainNB0(trainmatrix,traincategory):
'''朴素贝叶斯分类器训练函数'''
numtraindocs = len(trainmatrix)
numwords = len(trainmatrix[0])
p0abusive = traincategory.count(0)/float(numtraindocs)
p1abusive = traincategory.count(1)/float(numtraindocs)
p2abusive = traincategory.count(2)/float(numtraindocs)
p3abusive = traincategory.count(3)/float(numtraindocs)
p4abusive = traincategory.count(4)/float(numtraindocs)
p5abusive = traincategory.count(5)/float(numtraindocs)
p6abusive = traincategory.count(6)/float(numtraindocs)
p7abusive = traincategory.count(7)/float(numtraindocs)
p8abusive = traincategory.count(8)/float(numtraindocs)
p0num = ones(numwords);p1num = ones(numwords);p2num = ones(numwords)
p3num = ones(numwords);p4num = ones(numwords);p5num = ones(numwords)
p6num = ones(numwords);p7num = ones(numwords);p8num = ones(numwords)
p0denom = 2.0;p1denom = 2.0;p2denom = 2.0;p3denom = 2.0;p4denom = 2.0;
p5denom = 2.0;p6denom = 2.0;p7denom = 2.0;p8denom = 2.0;
for i in range(numtraindocs):
if traincategory[i] == 0:
p0num += trainmatrix[i]
p0denom += sum(trainmatrix[i])
elif traincategory[i] == 1:
p1num += trainmatrix[i]
p1denom += sum(trainmatrix[i])
elif traincategory[i] == 2:
p2num += trainmatrix[i]
p2denom += sum(trainmatrix[i])
elif traincategory[i] == 3:
p3num += trainmatrix[i]
p3denom += sum(trainmatrix[i])
elif traincategory[i] == 4:
p4num += trainmatrix[i]
p4denom += sum(trainmatrix[i])
elif traincategory[i] == 5:
p5num += trainmatrix[i]
p5denom += sum(trainmatrix[i])
elif traincategory[i] == 6:
p6num += trainmatrix[i]
p6denom += sum(trainmatrix[i])
elif traincategory[i] == 7:
p7num += trainmatrix[i]
p7denom += sum(trainmatrix[i])
elif traincategory[i] == 8:
p8num += trainmatrix[i]
p8denom += sum(trainmatrix[i])
p0vect = log(p0num/p0denom);p1vect = log(p1num/p1denom);p2vect = log(p2num/p2denom);
p3vect = log(p3num/p3denom);p4vect = log(p4num/p4denom);p5vect = log(p5num/p5denom);
p6vect = log(p6num/p6denom);p7vect = log(p7num/p7denom);p8vect = log(p8num/p8denom);
pvect = [p0vect.tolist()] + [p1vect.tolist()] + [p2vect.tolist()] + [p3vect.tolist()] + [p4vect.tolist()] + [p5vect.tolist()] + [p6vect.tolist()] + [p7vect.tolist()] + [p8vect.tolist()]
pabusive = [p0abusive] + [p1abusive] + [p2abusive] + [p3abusive] + [p4abusive] \
+ [p5abusive] + [p6abusive] + [p7abusive] + [p8abusive]
print("train success")
return pvect,pabusive
def classifyNB(vec2classify,pvect,pabusive):
'''朴素贝叶斯分类函数'''
#vec2classify为要进行分类的新闻向量
p = []
vec2classify = array(vec2classify)
p0 = sum(vec2classify * pvect[0]) + log(pabusive[0])
p1 = sum(vec2classify * pvect[1]) + log(pabusive[1])
p2 = sum(vec2classify * pvect[2]) + log(pabusive[2])
p3 = sum(vec2classify * pvect[3]) + log(pabusive[3])
p4 = sum(vec2classify * pvect[4]) + log(pabusive[4])
p5 = sum(vec2classify * pvect[5]) + log(pabusive[5])
p6 = sum(vec2classify * pvect[6]) + log(pabusive[6])
p7 = sum(vec2classify * pvect[7]) + log(pabusive[7])
p8 = sum(vec2classify * pvect[8]) + log(pabusive[8])
p += [p0]; p += [p1]; p += [p2]; p += [p3]; p += [p4];
p += [p5]; p += [p6]; p += [p7]; p += [p8];
#print(p)
return p.index(max(p))
import jieba.posseg as pseg import jieba.analyse import jieba import db import sys import os def load_dict(): files = os.listdir("userdict") path = sys.path[0] + "/userdict/" for filename in files: jieba.load_userdict(path + filename) load_dict()#添加自定义字典 def extract_keywords(content): '''提取文章关键字''' words = pseg.cut(content) candidate = [] for w in words: if ('n' in w.flag or w.flag == 'v') and w.flag != 'eng': candidate += [w.word] keywords = jieba.analyse.extract_tags("".join(candidate)) return keywords
import bayes import db import json def init_data(): '''初始化训练数据''' print("load data start...") dataset = bayes.load_data() vocablist = bayes.create_vocablist(dataset[0]) print(len(dataset[0])) print(len(dataset[1])) trainmatrix = [] for i in dataset[0]: trainmatrix.append(bayes.news_to_vec(vocablist, i)) print("train start...") pvect, pabusive = bayes.trainNB0(trainmatrix, dataset[1]) d = {} d["vocablist"] = str(vocablist) d["pvect"] = str(pvect) d["pabusive"] = str(pabusive) f = open("classify.txt", "w")#将训练结果数据保存到classify.txt文件中 json.dump(d, f) f.close() def classify_by_url(news_to_classify_url): '''对输入新闻进行分类''' f = open("classify.txt") data = json.load(f) pvect = json.loads(data.get('pvect')) s1 = data.get('vocablist').replace('\'', '"') vocablist = json.loads(s1) pabusive = json.loads(data.get('pabusive')) inputset = bayes.url_to_wordlist(news_to_classify_url) news_to_classify_vec = bayes.news_to_vec(vocablist, inputset) category = bayes.classifyNB(news_to_classify_vec, pvect, pabusive) return category def result(category): if category == 0: return "保险" elif category == 1: return "比特币" elif category == 2: return "理财" elif category == 3: return "投资" elif category == 4: return "网贷" elif category == 5: return "银行" elif category == 6: return "证券" elif category == 7: return "支付" elif category == 8: return "众筹" if __name__ == "__main__": # init_data()第一次运行执行数据初始化函数 category = classify_by_url("http://www.csai.cn/if/743378.html") print(result(category))
相关文章推荐
- Python实现基于朴素贝叶斯的垃圾邮件分类
- 贝叶斯分类方法学习三 python+jieba+mongodb实现朴素贝叶斯新闻文本自动分类
- Python实现基于朴素贝叶斯的垃圾邮件分类 标签: python朴素贝叶斯垃圾邮件分类 2016-04-20 15:09 2750人阅读 评论(1) 收藏 举报 分类: 机器学习(19) 听说
- 机器学习—— 基于朴素贝叶斯分类算法构建文本分类器的Python实现
- 《机器学习实战》基于朴素贝叶斯分类算法构建文本分类器的Python实现
- 基于朴素贝叶斯分类器的文本分类算法的实现过程分析
- 机器学习之朴素贝叶斯(NB)分类算法与Python实现
- 神经网络与深度学习 使用Python实现基于梯度下降算法的神经网络和自制仿MNIST数据集的手写数字分类可视化程序 web版本
- 朴素贝叶斯分类的Python实现
- python基于urllib实现按照百度音乐分类下载mp3的方法
- 【机器学习实战-python3】基于概率论的分类方法:朴素贝叶斯
- 分类算法-----朴素贝叶斯原理和python实现
- 文本分类的python实现-基于SVM算法
- 基于文本密度的新闻正文抽取方法之Python实现
- 基于Python的朴素贝叶斯算法实现
- 基于朴素贝叶斯分类器的文本分类算法的实现过程分析
- 机器学习实战python版第四章基于概率论的分类方法 朴素贝叶斯
- 朴素贝叶斯分类算法的Python实现
- 基于朴素贝叶斯分类算法实现垃圾邮箱分类
- 朴素贝叶斯的概率理论及其python代码实现文本分类的实例