您的位置:首页 > 编程语言 > Python开发

Python+wordcloud+jieba+docx生成中文词云和词频统计

2017-12-29 10:50 1146 查看

Python+wordcloud+jieba+docx生成中文词云和词频统计

本文将以习大大的十九大报告(.docx格式)为例,展示生成中文词云和词频统计的完整过程。本文需要的三个核心Python类库:

wordcloud: Python下的词云生成工具

jieba: 中文分词分词工具

docx:python读取docx文档工具

准备

去python官网下载python3.X,安装python-3.6.3

设置环境变量,在PATH中添加python安装路径

pip install python-docx安装python-docx模块

Pip install wordcloud安装wordcloud模块,报错如下:



解决办法:

下载.whl文件http://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud

选择wordcloud-1.3.2-cp36-cp36m-win32.whl(根据计算机情况自行选择),使用cd命令进入whl文件的路径,运行这条命令:

python -m pip install

pip install jieba安装jieba模块

china_map.png,作为词云形状遮罩,最好用png格式,如下:



代码

本文将中间过程生成的分词结果、去停用词结果、词频统计结果都进行了本地存储,根据情况也可以跳过这些步骤,需要注意的是文件存储路径,可根据需要自行更改。

# fenci.py
# -*- coding: utf-8 -*-

import docx
import jieba
import codecs
from scipy.misc import imread
import os
from os import path
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
from wordcloud import WordCloud, ImageColorGenerator
import drawBar

def readDocument():
'''
获取文档对象,将文档内容按段落读入,并存入doc中
'''
file = docx.Document("F:\\python\\十九大报告.docx")
doc = ""
for para in file.paragraphs:
doc = doc + para.text

return doc

def segment(doc):
'''
用jieba分词对输入文档进行分词,并保存至本地(根据情况可跳过)
'''
seg_list = " ".join(jieba.cut(doc, cut_all=False)) #seg_list为str类型

document_after_segment = open('分词结果.txt', 'w+')
document_after_segment.write(seg_list)
document_after_segment.close()

return seg_list

def wordCount(segment_list):
'''
该函数实现词频的统计,并将统计结果存储至本地。
在制作词云的过程中用不到,主要是在画词频统计图时用到。
'''
word_lst = []
word_dict = {}
with open('F:\python\词频统计(去停用词).txt','w') as wf2:
word_lst.append(segment_list.split(' '))
for item in word_lst:
for item2 in item:
if item2 not in word_dict:
word_dict[item2] = 1
else:
word_dict[item2] += 1

word_dict_sorted = dict(sorted(word_dict.items(), \
key = lambda item:item[1], reverse=True))#按照词频从大到小排序
for key in word_dict_sorted:
wf2.write(key+' '+str(word_dict_sorted[key])+'\n')
wf2.close()

def drawWordCloud(seg_list):
'''
制作词云
设置词云参数
'''
color_mask = imread("china_map.png") # 读取背景图片,注意路径
wc = WordCloud(
#设置字体,不指定就会出现乱码,注意字体路径
font_path="simkai.ttf",
#font_path=path.join(d,'simsun.ttc'),
#设置背景色
background_color='white',
#词云形状
mask=color_mask,
#允许最大词汇
max_words=2000,
#最大号字体
max_font_size=60
)
wc.generate(seg_list) # 产生词云
image_colors = ImageColorGenerator(color_mask)
wc.to_file("ciyun.jpg") #保存图片
#  显示词云图片
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')

#这里主要为了实现词云图片按照图片颜色取色
plt.figure()
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")

plt.show()

def removeStopWords(seg_list):
'''
自行下载stopwords1893.txt停用词表,该函数实现去停用词
'''
wordlist_stopwords_removed = []

stop_words = open('F:\python\stopwords1893.txt')
stop_words_text = stop_words.read()

stop_words.close()

stop_words_text_list = stop_words_text.split('\n')
after_seg_text_list = seg_list.split(' ')

for word in after_seg_text_list:
if word not in stop_words_text_list:
wordlist_stopwords_removed.append(word)

without_stopwords = open('F:\python\分词结果(去停用词).txt', 'w')
without_stopwords.write(' '.join(wordlist_stopwords_removed))
return ' '.join(wordlist_stopwords_removed)

if __name__ == "__main__":
doc = readDocument()
segment_list = segment(doc)
segment_list_remove_stopwords = removeStopWords(segment_list)
drawWordCloud(segment_list_remove_stopwords)
wordCount(segment_list_remove_stopwords)
drawBar.drawStatBarh()


# drawBar.py
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
from matplotlib.font_manager import *
import numpy as np

def drawStatBarh():
'''
画出词频统计条形图,用渐变颜色显示,选取前N个词频
'''
fig, ax = plt.subplots()
myfont = FontProperties(fname='F:\python\simkai.ttf')
N = 30
words = []
counts = []
for line in open('F:\python\词频统计(去停用词).txt'):
line.strip('\n')
words.append(line.split(' ')[0])
counts.append(int(line.split(' ')[1].strip('\n')))

y_pos = np.arange(N)

colors = ['#FA8072'] #这里是为了实现条状的渐变效果,以该色号为基本色实现渐变效果
for i in range(len(words[:N]) - 1):
colors.append('#FA' + str(int(colors[-1][3:]) - 1))

rects = ax.barh(y_pos, counts[:N], align='center', color=colors)

ax.set_yticks(np.arange(N))
ax.set_yticklabels(words[:N],fontproperties=myfont)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_title('十九大报告中的高频词汇',fontproperties=myfont, fontsize=17)
ax.set_xlabel(u"出现次数",fontproperties=myfont)

autolabel(rects, ax)
plt.show()

def autolabel(rects, ax):
"""
给条形图加上文字标签
"""
#fig, ax = plt.subplots()
for rect in rects:
width = rect.get_width()
ax.text(1.03 * width, rect.get_y() + rect.get_height()/2.,
'%d' % int(width),ha='center', va='center')


结果

按照china_map形状制作的词云



按照china_map形状(带取色效果)制作的词云



词频统计



结语

本文参考了一些文档和代码,基本实现了将任意中文word文档进行词云制作和词频统计,词云的具体设置可参考wordcloud官网介绍,可以制作出很漂亮的效果,有兴趣的同学可以尝试做出交互的效果,接下来也打算尝试一下。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息