您的位置:首页 > 编程语言 > Python开发

python 分词计算文档TF-IDF值并排序

2017-04-17 13:22 519 查看
文章来自于我的个人博客:python 分词计算文档TF-IDF值并排序

该程序实现的功能是:首先读取一些文档,然后通过jieba来分词,将分词存入文件,然后通过sklearn计算每一个分词文档中的tf-idf值,再将文档排序输入一个大文件里

依赖包:

sklearn

jieba

注:此程序參考了一位同行的程序后进行了改动

# -*- coding: utf-8 -*-
"""
@author: jiangfuqiang
"""

import os
import jieba
import jieba.posseg as pseg
import sys
import re
import time
import string
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys)

sys.setdefaultencoding('utf-8')

def getFileList(path):
filelist = []
files = os.listdir(path)
for f in files:
if f[0] == '.':
pass
else:
filelist.append(f)
return filelist,path

def fenci(filename,path,segPath):
f = open(path +"/" + filename,'r+')
file_list = f.read()
f.close()

#保存粉刺结果的文件夹

if not os.path.exists(segPath):
os.mkdir(segPath)

#对文档进行分词处理
seg_list = jieba.cut(file_list,cut_all=True)
#对空格。换行符进行处理
result = []
for seg in seg_list:
seg = ''.join(seg.split())
reg = 'w+'
r = re.search(reg,seg)
if seg != '' and seg != '
' and seg != '

' and seg != '=' and
seg != '[' and seg != ']' and seg != '(' and seg != ')' and not r:
result.append(seg)

#将分词后的结果用空格隔开,保存至本地
f = open(segPath+"/"+filename+"-seg.txt","w+")
f.write(' '.join(result))
f.close()

#读取已经分词好的文档。进行TF-IDF计算
def Tfidf(filelist,sFilePath,path):
corpus = []
for ff in filelist:
fname = path + ff
f = open(fname+"-seg.txt",'r+')
content = f.read()
f.close()
corpus.append(content)

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.get_feature_names()  #全部文本的关键字
weight = tfidf.toarray()

if not os.path.exists(sFilePath):
os.mkdir(sFilePath)

for i in range(len(weight)):
print u'----------writing all the tf-idf in the ',i,u'file into ', sFilePath+'/' +string.zfill(i,5)+".txt"
f = open(sFilePath+"/"+string.zfill(i,5)+".txt",'w+')
for j in range(len(word)):
f.write(word[j] + "  " + str(weight[i][j]) + "
")
f.close()

if __name__ == "__main__":
#保存tf-idf的计算结果文件夹
sFilePath = "/home/lifeix/soft/allfile/tfidffile"+str(time.time())
#保存分词的文件夹
segPath = '/home/lifeix/soft/allfile/segfile'
(allfile,path) = getFileList('/home/lifeix/soft/allkeyword')
for ff in allfile:
print "Using jieba on " + ff
fenci(ff,path,segPath)

Tfidf(allfile,sFilePath,segPath)
#对整个文档进行排序
os.system("sort -nrk 2 " + sFilePath+"/*.txt >" + sFilePath + "/sorted.txt")
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: