您的位置:首页 > 其它

计算两篇文档的余弦相似度(tfidf)

2017-06-29 11:26 387 查看
# -*- coding:utf-8 -*-

"""
@author: Linlifang
"""

import os
import jieba
import sys
import re
import string
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys)
sys.setdefaultencoding('utf-8')

'''
首先读取文件夹里的文档,然后通过结巴分词,将分词的结果存入文件,接着使用sklearn包计算每一篇文档的tfidf值
并保存在一个文件里,最后从这些文件中任选两个txt文件来计算他们的余弦相似度。
'''
def getFileList(path):
filelist = []
files = os.listdir(path)
for f in files:
if f[0] == '.':
pass
        else:
filelist.append(f)
return filelist, path

def segment(filename, path, segPath):
f = open(path + "/" + filename, 'r+')
file_list = f.read()
f.close()
#对文档进行分词处理
if not os.path.exists(segPath):
os.mkdir(segPath)
#对空格,换行符进行处理
# Segmenting the document
seg_list = jieba.cut(file_list, cut_all=False)
# stopword = open('stopworda.txt').readlines()
result = []
for seg in seg_list:
seg = ''.join(seg.split())
reg = 'w+'
r = re.search(reg, seg)
if seg != '' and seg != ' = ' and seg != '[' and seg != ']' and seg != '(' and seg != ')' and not r:
result.append(seg)
finalresult = []
stopword = open('stopworda.txt').read()
for word in result: #去除停用词
if word in stopword:
continue
        else:
if word >= u'\u4e00' and word <= u'\u9fa5': #判断是否是汉字
finalresult.append(word)

# 将分词后的结果用空格隔开,保存在本地
f = open(segPath + "/" + filename + "-seg.txt", "w+")
f.write(' '.join(finalresult))
f.close()

# 读取已经分词好的文档,进行TFIDF计算
def Tfidf(filelist, sFilePath, path):
corpus = []
for ff in filelist:
fname = path + "/" + ff
f = open(fname + "-seg.txt", 'r+')
content = f.read()
f.close()
corpus.append(content)

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.get_feature_names()  # 全部文本关键字
weight = tfidf.toarray()

if not os.path.exists(sFilePath):
os.mkdir(sFilePath)

for i in range(len(weight)):
print u'-writing all the tf-idf in the ', i, u'file into ', sFilePath + '/' + string.zfill(i, 2) + ".txt"
f = open(sFilePath + "/" + string.zfill(i, 2) + ".txt", 'w+')
for j in range(len(word)):
f.write(word[j] + "  " + str(weight[i][j]) + " " + "\n")

def coutcos(file1,file2):
cipin1 = open(file1).readlines()
cipin2 = open(file2).readlines()
list1 = []
list2 = []
for x in cipin1:
y = x.split(' ')
list1.append(y[2])
for x in cipin2:
y = x.split(' ')
list2.append(y[2])
dot_product = 0.0
normA = 0.0
normB = 0.0
for a, b in zip(list1, list2):
a = float(a)
b = float(b)
dot_product += a * b
normA += a ** 2
normB += b ** 2
if normA == 0.0 or normB == 0.0:
return None
else:
return dot_product / ((normA * normB) ** 0.5)

if __name__ == "__main__":
# 保存TFIDF的计算结果到文件夹
sFilePath = "C:/Users/llfang1/PycharmProjects/untitled2/corpus/tfidffile"
# 保存分词的文件夹
segPath = 'C:/Users/llfang1/PycharmProjects/untitled2/corpus/segfile'
(allfile, path) = getFileList('C:/Users/llfang1/PycharmProjects/untitled2/corpus/allkeyword')
for ff in allfile:
print "Using jieba on " + ff
segment(ff, path, segPath)
Tfidf(allfile, sFilePath, segPath)
file1 = sFilePath + "/" + "04.txt"
file2 = sFilePath + "/" + "05.txt"
similar = coutcos(file1,file2)
print similar

注:此程序參考了一位同行的程序后进行了改动并添加一些内容


                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: