您的位置:首页 > 其它

vsm向量空间模型实现

2016-03-26 11:33 197 查看
简介

代码实现

总结

一.简介

在检索当中,主要涉及了两个核心问题:

I.相似度计算

II.索引的建立

索引建立参考链接:

/article/2807955.html

这里我们重点讲解第一个问题

1.1整体流程如图:



1.2在整个流程当中,第二步骤和第三步骤对于效果影响很多,故此很关键

相似度的vsm经典模型流程如图:



经过第三步骤处理后,文档在词典当中均有唯一的表示-表示为一个长向量的形式

第四步骤参考链接:

/article/10504204.html

二.代码实现

# !usr/bin/python
# copyright(c) youfuwen
# Date:2016.03.26
# E-Mail:yfwen@bjtu.edu.cn
# first: cipintongji
import math
import ast
from collections import Counter
wordsCount=0#variable for wordsfrequency
def CountKeyByWen(fileName1):
global wordsCount
f1=open(fileName1,'r')
f2=open(fileName2,'r')
table={}
for lines in f1:
for line in lines.split(' '):
if line!=' ' and table.has_key(line):
table[line]+=1
wordsCount+=1
elif line!=' ':
wordsCount+=1
table[line]=1
dic = sorted(table.iteritems(),key= lambda asd:asd[1], reverse=True)
# print len(dic) code for testing
return dic
# seconde:create vocabulary
def CreateVocabulary(dic1=None, dic2=None):
vocabulary=[]
for dicEle in dic1:
if dicEle[0] not in vocabulary:
vocabulary.append(dicEle[0])
for dicEle in dic2:
if dicEle[0] not in vocabulary:
vocabulary.append(dicEle[0])
# print len(vocabulary) code for testing
return vocabulary
# third:compute TF-IDF output: a vector
# In this code we just use TF for computing similarity
def ComputeVector(dic1=None,vocabulary=None):
# 3.1compute cipin global wordscount wordsCount
# 3.2create vector
dicVector = {}
for elem in vocabulary:
dicVector[elem]=0
# dicVector = sorted(dicVector.iteritems(),key= lambda asd:asd[1], reverse=True)
dicTemp1,dicTemp2=Counter(dicVector), Counter(dic1)
dicTemp=dict(dicTemp1+dicTemp2)
# dicTemp = sorted(dicTemp.iteritems(),key= lambda asd:asd[1], reverse=True)
return  dicTemp
# fourth: compute TF-IDF
def ComputeSimlirity(dic1Vector=None,dic2Vector=None):
x=0.0 #fenzi
#fenmu
y1=0.0
y2=0.0
for k in dic1Vector:# because of the element of dic1 and dic2 are the same
temp1=(float)(float(dic1Vector[k])/float(wordsCount))
temp2=(float)(float(dic2Vector[k])/float(wordsCount))
x=x+ (temp1*temp2)
y1+=pow(temp1,2)
y2+=pow(temp2,2)
return x/math.sqrt(y1*y2)

if __name__=='__main__':
fileName1='amanda_all.txt';
fileName2='amanda_all.txt';
dic1 = CountKeyByWen(fileName1)
dic2 = CountKeyByWen(fileName2)
vocabulary = CreateVocabulary(dic1, dic2)
dic1Vector = ComputeVector(dic1, vocabulary)
dic2Vector = ComputeVector(dic2, vocabulary)
for elem in dic1Vector:
print "<"+elem[0],',',str(elem[1])+">"
sim=ComputeSimlirity(dic1Vector,dic2Vector)
print sim


tips:上面的code有人提示楼主有问题,遂做修改,欢迎大家批评指正哈!修改代码如下:

# !usr/bin/python
# copyright(c) youfuwen
# Date:2016.03.29revised version
# E-Mail:yfwen@bjtu.edu.cn
# first: count words frequency
import math
import ast
from collections import Counter
wordsCount=0#variable for wordsfrequency
def CountKeyByWen(fileName1):
global wordsCount
f1=open(fileName1,'r')
f2=open(fileName2,'r')
table={}
for lines in f1:
for line in lines.split(' '):
if line!=' ' and table.has_key(line):
table[line]+=1
wordsCount+=1
elif line!=' ':
wordsCount+=1
table[line]=1
#dic = sorted(table.iteritems(),key= lambda asd:asd[1], reverse=True)
# print len(dic) code for testing
return table
# seconde:create vocabulary
def CreateVocabulary(dic1=None, dic2=None):
vocabulary=[]
for dicEle in dic1:
if dicEle not in vocabulary:
vocabulary.append(dicEle)
for dicEle in dic2:
if dicEle not in vocabulary:
vocabulary.append(dicEle)
# print len(vocabulary) code for testing
return vocabulary
# third:compute TF-IDF output: a vector
# In this code we just use TF for computing similarity
def union_dict(*objs):
_keys = set(sum([obj.keys() for obj in objs],[]))
_total = {}
for _key in _keys:
_total[_key] = sum([obj.get(_key,0) for obj in objs])
return _total
def ComputeVector(dic1=None,vocabulary=None):
# 3.1compute cipin global wordscount wordsCount
# 3.2create vector
dicVector = {}
for elem in vocabulary:
dicVector[elem]=0
# dicVector = sorted(dicVector.iteritems(),key= lambda asd:asd[1], reverse=True)
# U"vocabulary --->dicVector"
# U"dic1->vector"
dicTemp=union_dict(dicVector,dic1);
# dicTemp1,dicTemp2=Counter(dicVector), Counter(dic1)
# dicTemp=dict(dicTemp1+dicTemp2)
# dicTemp = sorted(dicTemp.iteritems(),key= lambda asd:asd[1], reverse=True)
return  dicTemp
# fourth: compute TF-IDF
def ComputeSimlirity(dic1Vector=None,dic2Vector=None):
x=0.0 #fenzi
#fenmu
y1=0.0
y2=0.0
for k in dic1Vector:# because of the element of dic1 and dic2 are the same
temp1=(float)(float(dic1Vector[k])/float(wordsCount))
temp2=(float)(float(dic2Vector[k])/float(wordsCount))
x=x+ (temp1*temp2)
y1+=pow(temp1,2)
y2+=pow(temp2,2)
return x/math.sqrt(y1*y2)

if __name__=='__main__':
fileName1='a.txt';
fileName2='b.txt';
dic1 = CountKeyByWen(fileName1)
dic2 = CountKeyByWen(fileName2)
vocabulary = CreateVocabulary(dic1, dic2)
dic1Vector = ComputeVector(dic1, vocabulary)
dic2Vector = ComputeVector(dic2, vocabulary)
for elem in dic1Vector:
print "<"+elem,',',str(dic1Vector[elem])+">"
sim=ComputeSimlirity(dic1Vector,dic2Vector)
print "similarity="+str(sim)
#####################################


三.总结

I.任何事情均通于天道地道,都有一个积累的过程,努力是成功的必要前提条件II.让我们一同努力,明天会更好!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: