您的位置:首页 > 其它

推荐系统小记20150123

2015-01-23 23:09 148 查看
一.此前的数据集

movies.dat

1::Toy Story (1995)::Animation|Children's|Comedy

2::Jumanji (1995)::Adventure|Children's|Fantasy

...

ratings.dat

1::1193::5::978300760

1::661::3::978302109

...

users.dat 记录用户的属性,不需要

1::F::1::10::48067

2::M::56::16::70072

...

二.当前的含有信任矩阵的数据集,初始信任很稀疏,不包含项目名字

ratings_data.txt

1 100 4

1 101 5

...

trust_data.txt

22605 42915 1

22605 5052 1

...

三.Python代码,初步版本,试用数据集1

from math import sqrt

# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
# Get the list of shared_items
si={}
for item in prefs[person1]:
if item in prefs[person2]: si[item]=1

# if they have no ratings in common, return 0
if len(si)==0: return 0

# Add up the squares of all the differences
sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
for item in prefs[person1] if item in prefs[person2]])

return 1/(1+sum_of_squares)

# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
# Get the list of mutually rated items
si={}
for item in prefs[p1]:
if item in prefs[p2]: si[item]=1

# if they are no ratings in common, return 0
if len(si)==0: return 0

# Sum calculations
n=len(si)

# Sums of all the preferences
sum1=sum([prefs[p1][it] for it in si])
sum2=sum([prefs[p2][it] for it in si])

# Sums of the squares
sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
sum2Sq=sum([pow(prefs[p2][it],2) for it in si])

# Sum of the products
pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])

# Calculate r (Pearson score)
num=pSum-(sum1*sum2/n)
den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
if den==0: return 0

r=num/den

return r

# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,td,person,n=10,similarity=sim_pearson):
#scores=[(similarity(prefs,person,other),other)
#                for other in prefs if other!=person]
scores=[]
for other in prefs:
if other==person: continue
if other in td[person]:
sim=0.7*similarity(prefs,person,other) + 0.3*td[person][other]
else:
sim=0.7*similarity(prefs,person,other)
scores.append((sim, other)) # save as tuple @shaojh
scores.sort()
scores.reverse()
return scores[0:n]

# do sth to the trust metric
def trustPropagate(td):
for p1 in td:
for p2 in td:
if td[p1][p2] == 1: continue
for p3 in td:
if td[p2][p3] == 1:
td[p1][p3] = 0.5 # trust reduction O(n^3)

# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,td,person,similarity=sim_pearson):
totals={}
simSums={}

# calculate person's everage rating
personSum=0
personEverage=0
for item in prefs[person]:
personSum+=prefs[person][item]
personEverage=personSum/len(prefs[person])
print 'personEverage=',personEverage

scores=topMatches(prefs, td, person)
#for other in prefs:
for sim,other in scores:
print '    ',sim,' ',other
# don't compare me to myself
#sim=similarity(prefs,person,other)

# ignore scores of zero or lower
#if sim<=0: continue

# calculate other's everage rating
otherSum=0
otherEverage=0
for item in prefs[other]:
otherSum+=prefs[other][item]
otherEverage=otherSum/len(prefs[other])
print otherEverage

for item in prefs[other]:

# only score movies I haven't seen yet
if item not in prefs[person] or prefs[person][item]==0:
# Similarity * Score
totals.setdefault(item,0)
totals[item]+=(prefs[other][item]-otherEverage)*sim
# Sum of similarities
simSums.setdefault(item,0)
simSums[item]+=sim

# Create the normalized list
rankings=[(personEverage+total/simSums[item],item) for item,total in totals.items()]

# Return the sorted list
rankings.sort()
rankings.reverse()
return rankings[0:10]

# Input data
def loadRatings(path='C:/data'):
#Load data
prefs={}
for line in open(path+'/ratings_data.dat'):
(user,movieid,rating)=line.split(' ')[0:3]
#print type(rating),rating[0:1],user,movieid
prefs.setdefault(user,{})
prefs[user][movieid]=float(rating[0])
return prefs

def loadTrustData(path='C:/data'):
# Load data
td={}
for line in open(path+'/trust_data.dat'):
(tmp,user1,user2,rating)=line.split(' ')[0:4]
td.setdefault(user1,{})
td[user1][user2]=float(rating[0])
return td

def test(prefs):
for i in prefs['1']:
print i
print len(prefs['1'])
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: