推荐系统小记20150123
2015-01-23 23:09
148 查看
一.此前的数据集
movies.dat
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
...
ratings.dat
1::1193::5::978300760
1::661::3::978302109
...
users.dat 记录用户的属性,不需要
1::F::1::10::48067
2::M::56::16::70072
...
二.当前的含有信任矩阵的数据集,初始信任很稀疏,不包含项目名字
ratings_data.txt
1 100 4
1 101 5
...
trust_data.txt
22605 42915 1
22605 5052 1
...
三.Python代码,初步版本,试用数据集1
movies.dat
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
...
ratings.dat
1::1193::5::978300760
1::661::3::978302109
...
users.dat 记录用户的属性,不需要
1::F::1::10::48067
2::M::56::16::70072
...
二.当前的含有信任矩阵的数据集,初始信任很稀疏,不包含项目名字
ratings_data.txt
1 100 4
1 101 5
...
trust_data.txt
22605 42915 1
22605 5052 1
...
三.Python代码,初步版本,试用数据集1
from math import sqrt # Returns a distance-based similarity score for person1 and person2 def sim_distance(prefs,person1,person2): # Get the list of shared_items si={} for item in prefs[person1]: if item in prefs[person2]: si[item]=1 # if they have no ratings in common, return 0 if len(si)==0: return 0 # Add up the squares of all the differences sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]]) return 1/(1+sum_of_squares) # Returns the Pearson correlation coefficient for p1 and p2 def sim_pearson(prefs,p1,p2): # Get the list of mutually rated items si={} for item in prefs[p1]: if item in prefs[p2]: si[item]=1 # if they are no ratings in common, return 0 if len(si)==0: return 0 # Sum calculations n=len(si) # Sums of all the preferences sum1=sum([prefs[p1][it] for it in si]) sum2=sum([prefs[p2][it] for it in si]) # Sums of the squares sum1Sq=sum([pow(prefs[p1][it],2) for it in si]) sum2Sq=sum([pow(prefs[p2][it],2) for it in si]) # Sum of the products pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si]) # Calculate r (Pearson score) num=pSum-(sum1*sum2/n) den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)) if den==0: return 0 r=num/den return r # Returns the best matches for person from the prefs dictionary. # Number of results and similarity function are optional params. def topMatches(prefs,td,person,n=10,similarity=sim_pearson): #scores=[(similarity(prefs,person,other),other) # for other in prefs if other!=person] scores=[] for other in prefs: if other==person: continue if other in td[person]: sim=0.7*similarity(prefs,person,other) + 0.3*td[person][other] else: sim=0.7*similarity(prefs,person,other) scores.append((sim, other)) # save as tuple @shaojh scores.sort() scores.reverse() return scores[0:n] # do sth to the trust metric def trustPropagate(td): for p1 in td: for p2 in td: if td[p1][p2] == 1: continue for p3 in td: if td[p2][p3] == 1: td[p1][p3] = 0.5 # trust reduction O(n^3) # Gets recommendations for a person by using a weighted average # of every other user's rankings def getRecommendations(prefs,td,person,similarity=sim_pearson): totals={} simSums={} # calculate person's everage rating personSum=0 personEverage=0 for item in prefs[person]: personSum+=prefs[person][item] personEverage=personSum/len(prefs[person]) print 'personEverage=',personEverage scores=topMatches(prefs, td, person) #for other in prefs: for sim,other in scores: print ' ',sim,' ',other # don't compare me to myself #sim=similarity(prefs,person,other) # ignore scores of zero or lower #if sim<=0: continue # calculate other's everage rating otherSum=0 otherEverage=0 for item in prefs[other]: otherSum+=prefs[other][item] otherEverage=otherSum/len(prefs[other]) print otherEverage for item in prefs[other]: # only score movies I haven't seen yet if item not in prefs[person] or prefs[person][item]==0: # Similarity * Score totals.setdefault(item,0) totals[item]+=(prefs[other][item]-otherEverage)*sim # Sum of similarities simSums.setdefault(item,0) simSums[item]+=sim # Create the normalized list rankings=[(personEverage+total/simSums[item],item) for item,total in totals.items()] # Return the sorted list rankings.sort() rankings.reverse() return rankings[0:10] # Input data def loadRatings(path='C:/data'): #Load data prefs={} for line in open(path+'/ratings_data.dat'): (user,movieid,rating)=line.split(' ')[0:3] #print type(rating),rating[0:1],user,movieid prefs.setdefault(user,{}) prefs[user][movieid]=float(rating[0]) return prefs def loadTrustData(path='C:/data'): # Load data td={} for line in open(path+'/trust_data.dat'): (tmp,user1,user2,rating)=line.split(' ')[0:4] td.setdefault(user1,{}) td[user1][user2]=float(rating[0]) return td def test(prefs): for i in prefs['1']: print i print len(prefs['1'])
相关文章推荐
- 小记:利用单例模式的提升系统整体性能 推荐
- 动态模版系统/引擎(开源)推荐
- 嵌入式系统好书推荐
- VS.NET环境下实现日志系统的几种方式 推荐
- 文章推荐系统(三)
- ZFS大大提高文件系统的稳定性 推荐
- 文章推荐系统(二)
- 【蛙蛙推荐】:由改进一个老旧系统想到的
- 推荐文章系统(一)
- 推荐文章系统(一)
- 蛙蛙推荐:偶做的用户管理系统
- 推荐一个非常不错的.NET系统--C1协同平台
- 推荐一本好书《深入理解计算机系统 Ccomputer Systems A Programmer's Perspective》
- [推荐]网络上通用的调查答卷系统-XML做数据库(将DataSet转化成字符串)
- 日常小记--关于系统演示
- [温润推荐]计算机系统集成项目管理
- WindowsNT/2K/XP/2K3系统实用工具集【推荐】
- 真诚推荐:我写的免疫32种如3721、百度垃圾组件的系统工具! (摘自博客园)
- java Blog系统推荐-roller
- 谈电信领域网管系统的应用和发展 推荐