您的位置:首页 > 其它

Netflix-协同过滤矩阵分解之随机梯度下降(SGD)

2013-04-03 10:34 169 查看
最近开始搞协同过滤CF,觉得自己真是水到家,先在小的netflix数据集上做做SGD

主要参考文章:

Netflix Update: Try This at Home:http://sifter.org/~simon/journal/20061211.html

数据集形式:头两行描述可以忽略跳过,第三行依次是行数,列数,打分个数。

打分矩阵用稀疏的形式存储,按以下形式存储(row_index,col_index,rating)。

13 1 1表示第13行第1列位置的元素是1

%%MatrixMarket matrix coordinate real general
% Generated 28-Aug-2011
95526 3561 3298163
13 1  1
83 1  2
127 1  2
136 1  5
137 1  4
138 1  4
139 1  1
……


验证集也是同样的文件形式,只是不知道打分(最后一列)

%%MatrixMarket matrix coordinate real general
% Generated 28-Aug-2011
95526 3561 545177
135 1  3
140 1  5
141 1  4
154 1  3
162 1  4
167 1  4
169 1  4
……


程序很简单,基本思路如下

(1)确定参数,学习率lrate,正规化稀疏lambada,特征个数K

(2)随机初始化用户特征fuser,电影特征fmovie

(3)在训练集上进行训练

(4)计算validation set上的rmse

(5)重复(3)(4)步至收敛

#include<iostream>
#include<vector>
#include<stdio.h>
#include<time.h>
#include<string.h>
#include<stdlib.h>
#include<cmath>
using namespace std;

int user=0;
int movie=0;
int rating=0;
int* usercnt;
int* moviecnt;
double** fuser;
double** fmovie;
int ratingcnt=0;
int K=20;
double lambda=0.02;
double lrate=0.001;

void LoadData()
{
FILE* fin=fopen("/home/tuywen/graphchi/netflixsmall/smallnetflix_mm","r");
if(fin==NULL)
printf("Error:opening the file failed!\n");
char* l1=new char[100];
fgets(l1,100,fin);
fgets(l1,100,fin);
fscanf(fin,"%d%d%d",&user,&movie,&rating);
printf("[user]=%d\n[moive]=%d\n[rating]=%d\n",user,movie,rating);
fclose(fin);
}

void InitFeatureVector()
{
printf("Begin initilzing the feature vectors...\n");
srand48(time(NULL));
fuser=new double*[user+1];
fmovie=new double*[movie+1];
for(int i=1;i<=user;i++)
{
fuser[i]=new double[K];
for(int j=0;j<K;j++)
fuser[i][j]=drand48();
}

for(int i=1;i<=movie;i++)
{
fmovie[i]=new double[K];
for(int j=0;j<K;j++)
fmovie[i][j]=drand48();
}
printf("Finish  initilzing the feature vectors...\n");

}

double predict(int userid,int movieid)
{
double sum=0;
for(int i=0;i<K;i++)
sum+=fuser[userid][i]*fmovie[movieid][i];
sum=max(double(1),sum);
sum=min(double(5),sum);
return sum;
}

void train(int userid,int movieid,int rate)
{
double err=rate-predict(userid,movieid);
for(int i=0;i<K;i++)
{
fuser[userid][i]+=lrate*(err*fmovie[movieid][i]-lambda*fuser[userid][i]);
fmovie[movieid][i]+=lrate*(err*fuser[userid][i]-lambda*fmovie[movieid][i]);
}
}

void Interation()
{
printf("Begin iterating...\n");
FILE* fin=fopen("/home/tuywen/graphchi/netflixsmall/smallnetflix_mm","r");
if(fin==NULL)
printf("Error:opening the file failed!\n");
char* l1=new char[100];
fgets(l1,100,fin);
fgets(l1,100,fin);
double trmse=0;
fscanf(fin,"%d%d%d",&user,&movie,&rating);
int u,m,r;
while(fscanf(fin,"%d%d%d",&u,&m,&r)!=EOF)
{
train(u,m,r);
}
fclose(fin);
printf("Finish iterating...\n");

}

void TrainRMSE()
{
FILE* fin=fopen("/home/tuywen/graphchi/netflixsmall/smallnetflix_mm","r");
char* l1=new char[100];
fgets(l1,100,fin);
fgets(l1,100,fin);
int vuser,vmovie,vrating;
int tuser,tmovie,trating;
fscanf(fin,"%d%d%d",&vuser,&vmovie,&vrating);
double vrmse=0,err;
for(int l=0;l<vrating;l++)
{

fscanf(fin,"%d%d%d",&tuser,&tmovie,&trating);
err=predict(tuser,tmovie)-trating;
vrmse+=err*err;
}
fclose(fin);
vrmse=sqrt(vrmse/vrating);
printf("Training Set RMSE:%lf\n",vrmse);

}

void ValidationRMSE()
{
FILE* fin=fopen("/home/tuywen/graphchi/netflixsmall/smallnetflix_mme","r");
char* l1=new char[100];
fgets(l1,100,fin);
fgets(l1,100,fin);
int vuser,vmovie,vrating;
int tuser,tmovie,trating;
fscanf(fin,"%d%d%d",&vuser,&vmovie,&vrating);
double vrmse=0,err;
for(int l=0;l<vrating;l++)
{

fscanf(fin,"%d%d%d",&tuser,&tmovie,&trating);
err=predict(tuser,tmovie)-trating;
vrmse+=err*err;
}
fclose(fin);
vrmse=sqrt(vrmse/vrating);
printf("Validation Set RMSE:%lf\n",vrmse);
}

int main()
{
//char* location = "/home/tuywen/DataSet/Netflix_probe.txt";
time_t t1=time(NULL);
LoadData();
InitFeatureVector();
for(int i=0;i<10;i++)
{
Interation();
TrainRMSE();
ValidationRMSE();
}
time_t t2=time(NULL);
printf("Load time:%d\n",t2-t1);
}


  
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: