多线程版本的Item_KNN的C语言实现
2015-05-11 15:04
429 查看
这里模仿了Word2vec里面构建Hash索引的思想,版本1里面由于IO密集型和CPU密集型工作同时进行,这样导致效率很低,所以在版本2中将所有线程首先做CPU密集型的操作然后再由主线程把结果写道文件中,并且经过测试,6核(23虚拟核)的服务器,开100个线程执行效率最佳。
版本1:
版本2:
版本1:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdlib.h> #include <time.h> #include <pthread.h> #define MaxString 50 #define MaxUserSize 1024*1024*10 #define SIMILARITY_ITEM 30 #define MaxLen 100 char filename[20][15]={ "output1.txt","output2.txt","output3.txt","output4.txt", "output5.txt","output6.txt","output7.txt","output8.txt", "output9.txt","output10.txt","output11.txt","output12.txt", "output13.txt","output14.txt","output15.txt","output16.txt", "output17.txt","output18.txt","output19.txt","output20.txt"}; const int item_hash_size = 30000000; struct itemInfo { char itemId[MaxString]; int totalUser; unsigned long long * userList; int max_user; }; unsigned long long max_item =1000,item_size=0; struct itemInfo * item; int * item_hash; char str1[10000][50]; int num_threads=20; char str2[100]; int GetWordHash(char *itemId) { unsigned long long a, hash = 0; for (a = 0; a < strlen(itemId); a++) hash = hash * 257 + itemId[a]; hash = hash % item_hash_size; return hash; } int SearchItem(char *itemId) { unsigned int hash = GetWordHash(itemId); while (1) { if (item_hash[hash] == -1) return -1; if (!strcmp(itemId, item[item_hash[hash]].itemId)) return item_hash[hash]; hash = (hash + 1) % item_hash_size; } return -1; } int AddItemIdToItem(char *itemId,unsigned long long userId) { unsigned int hash; item[item_size].max_user = 100; item[item_size].userList = (unsigned long long *)calloc(item[item_size].max_user, sizeof(unsigned long long)); strcpy(item[item_size].itemId, itemId); item[item_size].userList[0]=userId; item[item_size].totalUser = 1; item_size++; // Reallocate memory if needed if (item_size + 2 >= max_item) { max_item += 1000; item = (struct itemInfo *)realloc(item, max_item * sizeof(struct itemInfo)); } hash = GetWordHash(itemId); while (item_hash[hash] != -1) hash = (hash + 1) % item_hash_size; item_hash[hash] = item_size - 1; return item_size - 1; } int ReadItemInfo() { int a; //打开文件 FILE * fin = fopen("data_1w.txt","rb"); if (fin==NULL) { printf("The input file doesn't exist.\n"); exit(1); } item =(struct itemInfo *)malloc(max_item*sizeof(struct itemInfo)); if (item==NULL) { printf("item allocate failed.\n"); exit(1); } //读取用户的购买记录 char * str=(char *)malloc(MaxUserSize); while(fgets (str ,MaxUserSize,fin) != NULL) { //将用户购买记录分段 memset(str1,0,sizeof(str1)); int cn = 0; int b = 0; int c = 0; while(1){ str1[cn][b] = str[c]; b++; c++; str1[cn][b] = 0; if (str[c] == 10) break; if (str[c] == ' ') { cn++; b = 0; c++; } } cn++; //去除u的userID int len = strlen(str1[0]); unsigned long long value=0; for (int j=1; j < len; j++) value = value*10+(str1[0][j]-'0'); //将ItemId添加到Item中 for (int i = 1; i < cn; ++i) { int index = SearchItem(str1[i]); if (index == -1) a = AddItemIdToItem(str1[i],value); else { item[index].totalUser++; if (item[index].totalUser+2>=item[index].max_user) { item[index].max_user+=100; item[index].userList = (unsigned long long *)realloc(item[index].userList, item[index].max_user * sizeof(unsigned long long)); } item[index].userList[item[index].totalUser-1]=value; } } } return 0; } void init(){ //初始化Hash表 item_hash = (int *)calloc(item_hash_size, sizeof(int)); for (int i = 0; i < item_hash_size; ++i) item_hash[i] = -1; //为item分配空间 item = (struct itemInfo *)malloc(max_item*sizeof(struct itemInfo)); } int binary_search(unsigned long long array[],int n,unsigned long long value) { int left=0; int right=n-1; while (left<=right) //循环条件,适时而变 { int middle=left + ((right-left)>>1); //防止溢出,移位也更高效。同时,每次循环都需要更新。 if (array[middle]>value) { right =middle-1; //right赋值,适时而变 } else if(array[middle]<value) { left=middle+1; } else return middle; } return -1; } void * CalItemSim(void * a){ FILE * fout = fopen(filename[(int)a],"w"); float bestSim[SIMILARITY_ITEM]; char bestUserID[SIMILARITY_ITEM][50]; float p,similarity; int common,pos; int left = item_size/ num_threads * (int)a; int right = item_size / num_threads *((int)a+1)-1; if ((int)a==num_threads-1) right=item_size-1; for (int i = left; i <= right; ++i) //遍历此线程需要处理的item { for (int w = 0; w < SIMILARITY_ITEM; ++w) //初始化 { bestSim[w]=-1; bestUserID[w][0]=0; } for (int j = 0; j < item_size; ++j) { if (i!=j) { common=0; for (int t = 0; t < item[i].totalUser; ++t) //寻找购买的相同物品个数 { pos=binary_search(item[j].userList,item[j].totalUser,item[i].userList[t]); if (pos!=-1) common++; } if(common>5){ p=sqrt(item[i].totalUser * item[j].totalUser); similarity = common/p; for (int k = 0; k < SIMILARITY_ITEM; ++k) { if (similarity>bestSim[k]) { for (int q = SIMILARITY_ITEM -1; q > k; q--) { bestSim[q] = bestSim[q-1]; strcpy(bestUserID[q],bestUserID[q-1]); } bestSim[k] =similarity; strcpy(bestUserID[k],item[j].itemId); break; } } } } } for (int c = 0; (c < SIMILARITY_ITEM)&&bestSim[c]!=-1; ++c) { fprintf(fout,"%s %s %f\n",item[i].itemId,bestUserID[c],bestSim[c]); } } fclose(fout); pthread_exit(NULL); } void CreatMulThread(){ pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); for (long long a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, CalItemSim, (void *)a); for (long long a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); } //将多个输出文件合并成 void FileJoin(){ FILE * fout = fopen("output.txt","w"); for (int i = 0; i < 20; ++i) { FILE * fin = fopen(filename[i],"r"); while(fgets (str2 ,MaxLen,fin) != NULL){ fprintf(fout,"%s",str2); } fclose(fin); if( remove(filename[i]) == 0 ) printf("Removed %s\n", filename[i]); else perror("remove"); } fclose(fout); } int main(){ //freopen("output.txt","w",stdout); init(); ReadItemInfo(); CreatMulThread(); FileJoin(); /* char testpid[50]="p535223"; int test=SearchItem(testpid); printf("test:%d\n",test ); printf("item_size:%llu\n",item_size ); printf("itemId:%s\n",item[test].itemId); printf("totalUser:%d\n",item[test].totalUser); for (int i = 0; i < item[test].totalUser; ++i) { printf("%llu\n",item[test].userList[i]); } */ return 0; }
版本2:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdlib.h> #include <time.h> #include <pthread.h> #define MaxString 50 #define MaxUserSize 1024*1024*10 #define SIMILARITY_ITEM 30 #define MaxLen 100 #define Num_Thread 100 const int item_hash_size = 30000000; struct itemInfo { char itemId[MaxString]; int totalUser; unsigned long long * userList; int max_user; }; struct recordInfo{ char itemId1[MaxString]; char itemId2[MaxString]; float similarity; }; struct recordInfo * record[Num_Thread]; int len[Num_Thread]; int max[Num_Thread]; unsigned long long max_item =1000,item_size=0; struct itemInfo * item; int * item_hash; char str1[10000][50]; char str2[100]; int GetWordHash(char *itemId) { unsigned long long a, hash = 0; for (a = 0; a < strlen(itemId); a++) hash = hash * 257 + itemId[a]; hash = hash % item_hash_size; return hash; } int SearchItem(char *itemId) { unsigned int hash = GetWordHash(itemId); while (1) { if (item_hash[hash] == -1) return -1; if (!strcmp(itemId, item[item_hash[hash]].itemId)) return item_hash[hash]; hash = (hash + 1) % item_hash_size; } return -1; } int AddItemIdToItem(char *itemId,unsigned long long userId) { unsigned int hash; item[item_size].max_user = 100; item[item_size].userList = (unsigned long long *)calloc(item[item_size].max_user, sizeof(unsigned long long)); strcpy(item[item_size].itemId, itemId); item[item_size].userList[0]=userId; item[item_size].totalUser = 1; item_size++; // Reallocate memory if needed if (item_size + 2 >= max_item) { max_item += 1000; item = (struct itemInfo *)realloc(item, max_item * sizeof(struct itemInfo)); } hash = GetWordHash(itemId); while (item_hash[hash] != -1) hash = (hash + 1) % item_hash_size; item_hash[hash] = item_size - 1; return item_size - 1; } int ReadItemInfo() { int a; //打开文件 FILE * fin = fopen("record_20w.txt","rb"); if (fin==NULL) { printf("The input file doesn't exist.\n"); exit(1); } item =(struct itemInfo *)malloc(max_item*sizeof(struct itemInfo)); if (item==NULL) { printf("item allocate failed.\n"); exit(1); } //读取用户的购买记录 char * str=(char *)malloc(MaxUserSize); while(fgets (str ,MaxUserSize,fin) != NULL) { //将用户购买记录分段 memset(str1,0,sizeof(str1)); int cn = 0; int b = 0; int c = 0; while(1){ str1[cn][b] = str[c]; b++; c++; str1[cn][b] = 0; if (str[c] == 10) break; if (str[c] == ' ') { cn++; b = 0; c++; } } cn++; //去除u的userID int len = strlen(str1[0]); unsigned long long value=0; int j; for ( j=1; j < len; j++) value = value*10+(str1[0][j]-'0'); //将ItemId添加到Item中 int i; for ( i = 1; i < cn; ++i) { int index = SearchItem(str1[i]); if (index == -1) a = AddItemIdToItem(str1[i],value); else { item[index].totalUser++; if (item[index].totalUser+2>=item[index].max_user) { item[index].max_user+=100; item[index].userList = (unsigned long long *)realloc(item[index].userList, item[index].max_user * sizeof(unsigned long long)); } item[index].userList[item[index].totalUser-1]=value; } } } return 0; } void init(){ //初始化max和len int i; for(i=0;i<Num_Thread;i++){ max[i]=1000; len[i]=0; } //初始化Hash表 item_hash = (int *)calloc(item_hash_size, sizeof(int)); for ( i = 0; i < item_hash_size; ++i) item_hash[i] = -1; //为item分配空间 item = (struct itemInfo *)malloc(max_item*sizeof(struct itemInfo)); } int binary_search(unsigned long long array[],int n,unsigned long long value) { int left=0; int right=n-1; while (left<=right) //循环条件,适时而变 { int middle=left + ((right-left)>>1); //防止溢出,移位也更高效。同时,每次循环都需要更新。 if (array[middle]>value) { right =middle-1; //right赋值,适时而变 } else if(array[middle]<value) { left=middle+1; } else return middle; } return -1; } void * CalItemSim(void * a){ int id = *(int *)a; record[id] = (struct recordInfo *)malloc(max[id]*sizeof(struct recordInfo)); float bestSim[SIMILARITY_ITEM]; char bestUserID[SIMILARITY_ITEM][50]; float p,similarity; int common,pos; int left = item_size/ Num_Thread * id; int right = item_size / Num_Thread *(id+1)-1; if (id==Num_Thread-1) right=item_size-1; int i,w,j,t,k,q; for ( i = left; i <= right; ++i) //遍历此线程需要处理的item { for ( w = 0; w < SIMILARITY_ITEM; ++w) //初始化 { bestSim[w]=-1; bestUserID[w][0]=0; } for ( j = 0; j < item_size; ++j) { if (i!=j) { common=0; for ( t = 0; t < item[i].totalUser; ++t) //寻找购买的相同物品个数 { pos=binary_search(item[j].userList,item[j].totalUser,item[i].userList[t]); if (pos!=-1) common++; } if(common>5){ p=sqrt(item[i].totalUser * item[j].totalUser); similarity = common/p; for (k = 0; k < SIMILARITY_ITEM; ++k) { if (similarity>bestSim[k]) { for ( q = SIMILARITY_ITEM -1; q > k; q--) { bestSim[q] = bestSim[q-1]; strcpy(bestUserID[q],bestUserID[q-1]); } bestSim[k] =similarity; strcpy(bestUserID[k],item[j].itemId); break; } } } } } int c; for ( c = 0; (c < SIMILARITY_ITEM)&&bestSim[c]!=-1; ++c){ if (len[id]+5>=max[id]) { max[id]+=1000; record[id] = (struct recordInfo *)realloc(record[id], max[id] * sizeof(struct recordInfo)); } strcpy(record[id][len[id]].itemId1,item[i].itemId); strcpy(record[id][len[id]].itemId2,bestUserID[c]); record[id][len[id]].similarity = bestSim[c]; len[id]++; } } pthread_exit(NULL); } void CreatMulThread(){ pthread_t *pt = (pthread_t *)malloc(Num_Thread* sizeof(pthread_t)); long long a; int id[Num_Thread]; for(a=0;a<Num_Thread;a++) id[a]=a; for ( a = 0; a < Num_Thread; a++) pthread_create(&pt[id[a]], NULL, CalItemSim, (void *)&id[a]); for ( a = 0; a < Num_Thread; a++) pthread_join(pt[a], NULL); } //将结果输出到文件 void Output(){ FILE * fout = fopen("output_item_knn.txt","w"); int i,j; for (i = 0; i < Num_Thread; ++i) { for (j = 0; j < len[i]; ++j) { fprintf(fout, "%s %s %f\n",record[i][j].itemId1,record[i][j].itemId2,record[i][j].similarity); } } fclose(fout); } int main(){ //freopen("output.txt","w",stdout); init(); ReadItemInfo(); printf("sad"); CreatMulThread(); int sum= 0,i; for (i = 0; i < Num_Thread; ++i) { printf("len%d:%d\n",i+1,len[i]); sum+=len[i]; } printf("sum:%d\n",sum); Output(); /* char testpid[50]="p535223"; int test=SearchItem(testpid); printf("test:%d\n",test ); printf("item_size:%llu\n",item_size ); printf("itemId:%s\n",item[test].itemId); printf("totalUser:%d\n",item[test].totalUser); for (int i = 0; i < item[test].totalUser; ++i) { printf("%llu\n",item[test].userList[i]); } */ return 0; }
相关文章推荐
- 多线程版本的User_KNN的C语言实现
- 设计模式 行为模式 观察者模式 c语言 版本实现
- 【Machine Learning】KNN学习算法与C语言实现
- C语言使用windows api实现多线程
- C语言实现knn
- 一个UUID生成算法的C语言实现 --- WIN32版本
- C语言版本:双链表的实现
- 设计模式 行为模式 责任链 c语言 版本实现
- C语言版本:顺序表的实现
- 一个多线程的Socket通信Demo(C语言实现)
- 一个UUID生成算法的C语言实现——WIN32版本
- 二叉查找树实现(C语言版本)
- Lua BitOp 提供5.1和5.2版本位操作运算 (跨平台C语言实现 说明部分1)
- linux下c语言实现多线程文件复制
- AVL平衡二叉查找树实现(C语言版本)
- 一个UUID生成算法的C语言实现 --- WIN32版本
- 设计模式 行为模式 命令模式 c语言 版本实现
- 一道多线程面试题-C语言实现
- k近邻算法(knn)的c语言实现
- 常见的排序算法对比及实现C语言版本