其中的数据格式:
一行一个用户购买记录,uid pid1 pid2…
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#define MaxString 50
#define MaxUserSize 1024*1024*10
#define SIMILARITY_USER 20
#define MaxLen 100
char filename[20][15]={ "output1.txt","output2.txt","output3.txt","output4.txt",
"output5.txt","output6.txt","output7.txt","output8.txt",
"output9.txt","output10.txt","output11.txt","output12.txt",
"output13.txt","output14.txt","output15.txt","output16.txt",
"output17.txt","output18.txt","output19.txt","output20.txt"};
struct userInfo
{
char userId[MaxString];
int totalItem;
unsigned long long * itemList;
};
struct userInfo * user;
float similarity;
char maxItemUserID[50],maxItemIDUserID[50],str1[10000][50];
int max_user=1000,userNum=0,maxItem=0,maxItemUserIndex,MaxItemIDIndex,len,pos=-1;
unsigned long long MaxItemID=0,value;
long long num=0;//user_num
int num_threads=20;
char str2[100];
int binary_search(unsigned long long a[],int n,unsigned long long key);
int compare123(const void *a, const void *b) {
return *(unsigned long long *)a - *(unsigned long long *)b;
}
int binary_search(unsigned long long array[],int n,unsigned long long value) {
int left=0;
int right=n-1;
while (left<=right) //循环条件,适时而变
{
int middle=left + ((right-left)>>1); //防止溢出,移位也更高效。同时,每次循环都需要更新。
if (array[middle]>value)
{
right =middle-1; //right赋值,适时而变
}
else if(array[middle]<value)
{
left=middle+1;
}
else
return middle;
}
return -1;
}
void * CalUserSim(void * a){
FILE * fout = fopen(filename[(int)a],"w");
float bestSim[SIMILARITY_USER];
char bestUserID[SIMILARITY_USER][50];
float p;
int common;
int left = userNum / num_threads * (int)a;
int right = userNum / num_threads *((int)a+1)-1;
if ((int)a==num_threads-1)
right=userNum-1;
for (int i = left; i <= right; ++i)
{
for (int w = 0; w < SIMILARITY_USER; ++w) //初始化
{
bestSim[w]=-1;
bestUserID[w][0]=0;
}
for (int j = 0; j < userNum; ++j) //对于一个i,进程userNum次遍历
{
if (i!=j)
{
common=0;
for (int t = 0; t < user[i].totalItem; ++t) //寻找购买的相同物品个数
{
pos=binary_search(user[j].itemList,user[j].totalItem,user[i].itemList[t]);
if (pos!=-1)
common++;
}
if (common>10) //如果相同物品个数大于10个,判断是否是处于topK
{
p=sqrt(user[i].totalItem * user[j].totalItem);
similarity = common/p;
for (int k = 0; k < SIMILARITY_USER; ++k)
{
if (similarity>bestSim[k])
{
for (int q = SIMILARITY_USER -1; q > k; q--)
{
bestSim[q] = bestSim[q-1];
strcpy(bestUserID[q],bestUserID[q-1]);
}
bestSim[k] =similarity;
strcpy(bestUserID[k],user[j].userId);
break;
}
}
}
}
}
for (int c = 0; (c < SIMILARITY_USER)&&bestSim[c]!=-1; ++c)
{
fprintf(fout,"%s %s %f\n",user[i].userId,bestUserID[c],bestSim[c]);
}
}
fclose(fout);
pthread_exit(NULL);
}
int ReadUserInfo(){
char * str=(char *)malloc(MaxUserSize);
if (str==NULL)
{
printf("Str allocate failed.\n");
exit(1);
}
//打开文件
FILE * fin = fopen("data_5K.txt","rb");
if (fin==NULL)
{
printf("The input file doesn‘t exist.\n");
exit(1);
}
user =(struct userInfo *)malloc(max_user*sizeof(struct userInfo));
if (user==NULL)
{
printf("User allocate failed.\n");
exit(1);
}
while(fgets (str ,MaxUserSize,fin) != NULL)
{
//将一个用户的购买记录按空格分开
memset(str1,0,sizeof(str1));
int cn = 0;
int b = 0;
int c = 0;
while(1){
str1[cn][b] = str[c];
b++;
c++;
str1[cn][b] = 0;
if (str[c] == 10) break;
if (str[c] == ‘ ‘) {
cn++;
b = 0;
c++;
}
}
cn++;
if (cn-1>10)
{
if (num + 2 >= max_user) {
max_user += 1000;
user = (struct userInfo *)realloc(user, max_user * sizeof(struct userInfo));
if (user==NULL)
{
printf("Realloc user memory failed.\n");
exit(1);
}
}
user[num].itemList =(unsigned long long *) malloc((cn-1)*sizeof(unsigned long long));
if(user[num].itemList==NULL){
printf("allocate itemList failed.\n");
exit(1);
}
user[num].totalItem=cn-1;
strcpy(user[num].userId,str1[0]);
for (int i = 1; i < cn; ++i)//去除p的产品id
{
len = strlen(str1[i]);
value=0;
for (int j=1; j < len; j++)
value = value*10+(str1[i][j]-‘0‘);
user[num].itemList[i-1]=value;
if (value>MaxItemID)
{
MaxItemID = value; //找最大的ItemID
MaxItemIDIndex=num;
strcpy(maxItemIDUserID,str1[0]);
}
}
qsort(user[num].itemList, cn-1, sizeof(unsigned long long), compare123);
num++;
}
}
userNum=num;
fclose(fin);
return 0;
}
//将多个输出文件合并成
void FileJoin(){
FILE * fout = fopen("output.txt","w");
for (int i = 0; i < 20; ++i)
{
FILE * fin = fopen(filename[i],"r");
while(fgets (str2 ,MaxLen,fin) != NULL){
fprintf(fout,"%s",str2);
}
fclose(fin);
if( remove(filename[i]) == 0 )
printf("Removed %s\n", filename[i]);
else
perror("remove");
}
fclose(fout);
}
int main(){
clock_t start=clock();
ReadUserInfo();
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
for (long long a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, CalUserSim, (void *)a);
for (long long a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
clock_t now=clock();
printf("time:%lusecond \n",(now-start+1)/CLOCKS_PER_SEC);
FileJoin();
return 0;
}
原文地址:http://blog.csdn.net/li8630/article/details/45561989