码迷,mamicode.com
首页 > 编程语言 > 详细

多线程版本的User_KNN的C语言实现

时间:2015-05-07 16:46:16      阅读:209      评论:0      收藏:0      [点我收藏+]

标签:多线程   机器学习   c语言   

其中的数据格式:
一行一个用户购买记录,uid pid1 pid2…

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#define MaxString 50
#define MaxUserSize 1024*1024*10
#define SIMILARITY_USER 20
#define MaxLen 100

char filename[20][15]={ "output1.txt","output2.txt","output3.txt","output4.txt",
                        "output5.txt","output6.txt","output7.txt","output8.txt",
                        "output9.txt","output10.txt","output11.txt","output12.txt",
                        "output13.txt","output14.txt","output15.txt","output16.txt",
                        "output17.txt","output18.txt","output19.txt","output20.txt"};

struct userInfo
{
    char userId[MaxString];
    int totalItem;
    unsigned long long * itemList;
};

struct userInfo * user;
float similarity;
char  maxItemUserID[50],maxItemIDUserID[50],str1[10000][50];
int max_user=1000,userNum=0,maxItem=0,maxItemUserIndex,MaxItemIDIndex,len,pos=-1;
unsigned long long MaxItemID=0,value;   
long long num=0;//user_num
int num_threads=20;
char str2[100];

int  binary_search(unsigned long long a[],int n,unsigned long long key);


int compare123(const void *a, const void *b) {
        return *(unsigned long long *)a - *(unsigned long long *)b; 
}
int binary_search(unsigned long long array[],int n,unsigned long long value)  {  
    int left=0;  
    int right=n-1;    
    while (left<=right)          //循环条件,适时而变  
    {  
        int middle=left + ((right-left)>>1);  //防止溢出,移位也更高效。同时,每次循环都需要更新。  

        if (array[middle]>value)  
        {  
            right =middle-1;   //right赋值,适时而变  
        }   
        else if(array[middle]<value)  
        {  
            left=middle+1;  
        }  
        else  
            return middle;    

    }  
    return -1;  
}  


void * CalUserSim(void * a){
    FILE * fout = fopen(filename[(int)a],"w");
    float bestSim[SIMILARITY_USER];
    char  bestUserID[SIMILARITY_USER][50];
    float p;
    int common;
    int left = userNum / num_threads * (int)a;
    int right  = userNum / num_threads *((int)a+1)-1;
    if ((int)a==num_threads-1)
        right=userNum-1;
    for (int i = left; i <= right; ++i)
    {
        for (int w = 0; w < SIMILARITY_USER; ++w) //初始化
        {
            bestSim[w]=-1;
            bestUserID[w][0]=0;
        }
        for (int j = 0; j < userNum; ++j)         //对于一个i,进程userNum次遍历
        {   
            if (i!=j)
            {
                common=0;
                for (int t = 0; t < user[i].totalItem; ++t)  //寻找购买的相同物品个数
                {   
                    pos=binary_search(user[j].itemList,user[j].totalItem,user[i].itemList[t]);
                    if (pos!=-1)
                        common++;
                }
                if (common>10)    //如果相同物品个数大于10个,判断是否是处于topK
                {
                    p=sqrt(user[i].totalItem * user[j].totalItem);
                    similarity = common/p;
                    for (int k = 0; k < SIMILARITY_USER; ++k)
                    {
                        if (similarity>bestSim[k])
                        {
                            for (int q = SIMILARITY_USER -1; q > k; q--)
                            {
                                bestSim[q] = bestSim[q-1];
                                strcpy(bestUserID[q],bestUserID[q-1]);
                            }
                            bestSim[k] =similarity;
                            strcpy(bestUserID[k],user[j].userId);
                            break;
                        }
                    }
                }               
            }   
        }
        for (int c = 0; (c < SIMILARITY_USER)&&bestSim[c]!=-1; ++c)
        {
            fprintf(fout,"%s %s %f\n",user[i].userId,bestUserID[c],bestSim[c]);
        }
    }
    fclose(fout);
    pthread_exit(NULL);
}

int ReadUserInfo(){

    char * str=(char *)malloc(MaxUserSize);
    if (str==NULL)
    {
        printf("Str allocate failed.\n");
        exit(1);
    }
    //打开文件
    FILE * fin = fopen("data_5K.txt","rb");
    if (fin==NULL)
    {
        printf("The input file doesn‘t exist.\n");
        exit(1);
    }
    user =(struct userInfo *)malloc(max_user*sizeof(struct userInfo));
    if (user==NULL)
    {
        printf("User allocate failed.\n");
        exit(1);
    }   
    while(fgets (str ,MaxUserSize,fin) != NULL)
    {
        //将一个用户的购买记录按空格分开
        memset(str1,0,sizeof(str1));
        int cn = 0;
        int b = 0;
        int c = 0;
        while(1){
            str1[cn][b] = str[c];
            b++;
            c++;
            str1[cn][b] = 0;
            if (str[c] == 10) break;
            if (str[c] == ‘ ‘) {
             cn++;
             b = 0;
             c++;
            }
        }
        cn++;

        if (cn-1>10)
        {
            if (num + 2 >= max_user) {
                max_user += 1000;
                user = (struct userInfo *)realloc(user, max_user * sizeof(struct userInfo));
                if (user==NULL)
                {
                    printf("Realloc user memory failed.\n");
                    exit(1);
                }
            }

            user[num].itemList =(unsigned long long *) malloc((cn-1)*sizeof(unsigned long long));
            if(user[num].itemList==NULL){
                printf("allocate itemList failed.\n");
                exit(1);
            }
            user[num].totalItem=cn-1;
            strcpy(user[num].userId,str1[0]);
            for (int i = 1; i < cn; ++i)//去除p的产品id
            {
                len = strlen(str1[i]);
                value=0; 
                for (int j=1; j < len; j++)
                value = value*10+(str1[i][j]-‘0‘);
                user[num].itemList[i-1]=value;
                if (value>MaxItemID)
                {
                    MaxItemID = value;  //找最大的ItemID
                    MaxItemIDIndex=num; 
                    strcpy(maxItemIDUserID,str1[0]);
                }
            }
            qsort(user[num].itemList, cn-1, sizeof(unsigned long long), compare123);                        
            num++;  
        }
    }
    userNum=num;    
    fclose(fin);        
    return 0;
}
//将多个输出文件合并成
void FileJoin(){
    FILE * fout = fopen("output.txt","w");
    for (int i = 0; i < 20; ++i)
    {
        FILE * fin = fopen(filename[i],"r");
        while(fgets (str2 ,MaxLen,fin) != NULL){
            fprintf(fout,"%s",str2);
        }
        fclose(fin);

        if( remove(filename[i]) == 0 )
            printf("Removed %s\n", filename[i]);
        else
            perror("remove");
    }
    fclose(fout);
}

int main(){
    clock_t start=clock();
    ReadUserInfo();
    pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
    for (long long a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, CalUserSim, (void *)a);
    for (long long a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);  
    clock_t now=clock();
    printf("time:%lusecond \n",(now-start+1)/CLOCKS_PER_SEC);
    FileJoin();
    return 0;
}

多线程版本的User_KNN的C语言实现

标签:多线程   机器学习   c语言   

原文地址:http://blog.csdn.net/li8630/article/details/45561989

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!