本文主要介绍如何使用CUDA并行计算框架编程实现机器学习中的Kmeans算法,Kmeans算法的详细介绍在这里,本文重点在并行实现的过程。
当然还是简单的回顾一下kmeans算法的串行过程:
伪代码:
创建k个点作为起始质心(经常是随机选择) 当任意一个点的簇分配结果发生改变时 对数据集中的每个数据点 对每个质心 计算质心与数据点之间的距离 将数据点分配到距其最近的簇 对每一个簇,计算簇中所有点的均值并将均值作为质心我们可以观察到有两个部分可以并行优化:
①line03-04:将每个数据点到多个质心的距离计算进行并行化
②line05:将数据点到某个执行的距离计算进行并行化
KMEANS类:
class KMEANS { private: int numClusters; int numCoords; int numObjs; int *membership;//[numObjs] char *filename; float **objects;//[numObjs][numCoords] data objects float **clusters;//[numClusters][unmCoords] cluster center float threshold; int loop_iterations; public: KMEANS(int k); void file_read(char *fn); void file_write(); void cuda_kmeans(); inline int nextPowerOfTwo(int n); void free_memory(); virtual ~KMEANS(); };//KMEANS
成员变量:
numClusters:中心点的个数numCoords:每个数据点的维度
numObjs:数据点的个数
membership:每个数据点所属类别的数组,维度为numObjs
filename:读入的文件名
objects:所有数据点,维度为[numObjs][numCoords]
clusters:中心点数据,维度为[numObjs][numCoords]
threshold:控制循环次数的一个域值
loop_iterations:循环的迭代次数
成员函数:
KMEANS(int k):含参构造函数。初始化成员变量
file_read(char *fn):读入文件数据并初始化object以及membership变量
file_write():将计算结果写回到结果文件中去
cuda_kmeans():kmeans计算的入口函数
nextPowerOfTwo(int n):它计算大于等于输入参数n的第一个2的幂次数。
free_memory():释放内存空间
~KMEANS():析构函数
并行的代码主要三个函数:
find_nearest_cluster(...)
compute_delta(...)
euclid_dist_2(...)
首先看一下函数euclid_dist_2(...):
__host__ __device__ inline static float euclid_dist_2(int numCoords,int numObjs,int numClusters,float *objects,float *clusters,int objectId,int clusterId) { int i; float ans = 0; for( i=0;i<numCoords;i++ ) { ans += ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) * ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) ; } return ans; }这段代码实际上就是并行的计算向量objects[objectId]和clusters[clusterId]之间的距离,即第objectId个数据点到第clusterId个中心点的距离。
再看一下函数compute_delta(...):
/* * numIntermediates:The actual number of intermediates * numIntermediates2:The next power of two */ __global__ static void compute_delta(int *deviceIntermediates,int numIntermediates, int numIntermediates2) { extern __shared__ unsigned int intermediates[]; intermediates[threadIdx.x] = (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0 ; __syncthreads(); //numIntermediates2 *must* be a power of two! for(unsigned int s = numIntermediates2 /2 ; s > 0 ; s>>=1) { if(threadIdx.x < s) { intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; } __syncthreads(); } if(threadIdx.x == 0) { deviceIntermediates[0] = intermediates[0]; } }这段代码的意义就是将一个线程块中每个线程的对应的intermediates的数据求和最后放到deviceIntermediates[0]中去然后拷贝回主存块中去。这个问题的更好的解释在这里,实际上就是一个数组求和的问题,应用在这里求得的是有改变的membership中所有数据的和,即改变了簇的点的个数。
最后再看函数finid_nearest_cluster(...):
/* * objects:[numCoords][numObjs] * deviceClusters:[numCoords][numClusters] * membership:[numObjs] */ __global__ static void find_nearest_cluster(int numCoords,int numObjs,int numClusters,float *objects, float *deviceClusters,int *membership ,int *intermediates) { extern __shared__ char sharedMemory[]; unsigned char *membershipChanged = (unsigned char *)sharedMemory; float *clusters = deviceClusters; membershipChanged[threadIdx.x] = 0; int objectId = blockDim.x * blockIdx.x + threadIdx.x; if( objectId < numObjs ) { int index; float dist,min_dist; /*find the cluster id that has min distance to object*/ index = 0; min_dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0); for(int i=0;i<numClusters;i++) { dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i) ; /* no need square root */ if( dist < min_dist ) { min_dist = dist; index = i; } } if( membership[objectId]!=index ) { membershipChanged[threadIdx.x] = 1; } //assign the membership to object objectId membership[objectId] = index; __syncthreads(); //for membershipChanged[] #if 1 //blockDim.x *must* be a power of two! for(unsigned int s = blockDim.x / 2; s > 0 ;s>>=1) { if(threadIdx.x < s) { membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s];//calculate all changed values and save result to membershipChanged[0] } __syncthreads(); } if(threadIdx.x == 0) { intermediates[blockIdx.x] = membershipChanged[0]; } #endif } }//find_nearest_cluster这个函数计算的就是第objectId个数据点到numClusters个中心点的距离,然后根据情况比较更新membership。
这三个函数将所有能够并行的地方都进行了并行,实现了整体算法的并行化~
在此呈上全部代码:
kmeans.h:
#ifndef _H_KMEANS #define _H_KMEANS #include <assert.h> #define malloc2D(name, xDim, yDim, type) do { name = (type **)malloc(xDim * sizeof(type *)); assert(name != NULL); name[0] = (type *)malloc(xDim * yDim * sizeof(type)); assert(name[0] != NULL); for (size_t i = 1; i < xDim; i++) name[i] = name[i-1] + yDim; } while (0) double wtime(void); #endif
#include <sys/time.h> #include <stdio.h> #include <stdlib.h> double wtime(void) { double now_time; struct timeval etstart; struct timezone tzp; if (gettimeofday(&etstart, &tzp) == -1) perror("Error: calling gettimeofday() not successful.\n"); now_time = ((double)etstart.tv_sec) + /* in seconds */ ((double)etstart.tv_usec) / 1000000.0; /* in microseconds */ return now_time; }
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> #include <iostream> #include <cassert> #include "kmeans.h" using namespace std; const int MAX_CHAR_PER_LINE = 1024; class KMEANS { private: int numClusters; int numCoords; int numObjs; int *membership;//[numObjs] char *filename; float **objects;//[numObjs][numCoords] data objects float **clusters;//[numClusters][unmCoords] cluster center float threshold; int loop_iterations; public: KMEANS(int k); void file_read(char *fn); void file_write(); void cuda_kmeans(); inline int nextPowerOfTwo(int n); void free_memory(); virtual ~KMEANS(); }; KMEANS::~KMEANS() { free(membership); free(clusters[0]); free(clusters); free(objects[0]); free(objects); } KMEANS::KMEANS(int k) { threshold = 0.001; numObjs = 0; numCoords = 0; numClusters = k; filename = NULL; loop_iterations = 0; } void KMEANS::file_write() { FILE *fptr; char outFileName[1024]; //output:the coordinates of the cluster centres sprintf(outFileName,"%s.cluster_centres",filename); printf("Writingcoordinates of K=%d cluster centers to file \"%s\"\n",numClusters,outFileName); fptr = fopen(outFileName,"w"); for(int i=0;i<numClusters;i++) { fprintf(fptr,"%d ",i) ; for(int j=0;j<numCoords;j++) fprintf(fptr,"%f ",clusters[i][j]); fprintf(fptr,"\n"); } fclose(fptr); //output:the closest cluster centre to each of the data points sprintf(outFileName,"%s.membership",filename); printf("writing membership of N=%d data objects to file \"%s\" \n",numObjs,outFileName); fptr = fopen(outFileName,"w"); for(int i=0;i<numObjs;i++) { fprintf(fptr,"%d %d\n",i,membership[i]) ; } fclose(fptr); } inline int KMEANS::nextPowerOfTwo(int n) { n--; n = n >> 1 | n; n = n >> 2 | n; n = n >> 4 | n; n = n >> 8 | n; n = n >> 16 | n; //n = n >> 32 | n; // for 64-bit ints return ++n; } __host__ __device__ inline static float euclid_dist_2(int numCoords,int numObjs,int numClusters,float *objects,float *clusters,int objectId,int clusterId) { int i; float ans = 0; for( i=0;i<numCoords;i++ ) { ans += ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) * ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) ; } return ans; } /* * numIntermediates:The actual number of intermediates * numIntermediates2:The next power of two */ __global__ static void compute_delta(int *deviceIntermediates,int numIntermediates, int numIntermediates2) { extern __shared__ unsigned int intermediates[]; intermediates[threadIdx.x] = (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0 ; __syncthreads(); //numIntermediates2 *must* be a power of two! for(unsigned int s = numIntermediates2 /2 ; s > 0 ; s>>=1) { if(threadIdx.x < s) { intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; } __syncthreads(); } if(threadIdx.x == 0) { deviceIntermediates[0] = intermediates[0]; } } /* * objects:[numCoords][numObjs] * deviceClusters:[numCoords][numClusters] * membership:[numObjs] */ __global__ static void find_nearest_cluster(int numCoords,int numObjs,int numClusters,float *objects, float *deviceClusters,int *membership ,int *intermediates) { extern __shared__ char sharedMemory[]; unsigned char *membershipChanged = (unsigned char *)sharedMemory; float *clusters = deviceClusters; membershipChanged[threadIdx.x] = 0; int objectId = blockDim.x * blockIdx.x + threadIdx.x; if( objectId < numObjs ) { int index; float dist,min_dist; /*find the cluster id that has min distance to object*/ index = 0; min_dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0); for(int i=0;i<numClusters;i++) { dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i) ; /* no need square root */ if( dist < min_dist ) { min_dist = dist; index = i; } } if( membership[objectId]!=index ) { membershipChanged[threadIdx.x] = 1; } //assign the membership to object objectId membership[objectId] = index; __syncthreads(); //for membershipChanged[] #if 1 //blockDim.x *must* be a power of two! for(unsigned int s = blockDim.x / 2; s > 0 ;s>>=1) { if(threadIdx.x < s) { membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s];//calculate all changed values and save result to membershipChanged[0] } __syncthreads(); } if(threadIdx.x == 0) { intermediates[blockIdx.x] = membershipChanged[0]; } #endif } }//find_nearest_cluster void KMEANS::cuda_kmeans() { int index,loop = 0; int *newClusterSize;//[numClusters]:no.objects assigned in each new cluster float delta; //% of objects changes their clusters float **dimObjects;//[numCoords][numObjs] float **dimClusters; float **newClusters;//[numCoords][numClusters] float *deviceObjects; //[numCoords][numObjs] float *deviceClusters; //[numCoords][numclusters] int *deviceMembership; int *deviceIntermediates; //Copy objects given in [numObjs][numCoords] layout to new [numCoords][numObjs] layout malloc2D(dimObjects,numCoords,numObjs,float); for(int i=0;i<numCoords;i++) { for(int j=0;j<numObjs;j++) { dimObjects[i][j] = objects[j][i]; } } //pick first numClusters elements of objects[] as initial cluster centers malloc2D(dimClusters, numCoords, numClusters,float); for(int i=0;i<numCoords;i++) { for(int j=0;j<numClusters;j++) { dimClusters[i][j] = dimObjects[i][j]; } } newClusterSize = new int[numClusters]; assert(newClusterSize!=NULL); malloc2D(newClusters,numCoords,numClusters,float); memset(newClusters[0],0,numCoords * numClusters * sizeof(float) ); //To support reduction,numThreadsPerClusterBlock *must* be a power of two, and it *must* be no larger than the number of bits that will fit into an unsigned char ,the type used to keep track of membership changes in the kernel. const unsigned int numThreadsPerClusterBlock = 32; const unsigned int numClusterBlocks = (numObjs + numThreadsPerClusterBlock -1)/numThreadsPerClusterBlock; const unsigned int numReductionThreads = nextPowerOfTwo(numClusterBlocks); const unsigned int clusterBlockSharedDataSize = numThreadsPerClusterBlock * sizeof(unsigned char); const unsigned int reductionBlockSharedDataSize = numReductionThreads * sizeof(unsigned int); cudaMalloc(&deviceObjects,numObjs*numCoords*sizeof(float)); cudaMalloc(&deviceClusters,numClusters*numCoords*sizeof(float)); cudaMalloc(&deviceMembership,numObjs*sizeof(int)); cudaMalloc(&deviceIntermediates,numReductionThreads*sizeof(unsigned int)); cudaMemcpy(deviceObjects,dimObjects[0],numObjs*numCoords*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(deviceMembership,membership,numObjs*sizeof(int),cudaMemcpyHostToDevice); do { cudaMemcpy(deviceClusters,dimClusters[0],numClusters*numCoords*sizeof(float),cudaMemcpyHostToDevice); find_nearest_cluster<<<numClusterBlocks,numThreadsPerClusterBlock,clusterBlockSharedDataSize>>>(numCoords,numObjs,numClusters,deviceObjects,deviceClusters,deviceMembership,deviceIntermediates); cudaDeviceSynchronize(); compute_delta<<<1,numReductionThreads,reductionBlockSharedDataSize>>>(deviceIntermediates,numClusterBlocks,numReductionThreads); cudaDeviceSynchronize(); int d; cudaMemcpy(&d,deviceIntermediates,sizeof(int),cudaMemcpyDeviceToHost); delta = (float)d; cudaMemcpy(membership,deviceMembership,numObjs*sizeof(int),cudaMemcpyDeviceToHost); for(int i=0;i<numObjs;i++) { //find the array index of nestest index = membership[i]; //update new cluster centers:sum of objects located within newClusterSize[index]++; for(int j=0;j<numCoords;j++) { newClusters[j][index] += objects[i][j]; } } //average the sum and replace old cluster centers with newClusters for(int i=0;i<numClusters;i++) { for(int j=0;j<numCoords;j++) { if(newClusterSize[i] > 0) dimClusters[j][i] = newClusters[j][i]/newClusterSize[i]; newClusters[j][i] = 0.0;//set back to 0 } newClusterSize[i] = 0 ; //set back to 0 } delta /= numObjs; }while( delta > threshold && loop++ < 500 ); loop_iterations = loop + 1; malloc2D(clusters,numClusters,numCoords,float); for(int i=0;i<numClusters;i++) { for(int j=0;j<numCoords;j++) { clusters[i][j] = dimClusters[j][i]; } } cudaFree(deviceObjects) ; cudaFree(deviceClusters); cudaFree(deviceMembership); cudaFree(deviceMembership); free(dimObjects[0]); free(dimObjects); free(dimClusters[0]); free(dimClusters); free(newClusters[0]); free(newClusters); free(newClusterSize); } void KMEANS::file_read(char *fn) { FILE *infile; char *line = new char[MAX_CHAR_PER_LINE]; int lineLen = MAX_CHAR_PER_LINE; filename = fn; infile = fopen(filename,"r"); assert(infile!=NULL); /*find the number of objects*/ while( fgets(line,lineLen,infile) ) { numObjs++; } /*find the dimension of each object*/ rewind(infile); while( fgets(line,lineLen,infile)!=NULL ) { if( strtok(line," \t\n")!=0 ) { while( strtok(NULL," \t\n") ) numCoords++; break; } } /*allocate space for object[][] and read all objcet*/ rewind(infile); objects = new float*[numObjs]; for(int i=0;i<numObjs;i++) { objects[i] = new float[numCoords]; } int i=0; /*read all object*/ while( fgets(line,lineLen,infile)!=NULL ) { if( strtok(line," \t\n") ==NULL ) continue; for(int j=0;j<numCoords;j++) { objects[i][j] = atof( strtok(NULL," ,\t\n") ) ; } i++; } /* membership: the cluster id for each data object */ membership = new int[numObjs]; assert(membership!=NULL); for(int i=0;i<numObjs;i++) membership[i] = -1; } int main(int argc,char *argv[]) { KMEANS kmeans(atoi(argv[1])); kmeans.file_read(argv[2]); kmeans.cuda_kmeans(); kmeans.file_write(); return 0; }
target: nvcc cuda_kmeans.cu ./a.out 4 ./Image_data/color100.txt
运行代码:
kmeans的cuda实现代码相对复杂,在阅读的过程中可能会有困难,有问题请留言~
Author:忆之独秀
Email:leaguenew@qq.com
注明出处:http://blog.csdn.net/lavorange/article/details/41942323
原文地址:http://blog.csdn.net/lavorange/article/details/41942323