标签:
用cuda计算向量加法A+B=C
流程:
1.申请主机内存。向量A,向量B,计算结果C
2.初始化数据。用0-1之间的随机数初始化向量A,B,C
3.GPU内存申请。申请A,B,C需要的GPU内存空间
4.数据拷贝。把数据从主机内存拷贝至GPU内存
5.计算需要的线程数和线程块数。
6.调用GPU加法函数
7.数据拷贝。把结果从GPU内存拷贝至主机内存。
8.在CPU上重新运行一遍,与GPU结果进行对照。
9.释放GPU内存。
10.释放主机内存。
11.重置GPU状态。
要点:内存管理,数据拷贝。
代码:
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 __global__ void 4 vectorAdd(const float *A, const float *B, float *C, int numElements) 5 { 6 int i = blockDim.x * blockIdx.x + threadIdx.x; 7 8 if (i < numElements) 9 { 10 C[i] = A[i] + B[i]; 11 } 12 } 13 14 int main(void) 15 { 16 //检测cuda返回值 17 cudaError_t err = cudaSuccess; 18 19 //初始化向量维度 20 int numElements = 50000; 21 //计算内存需求 22 size_t size = numElements * sizeof(float); 23 24 printf("[Vector addition of %d elements ]\n", numElements); 25 26 // 27 // 对主机的A,B,C申请内存空间 28 float *host_A = (float *)malloc(size); 29 float *host_B = (float *)malloc(size); 30 float *host_C = (float *)malloc(size); 31 //判断是否申请成功 32 if (host_A == NULL || host_B == NULL || host_C == NULL) 33 { 34 fprintf(stderr, "Failed to allocate host vectors!\n"); 35 exit(EXIT_FAILURE); 36 } 37 // 初始化主机A,B 38 for (int i = 0; i < numElements; ++i) 39 { 40 host_A[i] = rand()/(float)RAND_MAX; 41 host_B[i] = rand()/(float)RAND_MAX; 42 } 43 44 // 45 //申请cuda内存空间并判断 46 float *device_A = NULL; 47 err = cudaMalloc((void **)&device_A, size); 48 if (err != cudaSuccess) 49 { 50 fprintf(stderr, "对向量A申请cuda内存空间失败 (错误代码 %s)!\n", cudaGetErrorString(err)); 51 exit(EXIT_FAILURE); 52 } 53 float *device_B = NULL; 54 err = cudaMalloc((void **)&device_B, size); 55 if (err != cudaSuccess) 56 { 57 fprintf(stderr, "对向量B申请cuda内存空间失败 (错误代码 %s)!\n", cudaGetErrorString(err)); 58 exit(EXIT_FAILURE); 59 } 60 float *device_C = NULL; 61 err = cudaMalloc((void **)&device_C, size); 62 if (err != cudaSuccess) 63 { 64 fprintf(stderr, "对向量C申请cuda内存空间失败 (错误代码 %s)!\n", cudaGetErrorString(err)); 65 exit(EXIT_FAILURE); 66 } 67 //拷贝数据到cuda内存并检测 68 printf("Copy input data from the host memory to the CUDA device\n"); 69 err = cudaMemcpy(device_A, host_A, size, cudaMemcpyHostToDevice); 70 if (err != cudaSuccess) 71 { 72 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); 73 exit(EXIT_FAILURE); 74 } 75 err = cudaMemcpy(device_B, host_B, size, cudaMemcpyHostToDevice); 76 if (err != cudaSuccess) 77 { 78 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); 79 exit(EXIT_FAILURE); 80 } 81 82 //计算线程块与线程 83 //每线程块线程数 84 int threadsPerBlock = 256; 85 //每网格线程块数 86 int blocksPerGrid = (numElements + threadsPerBlock - 1)/ threadsPerBlock; 87 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); 88 vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(device_A, device_B, device_C, numElements); 89 //判断cuda程序运行情况 90 err = cudaGetLastError(); 91 if (err != cudaSuccess) 92 { 93 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); 94 exit(EXIT_FAILURE); 95 } 96 97 //计算结果拷贝回主机 98 printf("Copy output data from the CUDA device to the host memory\n"); 99 err = cudaMemcpy(host_C, device_C, size, cudaMemcpyDeviceToHost); 100 if (err != cudaSuccess) 101 { 102 fprintf(stderr,"计算结果拷贝回主机失败(错误代码:%s)\n",cudaGetErrorString(err)); 103 exit(EXIT_FAILURE); 104 } 105 106 //结果验证 107 for(int i = 0; i < numElements; ++i) 108 { 109 if(fabs(host_A[i] + host_B[i] - host_C[i]) > 1e-5) 110 { 111 fprintf(stderr,"验证失败%d\n",i); 112 exit(EXIT_FAILURE); 113 } 114 } 115 printf("验证成功\n"); 116 117 //释放cuda内存和主机内存 118 err = cudaFree(device_A); 119 err = cudaFree(device_B); 120 err = cudaFree(device_C); 121 free(host_A); 122 free(host_B); 123 free(host_C); 124 125 //重置cuda状态 126 err = cudaDeviceReset(); 127 printf("结束"); 128 return 0; 129 }
好烦……GPU搞起来真麻烦
标签:
原文地址:http://www.cnblogs.com/nwpuxuezha/p/4468860.html