标签:线程 put code share rand tar span lock format
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <time.h> 4 #include "cuda_runtime.h" 5 #include "device_launch_parameters.h" 6 7 #define ARRAY_SIZE (1024*100-1) 8 #define WIDTH 32 9 #define WIDTH2 1024 10 #define SEED 1 //(unsigned int)clock() 11 #define CEIL(x,y) (( x - 1 ) / y + 1) 12 13 typedef float format; // int or float 14 15 void checkCudaError(cudaError input) 16 { 17 if (input != cudaSuccess) 18 { 19 printf("\n\tfind a cudaError!"); 20 exit(1); 21 } 22 return; 23 } 24 25 __global__ void add_naive(format *in, format *out) 26 { 27 extern __shared__ format sdata[]; 28 sdata[threadIdx.x] = 0; 29 30 int id = threadIdx.x; 31 while (id < ARRAY_SIZE) 32 { 33 sdata[threadIdx.x] += in[id]; 34 id += blockDim.x; 35 } 36 __syncthreads(); 37 38 if (threadIdx.x == 0)// 零号线程负责sdata的规约 39 { 40 for (id = 1; id < blockDim.x; id++) 41 sdata[0] += sdata[id]; 42 *out = sdata[0]; 43 } 44 return; 45 } 46 47 __global__ void add_reduce(format *in, format *out) 48 { 49 extern __shared__ format sdata[]; 50 sdata[threadIdx.x] = 0; 51 52 int id = threadIdx.x; 53 int wall = WIDTH2; 54 while (id < ARRAY_SIZE) 55 { 56 sdata[threadIdx.x] += in[id]; 57 id += blockDim.x; 58 } 59 __syncthreads(); 60 61 for (int s = (wall + 1) / 2; s > 1 && s < wall; wall = (wall + 1) / 2, s = (s + 1) / 2) 62 { 63 if (threadIdx.x < s) 64 sdata[threadIdx.x] += (threadIdx.x + s >= wall) ? 0 : sdata[threadIdx.x + s]; 65 } 66 __syncthreads(); 67 68 if (threadIdx.x == 0) 69 *out = sdata[0] + sdata[1]; 70 return; 71 } 72 73 int main() 74 { 75 int i; 76 format h_in[ARRAY_SIZE], cpu_sum, h_out_naive, h_out_reduce; 77 format *d_in, *d_out_naive, *d_out_reduce; 78 cudaEvent_t start, stop; 79 float elapsedTime1,elapsedTime2; 80 cudaEventCreate(&start); 81 cudaEventCreate(&stop); 82 83 cudaMalloc((void **)&d_in, sizeof(format)*ARRAY_SIZE); 84 cudaMalloc((void **)&d_out_naive, sizeof(format)); 85 cudaMalloc((void **)&d_out_reduce, sizeof(format)); 86 87 srand(SEED); 88 for (i = 0, cpu_sum = 0; i < ARRAY_SIZE; i++) 89 { 90 h_in[i] = rand(); 91 cpu_sum += h_in[i]; 92 //printf("%f\n", h_in[i]); 93 } 94 95 cudaMemcpy(d_in, h_in, sizeof(format) * ARRAY_SIZE, cudaMemcpyHostToDevice); 96 97 cudaEventRecord(start, 0); 98 add_naive <<< 1, WIDTH , sizeof(format) * WIDTH >>> (d_in, d_out_naive); 99 cudaMemcpy(&h_out_naive, d_out_naive, sizeof(format), cudaMemcpyDeviceToHost); 100 cudaThreadSynchronize(); 101 cudaEventRecord(stop, 0); 102 cudaEventSynchronize(stop); 103 cudaEventElapsedTime(&elapsedTime1, start, stop); 104 105 cudaEventRecord(start, 0); 106 add_reduce <<< CEIL(ARRAY_SIZE,WIDTH2), WIDTH2 ,sizeof(format) * WIDTH2 >>> (d_in, d_out_reduce); 107 cudaMemcpy(&h_out_reduce, d_out_reduce, sizeof(format), cudaMemcpyDeviceToHost); 108 cudaThreadSynchronize(); 109 cudaEventRecord(stop, 0); 110 cudaEventSynchronize(stop); 111 cudaEventElapsedTime(&elapsedTime2, start, stop); 112 printf("\n\tCPU:\t\t%f\n\tGPU naive:\t%f\tTime:\t%6.2f ms\n\tGPU reduce:\t%f\tTime:\t%6.2f ms\n", 113 (float)cpu_sum, (float)h_out_naive, elapsedTime1, (float)h_out_reduce, elapsedTime2); 114 115 cudaFree(d_in); 116 cudaFree(d_out_naive); 117 cudaFree(d_out_reduce); 118 cudaEventDestroy(start); 119 cudaEventDestroy(stop); 120 121 getchar(); 122 return 0; 123 }
标签:线程 put code share rand tar span lock format
原文地址:http://www.cnblogs.com/cuancuancuanhao/p/7660621.html