标签:define array error return 长度 分发树 ram idt eve
三种不同的方法计算前缀和,并与CPU的结果进行了对比。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <windows.h> 4 #include <time.h> 5 #include <math.h> 6 #include "cuda_runtime.h" 7 #include "device_launch_parameters.h" 8 9 #define ARRAY_SIZE (1024*7+419) 10 #define SIMPLE_TASK 1024 11 #define WIDTH 256 12 #define SEED 1 //(unsigned int)clock() 13 #define RAND_RANGE 10 // 取[-RAND_RANGE,+RAND_RANGE]范围的随机数 14 #define MIN(x,y) ((x)<(y)?(x):(y)) 15 #define CEIL(x,y) (int)(( x - 1 ) / y + 1) 16 17 typedef int format; // int or float 18 19 void checkCudaError(cudaError input) 20 { 21 if (input != cudaSuccess) 22 { 23 printf("\n\tfind a cudaError!"); 24 exit(1); 25 } 26 return; 27 } 28 29 int ceil_base2(int input)// 将input向上约化到2的整数幂上去 30 { 31 int i; 32 for (i = 1; input > 1; input = (input + 1) / 2, i *= 2); 33 return i; 34 } 35 36 int checkResult(format * in1, format * in2, const int length) 37 // 注意返回值为0(两向量相等)或者“值不等的元素下标加一”(防止0号元素就不相等),返回后若想使用该下标则需要减1 38 { 39 int i; 40 for (i = 0; i < length; i++) 41 { 42 if (in1[i] != in2[i]) 43 return i+1; 44 } 45 return 0; 46 } 47 48 void prefixSumCPU(const format *in, format *out, const int array_size) 49 { 50 out[0] = in[0]; 51 for (int i = 1; i < array_size; i++) 52 out[i] = out[i - 1] + in[i]; 53 return; 54 } 55 56 __global__ void select(const format *in, format *out, const int space)// 从in中等间距挑选元素放到out中去,注意要用 57 { 58 out[threadIdx.x] = (threadIdx.x == 0) ? 0 : in[(threadIdx.x)*space - 1]; 59 return; 60 } 61 62 __global__ void add(format *in, const format * addend, const int array_size)// 线程块内所有元素加上一个来自addend中的特定的元素 63 { 64 __shared__ format share_addend; 65 share_addend = addend[blockIdx.x]; 66 67 int id = blockIdx.x * blockDim.x + threadIdx.x; 68 if (id < array_size) 69 in[id] += share_addend; 70 return; 71 } 72 73 __global__ void prefixSumGPU1(const format *in, format *out, const int array_size)// 低效率分段闭扫描前缀和,可以处理任意长度 74 { 75 extern __shared__ format share_in[]; 76 int id = blockIdx.x * blockDim.x + threadIdx.x; 77 int jump; 78 share_in[threadIdx.x] = 0; 79 if (id < array_size) 80 share_in[threadIdx.x] = in[id]; 81 __syncthreads(); 82 83 for (jump = 1; jump < blockDim.x; jump *= 2) 84 { 85 share_in[threadIdx.x] += (threadIdx.x >= jump) ? share_in[threadIdx.x - jump] : 0; 86 __syncthreads(); 87 } 88 if (id < array_size) 89 out[id] = share_in[threadIdx.x]; 90 return; 91 } 92 93 __global__ void prefixSumGPU2(const format *in, format *out, const int array_size)//收集 - 分发树法,处理2的整数次幂个元素 94 { 95 extern __shared__ format share_in[]; 96 int id = blockIdx.x * blockDim.x + threadIdx.x; 97 int target; 98 int jump; 99 share_in[threadIdx.x] = 0; 100 if (id < array_size) 101 share_in[threadIdx.x] = in[id]; 102 __syncthreads(); 103 104 /*//收集树,与后面一个收集树等效,但是使用了离散的线程,读写效率较低 105 for (jump = 1; jump < blockDim.x; jump *= 2) 106 { 107 share_in[threadIdx.x] += ((threadIdx.x + 1) % (2 * jump) == 0) ? share_in[threadIdx.x - jump] : 0; 108 __syncthreads(); 109 } 110 */ 111 for (jump = 1; jump < blockDim.x; jump *= 2)// 收集树,利用连续的线程,减少逻辑分支 112 { 113 target = (threadIdx.x + 1) * 2 * jump - 1; 114 if (target < blockDim.x) 115 share_in[target] += share_in[target - jump]; 116 __syncthreads(); 117 } 118 for (jump = blockDim.x / 4; jump > 0; jump /= 2)// 分发树 119 { 120 target = (threadIdx.x + 1) * 2 * jump - 1; 121 if (target + jump < blockDim.x) 122 share_in[target + jump] += share_in[target]; 123 __syncthreads(); 124 } 125 if (id < array_size) 126 out[id] = share_in[threadIdx.x]; 127 return; 128 } 129 130 __global__ void prefixSumGPU3(const format *in, format *out, const int array_size)//改良版收集 - 分发树,处理的元素个数为线程数的2倍 131 { 132 extern __shared__ format share_in[]; 133 int id = blockIdx.x * blockDim.x + threadIdx.x; 134 int target; 135 int jump; 136 share_in[2 * threadIdx.x] = 0; 137 share_in[2 * threadIdx.x + 1] = 0; 138 if (2 * id + 1 < (array_size + 1) / 2 * 2)// 改动,调整下标范围以适应奇数个元素 139 { 140 share_in[2 * threadIdx.x] = in[2 * id];// 改动,一个线程要搬运两个全局内存 141 share_in[2 * threadIdx.x + 1] = in[2 * id + 1]; 142 } 143 __syncthreads(); 144 145 for (jump = 1; jump < 2 * blockDim.x; jump *= 2)// 改动,放宽了jump的上限 146 { 147 target = (threadIdx.x + 1) * 2 * jump - 1; 148 if (target < 2 * blockDim.x)// 改动,放宽了接收元素下标范围 149 share_in[target] += share_in[target - jump]; 150 __syncthreads(); 151 } 152 for (jump = blockDim.x / 2; jump > 0; jump /= 2)// 改动,放宽了jump的初始值 153 { 154 target = (threadIdx.x + 1) * 2 * jump - 1; 155 if (target + jump < 2 * blockDim.x)// 改动,放宽了接收元素下标范围 156 share_in[target + jump] += share_in[target]; 157 __syncthreads(); 158 } 159 if (2 * id + 1 < (array_size + 1) / 2 * 2)// 改动,调整下标范围以适应奇数个元素 160 { 161 out[2 * id] = share_in[2 * threadIdx.x];// 改动,一个线程要搬运两个全局内存 162 out[2 * id + 1] = share_in[2 * threadIdx.x + 1]; 163 } 164 return; 165 } 166 167 int main() 168 { 169 int i; 170 int simple_task = MIN(ceil_base2(ARRAY_SIZE), SIMPLE_TASK);//要么将原数组向上扩充到最小的2的整数次幂个大小,要么最大限制1024个元素 171 format h_in[ARRAY_SIZE], cpu_out[ARRAY_SIZE], gpu_out[ARRAY_SIZE]; 172 format *d_in, *d_out, *d_temp, *d_temp2; 173 174 cudaStream_t stream; 175 cudaStreamCreate(&stream); 176 clock_t time; 177 cudaEvent_t start, stop; 178 float elapsedTime1, elapsedTime2, elapsedTime3, elapsedTime4; 179 cudaEventCreate(&start); 180 cudaEventCreate(&stop); 181 182 checkCudaError(cudaMalloc((void **)&d_in, sizeof(format) * ARRAY_SIZE)); 183 checkCudaError(cudaMalloc((void **)&d_out, sizeof(format) * ARRAY_SIZE)); 184 int d_temp_length = CEIL(ARRAY_SIZE, WIDTH);// array首次分解为线程块的数量 185 checkCudaError(cudaMalloc((void **)&d_temp, sizeof(format) * d_temp_length)); 186 187 srand(SEED); 188 for (i = 0; i < ARRAY_SIZE; i++) 189 h_in[i] = rand() % (RAND_RANGE << 1 + 1) - RAND_RANGE; 190 191 time = clock(); 192 prefixSumCPU(h_in, cpu_out, ARRAY_SIZE); 193 time = clock() - time; 194 195 cudaMemcpy(d_in, h_in, sizeof(format) * ARRAY_SIZE, cudaMemcpyHostToDevice); 196 197 cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE); 198 cudaEventRecord(start, 0); 199 prefixSumGPU1 << < 1, simple_task, sizeof(format) * simple_task >> > (d_in, d_out, MIN(ARRAY_SIZE, SIMPLE_TASK)); 200 cudaMemcpy(gpu_out, d_out, sizeof(format) * MIN(ARRAY_SIZE, SIMPLE_TASK), cudaMemcpyDeviceToHost); 201 cudaDeviceSynchronize(); 202 cudaEventRecord(stop, 0); 203 cudaEventSynchronize(stop); 204 cudaEventElapsedTime(&elapsedTime1, start, stop); 205 if (i = checkResult(cpu_out, gpu_out, MIN(ARRAY_SIZE, SIMPLE_TASK))) 206 printf("\n\tCompute error at i = %d\n\tcpu_out[i] = %10d, gpu_out[i] = %10d\n", i - 1, cpu_out[i - 1], gpu_out[i - 1]); 207 else 208 printf("\n\tGPU1 Compute correctly!\n"); 209 210 cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE); 211 cudaEventRecord(start, 0); 212 prefixSumGPU2 << < 1, simple_task, sizeof(format) * simple_task >> > (d_in, d_out, MIN(ARRAY_SIZE, SIMPLE_TASK)); 213 cudaMemcpy(gpu_out, d_out, sizeof(format) * MIN(ARRAY_SIZE, SIMPLE_TASK), cudaMemcpyDeviceToHost); 214 cudaDeviceSynchronize(); 215 cudaEventRecord(stop, 0); 216 cudaEventSynchronize(stop); 217 cudaEventElapsedTime(&elapsedTime2, start, stop); 218 if (i = checkResult(cpu_out, gpu_out, MIN(ARRAY_SIZE, SIMPLE_TASK))) 219 printf("\n\tCompute error at i = %d\n\tcpu_out[i] = %10d, gpu_out[i] = %10d\n", i - 1, cpu_out[i - 1], gpu_out[i - 1]); 220 else 221 printf("\n\tGPU2 Compute correctly!\n"); 222 223 cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE); 224 cudaEventRecord(start, 0); 225 prefixSumGPU3 << < 1, simple_task / 2, sizeof(format) * simple_task >> > (d_in, d_out, MIN(ARRAY_SIZE, SIMPLE_TASK)); 226 cudaMemcpy(gpu_out, d_out, sizeof(format) * MIN(ARRAY_SIZE, SIMPLE_TASK), cudaMemcpyDeviceToHost); 227 cudaDeviceSynchronize(); 228 cudaEventRecord(stop, 0); 229 cudaEventSynchronize(stop); 230 cudaEventElapsedTime(&elapsedTime3, start, stop); 231 if (i = checkResult(cpu_out, gpu_out, MIN(ARRAY_SIZE, SIMPLE_TASK))) 232 printf("\n\tCompute error at i = %d\n\tcpu_out[i] = %10d, gpu_out[i] = %10d\n", i - 1, cpu_out[i - 1], gpu_out[i - 1]); 233 else 234 printf("\n\tGPU3 Compute correctly!\n"); 235 236 // 长向量前缀和,假设长度不大于WIDTH^2(两次迭代可完成) 237 cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE); 238 cudaEventRecord(start, 0); 239 prefixSumGPU3 << < d_temp_length, WIDTH / 2, sizeof(format) * WIDTH, stream >> > (d_in, d_out, ARRAY_SIZE); 240 select << < 1, d_temp_length, 0, stream >> > (d_out, d_temp, WIDTH); 241 prefixSumGPU3 << < 1, ceil_base2(d_temp_length), sizeof(format) * ceil_base2(d_temp_length), stream >> > (d_temp, d_temp, d_temp_length); 242 add << < d_temp_length, WIDTH, 0, stream >> > (d_out, d_temp, ARRAY_SIZE); 243 cudaMemcpy(gpu_out, d_out, sizeof(format) * ARRAY_SIZE, cudaMemcpyDeviceToHost); 244 cudaDeviceSynchronize(); 245 cudaEventRecord(stop, 0); 246 cudaEventSynchronize(stop); 247 cudaEventElapsedTime(&elapsedTime4, start, stop); 248 if (i = checkResult(cpu_out, gpu_out, ARRAY_SIZE)) 249 printf("\n\tCompute error at i = %d\n\tcpu_out[i] = %10d, gpu_out[i] = %10d\n", i - 1, cpu_out[i - 1], gpu_out[i - 1]); 250 else 251 printf("\n\tGPU4 Compute correctly!\n"); 252 253 // 长向量前缀和,可能多次迭代 254 // TODO 255 256 printf("\n\tSpending time:\n\tCPU:\t%10ld ms\ 257 \n\tGPU1:\t%f ms\n\tGPU2:\t%f ms\n\tGPU3:\t%f ms\n\tGPU4:\t%f ms\n", 258 time, elapsedTime1, elapsedTime2, elapsedTime3, elapsedTime4); 259 260 cudaFree(d_in); 261 cudaFree(d_out); 262 cudaFree(d_temp); 263 cudaStreamDestroy(stream); 264 cudaEventDestroy(start); 265 cudaEventDestroy(stop); 266 getchar(); 267 return 0; 268 }
? 结果如下图。第一种方法存在不可重现的bug,且仅当输入数组规模大于512时开始出现,原因未知。其他几种方法计算结果均正确,再计算较短的响亮的时候第三种方法(改良的收集 - 分发树法)效率最高,当向量长度远大于自己设定的阈值1024(单个线程亏最大处理大小)时,第四种方案的效率很高。由于warm-up的原因,第四种方案计算全部向量的速度还有可能快于第三种方案计算前1024个元素的速度。
标签:define array error return 长度 分发树 ram idt eve
原文地址:http://www.cnblogs.com/cuancuancuanhao/p/7669637.html