标签:des style blog http color io os ar for
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #include "device_functions.h" 22 #define imin(a,b) (a<b?a:b) 23 24 const int N = 33 * 1024 * 1024; 25 const int threadsPerBlock = 256; 26 const int blocksPerGrid = 27 imin(32, (N + threadsPerBlock - 1) / threadsPerBlock); 28 29 30 __global__ void dot(int size, float *a, float *b, float *c) { 31 __shared__ float cache[threadsPerBlock]; 32 int tid = threadIdx.x + blockIdx.x * blockDim.x; 33 int cacheIndex = threadIdx.x; 34 35 float temp = 0; 36 while (tid < size) { 37 temp += a[tid] * b[tid]; 38 tid += blockDim.x * gridDim.x; 39 } 40 41 // set the cache values 42 cache[cacheIndex] = temp; 43 44 // synchronize threads in this block 45 __syncthreads(); 46 47 // for reductions, threadsPerBlock must be a power of 2 48 // because of the following code 49 int i = blockDim.x / 2; 50 while (i != 0) { 51 if (cacheIndex < i) 52 cache[cacheIndex] += cache[cacheIndex + i]; 53 __syncthreads(); 54 i /= 2; 55 } 56 57 if (cacheIndex == 0) 58 c[blockIdx.x] = cache[0]; 59 } 60 61 62 float malloc_test(int size) { 63 cudaEvent_t start, stop; 64 float *a, *b, c, *partial_c; 65 float *dev_a, *dev_b, *dev_partial_c; 66 float elapsedTime; 67 68 HANDLE_ERROR(cudaEventCreate(&start)); 69 HANDLE_ERROR(cudaEventCreate(&stop)); 70 71 // allocate memory on the CPU side 72 a = (float*)malloc(size*sizeof(float)); 73 b = (float*)malloc(size*sizeof(float)); 74 partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 75 76 // allocate the memory on the GPU 77 HANDLE_ERROR(cudaMalloc((void**)&dev_a, 78 size*sizeof(float))); 79 HANDLE_ERROR(cudaMalloc((void**)&dev_b, 80 size*sizeof(float))); 81 HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 82 blocksPerGrid*sizeof(float))); 83 84 // fill in the host memory with data 85 for (int i = 0; i<size; i++) { 86 a[i] = i; 87 b[i] = i * 2; 88 } 89 90 HANDLE_ERROR(cudaEventRecord(start, 0)); 91 // copy the arrays ‘a‘ and ‘b‘ to the GPU 92 HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float), 93 cudaMemcpyHostToDevice)); 94 HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float), 95 cudaMemcpyHostToDevice)); 96 97 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b, 98 dev_partial_c); 99 // copy the array ‘c‘ back from the GPU to the CPU 100 HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c, 101 blocksPerGrid*sizeof(float), 102 cudaMemcpyDeviceToHost)); 103 104 HANDLE_ERROR(cudaEventRecord(stop, 0)); 105 HANDLE_ERROR(cudaEventSynchronize(stop)); 106 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, 107 start, stop)); 108 109 // finish up on the CPU side 110 c = 0; 111 for (int i = 0; i<blocksPerGrid; i++) { 112 c += partial_c[i]; 113 } 114 115 HANDLE_ERROR(cudaFree(dev_a)); 116 HANDLE_ERROR(cudaFree(dev_b)); 117 HANDLE_ERROR(cudaFree(dev_partial_c)); 118 119 // free memory on the CPU side 120 free(a); 121 free(b); 122 free(partial_c); 123 124 // free events 125 HANDLE_ERROR(cudaEventDestroy(start)); 126 HANDLE_ERROR(cudaEventDestroy(stop)); 127 128 printf("Value calculated: %f\n", c); 129 130 return elapsedTime; 131 } 132 133 134 float cuda_host_alloc_test(int size) { 135 cudaEvent_t start, stop; 136 float *a, *b, c, *partial_c; 137 float *dev_a, *dev_b, *dev_partial_c; 138 float elapsedTime; 139 140 HANDLE_ERROR(cudaEventCreate(&start)); 141 HANDLE_ERROR(cudaEventCreate(&stop)); 142 143 // allocate the memory on the CPU 144 HANDLE_ERROR(cudaHostAlloc((void**)&a, 145 size*sizeof(float), 146 cudaHostAllocWriteCombined | 147 cudaHostAllocMapped)); 148 HANDLE_ERROR(cudaHostAlloc((void**)&b, 149 size*sizeof(float), 150 cudaHostAllocWriteCombined | 151 cudaHostAllocMapped)); 152 HANDLE_ERROR(cudaHostAlloc((void**)&partial_c, 153 blocksPerGrid*sizeof(float), 154 cudaHostAllocMapped)); 155 156 // find out the GPU pointers 157 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0)); 158 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0)); 159 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_partial_c, 160 partial_c, 0)); 161 162 // fill in the host memory with data 163 for (int i = 0; i<size; i++) { 164 a[i] = i; 165 b[i] = i * 2; 166 } 167 168 HANDLE_ERROR(cudaEventRecord(start, 0)); 169 170 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b, 171 dev_partial_c); 172 173 HANDLE_ERROR(cudaThreadSynchronize()); 174 HANDLE_ERROR(cudaEventRecord(stop, 0)); 175 HANDLE_ERROR(cudaEventSynchronize(stop)); 176 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, 177 start, stop)); 178 179 // finish up on the CPU side 180 c = 0; 181 for (int i = 0; i<blocksPerGrid; i++) { 182 c += partial_c[i]; 183 } 184 185 HANDLE_ERROR(cudaFreeHost(a)); 186 HANDLE_ERROR(cudaFreeHost(b)); 187 HANDLE_ERROR(cudaFreeHost(partial_c)); 188 189 // free events 190 HANDLE_ERROR(cudaEventDestroy(start)); 191 HANDLE_ERROR(cudaEventDestroy(stop)); 192 193 printf("Value calculated: %f\n", c); 194 195 return elapsedTime; 196 } 197 198 199 int main(void) { 200 cudaDeviceProp prop; 201 int whichDevice; 202 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 203 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 204 if (prop.canMapHostMemory != 1) { 205 printf("Device can not map memory.\n"); 206 return 0; 207 } 208 209 float elapsedTime; 210 211 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost)); 212 213 // try it with malloc 214 elapsedTime = malloc_test(N); 215 printf("Time using cudaMalloc: %3.1f ms\n", 216 elapsedTime); 217 218 // now try it with cudaHostAlloc 219 elapsedTime = cuda_host_alloc_test(N); 220 printf("Time using cudaHostAlloc: %3.1f ms\n", 221 elapsedTime); 222 }
标签:des style blog http color io os ar for
原文地址:http://www.cnblogs.com/liangliangdetianxia/p/3996389.html