标签:ace argv etl ade memory project lock parallel mes
在OpenMP的多线程程序中,各线程分别调用CUDA进行计算。OpenMP的简单示例。
? 源代码:
1 #include <omp.h> 2 #include <stdio.h> 3 #include "device_launch_parameters.h" 4 #include <helper_cuda.h> 5 6 using namespace std; 7 8 __global__ void kernelAddConstant(int *g_a, const int b) 9 { 10 int idx = blockIdx.x * blockDim.x + threadIdx.x; 11 g_a[idx] += b; 12 } 13 14 int correctResult(int *data, const int n, const int b) 15 { 16 for (int i = 0; i < n; i++) 17 { 18 if (data[i] != i + b) 19 return 0; 20 } 21 return 1; 22 } 23 24 int main(int argc, char *argv[]) 25 { 26 // 检测可用的CUDA GPU数量并输出名字 27 int num_gpus = 0; 28 printf("%s Starting...\n\n", argv[0]); 29 cudaGetDeviceCount(&num_gpus); 30 if (num_gpus < 1) 31 { 32 printf("no CUDA capable devices were detected\n"); 33 return 1; 34 } 35 printf("number of host CPUs:\t%d\n", omp_get_num_procs()); 36 printf("number of CUDA devices:\t%d\n", num_gpus); 37 for (int i = 0; i < num_gpus; i++) 38 { 39 cudaDeviceProp dprop; 40 cudaGetDeviceProperties(&dprop, i); 41 printf(" %d: %s\n", i, dprop.name); 42 } 43 printf("---------------------------\n"); 44 45 // 初始化计算参数 46 unsigned int n = num_gpus * 8192; 47 unsigned int nbytes = n * sizeof(int); 48 int *a = 0; 49 int b = 3; 50 a = (int *)malloc(nbytes); 51 if (0 == a) 52 { 53 printf("couldn‘t allocate CPU memory\n"); 54 return 1; 55 } 56 for (unsigned int i = 0; i < n; i++) 57 a[i] = i; 58 59 // 计算部分 60 omp_set_num_threads(num_gpus); // 使用CPU线程数量等于GPU设备数量。可以使用更多,如 2*num_gpus 61 62 #pragma omp parallel 63 { 64 unsigned int cpu_thread_id = omp_get_thread_num(); // 获取当前线程编号 65 unsigned int num_cpu_threads = omp_get_num_threads();// 获取总线程数量 66 67 int gpu_id = -1; 68 cudaSetDevice(cpu_thread_id % num_gpus); // 使用 % 使得一个GPU能接受更多CPU线程 69 cudaGetDevice(&gpu_id); 70 printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id); 71 72 int *d_a = 0; 73 int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // 主机内存分段 74 unsigned int nbytes_per_kernel = nbytes / num_cpu_threads; 75 dim3 gpu_threads(128); 76 dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads)); 77 cudaMalloc((void **)&d_a, nbytes_per_kernel); 78 cudaMemset(d_a, 0, nbytes_per_kernel); 79 cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice); 80 kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b); 81 cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost); 82 cudaFree(d_a); 83 } 84 printf("---------------------------\n"); 85 86 // 检查结果 87 if (cudaSuccess != cudaGetLastError()) 88 printf("%s\n", cudaGetErrorString(cudaGetLastError())); 89 bool bResult = correctResult(a, n, b); 90 if (a) 91 free(a); // free CPU memory 92 93 getchar(); 94 exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); 95 }
? 输出结果:
D:\Code\CUDA\ProjectTemp\x64\Debug\ProjectTemp.exe Starting... number of host CPUs: 8 number of CUDA devices: 1 0: GeForce GTX 1070 --------------------------- CPU thread 0 (of 1) uses CUDA device 0 ---------------------------
? 涨姿势
● 使用OpenMP的简单范例:
需要头文件 #include <omp.h>
使用并行的代码块
1 omp_set_num_threads(threads);// 设置线程数量 2 #pragma omp parallel 3 { 4 unsigned int cpu_thread_id = omp_get_thread_num(); // 获取当前线程编号 5 unsigned int num_cpu_threads = omp_get_num_threads();// 获取总线程数量 6 ... 7 }
涉及的函数:
1 _OMPIMP void _OMPAPI omp_set_num_threads(int _Num_threads); // 设置线程数量 2 3 _OMPIMP int _OMPAPI omp_get_thread_num(void); // 获取当前线程编号 4 5 _OMPIMP int _OMPAPI omp_get_num_threads(void); // 获取总线程数量
标签:ace argv etl ade memory project lock parallel mes
原文地址:http://www.cnblogs.com/cuancuancuanhao/p/7742951.html