标签:数据 tap 递归调用 erb 范围 break reads compute let
利用CUDA动态并行实现快排算法(有单线程的递归调用)
源代码:
1 #include <iostream> 2 #include <cstdio> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <D:\Program\CUDA\Samples\common\inc\helper_cuda.h> 6 #include <D:\Program\CUDA\Samples\common\inc\helper_string.h> 7 8 #define MAX_DEPTH 16 9 #define INSERTION_SORT 32 10 11 // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 是使用选排。设备代码, 单线程完成。 12 __device__ void selection_sort(unsigned int *data, int left, int right) 13 { 14 for (int i = left; i <= right; ++i) 15 { 16 unsigned min_val = data[i]; 17 int min_idx = i; 18 19 // 找最小元素及其下标 20 for (int j = i + 1; j <= right; ++j) 21 { 22 unsigned val_j = data[j]; 23 24 if (val_j < min_val) 25 { 26 min_idx = j; 27 min_val = val_j; 28 } 29 } 30 31 // 交换第 i 号元素到指定的位置上 32 if (i != min_idx) 33 { 34 data[min_idx] = data[i]; 35 data[i] = min_val; 36 } 37 } 38 } 39 40 // 快排主体,内含递归调用,每个函数调用都是单线程 41 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) 42 { 43 // 处理适用选排的情况 44 if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT) 45 { 46 selection_sort(data, left, right); 47 return; 48 } 49 50 unsigned int *lptr = data + left; 51 unsigned int *rptr = data + right; 52 unsigned int pivot = data[(left + right) / 2]; 53 54 // 分割 55 while (lptr <= rptr) 56 { 57 // 指定左指针指向的值和右指针指向的值 58 unsigned int lval = *lptr; 59 unsigned int rval = *rptr; 60 61 // 左指针递增 62 while (lval < pivot) 63 { 64 lptr++; 65 lval = *lptr; 66 } 67 68 // 右指针递减 69 while (rval > pivot) 70 { 71 rptr--; 72 rval = *rptr; 73 } 74 75 // 交换左右指针指向的值 76 if (lptr <= rptr) 77 { 78 *lptr++ = rval; 79 *rptr-- = lval; 80 } 81 } 82 83 // 获得左右分区的范围 84 int nright = rptr - data; 85 int nleft = lptr - data; 86 87 // 将左右分区放到两个不同的流中 88 if (left < (rptr - data)) 89 { 90 cudaStream_t s; 91 cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); 92 cdp_simple_quicksort << < 1, 1, 0, s >> >(data, left, nright, depth + 1); 93 cudaStreamDestroy(s); 94 } 95 if ((lptr - data) < right) 96 { 97 cudaStream_t s1; 98 cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); 99 cdp_simple_quicksort << < 1, 1, 0, s1 >> >(data, nleft, right, depth + 1); 100 cudaStreamDestroy(s1); 101 } 102 } 103 104 // 快排的入口函数,注意使用单线程启动核函数 105 void run_qsort(unsigned int *data, unsigned int nitems) 106 { 107 // 设置最大递归深度 108 cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH); 109 110 // 调用快排函数 111 int left = 0; 112 int right = nitems - 1; 113 std::cout << "Launching kernel on the GPU" << std::endl; 114 cdp_simple_quicksort << < 1, 1 >> >(data, left, right, 0); 115 cudaDeviceSynchronize(); 116 } 117 118 // 数据初始化 119 void initialize_data(unsigned int *dst, unsigned int nitems) 120 { 121 srand(2047); 122 for (unsigned i = 0; i < nitems; i++) 123 dst[i] = rand() % nitems; 124 } 125 126 // 检查结果 127 void check_results(int n, unsigned int *results_d) 128 { 129 unsigned int *results_h = new unsigned[n]; 130 cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost); 131 132 for (int i = 1; i < n; ++i) 133 if (results_h[i - 1] > results_h[i]) 134 { 135 std::cout << "Invalid item[" << i - 1 << "]: " << results_h[i - 1] << " greater than " << results_h[i] << std::endl; 136 exit(EXIT_FAILURE); 137 } 138 139 std::cout << "OK" << std::endl; 140 delete[] results_h; 141 } 142 143 int main(int argc, char **argv) 144 { 145 int num_items = 128; 146 bool verbose = false;// 是否检查初始化后的 h_data 147 148 // 帮助模式? 149 if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h")) 150 { 151 std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of items to sort" << std::endl; 152 exit(EXIT_SUCCESS); 153 } 154 155 // 查看模式,查看随机数组 h_data 的内容 156 if (checkCmdLineFlag(argc, (const char **)argv, "v")) 157 verbose = true; 158 159 // 手动设定待排数组大小 160 if (checkCmdLineFlag(argc, (const char **)argv, "num_items")) 161 { 162 num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items"); 163 if (num_items < 1) 164 { 165 std::cerr << "ERROR: num_items has to be greater than 1" << std::endl; 166 exit(EXIT_FAILURE); 167 } 168 } 169 170 // 设备相关 171 int device_count = 0, device = -1; 172 if (checkCmdLineFlag(argc, (const char **)argv, "device"))// 命令行指定了设备 173 { 174 device = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 175 176 cudaDeviceProp properties; 177 cudaGetDeviceProperties(&properties, device); 178 179 if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 180 std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl; 181 else 182 { 183 std::cout << "ERROR: cdpsimpleQuicksort requires GPU devices with compute SM 3.5 or higher." << std::endl; 184 std::cout << "Current GPU device has compute SM" << properties.major << "." << properties.minor << ". Exiting..." << std::endl; 185 exit(EXIT_FAILURE); 186 } 187 } 188 else// 命令行没有指定设备,自动寻找 189 { 190 cudaGetDeviceCount(&device_count); 191 for (int i = 0; i < device_count; ++i) 192 { 193 cudaDeviceProp properties; 194 cudaGetDeviceProperties(&properties, i); 195 196 if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 197 { 198 device = i; 199 std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl; 200 break; 201 } 202 std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl; 203 } 204 } 205 206 if (device == -1) 207 { 208 std::cerr << "cdpSimpleQuicksort requires GPU devices with compute SM 3.5 or higher. Exiting..." << std::endl; 209 exit(EXIT_WAIVED); 210 } 211 212 cudaSetDevice(device); 213 214 // 创建待排数据 215 unsigned int *h_data = 0; 216 unsigned int *d_data = 0; 217 218 std::cout << "Initializing data." << std::endl; 219 h_data = (unsigned int *)malloc(num_items * sizeof(unsigned int)); 220 initialize_data(h_data, num_items); 221 222 if (verbose) 223 { 224 for (int i = 0; i<num_items; i++) 225 std::cout << "Data [" << i << "]: " << h_data[i] << std::endl; 226 } 227 228 // 数据搬进显存 229 cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int)); 230 cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice); 231 232 // 运行快排入口函数 233 std::cout << "Running quicksort on " << num_items << " elements" << std::endl; 234 run_qsort(d_data, num_items); 235 236 // 检查结果 237 std::cout << "Validating results: "; 238 check_results(num_items, d_data); 239 240 free(h_data); 241 cudaFree(d_data); 242 243 getchar(); 244 exit(EXIT_SUCCESS); 245 }
? 输出结果:
Running on GPU 0 (GeForce GTX 1070) Initializing data. Running quicksort on 128 elements Launching kernel on the GPU Validating results : OK
? 新姿势:
● C++动态数组
1 unsigned int *h = new unsigned[n]; 2 delete[] h;
● checkCmdLineFlag 用于检验函数参数argv是否等于字符串string_ref(定义于helper_string.h中)
1 inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 2 { 3 bool bFound = false; 4 if (argc >= 1) 5 { 6 for (int i = 1; i < argc; i++) 7 { 8 int string_start = stringRemoveDelimiter(‘-‘, argv[i]); 9 const char *string_argv = &argv[i][string_start]; 10 11 const char *equal_pos = strchr(string_argv, ‘=‘); 12 int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 13 14 int length = (int)strlen(string_ref); 15 16 if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) 17 { 18 bFound = true; 19 continue; 20 } 21 } 22 } 23 return bFound; 24 } 25 26 // 其中的函数 stringRemoveDelimiter 用于去除特定的符号,上述函数的中用于去除参数前面的 - 或 -- 27 inline int stringRemoveDelimiter(char delimiter, const char *string) 28 { 29 int string_start = 0; 30 31 while (string[string_start] == delimiter) 32 { 33 string_start++; 34 } 35 36 if (string_start >= (int)strlen(string) - 1) 37 { 38 return 0; 39 } 40 41 return string_start; 42 } 43 44 // 其中的宏 STRNCASECMP 用于比较字符串(定义于string.h中) 45 #define STRNCASECMP _strnicmp 46 47 _ACRTIMP int __cdecl _strnicmp 48 ( 49 _In_reads_or_z_(_MaxCount) char const* _String1, 50 _In_reads_or_z_(_MaxCount) char const* _String2, 51 _In_ size_t _MaxCount 52 );
● getCmdLineArgumentInt 用于提取函数参数argv中的整数(定义于helper_string.h中)
1 inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 2 { 3 bool bFound = false; 4 int value = -1; 5 6 if (argc >= 1) 7 { 8 for (int i = 1; i < argc; i++) 9 { 10 int string_start = stringRemoveDelimiter(‘-‘, argv[i]); 11 const char *string_argv = &argv[i][string_start]; 12 int length = (int)strlen(string_ref); 13 14 if (!STRNCASECMP(string_argv, string_ref, length)) 15 { 16 if (length + 1 <= (int)strlen(string_argv)) 17 { 18 int auto_inc = (string_argv[length] == ‘=‘) ? 1 : 0; 19 value = atoi(&string_argv[length + auto_inc]); 20 } 21 else 22 value = 0; 23 bFound = true; 24 continue; 25 } 26 } 27 } 28 if (bFound) 29 return value; 30 else 31 return 0;
● 设置CUDA各项参数的大小,源代码中用于指定最大递归深度
extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
● 带有标识符的 cudaStreamCreateWithFlags ,用于设置流的优先级
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
对比 cudaStreamCreate
extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
●
标签:数据 tap 递归调用 erb 范围 break reads compute let
原文地址:http://www.cnblogs.com/cuancuancuanhao/p/7726121.html