码迷,mamicode.com
首页 > 其他好文 > 详细

cuda流测试=basic_single_stream

时间:2014-09-27 15:25:40      阅读:254      评论:0      收藏:0      [点我收藏+]

标签:des   style   blog   http   color   io   os   ar   for   

cuda流测试

  1 /*
  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 *
  4 * NVIDIA Corporation and its licensors retain all intellectual property and
  5 * proprietary rights in and to this software and related documentation.
  6 * Any use, reproduction, disclosure, or distribution of this software
  7 * and related documentation without an express license agreement from
  8 * NVIDIA Corporation is strictly prohibited.
  9 *
 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 * associated with this source code for terms and conditions that govern
 12 * your use of this NVIDIA software.
 13 *
 14 */
 15 
 16 
 17 #include "../common/book.h"
 18 #include "cuda.h"
 19 #include "cuda_runtime.h"
 20 #include "device_launch_parameters.h"
 21 #define N   (1024*1024)
 22 #define FULL_DATA_SIZE   (N*20)
 23 
 24 
 25 __global__ void kernel(int *a, int *b, int *c) {
 26     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 27     if (idx < N) {
 28         //idx后两个数
 29         int idx1 = (idx + 1) % 256;
 30         int idx2 = (idx + 2) % 256;
 31         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
 32         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
 33         c[idx] = (as + bs) / 2;
 34     }
 35 }
 36 
 37 
 38 int main(void) {
 39     cudaDeviceProp  prop;
 40     int whichDevice;
 41     HANDLE_ERROR(cudaGetDevice(&whichDevice));
 42     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
 43     if (!prop.deviceOverlap) {
 44         printf("Device will not handle overlaps, so no speed up from streams\n");
 45         return 0;
 46     }
 47 
 48     cudaEvent_t     start, stop;
 49     float           elapsedTime;
 50 
 51     cudaStream_t    stream;
 52     int *host_a, *host_b, *host_c;
 53     int *dev_a, *dev_b, *dev_c;
 54 
 55     // start the timers
 56     HANDLE_ERROR(cudaEventCreate(&start));
 57     HANDLE_ERROR(cudaEventCreate(&stop));
 58 
 59     //初始化流
 60     HANDLE_ERROR(cudaStreamCreate(&stream));
 61 
 62     // allocate the memory on the GPU
 63     HANDLE_ERROR(cudaMalloc((void**)&dev_a,
 64         N * sizeof(int)));
 65     HANDLE_ERROR(cudaMalloc((void**)&dev_b,
 66         N * sizeof(int)));
 67     HANDLE_ERROR(cudaMalloc((void**)&dev_c,
 68         N * sizeof(int)));
 69 
 70     //分配由于GPU访问的主机无分页内存(锁定内存页)
 71     HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
 72         FULL_DATA_SIZE * sizeof(int),
 73         cudaHostAllocDefault));
 74     HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
 75         FULL_DATA_SIZE * sizeof(int),
 76         cudaHostAllocDefault));
 77     HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
 78         FULL_DATA_SIZE * sizeof(int),
 79         cudaHostAllocDefault));
 80 
 81     for (int i = 0; i<FULL_DATA_SIZE; i++) {
 82         host_a[i] = rand();
 83         host_b[i] = rand();
 84     }
 85 
 86     HANDLE_ERROR(cudaEventRecord(start, 0));
 87     // now loop over full data, in bite-sized chunks
 88     for (int i = 0; i<FULL_DATA_SIZE; i += N) {
 89         //异步复制主机上内存的值到设备上
 90         HANDLE_ERROR(cudaMemcpyAsync(dev_a, host_a + i,
 91             N * sizeof(int),
 92             cudaMemcpyHostToDevice,
 93             stream));
 94         HANDLE_ERROR(cudaMemcpyAsync(dev_b, host_b + i,
 95             N * sizeof(int),
 96             cudaMemcpyHostToDevice,
 97             stream));
 98 
 99         kernel << <N / 256, 256, 0, stream >> >(dev_a, dev_b, dev_c);
100 
101         //将计算的值复制会主机
102         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c,
103             N * sizeof(int),
104             cudaMemcpyDeviceToHost,
105             stream));
106 
107     }
108     //从锁定页将结果块复制到主机内存
109     HANDLE_ERROR(cudaStreamSynchronize(stream));
110 
111     HANDLE_ERROR(cudaEventRecord(stop, 0));
112 
113     HANDLE_ERROR(cudaEventSynchronize(stop));
114     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
115         start, stop));
116     printf("Time taken:  %3.1f ms\n", elapsedTime);
117 
118     // cleanup the streams and memory
119     HANDLE_ERROR(cudaFreeHost(host_a));
120     HANDLE_ERROR(cudaFreeHost(host_b));
121     HANDLE_ERROR(cudaFreeHost(host_c));
122     HANDLE_ERROR(cudaFree(dev_a));
123     HANDLE_ERROR(cudaFree(dev_b));
124     HANDLE_ERROR(cudaFree(dev_c));
125     HANDLE_ERROR(cudaStreamDestroy(stream));
126 
127     return 0;
128 }

项目打包下载

cuda流测试=basic_single_stream

标签:des   style   blog   http   color   io   os   ar   for   

原文地址:http://www.cnblogs.com/liangliangdetianxia/p/3996333.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!