码迷,mamicode.com
首页 > 其他好文 > 详细

第七章 纹理内存

时间:2017-10-09 23:59:37      阅读:465      评论:0      收藏:0      [点我收藏+]

标签:kernel   自己   etc   har   释放   event   申请   bind   get   

本章介绍了纹理内存的使用,并给出了热传导的两个个例子。分别使用了一维和二维纹理单元。

热传导(使用一维纹理)

  1 #include <stdio.h>
  2 #include "cuda_runtime.h"
  3 #include "device_launch_parameters.h"
  4 #include "D:\Code\CUDA\book\common\book.h"
  5 #include "D:\Code\CUDA\book\common\cpu_anim.h"
  6 
  7 #define DIM 1024
  8 #define PI 3.1415926535897932f
  9 #define MAX_TEMP 1.0f
 10 #define MIN_TEMP 0.0001f
 11 #define SPEED   0.25f
 12 
 13 //在全局位置上声明纹理引用,存在于GPU中
 14 texture<float>  texConstSrc;
 15 texture<float>  texIn;
 16 texture<float>  texOut;
 17 
 18 struct DataBlock
 19 {
 20     unsigned char   *output_bitmap;
 21     float           *dev_inSrc;
 22     float           *dev_outSrc;
 23     float           *dev_constSrc;
 24     CPUAnimBitmap  *bitmap;
 25     cudaEvent_t     start, stop;
 26     float           totalTime;
 27     float           frames;
 28 };
 29 
 30 __global__ void blend_kernel(float *dst, bool dstOut)
 31 {
 32     int x = threadIdx.x + blockIdx.x * blockDim.x;
 33     int y = threadIdx.y + blockIdx.y * blockDim.y;
 34     int offset = x + y * blockDim.x * gridDim.x;
 35 
 36     int left = offset - 1;//找到上下左右的块
 37     int right = offset + 1;
 38     int top = offset - DIM;
 39     int bottom = offset + DIM;
 40     if (x == 0)
 41         left++;
 42     if (x == DIM - 1)
 43         right--;
 44     if (y == 0)
 45         top += DIM;
 46     if (y == DIM - 1)
 47         bottom -= DIM;
 48     float   t, l, c, r, b;
 49     if (dstOut)
 50     {
 51         t = tex1Dfetch(texIn, top);
 52         l = tex1Dfetch(texIn, left);
 53         c = tex1Dfetch(texIn, offset);
 54         r = tex1Dfetch(texIn, right);
 55         b = tex1Dfetch(texIn, bottom);
 56     }
 57     else
 58     {
 59         t = tex1Dfetch(texOut, top);
 60         l = tex1Dfetch(texOut, left);
 61         c = tex1Dfetch(texOut, offset);
 62         r = tex1Dfetch(texOut, right);
 63         b = tex1Dfetch(texOut, bottom);
 64     }
 65 
 66     dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
 67 
 68     return;
 69 }
 70 
 71 __global__ void copy_const_kernel(float *iptr)// 将恒温常量矩阵覆盖输入矩阵
 72 {
 73     int x = threadIdx.x + blockIdx.x * blockDim.x;
 74     int y = threadIdx.y + blockIdx.y * blockDim.y;
 75     int offset = x + y * blockDim.x * gridDim.x;
 76 
 77     float c = tex1Dfetch(texConstSrc, offset);
 78     if (c != 0)
 79         iptr[offset] = c;
 80 
 81     return;
 82 }
 83 
 84 void anim_gpu(DataBlock *d, int ticks)
 85 {
 86     cudaEventRecord(d->start, 0);
 87     dim3    blocks(DIM / 16, DIM / 16);
 88     dim3    threads(16, 16);
 89     CPUAnimBitmap  *bitmap = d->bitmap;
 90 
 91     volatile bool dstOut = true;//确定输入矩阵是哪一个,true代表dev_inSrc,false代表ev_outSrc
 92     for (int i = 0; i < 90; i++)
 93     {
 94         float   *in, *out;
 95         if (dstOut)
 96         {
 97             in = d->dev_inSrc;
 98             out = d->dev_outSrc;
 99         }
100         else
101         {
102             in = d->dev_outSrc;
103             out = d->dev_inSrc;
104         }
105 
106         copy_const_kernel << < blocks, threads >> > (in);
107         blend_kernel << < blocks, threads >> > (out, dstOut);
108         dstOut = !dstOut;
109     }
110     float_to_color << < blocks, threads >> > (d->output_bitmap, d->dev_inSrc);
111 
112     cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost);
113 
114     cudaEventRecord(d->stop, 0);
115     cudaEventSynchronize(d->stop);
116     float   elapsedTime;
117     cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
118     d->totalTime += elapsedTime;
119     ++d->frames;
120     printf("Average Time per frame:  %3.1f ms\n", d->totalTime / d->frames);
121 }
122 
123 void anim_exit(DataBlock *d)// 收拾申请的内存
124 {
125     cudaUnbindTexture(texIn);
126     cudaUnbindTexture(texOut);
127     cudaUnbindTexture(texConstSrc);
128     cudaFree(d->dev_inSrc);
129     cudaFree(d->dev_outSrc);
130     cudaFree(d->dev_constSrc);
131 
132     cudaEventDestroy(d->start);
133     cudaEventDestroy(d->stop);
134     return;
135 }
136 
137 int main(void)
138 {
139     DataBlock   data;
140     CPUAnimBitmap bitmap(DIM, DIM, &data);
141     data.bitmap = &bitmap;
142     data.totalTime = 0;
143     data.frames = 0;
144     cudaEventCreate(&data.start);
145     cudaEventCreate(&data.stop);
146 
147     int imageSize = bitmap.image_size();
148 
149     cudaMalloc((void**)&data.output_bitmap, imageSize);
150 
151     cudaMalloc((void**)&data.dev_inSrc, imageSize);
152     cudaMalloc((void**)&data.dev_outSrc, imageSize);
153     cudaMalloc((void**)&data.dev_constSrc, imageSize);
154     cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);//将内存绑定到之前声明的纹理引用中去
155     cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
156     cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize);
157 
158     float *temp = (float*)malloc(imageSize);
159     for (int i = 0; i < DIM*DIM; i++)// 恒温格点数据
160     {
161         temp[i] = 0;
162         int x = i % DIM;
163         int y = i / DIM;
164         if ((x >= 181) && (x < 281) && (y >= 462) && (y < 562))
165             temp[i] = MAX_TEMP;
166         if ((x >= 462) && (x < 562) && (y >= 462) && (y < 562))
167             temp[i] = MIN_TEMP;
168     }
169     cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice);
170 
171     for (int i = 0; i < DIM*DIM; i++)// 初始温度场数据
172     {
173         temp[i] = 0.5;
174         int x = i % DIM;
175         int y = i / DIM;
176         if ((x >= 718) && (x < 818) && (y >= 462) && (y < 562))
177             temp[i] = MAX_TEMP;
178     }
179     cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice);
180     
181     free(temp);
182 
183     bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit);
184 
185     getchar();
186     return;
187 }

 

? 输出结果(左侧为恒高温,中间为恒低温,右侧为初始高温点)

技术分享

技术分享

 

? 使用一维纹理内存的过程浓缩一下就变成了以下过程

 1 texture<float>  texSrc;// 在全局位置上声明纹理引用
 2 
 3 float *dev_Src;
 4 cudaMalloc((void**)&dev_Src, sizeof(float)*DIM);// 申请和绑定纹理内存
 5 cudaBindTexture(NULL, texSrc, dev_Src, NULL);
 6 
 7 float *temp = (float *)malloc(sizeof(float)*DIM);// 初始化该内存中的内容
 8 //Initalize data in temp and then free(temp)
 9 
10 cudaMemcpy(dev_Src, temp, sizeof(float)*DIM, cudaMemcpyHostToDevice);
11 
12 //Do something
13 
14 cudaUnbindTexture(texSrc);// 解绑和释放内存
15 cudaFree(dev_Src);

 

? 访问纹理内存不用中括号下标,而是

1 int x = threadIdx.x + blockIdx.x * blockDim.x;
2 int y = threadIdx.y + blockIdx.y * blockDim.y;
3 int offset = x + y * blockDim.x * gridDim.x;
4 float c = tex1Dfetch(texSrc, offset);

 

热传导(使用二维纹理)

  1 #include <stdio.h>
  2 #include "cuda_runtime.h"
  3 #include "device_launch_parameters.h"
  4 #include "D:\Code\CUDA\book\common\book.h"
  5 #include "D:\Code\CUDA\book\common\cpu_anim.h"
  6 
  7 #define DIM 1024
  8 #define PI 3.1415926535897932f
  9 #define MAX_TEMP 1.0f
 10 #define MIN_TEMP 0.0001f
 11 #define SPEED   0.25f
 12 
 13 texture<float, 2>  texConstSrc;
 14 texture<float, 2>  texIn;
 15 texture<float, 2>  texOut;
 16 
 17 struct DataBlock
 18 {
 19     unsigned char   *output_bitmap;
 20     float           *dev_inSrc;
 21     float           *dev_outSrc;
 22     float           *dev_constSrc;
 23     CPUAnimBitmap  *bitmap;
 24     cudaEvent_t     start, stop;
 25     float           totalTime;
 26     float           frames;
 27 };
 28 
 29 __global__ void blend_kernel(float *dst,bool dstOut)
 30 {
 31     int x = threadIdx.x + blockIdx.x * blockDim.x;
 32     int y = threadIdx.y + blockIdx.y * blockDim.y;
 33     int offset = x + y * blockDim.x * gridDim.x;
 34 
 35     float   t, l, c, r, b;
 36     if (dstOut)//不需要自己处理边界情况
 37     {
 38         t = tex2D(texIn, x, y - 1);
 39         l = tex2D(texIn, x - 1, y);
 40         c = tex2D(texIn, x, y);
 41         r = tex2D(texIn, x + 1, y);
 42         b = tex2D(texIn, x, y + 1);
 43     }
 44     else
 45     {
 46         t = tex2D(texOut, x, y - 1);
 47         l = tex2D(texOut, x - 1, y);
 48         c = tex2D(texOut, x, y);
 49         r = tex2D(texOut, x + 1, y);
 50         b = tex2D(texOut, x, y + 1);
 51     }
 52     dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
 53 
 54     return;
 55 }
 56 
 57 __global__ void copy_const_kernel(float *iptr)
 58 {
 59     // map from threadIdx/BlockIdx to pixel position
 60     int x = threadIdx.x + blockIdx.x * blockDim.x;
 61     int y = threadIdx.y + blockIdx.y * blockDim.y;
 62     int offset = x + y * blockDim.x * gridDim.x;
 63 
 64     float c = tex2D(texConstSrc, x, y);
 65     if (c != 0)
 66         iptr[offset] = c;
 67 
 68     return;
 69 }
 70 
 71 void anim_gpu(DataBlock *d, int ticks)
 72 {
 73     cudaEventRecord(d->start, 0);
 74     dim3    blocks(DIM / 16, DIM / 16);
 75     dim3    threads(16, 16);
 76     CPUAnimBitmap  *bitmap = d->bitmap;
 77 
 78     volatile bool dstOut = true;
 79     for (int i = 0; i < 90; i++)
 80     {
 81         float   *in, *out;
 82         if (dstOut) {
 83             in  = d->dev_inSrc;
 84             out = d->dev_outSrc;
 85         }
 86         else
 87         {
 88             out = d->dev_inSrc;
 89             in  = d->dev_outSrc;
 90         }
 91         copy_const_kernel << <blocks, threads >> > (in); 
 92         blend_kernel << <blocks, threads >> > (out, dstOut); 
 93         dstOut = !dstOut;
 94     }
 95     float_to_color << <blocks, threads >> > (d->output_bitmap, d->dev_inSrc);
 96 
 97     cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost);
 98 
 99     cudaEventRecord(d->stop, 0);
100     cudaEventSynchronize(d->stop);
101     
102     float   elapsedTime;
103     cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
104     d->totalTime += elapsedTime;
105     ++d->frames;
106     printf("Average Time per frame:  %3.1f ms\n", d->totalTime / d->frames);
107     
108     return;
109 }
110 
111 void anim_exit(DataBlock *d)
112 {
113     cudaUnbindTexture(texIn);
114     cudaUnbindTexture(texOut);
115     cudaUnbindTexture(texConstSrc);
116     cudaFree(d->dev_inSrc);
117     cudaFree(d->dev_outSrc);
118     cudaFree(d->dev_constSrc);
119 
120     cudaEventDestroy(d->start);
121     cudaEventDestroy(d->stop);
122     return;
123 }
124 
125 
126 int main(void)
127 {
128     DataBlock   data;
129     CPUAnimBitmap bitmap(DIM, DIM, &data);
130     data.bitmap = &bitmap;
131     data.totalTime = 0;
132     data.frames = 0;
133     cudaEventCreate(&data.start);
134     cudaEventCreate(&data.stop);
135 
136     int imageSize = bitmap.image_size();
137 
138     cudaMalloc((void**)&data.output_bitmap, imageSize);
139 
140     cudaMalloc((void**)&data.dev_inSrc, imageSize);
141     cudaMalloc((void**)&data.dev_outSrc, imageSize);
142     cudaMalloc((void**)&data.dev_constSrc, imageSize);
143 
144     cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
145     cudaBindTexture2D(NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float) * DIM);
146     cudaBindTexture2D(NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float) * DIM); 
147     cudaBindTexture2D(NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float) * DIM); 
148 
149     float *temp = (float*)malloc(imageSize);
150     for (int i = 0; i<DIM*DIM; i++) {
151         temp[i] = 0;
152         int x = i % DIM;
153         int y = i / DIM;
154         if ((x >= 181) && (x < 281) && (y >= 462) && (y < 562))
155             temp[i] = MAX_TEMP;
156         if ((x >= 462) && (x < 562) && (y >= 462) && (y < 562))
157             temp[i] = MIN_TEMP;
158     }
159     cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice);
160 
161     for (int i = 0; i < DIM*DIM; i++)// 初始温度场数据
162     {
163         temp[i] = 0.5;
164         int x = i % DIM;
165         int y = i / DIM;
166         if ((x >= 718) && (x < 818) && (y >= 462) && (y < 562))
167             temp[i] = MAX_TEMP;
168     }
169     cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice);
170     free(temp);
171 
172     bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit);
173 
174     getchar();
175     return 0;
176 }

 

? 输出结果同一维纹理的的情况,速度上没有明显差别

 

? 使用纹理内存的过程浓缩一下就变成了以下过程

 1 texture<float, 2>  texSrc;// 在全局位置上声明纹理引用
 2 
 3 float *dev_Src;
 4 cudaMalloc((void**)&dev_Src, DIM*DIM);// 申请和绑定纹理内存
 5 cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
 6 cudaBindTexture2D(NULL, texSrc, dev_Src, desc, DIM, DIM, sizeof(float) * DIM*DIM);
 7 
 8 float *temp = (float*)malloc(sizeof(float)*DIM*DIM);// 初始化该内存中的内容
 9 //Initalize data in temp and then free(temp)
10 
11 cudaMemcpy(dev_Src, temp, sizeof(float)*DIM*DIM, cudaMemcpyHostToDevice);
12 
13 //Do something
14 
15 cudaUnbindTexture(texSrc);// 解绑和释放内存
16 cudaFree(dev_Src);

 

? 访问纹理内存不用中括号下标,而是

1 int x = threadIdx.x + blockIdx.x * blockDim.x;
2 int y = threadIdx.y + blockIdx.y * blockDim.y;
3 float c = tex2D(texSrc, x, y);

 

第七章 纹理内存

标签:kernel   自己   etc   har   释放   event   申请   bind   get   

原文地址:http://www.cnblogs.com/cuancuancuanhao/p/7641428.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!