标签:
一、texture memory
在片上有缓存,可以在一个线程读取数据时,将这个数据周围的数据也存到缓存中(局部性)。
二、eg:simulating heat transfer
在一个空间中任意选取几个位置放上恒温的加热器,记录空间中温度的变化。
下式是计算新温度的公式:
过程:
//头文件
#include "cuda.h"
#include "book.h"
#include "cpu_anim.h"
#include "cuda_runtime.h"
#define DIM 1024
#define PI 3.141592635897932f
#define MAX_TEMP 1.0f //最高温
#define MIN_TEMP 0.0001f //最低温
#define SPEED 0.25f //公式中的k
struct DataBlock
{
unsigned char * output_bitmap; //GPU中温度转化为颜色后
float * dev_inSrc; //作为输入的温度分布
float * dev_outSrc; //计算后的温度分布
float * dev_constSrc; //恒温heater的分布和相应的温度
CPUAnimBitmap * bitmap; //CPU中温度转化为颜色后
cudaEvent_t start, stop; //事件
float totalTime; //总时间
float frames; //帧数
};
//将重新计算后的grid中放置恒温heater的位置的温度覆盖为设定值
__global__ void copy_const_kernel(float * iptr, const float * cptr){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
if(cptr[offset] != 0) iptr[offset] = cptr[offset];
}
//对每个位置的温度进行重新计算, 记得考虑边界处
__global__ void blend_kernel(float *outSrc,const float *inSrc){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
int left = offset - 1;
int right = offset + 1;
if(x == 0) left++;
if(x == DIM - 1) right--;
int top = offset - DIM;
int botton = offset + DIM;
if(y == 0) top += DIM;
if(y == DIM - 1) botton -= DIM;
outSrc[offset] = inSrc[offset] + SPEED * (inSrc[top] +
inSrc[botton] +inSrc[left] + inSrc[right] -
4 * inSrc[offset]);
}
//计算90次后产生新的一帧图
void anim_gpu(DataBlock *d, int ticks){
HANDLE_ERROR(cudaEventRecord(d->start, 0));
dim3 blocks(DIM/16, DIM/16);
dim3 threads(16, 16);
CPUAnimBitmap * bitmap = d->bitmap;
for(int i = 0; i < 90; i++){//计算90次
copy_const_kernel<<<blocks, threads>>>(d->dev_inSrc, d->dev_constSrc);
blend_kernel<<<blocks, threads>>>(d->dev_outSrc, d->dev_inSrc);
swap(d->dev_inSrc, d->dev_outSrc);
}
//将温度转化为相应的颜色,这个函数在book.h中
float_to_color<<<blocks, threads>>>(d->output_bitmap, d->dev_inSrc);
//将gpu上的图拷贝到cpu上
HANDLE_ERROR(cudaMemcpy(bitmap->get_ptr(),
d->output_bitmap,
bitmap->image_size(),
cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaEventRecord(d->stop, 0));
HANDLE_ERROR(cudaEventSynchronize(d->stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, d->start, d->stop));
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %.3lf ms\n", d->totalTime / d->frames);
}
void anim_exit(DataBlock *d){
cudaFree(d->dev_constSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
HANDLE_ERROR(cudaEventDestroy(d->start));
HANDLE_ERROR(cudaEventDestroy(d->stop));
}
int main(){
DataBlock data;
CPUAnimBitmap bitmap(DIM, DIM, &data);
data.bitmap = &bitmap;
data.totalTime = 0;
data.frames = 0;
HANDLE_ERROR(cudaEventCreate(&data.start));
HANDLE_ERROR(cudaEventCreate(&data.stop));
HANDLE_ERROR(cudaMalloc((void**)&data.output_bitmap, bitmap.image_size()));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_inSrc, bitmap.image_size()));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_outSrc, bitmap.image_size()));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size()));
float *temp = (float*)malloc(bitmap.image_size());
//以下一段是在设置heater的位置和温度
for(int i = 0; i < DIM * DIM; i++){
temp[i] = 0;
int x = i % DIM;
int y = i / DIM;
if((x > 300) && (x < 600) && (y >310) && (y < 601))
temp[i] = MAX_TEMP;
}
temp[DIM * 100 + 100] = (MIN_TEMP + MAX_TEMP) / 2;
temp[DIM * 700 + 100] = MIN_TEMP;
temp[DIM * 300 + 300] = MIN_TEMP;
temp[DIM * 200 + 700] = MIN_TEMP;
for(int y = 800; y < 900; y++){
for(int x = 400;x < 500; x++){
temp[x + y * DIM] = MIN_TEMP;
}
}
HANDLE_ERROR(cudaMemcpy(data.dev_constSrc, temp,
bitmap.image_size(),
cudaMemcpyHostToDevice));
//初始化空间中的温度
for(int y = 800; y < DIM; y++){
for(int x = 0; x < 200; x++)
temp[x + y * DIM] = MAX_TEMP;
}
HANDLE_ERROR(cudaMemcpy(data.dev_inSrc, temp,
bitmap.image_size(),
cudaMemcpyHostToDevice));
free(temp);
bitmap.anim_and_exit( (void (*) (void*, int))anim_gpu,(void(*)(void*))anim_exit);
}
三、使用texture memory
先来看一下如何使用之。
1.declare as texture references:
2.allocation in device memory:
HANDLE_ERROR(cudaMalloc((void**)&data.dev_inSrc, bitmap.image_size()));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_outSrc, bitmap.image_size()));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size()));
3.bind the reference and the allocation:
这个绑定说明:我们指定那块内存为texture M,以及我们把它命名为什么
HANDLE_ERROR(cudaBindTexture(NULL, texConstSrc,
data.dev_constSrc,
bitmap.image_size()));
HANDLE_ERROR(cudaBindTexture(NULL, texIn,
data.dev_inSrc,
bitmap.image_size()));
HANDLE_ERROR(cudaBindTexture(NULL, texOut,
data.dev_outSrc,
bitmap.image_size()));
4.为了从textureM而不是globalM来读取数据,需要用到tex1Dfetch()。
5.texture reference是在全局范围内声明的,不能将global上的内存作为函数参数传递。
6.最后需要接触绑定再释放空间:
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_constSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
最终:
#include "cuda.h"
#include "book.h"
#include "cpu_anim.h"
#include "cuda_runtime.h"
#define DIM 1024
#define PI 3.141592635897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f
texture<float> texConstSrc; //声明引用
texture<float> texIn;
texture<float> texOut;
struct DataBlock
{
unsigned char * output_bitmap;
float * dev_inSrc;
float * dev_outSrc;
float * dev_constSrc;
CPUAnimBitmap * bitmap;
cudaEvent_t start, stop;
float totalTime;
float frames;
};
__global__ void copy_const_kernel(float * iptr){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float c = tex1Dfetch(texConstSrc, offset); //读取texture上的内容
if(c != 0) iptr[offset] = c;
}
__global__ void blend_kernel(float *dst, bool dstOut){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
int left = offset - 1;
int right = offset + 1;
if(x == 0) left++;
if(x == DIM - 1) right--;
int top = offset - DIM;
int botton = offset + DIM;
if(y == 0) top += DIM;
if(y == DIM - 1) botton -= DIM;
float t, l, c, r, b;
//对一块device上的内存,当它被当作globalM使用时是可以修改的,如果通过它的reference来使用它则是只读的。
if(dstOut){
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, botton);
}else{
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, botton);
}
dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}
void anim_gpu(DataBlock *d, int ticks){
HANDLE_ERROR(cudaEventRecord(d->start, 0));
dim3 blocks(DIM/16, DIM/16);
dim3 threads(16, 16);
CPUAnimBitmap * bitmap = d->bitmap;
volatile bool dstOut = true;
for(int i = 0; i < 90; i++){
float *in, *out;
if(dstOut){
in = d->dev_inSrc;
out = d->dev_outSrc;
}else{
in = d->dev_outSrc;
out = d->dev_inSrc;
}
copy_const_kernel<<<blocks, threads>>>(in);
blend_kernel<<<blocks, threads>>>(out, dstOut);
dstOut = !dstOut;
}
float_to_color<<<blocks, threads>>>(d->output_bitmap, d->dev_inSrc);
HANDLE_ERROR(cudaMemcpy(bitmap->get_ptr(),
d->output_bitmap,
bitmap->image_size(),
cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaEventRecord(d->stop, 0));
HANDLE_ERROR(cudaEventSynchronize(d->stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, d->start, d->stop));
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %.3lf ms\n", d->totalTime / d->frames);
}
void anim_exit(DataBlock *d){
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_constSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
HANDLE_ERROR(cudaEventDestroy(d->start));
HANDLE_ERROR(cudaEventDestroy(d->stop));
}
int main(){
DataBlock data;
CPUAnimBitmap bitmap(DIM, DIM, &data);
data.bitmap = &bitmap;
data.totalTime = 0;
data.frames = 0;
HANDLE_ERROR(cudaEventCreate(&data.start));
HANDLE_ERROR(cudaEventCreate(&data.stop));
HANDLE_ERROR(cudaMalloc((void**)&data.output_bitmap, bitmap.image_size()));
//声明变量后绑定
HANDLE_ERROR(cudaMalloc((void**)&data.dev_inSrc, bitmap.image_size()));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_outSrc, bitmap.image_size()));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size()));
HANDLE_ERROR(cudaBindTexture(NULL, texConstSrc,
data.dev_constSrc,
bitmap.image_size()));
HANDLE_ERROR(cudaBindTexture(NULL, texIn,
data.dev_inSrc,
bitmap.image_size()));
HANDLE_ERROR(cudaBindTexture(NULL, texOut,
data.dev_outSrc,
float *temp = (float*)malloc(bitmap.image_size());
for(int i = 0; i < DIM * DIM; i++){
temp[i] = 0;
int x = i % DIM;
int y = i / DIM;
if((x > 300) && (x < 600) && (y >310) && (y < 601))
temp[i] = MAX_TEMP;
}
temp[DIM * 100 + 100] = (MIN_TEMP + MAX_TEMP) / 2;
temp[DIM * 700 + 100] = MIN_TEMP;
temp[DIM * 300 + 300] = MIN_TEMP;
temp[DIM * 200 + 700] = MIN_TEMP;
for(int y = 800; y < 900; y++){
for(int x = 400;x < 500; x++){
temp[x + y * DIM] = MIN_TEMP;
}
}
HANDLE_ERROR(cudaMemcpy(data.dev_constSrc, temp,
bitmap.image_size(),
cudaMemcpyHostToDevice));
for(int y = 800; y < DIM; y++){
for(int x = 0; x < 200; x++)
temp[x + y * DIM] = MAX_TEMP;
}
HANDLE_ERROR(cudaMemcpy(data.dev_inSrc, temp,
bitmap.image_size(),
cudaMemcpyHostToDevice));
free(temp);
bitmap.anim_and_exit( (void (*) (void*, int))anim_gpu,(void(*)(void*))anim_exit);
}
》》》如果是使用2D的texture,则声明时:
texture<float, 2> texConstSrc; //声明引用
texture<float, 2> texIn;
texture<float, 2> texOut;
读数时:使用text2D(),并且不需要offset变量,直接使用x,y坐标。
绑定时,书上没有细说:
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();//
HANDLE_ERROR(cudaBindTexture2D(NULL, texConstSrc,
data.dev_constSrc,
desc, DIM, DIM, //
sizeof(float) * DIM)); //
HANDLE_ERROR(cudaBindTexture2D(NULL, texIn,
data.dev_inSrc,
desc, DIM, DIM,
sizeof(float) * DIM));
HANDLE_ERROR(cudaBindTexture2D(NULL, texOut,
data.dev_outSrc,
desc, DIM, DIM,
sizeof(float) * DIM));
感觉用了textureM比没用要慢很多很多很多
标签:
原文地址:http://blog.csdn.net/small_lwei/article/details/51356190