码迷,mamicode.com
首页 > 其他好文 > 详细

CUDA_by_Examples_Chapter7

时间:2016-05-12 19:54:59      阅读:202      评论:0      收藏:0      [点我收藏+]

标签:

一、texture memory
在片上有缓存,可以在一个线程读取数据时,将这个数据周围的数据也存到缓存中(局部性)。
二、eg:simulating heat transfer
在一个空间中任意选取几个位置放上恒温的加热器,记录空间中温度的变化。

技术分享

下式是计算新温度的公式:

技术分享

过程:

  1. copy_const_kernel():将放置恒温加热器的位置的温度覆盖为原来的温度
  2. blend_kernel():通过上面的公式计算每个小块的温度
  3. 将2中的计算结果作为新的输入输入step1。
//头文件
#include "cuda.h"
#include "book.h"
#include "cpu_anim.h"
#include "cuda_runtime.h"

#define DIM 1024
#define PI 3.141592635897932f
#define MAX_TEMP 1.0f                          //最高温
#define MIN_TEMP 0.0001f                       //最低温
#define SPEED 0.25f                            //公式中的k

struct DataBlock 
{
    unsigned char * output_bitmap;            //GPU中温度转化为颜色后
    float * dev_inSrc;                        //作为输入的温度分布
    float * dev_outSrc;                       //计算后的温度分布
    float * dev_constSrc;                     //恒温heater的分布和相应的温度
    CPUAnimBitmap * bitmap;                   //CPU中温度转化为颜色后
    cudaEvent_t start, stop;                  //事件
    float totalTime;                          //总时间
    float frames;                             //帧数
};

//将重新计算后的grid中放置恒温heater的位置的温度覆盖为设定值
__global__ void copy_const_kernel(float * iptr, const float * cptr){
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    if(cptr[offset] != 0) iptr[offset] = cptr[offset];
}

//对每个位置的温度进行重新计算, 记得考虑边界处
__global__ void blend_kernel(float *outSrc,const float *inSrc){
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    int left = offset - 1;
    int right = offset + 1;
    if(x == 0) left++;
    if(x == DIM - 1) right--;

    int top = offset - DIM;
    int botton = offset + DIM;
    if(y == 0) top += DIM;
    if(y == DIM - 1) botton -= DIM;

    outSrc[offset] = inSrc[offset] + SPEED * (inSrc[top] + 
                     inSrc[botton] +inSrc[left] + inSrc[right] - 
                     4 * inSrc[offset]);
}

//计算90次后产生新的一帧图
void anim_gpu(DataBlock *d, int ticks){
    HANDLE_ERROR(cudaEventRecord(d->start, 0));
    dim3 blocks(DIM/16, DIM/16);
    dim3 threads(16, 16);
    CPUAnimBitmap * bitmap = d->bitmap;

    for(int i = 0; i < 90; i++){//计算90次
        copy_const_kernel<<<blocks, threads>>>(d->dev_inSrc, d->dev_constSrc);
        blend_kernel<<<blocks, threads>>>(d->dev_outSrc, d->dev_inSrc);
        swap(d->dev_inSrc, d->dev_outSrc);
    }
    //将温度转化为相应的颜色,这个函数在book.h中
    float_to_color<<<blocks, threads>>>(d->output_bitmap, d->dev_inSrc);
    //将gpu上的图拷贝到cpu上
    HANDLE_ERROR(cudaMemcpy(bitmap->get_ptr(),
                            d->output_bitmap,
                            bitmap->image_size(),
                            cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaEventRecord(d->stop, 0));
    HANDLE_ERROR(cudaEventSynchronize(d->stop));
    float elapsedTime;
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, d->start, d->stop));
    d->totalTime += elapsedTime;
    ++d->frames;
    printf("Average Time per frame: %.3lf ms\n", d->totalTime / d->frames);
}

void anim_exit(DataBlock *d){
    cudaFree(d->dev_constSrc);
    cudaFree(d->dev_inSrc);
    cudaFree(d->dev_outSrc);

    HANDLE_ERROR(cudaEventDestroy(d->start));
    HANDLE_ERROR(cudaEventDestroy(d->stop));
}
int main(){
    DataBlock data;
    CPUAnimBitmap bitmap(DIM, DIM, &data);
    data.bitmap = &bitmap;
    data.totalTime = 0;
    data.frames = 0;
    HANDLE_ERROR(cudaEventCreate(&data.start));
    HANDLE_ERROR(cudaEventCreate(&data.stop));
    HANDLE_ERROR(cudaMalloc((void**)&data.output_bitmap, bitmap.image_size()));
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_inSrc, bitmap.image_size()));
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_outSrc, bitmap.image_size()));
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size()));
    float *temp = (float*)malloc(bitmap.image_size());
    //以下一段是在设置heater的位置和温度
    for(int i = 0; i < DIM * DIM; i++){
        temp[i] = 0;
        int x = i % DIM;
        int y = i / DIM;
        if((x > 300) && (x < 600) && (y >310) && (y < 601))
            temp[i] = MAX_TEMP;
    }
    temp[DIM * 100 + 100] = (MIN_TEMP + MAX_TEMP) / 2;
    temp[DIM * 700 + 100] = MIN_TEMP;
    temp[DIM * 300 + 300] = MIN_TEMP;
    temp[DIM * 200 + 700] = MIN_TEMP;
    for(int y = 800; y < 900; y++){
        for(int x = 400;x < 500; x++){
            temp[x + y * DIM] = MIN_TEMP;
        }
    }
    HANDLE_ERROR(cudaMemcpy(data.dev_constSrc, temp,
                            bitmap.image_size(),
                            cudaMemcpyHostToDevice));
    //初始化空间中的温度
    for(int y = 800; y < DIM; y++){
        for(int x = 0; x < 200; x++)
            temp[x + y * DIM] = MAX_TEMP;
    }
    HANDLE_ERROR(cudaMemcpy(data.dev_inSrc, temp,
                            bitmap.image_size(),
                            cudaMemcpyHostToDevice));
    free(temp);
    bitmap.anim_and_exit( (void (*) (void*, int))anim_gpu,(void(*)(void*))anim_exit);
}

技术分享

三、使用texture memory

先来看一下如何使用之。

1.declare as texture references:

技术分享

2.allocation in device memory:

    HANDLE_ERROR(cudaMalloc((void**)&data.dev_inSrc, bitmap.image_size()));
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_outSrc, bitmap.image_size()));
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size()));

3.bind the reference and the allocation:

这个绑定说明:我们指定那块内存为texture M,以及我们把它命名为什么

    HANDLE_ERROR(cudaBindTexture(NULL, texConstSrc,
                                 data.dev_constSrc,
                                 bitmap.image_size()));
    HANDLE_ERROR(cudaBindTexture(NULL, texIn,
                                 data.dev_inSrc,
                                 bitmap.image_size()));
    HANDLE_ERROR(cudaBindTexture(NULL, texOut,
                                 data.dev_outSrc,
                                 bitmap.image_size()));

4.为了从textureM而不是globalM来读取数据,需要用到tex1Dfetch()。

5.texture reference是在全局范围内声明的,不能将global上的内存作为函数参数传递。

6.最后需要接触绑定再释放空间:

cudaUnbindTexture(texIn);
    cudaUnbindTexture(texOut);
    cudaUnbindTexture(texConstSrc);

    cudaFree(d->dev_constSrc);
    cudaFree(d->dev_inSrc);
    cudaFree(d->dev_outSrc);

最终:

#include "cuda.h"
#include "book.h"
#include "cpu_anim.h"
#include "cuda_runtime.h"
#define DIM 1024
#define PI 3.141592635897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f
texture<float> texConstSrc;                    //声明引用
texture<float> texIn;
texture<float> texOut;
struct DataBlock 
{
    unsigned char * output_bitmap;
    float * dev_inSrc;
    float * dev_outSrc;
    float * dev_constSrc;
    CPUAnimBitmap * bitmap;
    cudaEvent_t start, stop;
    float totalTime;
    float frames;
};
__global__ void copy_const_kernel(float * iptr){
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    float c = tex1Dfetch(texConstSrc, offset);      //读取texture上的内容
    if(c != 0) iptr[offset] = c;
}
__global__ void blend_kernel(float *dst, bool dstOut){
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    int left = offset - 1;
    int right = offset + 1;
    if(x == 0) left++;
    if(x == DIM - 1) right--;

    int top = offset - DIM;
    int botton = offset + DIM;
    if(y == 0) top += DIM;
    if(y == DIM - 1) botton -= DIM;

    float t, l, c, r, b;
    //对一块device上的内存,当它被当作globalM使用时是可以修改的,如果通过它的reference来使用它则是只读的。
    if(dstOut){
        t = tex1Dfetch(texIn, top);
        l = tex1Dfetch(texIn, left);
        c = tex1Dfetch(texIn, offset);
        r = tex1Dfetch(texIn, right);
        b = tex1Dfetch(texIn, botton);
    }else{
        t = tex1Dfetch(texOut, top);
        l = tex1Dfetch(texOut, left);
        c = tex1Dfetch(texOut, offset);
        r = tex1Dfetch(texOut, right);
        b = tex1Dfetch(texOut, botton);
    }
    dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}
void anim_gpu(DataBlock *d, int ticks){
    HANDLE_ERROR(cudaEventRecord(d->start, 0));
    dim3 blocks(DIM/16, DIM/16);
    dim3 threads(16, 16);
    CPUAnimBitmap * bitmap = d->bitmap;

    volatile bool dstOut = true;
    for(int i = 0; i < 90; i++){
        float *in, *out;
        if(dstOut){
            in = d->dev_inSrc;
            out = d->dev_outSrc;
        }else{
            in = d->dev_outSrc;
            out = d->dev_inSrc;
        }
        copy_const_kernel<<<blocks, threads>>>(in);
        blend_kernel<<<blocks, threads>>>(out, dstOut);
        dstOut = !dstOut;
    }

    float_to_color<<<blocks, threads>>>(d->output_bitmap, d->dev_inSrc);

    HANDLE_ERROR(cudaMemcpy(bitmap->get_ptr(),
                            d->output_bitmap,
                            bitmap->image_size(),
                            cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaEventRecord(d->stop, 0));
    HANDLE_ERROR(cudaEventSynchronize(d->stop));
    float elapsedTime;
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, d->start, d->stop));
    d->totalTime += elapsedTime;
    ++d->frames;
    printf("Average Time per frame: %.3lf ms\n", d->totalTime / d->frames);
}

void anim_exit(DataBlock *d){
    cudaUnbindTexture(texIn);
    cudaUnbindTexture(texOut);
    cudaUnbindTexture(texConstSrc);

    cudaFree(d->dev_constSrc);
    cudaFree(d->dev_inSrc);
    cudaFree(d->dev_outSrc);

    HANDLE_ERROR(cudaEventDestroy(d->start));
    HANDLE_ERROR(cudaEventDestroy(d->stop));
}
int main(){
    DataBlock data;
    CPUAnimBitmap bitmap(DIM, DIM, &data);
    data.bitmap = &bitmap;
    data.totalTime = 0;
    data.frames = 0;
    HANDLE_ERROR(cudaEventCreate(&data.start));
    HANDLE_ERROR(cudaEventCreate(&data.stop));
    HANDLE_ERROR(cudaMalloc((void**)&data.output_bitmap, bitmap.image_size())); 

//声明变量后绑定
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_inSrc, bitmap.image_size()));
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_outSrc, bitmap.image_size()));
    HANDLE_ERROR(cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size()));
    HANDLE_ERROR(cudaBindTexture(NULL, texConstSrc,
                                 data.dev_constSrc,
                                 bitmap.image_size()));
    HANDLE_ERROR(cudaBindTexture(NULL, texIn,
                                 data.dev_inSrc,
                                 bitmap.image_size()));
    HANDLE_ERROR(cudaBindTexture(NULL, texOut,
                                 data.dev_outSrc,


    float *temp = (float*)malloc(bitmap.image_size());
    for(int i = 0; i < DIM * DIM; i++){
        temp[i] = 0;
        int x = i % DIM;
        int y = i / DIM;
        if((x > 300) && (x < 600) && (y >310) && (y < 601))
            temp[i] = MAX_TEMP;
    }
    temp[DIM * 100 + 100] = (MIN_TEMP + MAX_TEMP) / 2;
    temp[DIM * 700 + 100] = MIN_TEMP;
    temp[DIM * 300 + 300] = MIN_TEMP;
    temp[DIM * 200 + 700] = MIN_TEMP;

    for(int y = 800; y < 900; y++){
        for(int x = 400;x < 500; x++){
            temp[x + y * DIM] = MIN_TEMP;
        }
    }
    HANDLE_ERROR(cudaMemcpy(data.dev_constSrc, temp,
        bitmap.image_size(),
        cudaMemcpyHostToDevice));
    for(int y = 800; y < DIM; y++){
        for(int x = 0; x < 200; x++)
            temp[x + y * DIM] = MAX_TEMP;
    }
    HANDLE_ERROR(cudaMemcpy(data.dev_inSrc, temp,
        bitmap.image_size(),
        cudaMemcpyHostToDevice));
    free(temp);
    bitmap.anim_and_exit( (void (*) (void*, int))anim_gpu,(void(*)(void*))anim_exit);
}

》》》如果是使用2D的texture,则声明时:

texture<float, 2> texConstSrc;                    //声明引用
texture<float, 2> texIn;
texture<float, 2> texOut;

读数时:使用text2D(),并且不需要offset变量,直接使用x,y坐标。
绑定时,书上没有细说:

    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();//
    HANDLE_ERROR(cudaBindTexture2D(NULL, texConstSrc,
                                 data.dev_constSrc,
                                 desc, DIM, DIM,        //
                                 sizeof(float) * DIM)); //
    HANDLE_ERROR(cudaBindTexture2D(NULL, texIn,
                                 data.dev_inSrc,
                                 desc, DIM, DIM,
                                 sizeof(float) * DIM));
    HANDLE_ERROR(cudaBindTexture2D(NULL, texOut,
                                 data.dev_outSrc,
                                 desc, DIM, DIM,
                                 sizeof(float) * DIM));

感觉用了textureM比没用要慢很多很多很多

CUDA_by_Examples_Chapter7

标签:

原文地址:http://blog.csdn.net/small_lwei/article/details/51356190

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!