标签:getchar id3 one sof cout void int const pre
没想到我2010年买的笔记本显卡GT330M 竟然还能跑CUDA,果断小试了一把,环境为CUDA6.5+VS2012,写了一个积分图实现均值滤波。类似于OpenCV的blur()函数。
使用lena.jpg做测试,效果如下:
代码在此:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <opencv2\opencv.hpp> using namespace std; using namespace cv; __global__ void rowAddKernel(float* pIntegImgLena,int* pPtsImg,int imgW,int imgH) { const int tidx=blockDim.x*blockIdx.x + threadIdx.x; if (tidx<imgW) { for (int j=1; j<imgH; j++) { pIntegImgLena[j*imgW+ tidx] +=pIntegImgLena[(j-1)*imgW+tidx]; pPtsImg[j*imgW+ tidx] +=pPtsImg[(j-1)*imgW+ tidx]; } } } __global__ void colAddKernel(float* pIntegImgLena,int* pPtsImg,int imgW,int imgH) { const int tidy=blockDim.y*blockIdx.y + threadIdx.y; if (tidy<imgH) { for (int i=1; i<imgW; i++) { pIntegImgLena[tidy*imgW+ i] +=pIntegImgLena[tidy*imgW+i-1]; pPtsImg[tidy*imgW+ i] +=pPtsImg[tidy*imgW+ i-1]; } } } __global__ void filterKernel(uchar* pImgLena,float* pIntegImgLena,int* pPtsImg,int imgW,int imgH,int win) { const int tidx=blockDim.x*blockIdx.x + threadIdx.x; const int tidy=blockDim.y*blockIdx.y + threadIdx.y; if (tidx<imgW && tidy<imgH) { int left=tidx-win; int right=tidx+win; int top=tidy-win; int bot=tidy+win; left=max(left, 0); right=min(right, imgW-1); top=max(top, 0); bot=min(bot, imgH-1); int id1=top*imgW+left; int id2=top*imgW+right; int id3=bot*imgW+left; int id4=bot*imgW+right; int cnt=pPtsImg[id4]+pPtsImg[id1]-pPtsImg[id2]-pPtsImg[id3]; float sum=pIntegImgLena[id4]+pIntegImgLena[id1]-pIntegImgLena[id2]-pIntegImgLena[id3]; float value=sum/cnt; pImgLena[tidy*imgW+tidx]=(uchar)value; } } void main() { //读取原图像 string imgPath="data/lena.jpg"; Mat imgLena=imread(imgPath, 0); int imgH=imgLena.rows; int imgW=imgLena.cols; namedWindow("lena"); imshow("lena", imgLena); waitKey(0); //滤波后的lena Mat filterLena=imgLena.clone(); filterLena.setTo(0); //积分图以及坐标索引图 Mat integImgLena=Mat::zeros(imgLena.size(), CV_32FC1); Mat ptsImg=Mat::zeros(imgLena.size(), CV_32SC1); //积分图初始化 imgLena.convertTo(imgLena, CV_32FC1); integImgLena=imgLena.clone(); ptsImg.setTo(1); //分配内存 uchar* pImgLena=NULL; float* pIntegImgLena=NULL; int* pPtsImg=NULL; cudaMalloc(&pImgLena, imgH*imgW*sizeof(uchar)); cudaMalloc(&pIntegImgLena, imgH*imgW*sizeof(float)); cudaMalloc(&pPtsImg, imgH*imgW*sizeof(int)); //拷贝数据至GPU cudaMemcpy(pImgLena, imgLena.data,imgH*imgW*sizeof(uchar), cudaMemcpyHostToDevice); cudaMemcpy(pIntegImgLena, integImgLena.data,imgH*imgW*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(pPtsImg, ptsImg.data,imgH*imgW*sizeof(int), cudaMemcpyHostToDevice); //按行求前缀和 dim3 block(8,1); dim3 grid((imgW+block.x-1)/block.x,1); rowAddKernel<<<grid, block, 0>>>(pIntegImgLena, pPtsImg, imgW, imgH); //按列求前缀和 block=dim3(1,8); grid=dim3(1,(imgH+block.y-1)/block.y); colAddKernel<<<grid, block, 0>>>(pIntegImgLena, pPtsImg, imgW, imgH); //滤波 int win=3; block=dim3(8,8); grid=dim3((imgW+block.x-1)/block.x, (imgH+block.y-1)/block.y); filterKernel<<<grid, block, 0>>>(pImgLena,pIntegImgLena, pPtsImg, imgW, imgH, win); cudaMemcpy(filterLena.data, pImgLena, imgH*imgW*sizeof(uchar), cudaMemcpyDeviceToHost); cudaError err; err=cudaGetLastError(); if (err!=cudaSuccess) { cout<<"err="<<err<<endl; getchar(); } namedWindow("filterLena"); imshow("filterLena", filterLena); waitKey(0); cudaFree(pImgLena); cudaFree(pIntegImgLena); cudaFree(pPtsImg); }
标签:getchar id3 one sof cout void int const pre
原文地址:http://www.cnblogs.com/riddick/p/7577293.html