标签:gpu argv const emc start val after app bat
<1> Basic
#include <stdio.h> #include <cuda_runtime.h> #include <device_launch_parameters.h> #define NUM 15 __global__ void square(float *dout,float *din) { int idx = threadIdx.x; float f = din[idx]; dout[idx] = f*f; } int main(int argc,char **argv) { const int bytes = sizeof(float) * NUM; float host_in[NUM]; // save some value for(int i=0;i<NUM;i++) { host_in[i] = float(i); } float host_out[NUM]; cudaError_t cudaStatus; // GPU SETTINGS // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); return; } // define gpu memory, GPU memory allocation float *device_in = 0; float *device_out = 0; cudaStatus = cudaMalloc((void**)&device_in, bytes); cudaStatus = cudaMalloc((void**)&device_out,bytes); cudaStatus = cudaMemcpy(device_in,host_in,bytes,cudaMemcpyHostToDevice); // GPU kernel // 1 block,Num threads square<<<1,NUM>>>(device_out,device_in); cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); } cudaStatus = cudaMemcpy(host_out, device_out, bytes, cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); } // Free GPU memory cudaFree(device_in); cudaFree(device_out); for(int i=0;i<NUM;i++) { fprintf(stdout,"%f \n",host_out[i]); } getchar(); return 0; }
<1>simple caculation:
I = (R+G+B)/2
I = R*0.299f + G*0.587f + 0.114f*B
CPU:
// Serial implementation for running on CPU using a single thread. void rgbaToGreyscaleCpu(const uchar4* const rgbaImage, unsigned char *const greyImage, const size_t numRows, const size_t numCols) { for (size_t r = 0; r < numRows; ++r) { for (size_t c = 0; c < numCols; ++c) { const uchar4 rgba = rgbaImage[r * numCols + c]; const float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z; greyImage[r * numCols + c] = channelSum; } } }
GPU:
// CUDA kernel which is run in parallel by many GPU threads. __global__ void rgbaToGreyscaleCudaKernel(const uchar4* const rgbaImage, unsigned char* const greyImage, const int numRows, const int numCols) { //First create a mapping from the 2D block and grid locations //to an absolute 2D location in the image, then use that to //calculate a 1D offset const long pointIndex = threadIdx.x + blockDim.x*blockIdx.x; if(pointIndex<numRows*numCols) { // this is necessary only if too many threads are started uchar4 const imagePoint = rgbaImage[pointIndex]; greyImage[pointIndex] = .299f*imagePoint.x + .587f*imagePoint.y + .114f*imagePoint.z; } } // Parallel implementation for running on GPU using multiple threads. void rgbaToGreyscaleCuda(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage, unsigned char* const d_greyImage, const size_t numRows, const size_t numCols) { const int blockThreadSize = 256; const int numberOfBlocks = 1 + ((numRows*numCols - 1) / blockThreadSize); // a/b rounded up const dim3 blockSize(blockThreadSize, 1, 1); const dim3 gridSize(numberOfBlocks , 1, 1); rgbaToGreyscaleCudaKernel<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols); }
GPU openEXR image(RGBA) -> gray image
标签:gpu argv const emc start val after app bat
原文地址:http://www.cnblogs.com/gearslogy/p/7113009.html