标签:
实现矩阵相加
1 #include <stdlib.h> 2 #include <stdio.h> 3 #include <opencv/cv.h> 4 #include <opencv/highgui.h> 5 #include <opencv2/opencv.hpp> 6 7 #include "cuda_runtime.h" 8 #include "device_launch_parameters.h" 9 using namespace std; 10 using namespace cv; 11 12 __global__ void Add_kernel(const int2* d_A, const int2* d_B,int2* d_C,int width, int height) 13 { 14 int x = threadIdx.x + blockIdx.x * blockDim.x; 15 int y = threadIdx.y + blockIdx.y * blockDim.y; 16 17 if(x < width && y < height) 18 { 19 int offset = x + y*width; 20 d_C[offset].x = d_A[offset].x + d_B[offset].x; 21 d_C[offset].y = d_A[offset].y + d_B[offset].y; 22 } 23 } 24 int main() 25 { 26 Mat img(3, 4, CV_32S, Scalar_<int>(0)); 27 28 cout<<img<<endl; 29 cout<<endl; 30 31 32 for(int i = 0 ; i < img.rows; i++) 33 { 34 for(int j = 0 ; j < img.cols; j++) 35 { 36 img.at<int>(i,j)=i+j; 37 } 38 } 39 cout<<endl; 40 41 cout<<img<<endl; 42 43 44 size_t memSize = img.step * img.rows; 45 int2* d_A = NULL; 46 int2* d_B = NULL; 47 int2* d_C = NULL; 48 cudaMalloc((void**)&d_A, memSize); 49 cudaMalloc((void**)&d_B, memSize); 50 cudaMalloc((void**)&d_C, memSize); 51 52 cudaMemcpy(d_A,img.data,memSize, cudaMemcpyHostToDevice); 53 cudaMemcpy(d_B,img.data,memSize, cudaMemcpyHostToDevice); 54 55 dim3 threads(16, 16); 56 dim3 grids((img.rows + threads.x - 1)/threads.x,(img.cols + threads.y - 1)/threads.y); 57 Add_kernel<<<grids,threads>>>(d_A, d_B, d_C, img.rows, img.cols); 58 59 cudaMemcpy(img.data, d_C,memSize,cudaMemcpyDeviceToHost); 60 cout<<"GPU"<<endl; 61 cout<<img<<endl; 62 cudaFree(d_A); 63 cudaFree(d_B); 64 cudaFree(d_C); 65 66 system("pause"); 67 return 0; 68 }
标签:
原文地址:http://www.cnblogs.com/LzKlyhPorter/p/4611281.html