cuda编程-矩阵乘法（2）

时间：2017-11-06 22:53:43 阅读：205 评论：0 收藏：0 [点我收藏+]

标签：void ons efi block emc oat nes gpu 代码

采用shared memory加速

技术分享

代码

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <algorithm>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "functions.h"

#define TILE_SIZE 16

__global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){
    __shared__ float tile_A[TILE_SIZE][TILE_SIZE];
    __shared__ float tile_B[TILE_SIZE][TILE_SIZE];
    unsigned int tx = threadIdx.x;
    unsigned int ty = threadIdx.y;
    unsigned int gx = blockIdx.x * TILE_SIZE + tx;
    unsigned int gy = blockIdx.y * TILE_SIZE + ty;
    if (gx >= width || gy >= height)
        return;

    // Load shared memory
    int tile_num = (width + TILE_SIZE - 1) / TILE_SIZE;
    float sum = 0;
    for (int i = 0; i < tile_num ; ++i){
        int bound = min(width, TILE_SIZE);
        for (int j = tx; j < bound; j+=blockDim.x){
            tile_A[ty][j] = A[gy * width + i * bound + j];
        }
        for (int j = ty; j < bound; j += blockDim.y){
            tile_B[j][tx] = B[(i * bound + j) * width + gx];
        }
        __syncthreads();

        for (int j = 0; j < bound; ++j){
            sum += tile_A[ty][j] * tile_B[j][tx];
        }
    }
    C[gy*width + gx] = sum;

}

void constantInit(float *data, int size, float val){
    for (int i = 0; i < size; ++i){
        data[i] = val;
    }
}

void matrixMul(){
    int dev_id = 0;
    cudaSetDevice(dev_id);

    // Allocate host memory for matrices A and B
    int width = 128;
    int height = 128;
    unsigned int size = width * height;
    unsigned int mem_size = sizeof(float)* size;
    float *h_A = (float *)malloc(mem_size);
    float *h_B = (float *)malloc(mem_size);
    float *h_C = (float *)malloc(mem_size);


    // Initialize host memory
    const float valB = 0.01f;
    constantInit(h_A, size, 1.0f);
    constantInit(h_B, size, valB);

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, mem_size);
    cudaMalloc((void **)&d_B, mem_size);
    cudaMalloc((void **)&d_C, mem_size);

    // Memcpy
    cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice);

    // Config dim
    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
    matrixMulKernel <<<grid, block >>>(d_C, d_A, d_B, width, height);

    // Memcpy device to host
    cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost);

    // Check
    printf("Checking computed result for correctness: ");
    bool correct = true;

    // test relative error by the formula
    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
    double eps = 1.e-6; // machine zero

    for (int i = 0; i < (int)(width * height); i++)
    {
        double abs_err = fabs(h_C[i] - (width * valB));
        double dot_length = width;
        double abs_val = fabs(h_C[i]);
        double rel_err = abs_err / abs_val / dot_length;

        if (abs_err > eps)
        {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], (float)(width*height), eps);
            correct = false;
        }
    }

    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

}

cuda编程-矩阵乘法（2）

标签：void ons efi block emc oat nes gpu 代码

原文地址：http://www.cnblogs.com/haiyang21/p/7795286.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行