您的位置:首页 > 其它

CUDA之矩阵乘法——TILE&sharedmemory

2016-05-25 10:12 447 查看

CUDA 矩阵乘法

将输入数据分成很多个TILE使用shared memory进行并行计算



代码

#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;

#define BLOCK_SIZE  16
#define N           3072

__device__ float GetElement(const Matrix A, int row, int col) {
return A.elements[row * A.stride + col];
}

__device__ void SetElement(Matrix A, int row, int col, float value) {
A.elements[row * A.stride + col] = value;
}

__device__ Matrix GetSubMatrix(Matrix A, int row, int col) {
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col];
return Asub;
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) {
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
float Cvalue = 0;
int row = threadIdx.y;
int col = threadIdx.x;
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
__syncthreads();
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
__syncthreads();
}
SetElement(Csub, row, col, Cvalue);
}

void MatMul(const Matrix A, const Matrix B, Matrix C) {
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc((void**)&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc((void**)&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc((void**)&d_C.elements, size);

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);

cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}

void init(Matrix *A) {
A->width=A->height=A->stride=N;
A->elements=(float*) calloc(N*N, sizeof(float));
for(int i=0; i<N*N; i++) A->elements[i]=i*0.1f;

}

void simple_sgemm(const float *A, const float *B, float *C) {
int i, j, k;
for(i=0; i<N; i++){
for(j=0; j<N; j++) {
float s=0;
for(k=0; k<N; k++) s+=A[k*N+i]*B[j*N+k];
C[j*N+i]=s;
}
}
}

int main() {
//定义四个矩阵,C=A*B为分割成块使用sharedmemory;C_ref=A*B直接计算;
Matrix A, B, C, C_ref;
//初始化四个矩阵,赋初值。
init(&A);
init(&B);
init(&C);
init(&C_ref);

int i, j;
for(i=0; i<N; i++){
for(j=0; j<N; j++) {
printf("%f \t",B.elements[j*N+i]);
}
printf("\n");
}

//C=A*B——数据分割成块使用sharedmemory;
MatMul(A, B, C);
for(i=0; i<N; i++){
for(j=0; j<N; j++) {
printf("%f \t",C.elements[j*N+i]);
}
printf("\n");
}

//C_ref=A*B直接计算;
simple_sgemm(A.elements, B.elements, C_ref.elements);

//误差对比
for(int i=0; i<N*N; i++) {
float err=fabs(C_ref.elements[i]-C.elements[i]);
if(err>1e-5 && err>C.elements[i]*1e-5) printf("ERR!\n"), exit(-1);
}
printf("GOOD!\n");
}


经与matlab对比,乘法结果正确!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: