CUDA之旅:矩阵相加
2017-06-07 18:43
141 查看
矩阵相加CUDA实现
//矩阵相加的CUDA程序实现 //Author: Eric Lv //Email: Eric2014_Lv@sjtu.edu.cn //Date: 6/7/2017 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <math.h> #include <stdlib.h> //#include <cuda.h> #define N 32 __global__ void matrix_add(const int a[] , const int b[] , int c[] ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int idy = blockIdx.y * blockDim.y + threadIdx.y; c[idx][idy] = a[idx][idy] + b[idx][idy]; } int main(void) { int i; int *dev_a, *dev_b, *dev_c; int *host_a, *host_b, *host_c; //分配block里面线程的维数 N*N dim3 threads_in_block (N, N); cudaError_t err = cudaSuccess; host_a = (int *)malloc( sizeof(int) * N * N ); host_b = (int *)malloc( sizeof(int) * N * N ); host_c = (int *)malloc( sizeof(int) * N * N ); err = cudaMalloc((void **)&dev_a, sizeof(int) * N * N); if(err != cudaSuccess) { printf("cudaMalloc (a) is failed!\n"); return -1; } err = cudaMalloc((void **)&dev_b, sizeof(int) * N * N); if(err != cudaSuccess) { printf("cudaMalloc (b) is failed!\n"); return -1; } err = cudaMalloc((void **)&dev_c, sizeof(int) * N * N); if(err != cudaSuccess) { printf("cudaMalloc (c) is failed!\n"); return -1; } for(i = 0; i < N * N; i++) { host_a[i] = 2*i+1; host_b[i] = 3*i-1; } err = cudaMemcpy(dev_a, host_a, sizeof(int) * N * N, cudaMemcpyHostToDevice); if(err != cudaSuccess) { printf("Host to device (a) is failed!\n"); return -1; } err = cudaMemcpy(dev_b, host_b, sizeof(int) * N * N, cudaMemcpyHostToDevice); if(err != cudaSuccess) { printf("Host to device (b) is failed!\n"); return -1; } // 调用GPU上的核函数 matrix_add<<<1, threads_in_block>>>((int (*) )dev_a, (int (*) )dev_b, (int (*) )dev_c); err = cudaMemcpy(host_c, dev_c, sizeof(int) * N * N, cudaMemcpyDeviceToHost); if(err != cudaSuccess) { printf("Device to host (c) is failed!\n"); return -1; } for (i = 0; i < N * N; i++) { if (host_a[i] + host_b[i] != host_c[i]) { printf("a[%d]%d + b[%d]%d != c[%d]%d.\n", i, host_a[i], i, host_b[i], i, host_c[i]); return -1; } } printf("Congratulations! All entris are correct! You have finished the CUDA code!\n"); free(host_a); free(host_b); free(host_c); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; }
相关文章推荐
- cuda中矩阵相加编程
- cuda下调试矩阵相加程序
- CUDA任意矩阵相乘 TLP最终版
- cuda——数组相加(矢量和)
- Matlab矩阵行相加、列相加
- cuda开发矩阵乘法测试你的GPU效率
- CUDA二维矩阵加法
- 稀疏矩阵的运算(相乘,输出,转置,相加)
- 第九周项目三稀疏矩阵相加
- CUDA-GDB调试一般方法————矩阵乘的例子
- 【CUDA并行编程之四】矩阵相乘
- cuda初学(1):稀疏矩阵向量乘法(单精度)
- 第九周项目三(2)稀疏矩阵相加
- 第十四周项目4矩阵运算(1)矩阵相加
- 基于Cuda的几种并行稀疏矩阵乘法方法(一)
- (五)利用GPU计算整数相加 CUDA
- cuda矩阵之心得
- CUDA之矩阵转置
- cuda 任意维度的矩阵相乘
- 第一个cuda程序-基于VS2010+CUDA5.0 两个向量相加的GPU实现