您的位置:首页 > 编程语言

cuda中矩阵相加编程

2015-12-15 20:02 447 查看
#include<stdio.h>

#include<stdlib.h>

#include<unistd.h>

__global__ void VecAdd(float*A, float*B, float*C, int N)

{

int i = blockDim.x * blockIdx.x + threadIdx.x;

if ( i < N)

C[i] = A[i] + B[i];

}

int main()

{

int N = 100;

int i = 0 ;

size_t size = N*sizeof(float);

//Allocate input vectors h_A and h_B in host memory

float * h_A = (float*)malloc(size);

float * h_B = (float*)malloc(size);

float * h_C = (float*)malloc(size);

// Initialize input vectors

for (i = 0; i < N; i++)

{

h_A[i] = 1.0;

h_B[i] = 2.0;

h_C[i] = 0.0;

}

//Allocate vectors in device memory

float*d_A;

cudaMalloc( (void **)&d_A,size);

float*d_B;

cudaMalloc( (void **)&d_B,size);

float*d_C;

cudaMalloc( (void **)&d_C,size);

//Copy vectors from host memory to device memory

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

//Invoke kernel

int threadsPerBlock = 256;

int blockPerGrid = (N + threadsPerBlock - 1)/ threadsPerBlock;

VecAdd<<<blockPerGrid,threadsPerBlock>>>(d_A,d_B,d_C,N);

// Copy result from device memory to host memory

// h_C contains the result in host memory

cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);

for(i = 0; i < N; i++)

{

printf("%f",h_C[i]);

}

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

free(h_A);

free(h_B);

free(h_C);

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: