cuda中矩阵相加编程
2015-12-15 20:02
447 查看
#include<stdio.h>
#include<stdlib.h>
#include<unistd.h>
__global__ void VecAdd(float*A, float*B, float*C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if ( i < N)
C[i] = A[i] + B[i];
}
int main()
{
int N = 100;
int i = 0 ;
size_t size = N*sizeof(float);
//Allocate input vectors h_A and h_B in host memory
float * h_A = (float*)malloc(size);
float * h_B = (float*)malloc(size);
float * h_C = (float*)malloc(size);
// Initialize input vectors
for (i = 0; i < N; i++)
{
h_A[i] = 1.0;
h_B[i] = 2.0;
h_C[i] = 0.0;
}
//Allocate vectors in device memory
float*d_A;
cudaMalloc( (void **)&d_A,size);
float*d_B;
cudaMalloc( (void **)&d_B,size);
float*d_C;
cudaMalloc( (void **)&d_C,size);
//Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//Invoke kernel
int threadsPerBlock = 256;
int blockPerGrid = (N + threadsPerBlock - 1)/ threadsPerBlock;
VecAdd<<<blockPerGrid,threadsPerBlock>>>(d_A,d_B,d_C,N);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
for(i = 0; i < N; i++)
{
printf("%f",h_C[i]);
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
}
#include<stdlib.h>
#include<unistd.h>
__global__ void VecAdd(float*A, float*B, float*C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if ( i < N)
C[i] = A[i] + B[i];
}
int main()
{
int N = 100;
int i = 0 ;
size_t size = N*sizeof(float);
//Allocate input vectors h_A and h_B in host memory
float * h_A = (float*)malloc(size);
float * h_B = (float*)malloc(size);
float * h_C = (float*)malloc(size);
// Initialize input vectors
for (i = 0; i < N; i++)
{
h_A[i] = 1.0;
h_B[i] = 2.0;
h_C[i] = 0.0;
}
//Allocate vectors in device memory
float*d_A;
cudaMalloc( (void **)&d_A,size);
float*d_B;
cudaMalloc( (void **)&d_B,size);
float*d_C;
cudaMalloc( (void **)&d_C,size);
//Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//Invoke kernel
int threadsPerBlock = 256;
int blockPerGrid = (N + threadsPerBlock - 1)/ threadsPerBlock;
VecAdd<<<blockPerGrid,threadsPerBlock>>>(d_A,d_B,d_C,N);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
for(i = 0; i < N; i++)
{
printf("%f",h_C[i]);
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
}
相关文章推荐
- 图像和滚动 、 编程规范和Xcode(一)
- 07 java main方法
- Spring学习笔记<一>
- python基础教程共60课-第43课查天气1
- Java设计模式之策略模式
- 《spring技术内幕》学习(二)ioc容器实现
- spring-session源码解读-5
- Spring AOP实现方式
- JAVA设计模式之抽象工厂模式
- java设计模式之建造者模式
- Java中堆内存和栈内存详解
- JAVA设计模式之工厂模式(简单工厂模式+工厂方法模式)
- JAVA设计模式之原型模式
- JAVA设计模式之代理模式
- Eclipse无法启动报An internal error occurred during: "reload maven project". java.lang.NullPointerException
- Java内存管理:深入Java内存区域(深入理解Java虚拟机的第2章内容,加上个人浅显理解)
- windows php安装ImageMagick扩展
- C#中listview 选中一行并把对应的几列数据放到textbox中
- JAVA设计模式之享元模式
- 代码对比看:&& || %errorlevel%的区别