CUDA编程系列学习-从入门到放弃
2017-05-26 11:49
363 查看
本文主要从代码方面进行CUDA系列知识的学习,基础知识就不一一细讲了,毕竟很多大牛已经介绍得很详细了,,,,
bong bong bong 开始吧!
一、整数相加
由GPU计算两个整数求和,使用一个Block 一个thread就可以解决问题了。
代码如下:
二、一维数组相加
在上面的例子的基础上,将整数改为数组,并且一个Block 里面只设置一个线程(实现Block层次的粗粒度并行),所以kernel函数里面下标的索引就用blockIdx.x。
注意:由整数变为数组后函数参数的变化,是否取地址符
代码如下:
三、一维数组点乘求和-使用共享内存
该例子只用了一个Block,在该Block里面声明 了一块共享内存数组,通过线程索引将乘积保存到对应元素,利用栅栏同步函数等待所有线程计算完成后再用线程0来累加点乘之和。
kernel:
__global__ void dotProduct(int *a, int *b, int *c) {
__shared__ int shareArray
;
shareArray[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
__syncthreads();
if (0 == threadIdx.x) {
int sum = 0;
for (int i = 0; i < N; i++) {
sum += shareArray[i];
}
*c = sum;
}
}
void thirdDemo_SharMem() {
//host value of a b c
int *hA, *hB, *hCone;
size_t size = N * sizeof(int);
hA = (int*)malloc(size);
hB = (int*)malloc(size);
hCone = (int*)malloc(sizeof(int));
//srand( (unsigned) time(NULL) );
for (int i = 0; i < N; i++) {
hA[i] = i+1;
hB[i] = 2;
}
printf("A:\n");
for (int i = 0; i < N; i++) {
printf("%5d", hA[i]);
}
printf("\nB:\n");
for (int i = 0; i < N; i++) {
printf("%5d", hB[i]);
}
printf("\n");
//device value of a b c
int *devA, *devB, *devC;
//allocate device memery of a b c (use void**)
cudaMalloc((void**)&devA, size);
cudaMalloc((void**)&devB, size);
cudaMalloc((void**)&devC, sizeof(int));
//copy host value to device (use hA address &hA)
cudaMemcpy(devA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(devB, hB, size, cudaMemcpyHostToDevice);
//launch add() kernel on GPU use one Block
dotProduct << <1, N >> > (devA, devB, devC);
//return the answer from the device
cudaMemcpy(hCone, devC, sizeof(int), cudaMemcpyDeviceToHost);
printf("C:\n");
printf("%5d", *hCone);
//Free the device memory
cudaFree(devA);
cudaFree(devB);
cudaFree(devC);
free(hA);
free(hB);
free(hCone);
}
四:
bong bong bong 开始吧!
一、整数相加
由GPU计算两个整数求和,使用一个Block 一个thread就可以解决问题了。
代码如下:
__global__ void add(int *a, int *b, int *c) { *c = *a + *b; } void firstDemo(){ //host value of a b c int hA, hB, hC; hA = 10, hB = 20, hC = 0; //device value of a b c int *devA, *devB, *devC; //allocate device memery of a b c (use void**) cudaMalloc((void**)&devA, sizeof(int)); cudaMalloc((void**)&devB, sizeof(int)); cudaMalloc((void**)&devC, sizeof(int)); //copy host value to device (use hA address &hA) cudaMemcpy(devA, &hA, sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(devB, &hB, sizeof(int), cudaMemcpyHostToDevice); //launch add() kernel on GPU add << <1, 1 >> > (devA, devB, devC); //return the answer from the device cudaMemcpy(&hC, devC, sizeof(int), cudaMemcpyDeviceToHost); printf("C=%d\n", hC); //Free the device memory cudaFree(devA); cudaFree(devB); cudaFree(devC); }
二、一维数组相加
在上面的例子的基础上,将整数改为数组,并且一个Block 里面只设置一个线程(实现Block层次的粗粒度并行),所以kernel函数里面下标的索引就用blockIdx.x。
注意:由整数变为数组后函数参数的变化,是否取地址符
代码如下:
__global__ void vectorAdd(int *a, int *b, int *c) { //use blockidx.x to access block's index c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x]; } #define N 100 void secondDemo() { //host value of a b c int *hA, *hB, *hC; size_t size = N * sizeof(int); hA = (int*)malloc(size); hB = (int*)malloc(size); hC = (int*)malloc(size); //srand( (unsigned) time(NULL) ); for (int i = 0; i < N; i++) { hA[i] = rand()%20; hB[i] = rand() % 20; } printf("A:\n"); for (int i = 0; i < N; i++) { printf("%5d", hA[i]); } printf("\nB:\n"); for (int i = 0; i < N; i++) { printf("%5d", hB[i]); } printf("\n"); //device value of a b c int *devA, *devB, *devC; //allocate device memery of a b c (use void**) cudaMalloc((void**)&devA, size); cudaMalloc((void**)&devB, size); cudaMalloc((void**)&devC, size); //copy host value to device (use hA address &hA) cudaMemcpy(devA, hA, size, cudaMemcpyHostToDevice); cudaMemcpy(devB, hB, size, cudaMemcpyHostToDevice); //launch add() kernel on GPU vectorAdd << <N, 1 >> > (devA, devB, devC); //return the answer from the device cudaMemcpy(hC, devC, size, cudaMemcpyDeviceToHost); printf("C:\n"); for (int i = 0; i < N; i++) { printf("%5d", hC[i]); } printf("\n"); //Free the device memory cudaFree(devA); cudaFree(devB); cudaFree(devC); free(hA); free(hB); free(hC); }
三、一维数组点乘求和-使用共享内存
该例子只用了一个Block,在该Block里面声明 了一块共享内存数组,通过线程索引将乘积保存到对应元素,利用栅栏同步函数等待所有线程计算完成后再用线程0来累加点乘之和。
kernel:
__global__ void dotProduct(int *a, int *b, int *c) {
__shared__ int shareArray
;
shareArray[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
__syncthreads();
if (0 == threadIdx.x) {
int sum = 0;
for (int i = 0; i < N; i++) {
sum += shareArray[i];
}
*c = sum;
}
}
void thirdDemo_SharMem() {
//host value of a b c
int *hA, *hB, *hCone;
size_t size = N * sizeof(int);
hA = (int*)malloc(size);
hB = (int*)malloc(size);
hCone = (int*)malloc(sizeof(int));
//srand( (unsigned) time(NULL) );
for (int i = 0; i < N; i++) {
hA[i] = i+1;
hB[i] = 2;
}
printf("A:\n");
for (int i = 0; i < N; i++) {
printf("%5d", hA[i]);
}
printf("\nB:\n");
for (int i = 0; i < N; i++) {
printf("%5d", hB[i]);
}
printf("\n");
//device value of a b c
int *devA, *devB, *devC;
//allocate device memery of a b c (use void**)
cudaMalloc((void**)&devA, size);
cudaMalloc((void**)&devB, size);
cudaMalloc((void**)&devC, sizeof(int));
//copy host value to device (use hA address &hA)
cudaMemcpy(devA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(devB, hB, size, cudaMemcpyHostToDevice);
//launch add() kernel on GPU use one Block
dotProduct << <1, N >> > (devA, devB, devC);
//return the answer from the device
cudaMemcpy(hCone, devC, sizeof(int), cudaMemcpyDeviceToHost);
printf("C:\n");
printf("%5d", *hCone);
//Free the device memory
cudaFree(devA);
cudaFree(devB);
cudaFree(devC);
free(hA);
free(hB);
free(hCone);
}
四:
相关文章推荐
- Lua入门系列----pil学习笔记之 Type and Values (1)
- 李浩学习计算机系列笔记——ADO.NET基础入门
- Lua入门系列----pil学习笔记之 Type and Values (1)
- [新手入门]快速学习 ADO.NET Entity Framework系列文章 #1~#2
- ROR 学习笔记系列二 基于ROR的简单Hello World程序入门
- Struts入门文章系列(链接)及学习笔记
- Lua入门系列----pil学习笔记之Getting Start
- Lua入门系列----pil学习笔记之Type and Values (2)
- Lua入门系列----pil学习笔记之Getting Start
- [新手入门]快速学习 ADO.NET Entity Framework系列文章 #4 -- 数据新增、删除、修改(ObkectContext的 .SaveChange()方法)
- ERP入门系列学习
- Lua入门系列----pil学习笔记之Getting Start
- Lua入门系列----pil学习笔记之Type and Values (2)
- Lua入门系列----pil学习笔记之Getting Start
- Lua入门系列----pil学习笔记之 Type and Values (1)
- Lua入门系列----pil学习笔记之Type and Values (2)
- oracle 学习入门系列(1):基础,存储过程
- Lua入门系列----pil学习笔记之Type and Values (2)
- 天轰穿-c#趣味编程系列视频 - vs2005/2008 winform实例入门 第二集 学习技巧风暴
- HACMP 认证学习系列,第 1 部分:入门