您的位置：首页 > 其它

cuda下调试矩阵相加程序

2012-05-10 14:19 225 查看

这几天一直在调试cuda下的矩阵相加程序，但一直运行错误，最后才发现是我的NVIDIA不支持double类型，导致出错。

下面是我的代码：

#include"stdio.h"
#include"cuda_runtime.h"
#include"cutil_inline.h"

__global__voidcal(double*D_A,double*D_B,double*D_C,intweidth,intheight,size_tpitch)
{
//introw=threadIdx.x;
//intcol=threadIdx.y;
intcol=threadIdx.x+blockDim.x*blockIdx.x;
introw=threadIdx.y+blockDim.y*blockIdx.y;
if(col<weidth&&row<height)
/*D_C[row*weidth+col]=D_A[row*weidth+col]+D_B[row*weidth+col];
//printf("testprintfinkernalcode");
//printf("%f",D_C[row][col]);*/
D_C[row*pitch/sizeof(double)+col]=D_A[row*pitch/sizeof(double)+col]+D_B[row*pitch/sizeof(double)+col];
}
intmain()
{
//__global__voidcal(double*,double*,double*,int,int,size_t);
intheight=10;
intweidth=3;
//doubleH_A[height][weidth];
//doubleH_B[height][weidth];
double*H_A=(double*)malloc(sizeof(double)*weidth*height);
double*H_B=(double*)malloc(sizeof(double)*weidth*height);
double*H_C=(double*)malloc(sizeof(double)*height*weidth);
for(inti=0;i<height;i++)
for(intj=0;j<weidth;j++)
H_C[i*weidth+j]=0;
printf("startinputH_A\n");
for(inti=0;i<height;i++)
{
for(intj=0;j<weidth;j++)
{
//printf("%d",i);
H_A[i*weidth+j]=i+j;
H_B[i*weidth+j]=1;
printf("%f%f",H_B[i*weidth+j],H_A[i*weidth+j]);
}printf("\n");
}printf("finishinput\n");
size_tpitch;
double*D_A,*D_B,*D_C;
cudaMallocPitch((void**)&D_A,&pitch,weidth*sizeof(double),height);
cudaMallocPitch((void**)&D_B,&pitch,weidth*sizeof(double),height);
cudaMallocPitch((void**)&D_C,&pitch,weidth*sizeof(double),height);
printf("pitchis%d\n",pitch);
cudaMemcpy2D(D_A,pitch,H_A,weidth*sizeof(double),weidth*sizeof(double),height,cudaMemcpyHostToDevice);
cudaMemcpy2D(D_B,pitch,H_B,weidth*sizeof(double),weidth*sizeof(double),height,cudaMemcpyHostToDevice);
dim3dimBlock(3,1,1);
dim3dimGrid(1,10,1);
cal<<<dimGrid,dimBlock>>>(D_A,D_B,D_C,weidth,height,pitch);

/*double**H_C=newdouble*[height];
for(inti=0;i<height;i++)
{
H_C[i]=newdouble[weidth];
for(intj=0;j<weidth;j++)
H_C[i][j]=0;
}*/
cudaMemcpy2D(H_C,sizeof(double)*weidth,D_C,pitch,weidth*sizeof(double),height,cudaMemcpyDeviceToHost);
for(inti=0;i<height;i++){
for(intj=0;j<weidth;j++)
printf("%lf",H_C[i*weidth+j]);
printf("\n");
}

free(H_A);

free(H_B);

free(H_C);

cudaFree(D_A);

cudaFree(D_B);

cudaFree(D_C);

以上是我的源代码。利用malloc声明连续的存储空间，自己分配二维数组，这样在读取的时候也是自己对齐元素。

但是结果一直都不对，我开始一点点排除，首先排除了内存与设备存储间传递数据出错的问题，不过GPU内部不支持二维数组直接取下标的读取方式，必须要自己用pitch进行计算读取，另外要拷贝进GPU的数组也不能是指针定义的，很显然，指向内存的地址，明显不是GPU内的地址。这个是指二维数组不能定义成double**array的形式，然后array指向一个double*[height]的指针数组，每个指针数组中的元素都指向一维数组。这样拷贝到GPU中会出现错误拷贝。

在排除完传递错误后，我开始想是不是GPU分配错误

然后我从网上搜到了以下代码

/**************************************************************************
*矩阵相加的例子
***************************************************************************/
#include<stdio.h>
#include<stdlib.h>
#include<cutil_inline.h>

/************************************************************************
*InitCUDA
************************************************************************/
#if__DEVICE_EMULATION__

boolInitCUDA(void){returntrue;}
#else
boolInitCUDA(void)
{
intcount=0;
inti=0;

cudaGetDeviceCount(&count);
if(count==0){
fprintf(stderr,"Thereisnodevice./n");
returnfalse;
}
for(i=0;i<count;i++){
cudaDevicePropprop;
if(cudaGetDeviceProperties(&prop,i)==cudaSuccess){
if(prop.major>=1){
break;
}}}if(i==count){
fprintf(stderr,"ThereisnodevicesupportingCUDA./n");
returnfalse;
}cudaSetDevice(i);

printf("CUDAinitialized./n");
returntrue;
}
#endif

/*****************************************************************************************************
*kernel函数，矩阵相加
******************************************************************************************************/
__global__voidmyKernel(constfloat*a,constfloat*b,float*c,size_tpitch,intheight,intwidth)
{

inti=blockIdx.y*blockDim.y+threadIdx.y;
intj=blockIdx.x*blockDim.x+threadIdx.x;

if(i<height&&j<width)
c[i*pitch/sizeof(float)+j]=a[i*pitch/sizeof(float)+j]+b[i*pitch/sizeof(float)+j];
}
intmain(intargc,char*argv[])
{
//if(!InitCUDA())
//return0;
//CPU上的3个矩阵数组
//float*cpu_A;
//float*cpu_B;
//float*cpu_C;
//GPU上的3个矩阵数组
float*gpu_A;
float*gpu_B;
float*gpu_C;

intwidth=3;//矩阵的宽度（列数）
intheight=10;//矩阵的高度（行数）
size_tpitch;//GPU数组的pitch
//为CPU上的矩阵数组申请内存空间
//cpu_A=(float*)malloc(sizeof(float)*width*height);
floatcpu_A[height][width];
floatcpu_B[height][width];
floatcpu_C[height][width];

//cpu_B=(float*)malloc(sizeof(float)*width*height);
//cpu_C=(float*)malloc(sizeof(float)*width*height);
//为GPU上的矩阵数组申请显存空间
cutilSafeCall(cudaMallocPitch((void**)&gpu_A,&pitch,sizeof(float)*width,height));
cutilSafeCall(cudaMallocPitch((void**)&gpu_B,&pitch,sizeof(float)*width,height));
cutilSafeCall(cudaMallocPitch((void**)&gpu_C,&pitch,sizeof(float)*width,height));
//将pitch打印
printf("Thepitchis:%d\n",pitch);
//为CPU上的矩阵数组初始化
/*for(intr=0;r<height;++r){
for(intc=0;c<width;++c){
//cpu_A[r*width+c]=r*c;
cpu_A[r][c]=r*c;
cpu_B[r*width+c]=r+c;
cpu_C[r*width+c]=0.0;
}}*/

for(intr=0;r<height;r++){
for(intc=0;c<width;c++)
{
cpu_A[r][c]=r*c;
cpu_B[r][c]=r+c;
cpu_C[r][c]=0.0;
}}//打印CPU上的矩阵数组
printf("/nCPU_ADATA\n");
for(intr=0;r<height;++r){
for(intc=0;c<width;++c){
printf("%f/t",cpu_A[r*width+c]);
}printf("\n");
}printf("/nCPU_BDATA\n");
for(intr=0;r<height;++r){
for(intc=0;c<width;++c){
printf("%f/t",cpu_B[r*width+c]);
}printf("\n");
}printf("/nCPU_CDATA\n");
for(intr=0;r<height;++r){
for(intc=0;c<width;++c){
printf("%f/t",cpu_C[r*width+c]);
}printf("\n");
}
//将CPU上的矩阵数组cpu_A、cpu_B分别拷贝到GPU上的矩阵数组gpu_A、gpu_B中
cutilSafeCall(cudaMemcpy2D(gpu_A,pitch,cpu_A,sizeof(float)*width,sizeof(float)*width,height,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy2D(gpu_B,pitch,cpu_B,sizeof(float)*width,sizeof(float)*width,height,cudaMemcpyHostToDevice));

dim3Dg(1,10,1);//定义整个grid的维度和尺寸
dim3Db(width,1,1);//定义每个block的维度和尺寸
myKernel<<<Dg,Db,0>>>(gpu_A,gpu_B,gpu_C,pitch,height,width);//调用kernel函数

//将显存数组gpu_C拷贝会内存数组cpu_C
cutilSafeCall(cudaMemcpy2D(cpu_C,sizeof(float)*width,gpu_C,pitch,sizeof(float)*width,height,cudaMemcpyDeviceToHost));

//打印CPU_C数组
printf("/nAfterchangeCPU_CDAT\/n");
for(intr=0;r<height;++r){
for(intc=0;c<width;++c){
printf("%f/t",cpu_C[r*width+c]);
}printf("\n");
}//释放内存空间
free(cpu_A);
free(cpu_B);
free(cpu_C);
//释放显存空间
cutilSafeCall(cudaFree(gpu_A));
cutilSafeCall(cudaFree(gpu_B));
cutilSafeCall(cudaFree(gpu_C));
//退出CUDA
//CUT_EXIT(argc,argv);

return0;
}

这种数组静态分配方式是可行的，因为也是直接分配连续空间。以上代码说明我的GPU分配可行，最后我将double改成float发现程序运行正确。才发现我之前编译运行的时候一直都有一个提示被我忽略：doubleisnotsupported。查了下才发现是我的版本的NVIDIA显卡不支持。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航