您的位置:首页 > 产品设计 > UI/UE

Cuda编程总结--cuda c programming Guide

2013-10-03 15:00 375 查看
Cuda编程总结2013-10-120:32:46
--------余家奎
 
参加书籍:NVIDIACUDA C Programming Guide
OpenGL编程指南
 
学习cuda例子中的总结

1、__constant__和__device__,__shared__的使用说明...2

2、分配二位数组实现两个二位数组相加...2

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝...5

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法...7

5、不带共享存储器的矩阵的相乘...9

6、带shared memory的矩阵相乘...12

7、页锁定主机存储器Page-locked Host memory.16

8、纹理存储的使用texture memory.18

9、surface Memory的使用方法...20

10、opengl和cuda的交互...22

11、Formatted output---printf函数在device的函数中,但是其需要其的compute copability至少为2.0 27

12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0.28

13、Per Thread Allocation On heap每个线程在堆上分配...29

14、Per Thread Block Allocation每个线程块在堆上分配空间...29

15、Allocation Persisting Between Kernel Launches在堆上分配...30 

如有错误的地方还请指正。。。谢谢

 

1、__constant__和__device__,__shared__的使用说明

其对应的程序:
//
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h"
#include
<stdio.h>
#include
<stdlib.h> 
// __constant__ int device_global_var=5;
// __device__ int device_global_var=5;
__shared__
int device_global_var ;
__global__
void kernel()
{
    __shared__
int xx;
}
int main()
{   
      int host_var=5;
    cudaMemcpyToSymbol(device_global_var,&host_var,sizeof(int)); 
    printf("value=%d\n",host_var);   
    cudaMemcpyFromSymbol(&host_var,device_global_var,sizeof(int));
    printf("device_value=%d\n",host_var); 
    system("pause");
    return 0;

2、分配二位数组实现两个二位数组相加

#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
#include
<stdio.h>
#include
<stdlib.h> 
#define  N 16 
__device__
intdevice_a

,device_b

,device_c


__global__
void VecAdd(inta

,int b

,intc

)
{
    int global_threadId_x=blockIdx.x*blockDim.x+threadIdx.x;
    int global_threadId_y=blockIdx.y*blockDim.y+threadIdx.y; 
    if (global_threadId_x<N &&global_threadId_y <N)
    {
        c[global_threadId_y][global_threadId_x]=a[global_threadId_y][global_threadId_x]+
            b[global_threadId_y][global_threadId_x];
    }
}
void printfArray(int data

)
{
    for (inti=0;i<N;i++)
    {
        for (intj=0;j<N;j++)
        {
            printf("%d ",data[i][j]);
        }
        printf("\n");
    }

void host_Add(int a

,int b

,intc

)
{
    for (inti=0;i<N;i++)
    {
        for (int j=0;j<N;j++)
        {
            c[i][j]=a[i][j]+b[i][j];
        }
    }
}
int main()
{
    int i,j;
    int k=0;
    int a

,b

;
    int c

;   
    for (i=0;i<N;i++)
    {
        for (j=0;j<N;j++)
        {
            a[i][j]=k;
            b[i][j]=k;
            k++;
        }
    }
    int tempA

;
    //int (*device_aa)
;
    int **device_aa;
    cudaMalloc((void**)&device_aa,sizeof(int)*N*N);
    cudaMemcpyToSymbol(device_a,a,sizeof(int)*N*N); 
    cudaMemcpyFromSymbol(tempA,device_a,sizeof(int)*N*N);
    printf("tempA====\n");
    printfArray(tempA);  
    system("pause");
    return 0;

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝

代码:
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h"
#include
<stdio.h>
#include
<stdlib.h>
#include
<iostream> 
// kernel which copies data from d_arrayto destinationArray 
__global__
void CopyData(float*d_array, 
                                   float* destinationArray, 
                                   size_tpitch, 
                                   int columnCount, 
                                   int rowCount) 

  for (int row = 0; row< rowCount; row++) 
  { 
     // update the pointer to point to the beginning of the nextrow 
     float* rowData = (float*)(((char*)d_array) +(row * pitch)); 
    for (int column = 0;column < columnCount; column++) 
    { 
      rowData[column] =123.0;
// make every value in the array123.0 

     destinationArray[(row*columnCount) + column] = rowData[column]; 
    } 
  } 

int main(int argc,char**argv) 


  int columnCount = 15; 
  int rowCount = 10; 
  float* d_array;
// thedevice array which memory will be allocated to 
  float* d_destinationArray;
//the device array 
  // allocate memory on the host 

  float* h_array =
new float[columnCount*rowCount]; 
  // the pitch value assigned by cudaMallocPitch 
  // (which ensures correct data structure alignment) 
  size_tpitch; 
  //allocated the device memory for source array 
  cudaMallocPitch(&d_array, &pitch,columnCount * sizeof(float), rowCount); 
  //allocate the device memory for destination array 
  cudaMalloc(&d_destinationArray,columnCount*rowCount*sizeof(float)); 
  //call the kernel which copies values from d_array tod_destinationArray 
 CopyData<<<100, 512>>>(d_array, d_destinationArray,pitch, columnCount, rowCount); 
  //copy the data back to the host memory  
  float *h_result=(float*)malloc(sizeof(float)*columnCount*rowCount);
 memset(h_result,0,sizeof(float)*columnCount*rowCount); 
 cudaMemcpy2D(h_result,columnCount*sizeof(float),d_array,pitch,columnCount*sizeof(float),rowCount,cudaMemcpyDeviceToHost);
 
 cudaMemcpy(h_array, 
                   d_destinationArray, 
                   columnCount*rowCount*sizeof(float), 
                   cudaMemcpyDeviceToHost);  
  for(int i = 0 ; i< rowCount ; i++) 
  { 
      for(int j = 0 ; j < columnCount ; j++) 
      { 
          cout << "h_result["<< (i*columnCount) + j <<"]="<< h_result[(i*columnCount) + j] << endl; 
      } 
  } 
  system("pause");
  printf("h_array==\n");
  //print out the values (all the values are 123.0) 
  for(int i = 0 ; i< rowCount ; i++) 
  { 
    for(int j = 0 ; j< columnCount ; j++) 
    { 
      cout<< "h_array[" <<(i*columnCount) + j <<"]="<< h_array[(i*columnCount) + j] << endl; 

    } 
  }  
  system("pause");
}  

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法

代码:
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
#include
<stdlib.h>
#include
<stdio.h> 
// Device code
__global__
void MyKernel(cudaPitchedPtrdevPitchedPtr,cudaExtent extent)
{
    char* devPtr = (char*)devPitchedPtr.ptr;
    size_t pitch= devPitchedPtr.pitch;
    size_tslicePitch = pitch * extent.height;
    for(int k=0; k <extent.depth; k++){
        char* slice = devPtr + k * slicePitch;
        for(int j=0; j<extent.height; j++){
            float3* row = (float3*)(slice+j*pitch); 
            for (inti=0;i<extent.width;i++)
            {
                row[i].x=2;
                row[i].y=3;
                row[i].z=4;
            }
        }
    } 

const int x=6;
const int y=60;
const int z=66; 
int main(){
    size_tbuf_pf=900000000;
//  cudaPrintfInit(buf_pf);
    cudaError_tstatus = cudaSuccess; 
    //======== Mem Host 
    float3 *mem_host = (float3*)malloc(sizeof(float3)*x*y*z);
    float3 *mem_host2 = (float3*)malloc(sizeof(float3)*x*y*z); 
    for(int i=0;i<x*y*z;i++){
        mem_host[i].x=10;
        mem_host[i].y=100;
        mem_host[i].z=1000;
    }
    //======== Mem Device 
    cudaExtentextent;
    extent.width=x*sizeof(float3);
    extent.height=y;
    extent.depth=z; 
    cudaPitchedPtrmem_device;
    status=cudaMalloc3D(&mem_device,extent);
// if(status!= cudaSuccess){fprintf(stderr, "Malloc: %s\n", cudaGetErrorString(status));}
//
// //========Cpy HostToDevice
//
// cudaMemcpy3DParmsp = { 0 };
// p.srcPtr= make_cudaPitchedPtr((void*)mem_host, x*sizeof(float3),x,y);
// p.dstPtr= mem_device;
// p.extent= extent;
// p.kind= cudaMemcpyHostToDevice;
// status=cudaMemcpy3D(&p);
// if(status!= cudaSuccess){fprintf(stderr, "MemcpyHtD: %s\n",cudaGetErrorString(status));} 
    MyKernel<<<1,1>>>(mem_device,extent); 
    //======== Cpy DeviceToHost !!!!!!! UNTESTED !!!!!!!! 
    cudaMemcpy3DParmsq = {0};
    q.srcPtr =mem_device;
    q.dstPtr =make_cudaPitchedPtr((void*)mem_host2,x*sizeof(float3),x,y);
    q.extent=extent;
    q.kind =cudaMemcpyDeviceToHost;
    status=cudaMemcpy3D(&q);
    if(status != cudaSuccess){fprintf(stderr,"MemcpyDtoH: %s\n",cudaGetErrorString(status));} 
    for(int i=0;i<x*y*z;i++)
        printf("%f %f %f\n",mem_host2[i].x,mem_host2[i].y,mem_host2[i].z); 
    cudaFree(mem_device.ptr); 
    system("pause"); 
}

5、不带共享存储器的矩阵的相乘

代码:
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
#include
<stdio.h>
#include
<stdlib.h>
#include
<string.h> 
typedef
struct 
{
    int width;
    int height;
    float *element;
}Matrix; 
#define BLOCK_SIZE 16 
__global__
void MatMulKernel(const Matrix,const Matrix,Matrix); 
void printMatrix(const Matrix &A)
{
    for (inti=0;i<A.height;i++)
    {
        for (intj=0;j<A.width;j++)
        {
            printf("%f ",A.element[i*A.width+j]);
        }
        printf("\n");
    }
}
void MatMul(const Matrix &A,const Matrix &B,Matrix &C)
{
    printf("matrix A");
    printMatrix(A);
    printf("matrix B");
    printMatrix(B);
    system("pause"); 
    Matrix d_A;
    d_A.width=A.width;
    d_A.height=A.height;
    size_tsize=A.width*A.height*sizeof(float);
    cudaMalloc(&d_A.element,size);
    cudaMemcpy(d_A.element,A.element,size,cudaMemcpyHostToDevice); 
    Matrix d_B;
    d_B.width=B.width;
    d_B.height=B.height;
    size=B.width*B.height*sizeof(float);
    cudaMalloc(&d_B.element,size);
    cudaMemcpy(d_B.element,B.element,size,cudaMemcpyHostToDevice); 
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    size=C.width*C.height*sizeof(float);
    cudaMalloc(&d_C.element,size);   
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C); 
    cudaMemcpy(C.element,d_C.element,size,cudaMemcpyDeviceToHost); 
    cudaFree(d_A.element);
    cudaFree(d_B.element);
    cudaFree(d_C.element); 
    for (inti=0;i<C.height;++i)
    {
        for (intj=0;j<C.width;++j)
        {
            printf("%f ",C.element[i*C.width+j]);
        }
        printf("\n");
    }
    system("pause");
}
void SetMatrixValue(Matrix &A,int value)
{
    for (inti=0;i<A.height;++i)
    {
        for (intj=0;j<A.width;++j)
        {
            A.element[i*A.width+j]=value;
        }
    }
}
void main()
{
    MatrixA,B,C;
    A.width=128;
    A.height=128;
    A.element=(float*)malloc(A.width*A.height*sizeof(float));
    SetMatrixValue(A,2); 
    B.width=128;
    B.height=128;
    B.element=(float*)malloc(B.width*B.height*sizeof(float));
    //memset(B.element,2,sizeof(float)*B.width*B.height);
    SetMatrixValue(B,2); 
    C.width=128;
    C.height=128;
    C.element=(float*)malloc(C.width*C.height*sizeof(float));
    //memset(C.element,2,sizeof(float)*C.width*C.height); 
    MatMul(A,B,C); 
    for (inti=0;i<C.height;++i)
    {
        for (intj=0;j<C.width;++j)
        {
            printf("%f ",C.element[i*C.width+j]);
        }
        printf("\n");
    } 
    system("pause");

__global__
void MatMulKernel(Matrix A,MatrixB,Matrix C)
{
    float CValue=0;
    int row=blockIdx.y*blockDim.y+threadIdx.y;
    int col=blockIdx.x*blockDim.x+threadIdx.x; 
    for (inte=0;e<A.width;++e)
    {
        CValue+=A.element[row*A.width+e]*B.element[e*B.width+col];
    } 
    C.element[row*C.width+col]=CValue;

6、带shared memory的矩阵相乘

代码:
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
#include
<stdio.h>
#include
<stdlib.h> 
#define BLOCK_SIZE 16 
typedef
struct
{
    int width;
    int height;
    int stride;
    float *elements;
}Matrix; 
__device__
float GetElement(constMatrix A,
int row,intcol)
{
    return A.elements[row*A.stride+col];

__device__
void SetElement(Matrix A,int row,int col,float value)
{
    A.elements[row*A.stride+col]=value;

__device__ Matrix GetSubMatrix(Matrix A,int row,int col)
{
    Matrix Asub;
    Asub.width=BLOCK_SIZE;
    Asub.height=BLOCK_SIZE;
    Asub.stride=A.stride;
    Asub.elements=&A.elements[A.stride*BLOCK_SIZE*row+BLOCK_SIZE*col]; 
    return Asub;

__global__
void MatMulKernel(const Matrix,const Matrix,Matrix); 
void MatMul(const Matrix &A,const Matrix &B,Matrix &C)
{
    Matrix d_A;
    d_A.width=A.width;
    d_A.height=A.height;
    d_A.stride=A.width;
    size_tsize=d_A.width*d_A.height*sizeof(float);
    cudaMalloc(&d_A.elements,size);
    cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice); 
    Matrix d_B;
    d_B.width=B.width;
    d_B.height=B.height;
    d_B.stride=B.width;
    size=B.width*B.height*sizeof(float);
    cudaMalloc(&d_B.elements,size);
    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice); 
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    d_C.stride=C.width;
    size=C.width*C.height*sizeof(float);
    cudaMalloc(&d_C.elements,size); 
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/BLOCK_SIZE,B.height/BLOCK_SIZE);
    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C); 
    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);

__global__
void MatMulKernel(Matrix A,MatrixB,Matrix C)
{
    int blockRow=blockIdx.y;
    int blockCol=blockIdx.x; 
    MatrixCsub=GetSubMatrix(C,blockRow,blockCol); 
    float Cvalue=0; 
    int row=threadIdx.y;
    int col=threadIdx.x; 
    for (intm=0;m<A.width/BLOCK_SIZE;++m)
    {
        MatrixAsub=GetSubMatrix(A,blockRow,m);
        MatrixBsub=GetSubMatrix(B,m,blockCol); 
        __shared__
floatAs[BLOCK_SIZE][BLOCK_SIZE];
        __shared__
floatBs[BLOCK_SIZE][BLOCK_SIZE]; 
        As[row][col]=GetElement(Asub,row,col);
        Bs[row][col]=GetElement(Bsub,row,col); 
        __syncthreads(); 
        for (inte=0;e<BLOCK_SIZE;++e)
        {
            Cvalue+=As[row][e]*Bs[e][col];
        } 
        __syncthreads();
    } 
    SetElement(Csub,row,col,Cvalue);
}
 void SetMatValue(Matrix A,int value)
{
    for (inti=0;i<A.height;++i)
    {
        for (intj=0;j<A.width;++j)
        {
            A.elements[i*A.width+j]=value;
        }
    }
}
void PrintMat(const Matrix A)
{
    for (int i=0;i<A.height;++i)
    {
        for(intj=0;j<A.width;++j)
        {
            printf("%f ",A.elements[i*A.width+j]);
        }
        printf("\n");
    }
}
void main()
{
    MatrixA,B,C;
    A.width=128;
    A.height=128;
    A.elements=(float*)malloc(A.width*A.height*sizeof(float));
    SetMatValue(A,2); 
    B.width=128;
    B.height=128;
    B.elements=(float*)malloc(B.width*B.height*sizeof(float));
    SetMatValue(B,2); 
    C.width=128;
    C.height=128;
    C.elements=(float *)malloc(C.width*C.height*sizeof(float)); 
    MatMul(A,B,C);
    PrintMat(C);
    system("pause"); 

7、页锁定主机存储器Page-lockedHost memory

代码:

#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
#include
<stdio.h>
#include
<stdlib.h> 
#define N 256 
void SetArrayValue(float *pData,int size,float value)
{
    for (inti=0;i<size;++i)
    {
        pData[i]=value;
    }

__global__
void VecAdd(float*A,float *B,float*C)
{
    int index=threadIdx.x;
    C[index]=A[index]+B[index];

void PrintArray(float *data,int size)
{
    for(inti=0;i<size;++i)
    {
        if((i+1)%10==0)
        {
            printf("\n");
        }
        printf("%f ",data[i]);
    }

void main()
{
    //cudaSetDeviceFlags(cudaDeviceMapHost);---可有可无
    cudaDevicePropdeviceProp;
    cudaGetDeviceProperties(&deviceProp,0); 
    if (deviceProp.integrated)
    {
        printf("GPU is integrated\n");
        return;
    }
    if(!deviceProp.canMapHostMemory)
    {
        printf("can't map host memory\n");
        return;
    }   
    float *hostPtrA;
    cudaHostAlloc(&hostPtrA,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);
    SetArrayValue(hostPtrA,N,29); 
    PrintArray(hostPtrA,N); 
    system("pause"); 
    float *hostPtrB;
    cudaHostAlloc(&hostPtrB,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);
    SetArrayValue(hostPtrB,N,31); 
    float *devPtrA,*devPtrB;
    cudaHostGetDevicePointer(&devPtrA,hostPtrA,0);
    cudaHostGetDevicePointer(&devPtrB,hostPtrB,0); 
    float *hostPtrC;
    cudaHostAlloc(&hostPtrC,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);
    float *devPtrC;
    cudaHostGetDevicePointer(&devPtrC,hostPtrC,0); 
    VecAdd<<<1,N>>>(devPtrA,devPtrB,devPtrC); 
    cudaDeviceSynchronize(); 
    for(inti=0;i<N;i++)
    {
        if ((i+1)%10==0)
        {
            printf("\n");
        }
        printf("%f ",hostPtrC[i]);
    }
    system("pause");
}

8、纹理存储的使用texturememory

代码:

#include
"cuda_runtime.h"
#include
"device_launch_parameters.h"
#include
<stdio.h>
#include
<stdlib.h> 
#define size 256 
texture<float,cudaTextureType2D,cudaReadModeElementType>texRef; 
__global__
void transformKernel(float *output,intwidth,int height,floattheta)
{
    unsigned
int x=blockIdx.x*blockDim.x+threadIdx.x;
    unsigned
int y=blockIdx.y*blockDim.y+threadIdx.y; 
    float u=x/(float)width;
    float v=y/(float)height; 
    u-=0.5f;
    v-=0.5f;
    float tu=u*cosf(theta)-v*sin(theta)+0.5f;
    float tv=v*cosf(theta)+u*sinf(theta)+0.5f;
    output[y*width+x]=tex2D(texRef,tu,tv);

void main()
{
    int width=25,height=25;
    cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
    cudaArray*cuArray;
    cudaMallocArray(&cuArray,&channelDesc,width,height); 
    float *h_data=(float*)malloc(width*height*sizeof(float));
    for (inti=0;i<height;++i)
    {
        for (intj=0;j<width;++j)
        {
            h_data[i*width+j]=i*width+j;
        }
    } 
    cudaMemcpyToArray(cuArray,0,0,h_data,width*height*sizeof(float),cudaMemcpyHostToDevice); 
    texRef.addressMode[0]=cudaAddressModeWrap;
    texRef.addressMode[1]=cudaAddressModeWrap;
    texRef.filterMode=cudaFilterModeLinear;
    texRef.normalized=true; 
    cudaBindTextureToArray(texRef,cuArray,channelDesc);
    float *output;
    cudaMalloc(&output,width*height*sizeof(float)); 
    dim3 dimBlock(16,16);
    dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);
    float angle=30; 
    transformKernel<<<dimGrid,dimBlock>>>(output,width,height,angle); 
    float *hostPtr=(float*)malloc(sizeof(float)*width*height);
    cudaMemcpy(hostPtr,output,sizeof(float)*width*height,cudaMemcpyDeviceToHost); 
    for (inti=0;i<height;++i)
    {
        for (intj=0;j<width;++j)
        {
            printf("%f ",hostPtr[i*width+j]);
        }
        printf("\n");
    } 
    free(hostPtr); 
    cudaFreeArray(cuArray);
    cudaFree(output); 
    system("pause");

9、surface Memory的使用方法

代码:

#include
"cuda_runtime.h"
#include
"device_launch_parameters.h"
#include
<stdio.h>
#include
<stdlib.h> 
surface<void,2> inputSurfRef;
surface<void,2>outputSurfRef; 
__global__
void copyKernel(intwidth,int height)
{
    unsigned
int x=blockIdx.x*blockDim.x+threadIdx.x;
    unsigned
int y=blockIdx.y*blockDim.y+threadIdx.y; 
    if(x<width && y<height)
    {
        uchar4 data;
        surf2Dread(&data,inputSurfRef,x*4,y);
        surf2Dwrite(data,outputSurfRef,x*4,y);
    }
}
void main()
{
    int width=256,height=256;
    unsigned
int*h_data=(unsigned
int*)malloc(width*height*sizeof(unsignedint));
    for (inti=0;i<height;++i)
    {
        for (intj=0;j<width;++j)
        {
            h_data[i*width+j]=3;
        }
    }
    int size=width*height*sizeof(unsignedint); 
    cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(8,8,8,8,cudaChannelFormatKindUnsigned);
    cudaArray*cuInputArray;
    cudaMallocArray(&cuInputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore); 
    cudaArray*cuOutputArray;
    cudaMallocArray(&cuOutputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore); 
    cudaMemcpyToArray(cuInputArray,0,0,h_data,size,cudaMemcpyHostToDevice); 
    cudaBindSurfaceToArray(inputSurfRef,cuInputArray);
    cudaBindSurfaceToArray(outputSurfRef,cuOutputArray); 
    dim3 dimBlock(16,16);
    dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y); 
    copyKernel<<<dimGrid,dimBlock>>>(width,height); 
    unsigned
int*host_output=(unsigned
int*)malloc(sizeof(unsignedint)*width*height);
    cudaMemcpyFromArray(host_output,cuOutputArray,0,0,size,cudaMemcpyDeviceToHost); 
    for (inti=0;i<height;++i)
    {
        for (intj=0;j<width;++j)
        {
            printf("%u ",host_output[i*width+j]);
        }
        printf("\n");
    } 
    system("pause");
    free(host_output);
    free(h_data);
    cudaFreeArray(cuInputArray);
    cudaFreeArray(cuOutputArray);
}
 

10、opengl和cuda的交互

代码:https://devtalk.nvidia.com/default/topic/502692/how-to-use-open_gl/

http://stackoverflow.com/questions/12082357/errors-while-using-opengl-buffers-using-visual-studio-2010-in-windows7 

#include
<cuda.h>
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h"
#include <gl/glew.h>//要放在下面这一句的前面
#include "cuda_gl_interop.h" 
#include
<stdio.h>
#include
<stdlib.h> 
//#include <gl/GL.h>
#include
<gl/glut.h> 
GLuint postionsVBO=1;
struct cudaGraphicsResource * postionsVBO_CUDA; 
int width=256;
int height=256;
__device__
float dev_time=1;
float host_time=1; 
__global__
void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight); 
void init()
{
    glClearColor(0.f,0.f,0.f,1.f);
    glClear(GL_DEPTH_BUFFER_BIT| GL_COLOR_BUFFER_BIT);

void reshape(int width,int height)
{
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    if (width>height)
    {
        gluPerspective(45,(GLfloat)width/height,0.001,1000);
    }else
    {
        gluPerspective(45,(GLfloat)height/width,0.001,1000);
    }
    glMatrixMode(GL_MATRIX_MODE);
    glLoadIdentity();   
}
void display()
{
    float4 *positions;
    cudaGraphicsMapResources(1,&postionsVBO_CUDA,0);
    size_tnumb_bytes;
    cudaGraphicsResourceGetMappedPointer((void**)&positions,&numb_bytes,postionsVBO_CUDA); 
    dim3 dimBlock(16,16,1);
    dim3 dimGrid(width/dimBlock.x,height/dimBlock.y,1);
    createVertices<<<dimGrid,dimBlock>>>(positions,dev_time,width,height); 
    dev_time++;
    cudaMemcpy(&time,&host_time,sizeof(float),cudaMemcpyHostToDevice); 
    cudaGraphicsUnmapResources(1,&postionsVBO_CUDA,0); 
    glClear(GL_COLOR_BUFFER_BIT| GL_DEPTH_BUFFER_BIT);
    glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);
    glVertexPointer(4,GL_FLOAT,0,0);
    glEnableClientState(GL_VERTEX_ARRAY);
    glDrawArrays(GL_POINTS,0,width*height);
    glDisableClientState(GL_VERTEX_ARRAY); 
    glutSwapBuffers();
    glutPostRedisplay();

__global__
void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight)
{
    unsigned
int x=blockIdx.x*blockDim.x+threadIdx.x;
    unsigned
int y=blockIdx.y*blockDim.y+threadIdx.y; 
    float u=x/(float)width;
    float v=y/(float)height; 
    u=u*2.f-1.f;
    v=v*2.f-1.f; 
    float freq=4.f;
    float w=sinf(u*freq+time)*cosf(v*freq+time)*0.5f; 
    positions[y*width+x]=make_float4(u,w,v,1.f); 

int main(int argc,char*argv[])
{
    cudaGLSetGLDevice(0); 
    glutInit(&argc,argv);
    glutInitDisplayMode(GLUT_DOUBLE| GLUT_RGB);
    glutInitWindowPosition(0,0);
    glutInitWindowSize(100,100);
    glutCreateWindow("opengl-cuda");
    init();
    glutDisplayFunc(display);
    glutReshapeFunc(reshape); 
    glewInit();//http://stackoverflow.com/questions/12344612/unusual-error-using-opengl-buffers-with-cuda-interop-on-ms-visual-studio-2010 
    glGenBuffers(1,&postionsVBO);
    glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);
    unsigned
intsize=width*height*4*sizeof(float);
    glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW);
    glBindBuffer(GL_ARRAY_BUFFER,0);
    cudaGraphicsGLRegisterBuffer(&postionsVBO_CUDA,postionsVBO,cudaGraphicsMapFlagsWriteDiscard); 
    glutMainLoop();
}

11、Formattedoutput---printf函数在device的函数中,但是其需要其的compute copability至少为2.0

代码:

#include
<cuda.h>
#include
<helper_cuda.h>
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
__global__
void bcast()
{
    printf("%d\n",threadIdx.x);

void main()
{
    bcast<<<1,32>>>(); 
    cudaDeviceSynchronize();
    system("pause");

12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0

代码:

#include
<cuda.h>
#include
<helper_cuda.h> 
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h"
#include <assert.h>
#include
<stdlib.h> 
__global__
void testAssert(void)
{
    int is_one=1;
    int should_be_one=0; 
    assert(is_one);
    assert(should_be_one);

void main()
{
    testAssert<<<1,1>>>();
    cudaDeviceSynchronize();
    cudaDeviceReset();
    system("pause");

13、Per ThreadAllocation On heap每个线程在堆上分配

代码:

#include
<stdio.h>
#include
<stdlib.h>
#include
<cuda.h>
#include
<helper_cuda.h>
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
__global__
void mallocTestPerThread()
{
    char *ptr=(char *)malloc(100);
    printf("Thread %d got pointer:%p\n",threadIdx.x,ptr);
    free(ptr);

int main()
{
    cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
    mallocTestPerThread<<<1,5>>>();
    cudaDeviceSynchronize(); 
    system("pause");
    return 0;
}

14、Per Thread BlockAllocation每个线程块在堆上分配空间

代码:

#include
<stdio.h>
#include
<stdlib.h>
#include
<cuda.h>
#include
<helper_cuda.h>
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
__global__
void mallocTestPerThreadBlock()
{
    __shared__
int *data;
    if (threadIdx.x==0)
    {
        data=(int*)malloc(blockDim.x*64);
    }
    __syncthreads(); 
    if (data==NULL)
    {
        return;
    } 
    int *ptr=data;
    for (inti=0;i<64;++i)
    {
        ptr[i*blockDim.x+threadIdx.x]=threadIdx.x;
    }
    __syncthreads(); 
    if (threadIdx.x==0)
    {
        free(data);
    }

int main()
{
    cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
    mallocTestPerThreadBlock<<<10,128>>>();
    cudaDeviceSynchronize(); 
    system("pause");
    return 0;
}

15、AllocationPersisting Between Kernel Launches在堆上分配

代码:

#include
<cuda.h>
#include
<helper_cuda.h>
#include
<stdio.h>
#include
<stdlib.h>
#include
"cuda_runtime.h"
#include
"device_launch_parameters.h" 
#define NUM_BLOCKS 20 
__device__
int *dataptr[NUM_BLOCKS]; 
__global__
void allocmem()
{
    if (threadIdx.x==0)
    {
        dataptr[blockIdx.x]=(int*)malloc(blockDim.x*4);
    }
    __syncthreads(); 
    if (dataptr[blockIdx.x]==NULL)
    {
        return;
    } 
    dataptr[blockIdx.x][threadIdx.x]=0;

__global__
void usemem()
{
    int *ptr=dataptr[blockIdx.x];
    if (ptr!=NULL)
    {
        ptr[threadIdx.x]+=threadIdx.x;
    }

__global__
void freemem()
{
    int *ptr=dataptr[blockIdx.x];
    if(ptr!=NULL)
        printf("Block %d,Thread=%d:final value=%d\n",blockIdx.x,threadIdx.x,ptr[threadIdx.x]); 
    if (threadIdx.x==0)
    {
        free(ptr);
    }

int main()
{
    cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024); 
    allocmem<<<NUM_BLOCKS,10>>>(); 
    usemem<<<NUM_BLOCKS,10>>>();
    usemem<<<NUM_BLOCKS,10>>>();
    usemem<<<NUM_BLOCKS,10>>>(); 
    freemem<<<NUM_BLOCKS,10>>>(); 
    cudaDeviceSynchronize();
    system("pause");
    return 0;
   
}
 
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  cuda 编程