您的位置:首页 > 编程语言

CUDA编程总结

2013-10-08 14:52 363 查看
Cuda编程总结2013-10-120:32:46--------余家奎 参加书籍:NVIDIACUDA C Programming GuideOpenGL编程指南 学习cuda例子中的总结1、__constant__和__device__,__shared__的使用说明...22、分配二位数组实现两个二位数组相加...23、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝...54、cudaMalloc3D()和cudaMemcpy3D()函数的用法...75、不带共享存储器的矩阵的相乘...96、带shared memory的矩阵相乘...127、页锁定主机存储器Page-locked Host memory.168、纹理存储的使用texture memory.189、surface Memory的使用方法...2010、opengl和cuda的交互...2211、Formatted output---printf函数在device的函数中,但是其需要其的compute copability至少为2.0 2712、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0.2813、Per Thread Allocation On heap每个线程在堆上分配...2914、Per Thread Block Allocation每个线程块在堆上分配空间...2915、Allocation Persisting Between Kernel Launches在堆上分配...30 如有错误的地方还请指正。。。谢谢

1、__constant__和__device__,__shared__的使用说明

其对应的程序://#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include <stdlib.h> // __constant__ int device_global_var=5;// __device__ int device_global_var=5;__shared__ int device_global_var ;__global__ void kernel(){ __shared__ int xx;}int main(){ int host_var=5; cudaMemcpyToSymbol(device_global_var,&host_var,sizeof(int)); printf("value=%d\n",host_var); cudaMemcpyFromSymbol(&host_var,device_global_var,sizeof(int)); printf("device_value=%d\n",host_var); system("pause"); return 0;}

2、分配二位数组实现两个二位数组相加

#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h>#include <stdlib.h> #define N 16 __device__ intdevice_a

,device_b

,device_c

; __global__ void VecAdd(inta

,int b

,intc

){ int global_threadId_x=blockIdx.x*blockDim.x+threadIdx.x; int global_threadId_y=blockIdx.y*blockDim.y+threadIdx.y; if (global_threadId_x<N &&global_threadId_y <N) { c[global_threadId_y][global_threadId_x]=a[global_threadId_y][global_threadId_x]+ b[global_threadId_y][global_threadId_x]; }}void printfArray(int data

){ for (inti=0;i<N;i++) { for (intj=0;j<N;j++) { printf("%d ",data[i][j]); } printf("\n"); }} void host_Add(int a

,int b

,intc

){ for (inti=0;i<N;i++) { for (int j=0;j<N;j++) { c[i][j]=a[i][j]+b[i][j]; } }}int main(){ int i,j; int k=0; int a

,b

; int c

; for (i=0;i<N;i++) { for (j=0;j<N;j++) { a[i][j]=k; b[i][j]=k; k++; } } int tempA

; //int (*device_aa)
; int **device_aa; cudaMalloc((void**)&device_aa,sizeof(int)*N*N); cudaMemcpyToSymbol(device_a,a,sizeof(int)*N*N); cudaMemcpyFromSymbol(tempA,device_a,sizeof(int)*N*N); printf("tempA====\n"); printfArray(tempA); system("pause"); return 0;}

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝

代码:#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include <stdlib.h>#include <iostream> // kernel which copies data from d_arrayto destinationArray __global__ void CopyData(float*d_array, float* destinationArray, size_tpitch, int columnCount, int rowCount) { for (int row = 0; row< rowCount; row++) { // update the pointer to point to the beginning of the nextrow float* rowData = (float*)(((char*)d_array) +(row * pitch)); for (int column = 0;column < columnCount; column++) { rowData[column] =123.0; // make every value in the array123.0 destinationArray[(row*columnCount) + column] = rowData[column]; } } } int main(int argc,char**argv) { int columnCount = 15; int rowCount = 10; float* d_array; // thedevice array which memory will be allocated to float* d_destinationArray; //the device array // allocate memory on the host float* h_array = new float[columnCount*rowCount]; // the pitch value assigned by cudaMallocPitch // (which ensures correct data structure alignment) size_tpitch; //allocated the device memory for source array cudaMallocPitch(&d_array, &pitch,columnCount * sizeof(float), rowCount); //allocate the device memory for destination array cudaMalloc(&d_destinationArray,columnCount*rowCount*sizeof(float)); //call the kernel which copies values from d_array tod_destinationArray CopyData<<<100, 512>>>(d_array, d_destinationArray,pitch, columnCount, rowCount); //copy the data back to the host memory float *h_result=(float*)malloc(sizeof(float)*columnCount*rowCount); memset(h_result,0,sizeof(float)*columnCount*rowCount); cudaMemcpy2D(h_result,columnCount*sizeof(float),d_array,pitch,columnCount*sizeof(float),rowCount,cudaMemcpyDeviceToHost); cudaMemcpy(h_array, d_destinationArray, columnCount*rowCount*sizeof(float), cudaMemcpyDeviceToHost); for(int i = 0 ; i< rowCount ; i++) { for(int j = 0 ; j < columnCount ; j++) { cout << "h_result["<< (i*columnCount) + j <<"]="<< h_result[(i*columnCount) + j] << endl; } } system("pause"); printf("h_array==\n"); //print out the values (all the values are 123.0) for(int i = 0 ; i< rowCount ; i++) { for(int j = 0 ; j< columnCount ; j++) { cout<< "h_array[" <<(i*columnCount) + j <<"]="<< h_array[(i*columnCount) + j] << endl; } } system("pause");}

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法

代码:#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdlib.h>#include <stdio.h> // Device code__global__ void MyKernel(cudaPitchedPtrdevPitchedPtr,cudaExtent extent){ char* devPtr = (char*)devPitchedPtr.ptr; size_t pitch= devPitchedPtr.pitch; size_tslicePitch = pitch * extent.height; for(int k=0; k <extent.depth; k++){ char* slice = devPtr + k * slicePitch; for(int j=0; j<extent.height; j++){ float3* row = (float3*)(slice+j*pitch); for (inti=0;i<extent.width;i++) { row[i].x=2; row[i].y=3; row[i].z=4; } } } } const int x=6;const int y=60;const int z=66; int main(){ size_tbuf_pf=900000000;// cudaPrintfInit(buf_pf); cudaError_tstatus = cudaSuccess; //======== Mem Host float3 *mem_host = (float3*)malloc(sizeof(float3)*x*y*z); float3 *mem_host2 = (float3*)malloc(sizeof(float3)*x*y*z); for(int i=0;i<x*y*z;i++){ mem_host[i].x=10; mem_host[i].y=100; mem_host[i].z=1000; } //======== Mem Device cudaExtentextent; extent.width=x*sizeof(float3); extent.height=y; extent.depth=z; cudaPitchedPtrmem_device; status=cudaMalloc3D(&mem_device,extent);// if(status!= cudaSuccess){fprintf(stderr, "Malloc: %s\n", cudaGetErrorString(status));}//// //========Cpy HostToDevice//// cudaMemcpy3DParmsp = { 0 };// p.srcPtr= make_cudaPitchedPtr((void*)mem_host, x*sizeof(float3),x,y);// p.dstPtr= mem_device;// p.extent= extent;// p.kind= cudaMemcpyHostToDevice;// status=cudaMemcpy3D(&p);// if(status!= cudaSuccess){fprintf(stderr, "MemcpyHtD: %s\n",cudaGetErrorString(status));} MyKernel<<<1,1>>>(mem_device,extent); //======== Cpy DeviceToHost !!!!!!! UNTESTED !!!!!!!! cudaMemcpy3DParmsq = {0}; q.srcPtr =mem_device; q.dstPtr =make_cudaPitchedPtr((void*)mem_host2,x*sizeof(float3),x,y); q.extent=extent; q.kind =cudaMemcpyDeviceToHost; status=cudaMemcpy3D(&q); if(status != cudaSuccess){fprintf(stderr,"MemcpyDtoH: %s\n",cudaGetErrorString(status));} for(int i=0;i<x*y*z;i++) printf("%f %f %f\n",mem_host2[i].x,mem_host2[i].y,mem_host2[i].z); cudaFree(mem_device.ptr); system("pause"); }

5、不带共享存储器的矩阵的相乘

代码:#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h>#include <stdlib.h>#include <string.h> typedef struct { int width; int height; float *element;}Matrix; #define BLOCK_SIZE 16 __global__ void MatMulKernel(const Matrix,const Matrix,Matrix); void printMatrix(const Matrix &A){ for (inti=0;i<A.height;i++) { for (intj=0;j<A.width;j++) { printf("%f ",A.element[i*A.width+j]); } printf("\n"); }}void MatMul(const Matrix &A,const Matrix &B,Matrix &C){ printf("matrix A"); printMatrix(A); printf("matrix B"); printMatrix(B); system("pause"); Matrix d_A; d_A.width=A.width; d_A.height=A.height; size_tsize=A.width*A.height*sizeof(float); cudaMalloc(&d_A.element,size); cudaMemcpy(d_A.element,A.element,size,cudaMemcpyHostToDevice); Matrix d_B; d_B.width=B.width; d_B.height=B.height; size=B.width*B.height*sizeof(float); cudaMalloc(&d_B.element,size); cudaMemcpy(d_B.element,B.element,size,cudaMemcpyHostToDevice); Matrix d_C; d_C.width=C.width; d_C.height=C.height; size=C.width*C.height*sizeof(float); cudaMalloc(&d_C.element,size); dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y); MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C); cudaMemcpy(C.element,d_C.element,size,cudaMemcpyDeviceToHost); cudaFree(d_A.element); cudaFree(d_B.element); cudaFree(d_C.element); for (inti=0;i<C.height;++i) { for (intj=0;j<C.width;++j) { printf("%f ",C.element[i*C.width+j]); } printf("\n"); } system("pause");}void SetMatrixValue(Matrix &A,int value){ for (inti=0;i<A.height;++i) { for (intj=0;j<A.width;++j) { A.element[i*A.width+j]=value; } }}void main(){ MatrixA,B,C; A.width=128; A.height=128; A.element=(float*)malloc(A.width*A.height*sizeof(float)); SetMatrixValue(A,2); B.width=128; B.height=128; B.element=(float*)malloc(B.width*B.height*sizeof(float)); //memset(B.element,2,sizeof(float)*B.width*B.height); SetMatrixValue(B,2); C.width=128; C.height=128; C.element=(float*)malloc(C.width*C.height*sizeof(float)); //memset(C.element,2,sizeof(float)*C.width*C.height); MatMul(A,B,C); for (inti=0;i<C.height;++i) { for (intj=0;j<C.width;++j) { printf("%f ",C.element[i*C.width+j]); } printf("\n"); } system("pause");} __global__ void MatMulKernel(Matrix A,MatrixB,Matrix C){ float CValue=0; int row=blockIdx.y*blockDim.y+threadIdx.y; int col=blockIdx.x*blockDim.x+threadIdx.x; for (inte=0;e<A.width;++e) { CValue+=A.element[row*A.width+e]*B.element[e*B.width+col]; } C.element[row*C.width+col]=CValue;}

6、带shared memory的矩阵相乘

代码:#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h>#include <stdlib.h> #define BLOCK_SIZE 16 typedef struct{ int width; int height; int stride; float *elements;}Matrix; __device__ float GetElement(constMatrix A, int row,intcol){ return A.elements[row*A.stride+col];} __device__ void SetElement(Matrix A,int row,int col,float value){ A.elements[row*A.stride+col]=value;} __device__ Matrix GetSubMatrix(Matrix A,int row,int col){ Matrix Asub; Asub.width=BLOCK_SIZE; Asub.height=BLOCK_SIZE; Asub.stride=A.stride; Asub.elements=&A.elements[A.stride*BLOCK_SIZE*row+BLOCK_SIZE*col]; return Asub;} __global__ void MatMulKernel(const Matrix,const Matrix,Matrix); void MatMul(const Matrix &A,const Matrix &B,Matrix &C){ Matrix d_A; d_A.width=A.width; d_A.height=A.height; d_A.stride=A.width; size_tsize=d_A.width*d_A.height*sizeof(float); cudaMalloc(&d_A.elements,size); cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice); Matrix d_B; d_B.width=B.width; d_B.height=B.height; d_B.stride=B.width; size=B.width*B.height*sizeof(float); cudaMalloc(&d_B.elements,size); cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice); Matrix d_C; d_C.width=C.width; d_C.height=C.height; d_C.stride=C.width; size=C.width*C.height*sizeof(float); cudaMalloc(&d_C.elements,size); dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); dim3 dimGrid(B.width/BLOCK_SIZE,B.height/BLOCK_SIZE); MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C); cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost); cudaFree(d_A.elements); cudaFree(d_B.elements); cudaFree(d_C.elements);} __global__ void MatMulKernel(Matrix A,MatrixB,Matrix C){ int blockRow=blockIdx.y; int blockCol=blockIdx.x; MatrixCsub=GetSubMatrix(C,blockRow,blockCol); float Cvalue=0; int row=threadIdx.y; int col=threadIdx.x; for (intm=0;m<A.width/BLOCK_SIZE;++m) { MatrixAsub=GetSubMatrix(A,blockRow,m); MatrixBsub=GetSubMatrix(B,m,blockCol); __shared__ floatAs[BLOCK_SIZE][BLOCK_SIZE]; __shared__ floatBs[BLOCK_SIZE][BLOCK_SIZE]; As[row][col]=GetElement(Asub,row,col); Bs[row][col]=GetElement(Bsub,row,col); __syncthreads(); for (inte=0;e<BLOCK_SIZE;++e) { Cvalue+=As[row][e]*Bs[e][col]; } __syncthreads(); } SetElement(Csub,row,col,Cvalue);} void SetMatValue(Matrix A,int value){ for (inti=0;i<A.height;++i) { for (intj=0;j<A.width;++j) { A.elements[i*A.width+j]=value; } }}void PrintMat(const Matrix A){ for (int i=0;i<A.height;++i) { for(intj=0;j<A.width;++j) { printf("%f ",A.elements[i*A.width+j]); } printf("\n"); }}void main(){ MatrixA,B,C; A.width=128; A.height=128; A.elements=(float*)malloc(A.width*A.height*sizeof(float)); SetMatValue(A,2); B.width=128; B.height=128; B.elements=(float*)malloc(B.width*B.height*sizeof(float)); SetMatValue(B,2); C.width=128; C.height=128; C.elements=(float *)malloc(C.width*C.height*sizeof(float)); MatMul(A,B,C); PrintMat(C); system("pause"); }

7、页锁定主机存储器Page-lockedHost memory

代码:#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h>#include <stdlib.h> #define N 256 void SetArrayValue(float *pData,int size,float value){ for (inti=0;i<size;++i) { pData[i]=value; }} __global__ void VecAdd(float*A,float *B,float*C){ int index=threadIdx.x; C[index]=A[index]+B[index];} void PrintArray(float *data,int size){ for(inti=0;i<size;++i) { if((i+1)%10==0) { printf("\n"); } printf("%f ",data[i]); }} void main(){ //cudaSetDeviceFlags(cudaDeviceMapHost);---可有可无 cudaDevicePropdeviceProp; cudaGetDeviceProperties(&deviceProp,0); if (deviceProp.integrated) { printf("GPU is integrated\n"); return; } if(!deviceProp.canMapHostMemory) { printf("can't map host memory\n"); return; } float *hostPtrA; cudaHostAlloc(&hostPtrA,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped); SetArrayValue(hostPtrA,N,29); PrintArray(hostPtrA,N); system("pause"); float *hostPtrB; cudaHostAlloc(&hostPtrB,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped); SetArrayValue(hostPtrB,N,31); float *devPtrA,*devPtrB; cudaHostGetDevicePointer(&devPtrA,hostPtrA,0); cudaHostGetDevicePointer(&devPtrB,hostPtrB,0); float *hostPtrC; cudaHostAlloc(&hostPtrC,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped); float *devPtrC; cudaHostGetDevicePointer(&devPtrC,hostPtrC,0); VecAdd<<<1,N>>>(devPtrA,devPtrB,devPtrC); cudaDeviceSynchronize(); for(inti=0;i<N;i++) { if ((i+1)%10==0) { printf("\n"); } printf("%f ",hostPtrC[i]); } system("pause");}

8、纹理存储的使用texturememory

代码:#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include <stdlib.h> #define size 256 texture<float,cudaTextureType2D,cudaReadModeElementType>texRef; __global__ void transformKernel(float *output,intwidth,int height,floattheta){ unsigned int x=blockIdx.x*blockDim.x+threadIdx.x; unsigned int y=blockIdx.y*blockDim.y+threadIdx.y; float u=x/(float)width; float v=y/(float)height; u-=0.5f; v-=0.5f; float tu=u*cosf(theta)-v*sin(theta)+0.5f; float tv=v*cosf(theta)+u*sinf(theta)+0.5f; output[y*width+x]=tex2D(texRef,tu,tv);} void main(){ int width=25,height=25; cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat); cudaArray*cuArray; cudaMallocArray(&cuArray,&channelDesc,width,height); float *h_data=(float*)malloc(width*height*sizeof(float)); for (inti=0;i<height;++i) { for (intj=0;j<width;++j) { h_data[i*width+j]=i*width+j; } } cudaMemcpyToArray(cuArray,0,0,h_data,width*height*sizeof(float),cudaMemcpyHostToDevice); texRef.addressMode[0]=cudaAddressModeWrap; texRef.addressMode[1]=cudaAddressModeWrap; texRef.filterMode=cudaFilterModeLinear; texRef.normalized=true; cudaBindTextureToArray(texRef,cuArray,channelDesc); float *output; cudaMalloc(&output,width*height*sizeof(float)); dim3 dimBlock(16,16); dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y); float angle=30; transformKernel<<<dimGrid,dimBlock>>>(output,width,height,angle); float *hostPtr=(float*)malloc(sizeof(float)*width*height); cudaMemcpy(hostPtr,output,sizeof(float)*width*height,cudaMemcpyDeviceToHost); for (inti=0;i<height;++i) { for (intj=0;j<width;++j) { printf("%f ",hostPtr[i*width+j]); } printf("\n"); } free(hostPtr); cudaFreeArray(cuArray); cudaFree(output); system("pause");}

9、surface Memory的使用方法

代码:#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include <stdlib.h> surface<void,2> inputSurfRef;surface<void,2>outputSurfRef; __global__ void copyKernel(intwidth,int height){ unsigned int x=blockIdx.x*blockDim.x+threadIdx.x; unsigned int y=blockIdx.y*blockDim.y+threadIdx.y; if(x<width && y<height) { uchar4 data; surf2Dread(&data,inputSurfRef,x*4,y); surf2Dwrite(data,outputSurfRef,x*4,y); }}void main(){ int width=256,height=256; unsigned int*h_data=(unsigned int*)malloc(width*height*sizeof(unsignedint)); for (inti=0;i<height;++i) { for (intj=0;j<width;++j) { h_data[i*width+j]=3; } } int size=width*height*sizeof(unsignedint); cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(8,8,8,8,cudaChannelFormatKindUnsigned); cudaArray*cuInputArray; cudaMallocArray(&cuInputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore); cudaArray*cuOutputArray; cudaMallocArray(&cuOutputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore); cudaMemcpyToArray(cuInputArray,0,0,h_data,size,cudaMemcpyHostToDevice); cudaBindSurfaceToArray(inputSurfRef,cuInputArray); cudaBindSurfaceToArray(outputSurfRef,cuOutputArray); dim3 dimBlock(16,16); dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y); copyKernel<<<dimGrid,dimBlock>>>(width,height); unsigned int*host_output=(unsigned int*)malloc(sizeof(unsignedint)*width*height); cudaMemcpyFromArray(host_output,cuOutputArray,0,0,size,cudaMemcpyDeviceToHost); for (inti=0;i<height;++i) { for (intj=0;j<width;++j) { printf("%u ",host_output[i*width+j]); } printf("\n"); } system("pause"); free(host_output); free(h_data); cudaFreeArray(cuInputArray); cudaFreeArray(cuOutputArray);}

10、opengl和cuda的交互

代码:https://devtalk.nvidia.com/default/topic/502692/how-to-use-open_gl/http://stackoverflow.com/questions/12082357/errors-while-using-opengl-buffers-using-visual-studio-2010-in-windows7 #include <cuda.h>#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <gl/glew.h>//要放在下面这一句的前面#include "cuda_gl_interop.h" #include <stdio.h>#include <stdlib.h> //#include <gl/GL.h>#include <gl/glut.h> GLuint postionsVBO=1;struct cudaGraphicsResource * postionsVBO_CUDA; int width=256;int height=256;__device__ float dev_time=1;float host_time=1; __global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight); void init(){ glClearColor(0.f,0.f,0.f,1.f); glClear(GL_DEPTH_BUFFER_BIT| GL_COLOR_BUFFER_BIT);} void reshape(int width,int height){ glMatrixMode(GL_PROJECTION); glLoadIdentity(); if (width>height) { gluPerspective(45,(GLfloat)width/height,0.001,1000); }else { gluPerspective(45,(GLfloat)height/width,0.001,1000); } glMatrixMode(GL_MATRIX_MODE); glLoadIdentity(); }void display(){ float4 *positions; cudaGraphicsMapResources(1,&postionsVBO_CUDA,0); size_tnumb_bytes; cudaGraphicsResourceGetMappedPointer((void**)&positions,&numb_bytes,postionsVBO_CUDA); dim3 dimBlock(16,16,1); dim3 dimGrid(width/dimBlock.x,height/dimBlock.y,1); createVertices<<<dimGrid,dimBlock>>>(positions,dev_time,width,height); dev_time++; cudaMemcpy(&time,&host_time,sizeof(float),cudaMemcpyHostToDevice); cudaGraphicsUnmapResources(1,&postionsVBO_CUDA,0); glClear(GL_COLOR_BUFFER_BIT| GL_DEPTH_BUFFER_BIT); glBindBuffer(GL_ARRAY_BUFFER,postionsVBO); glVertexPointer(4,GL_FLOAT,0,0); glEnableClientState(GL_VERTEX_ARRAY); glDrawArrays(GL_POINTS,0,width*height); glDisableClientState(GL_VERTEX_ARRAY); glutSwapBuffers(); glutPostRedisplay();} __global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight){ unsigned int x=blockIdx.x*blockDim.x+threadIdx.x; unsigned int y=blockIdx.y*blockDim.y+threadIdx.y; float u=x/(float)width; float v=y/(float)height; u=u*2.f-1.f; v=v*2.f-1.f; float freq=4.f; float w=sinf(u*freq+time)*cosf(v*freq+time)*0.5f; positions[y*width+x]=make_float4(u,w,v,1.f); } int main(int argc,char*argv[]){ cudaGLSetGLDevice(0); glutInit(&argc,argv); glutInitDisplayMode(GLUT_DOUBLE| GLUT_RGB); glutInitWindowPosition(0,0); glutInitWindowSize(100,100); glutCreateWindow("opengl-cuda"); init(); glutDisplayFunc(display); glutReshapeFunc(reshape); glewInit();//http://stackoverflow.com/questions/12344612/unusual-error-using-opengl-buffers-with-cuda-interop-on-ms-visual-studio-2010 glGenBuffers(1,&postionsVBO); glBindBuffer(GL_ARRAY_BUFFER,postionsVBO); unsigned intsize=width*height*4*sizeof(float); glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER,0); cudaGraphicsGLRegisterBuffer(&postionsVBO_CUDA,postionsVBO,cudaGraphicsMapFlagsWriteDiscard); glutMainLoop();}

11、Formattedoutput---printf函数在device的函数中,但是其需要其的compute copability至少为2.0

代码:#include <cuda.h>#include <helper_cuda.h>#include "cuda_runtime.h"#include "device_launch_parameters.h" __global__ void bcast(){ printf("%d\n",threadIdx.x);} void main(){ bcast<<<1,32>>>(); cudaDeviceSynchronize(); system("pause");}

12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0

代码:#include <cuda.h>#include <helper_cuda.h> #include "cuda_runtime.h"#include "device_launch_parameters.h"#include <assert.h>#include <stdlib.h> __global__ void testAssert(void){ int is_one=1; int should_be_one=0; assert(is_one); assert(should_be_one);} void main(){ testAssert<<<1,1>>>(); cudaDeviceSynchronize(); cudaDeviceReset(); system("pause");}

13、Per ThreadAllocation On heap每个线程在堆上分配

代码:#include <stdio.h>#include <stdlib.h>#include <cuda.h>#include <helper_cuda.h>#include "cuda_runtime.h"#include "device_launch_parameters.h" __global__ void mallocTestPerThread(){ char *ptr=(char *)malloc(100); printf("Thread %d got pointer:%p\n",threadIdx.x,ptr); free(ptr);} int main(){ cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024); mallocTestPerThread<<<1,5>>>(); cudaDeviceSynchronize(); system("pause"); return 0;}

14、Per Thread BlockAllocation每个线程块在堆上分配空间

代码:#include <stdio.h>#include <stdlib.h>#include <cuda.h>#include <helper_cuda.h>#include "cuda_runtime.h"#include "device_launch_parameters.h" __global__ void mallocTestPerThreadBlock(){ __shared__ int *data; if (threadIdx.x==0) { data=(int*)malloc(blockDim.x*64); } __syncthreads(); if (data==NULL) { return; } int *ptr=data; for (inti=0;i<64;++i) { ptr[i*blockDim.x+threadIdx.x]=threadIdx.x; } __syncthreads(); if (threadIdx.x==0) { free(data); }} int main(){ cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024); mallocTestPerThreadBlock<<<10,128>>>(); cudaDeviceSynchronize(); system("pause"); return 0;}

15、AllocationPersisting Between Kernel Launches在堆上分配

代码:#include <cuda.h>#include <helper_cuda.h>#include <stdio.h>#include <stdlib.h>#include "cuda_runtime.h"#include "device_launch_parameters.h" #define NUM_BLOCKS 20 __device__ int *dataptr[NUM_BLOCKS]; __global__ void allocmem(){ if (threadIdx.x==0) { dataptr[blockIdx.x]=(int*)malloc(blockDim.x*4); } __syncthreads(); if (dataptr[blockIdx.x]==NULL) { return; } dataptr[blockIdx.x][threadIdx.x]=0;} __global__ void usemem(){ int *ptr=dataptr[blockIdx.x]; if (ptr!=NULL) { ptr[threadIdx.x]+=threadIdx.x; }} __global__ void freemem(){ int *ptr=dataptr[blockIdx.x]; if(ptr!=NULL) printf("Block %d,Thread=%d:final value=%d\n",blockIdx.x,threadIdx.x,ptr[threadIdx.x]); if (threadIdx.x==0) { free(ptr); }} int main(){ cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024); allocmem<<<NUM_BLOCKS,10>>>(); usemem<<<NUM_BLOCKS,10>>>(); usemem<<<NUM_BLOCKS,10>>>(); usemem<<<NUM_BLOCKS,10>>>(); freemem<<<NUM_BLOCKS,10>>>(); cudaDeviceSynchronize(); system("pause"); return 0; }
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: