CUDA使用纹理内存
2013-07-01 14:17
471 查看
纹理内存位于设备端,global memory也位于设备端,但是texture memory的访问速度较global memory要快。
因为纹理内存有cache, 只有当cache没有命中的时候才会去访问device memory,否则访问texture cache具有很小的延迟。
另外,texture cache的2D定位已经进行了优化,对于同一线程束的线程访问位置临近的texture memory时效率非常高。
还有,texture memory的stream fetching 也进行了优化,所以即使cache没有命中,对texture memory的访问延迟也不会很高。
初始化纹理内存:
在上述代码中已经将h_volume中的数据拷贝到设备端的d_volumeArray中,然后又将其绑定到一个纹理内存。
下面在kernel函数中,对纹理进行访问,并将数据保存到PBO中,然后绘制。PBO的使用在前面已经写过了,可以参考之。
代码:
运行结果:
参考自CUDA SDK
因为纹理内存有cache, 只有当cache没有命中的时候才会去访问device memory,否则访问texture cache具有很小的延迟。
另外,texture cache的2D定位已经进行了优化,对于同一线程束的线程访问位置临近的texture memory时效率非常高。
还有,texture memory的stream fetching 也进行了优化,所以即使cache没有命中,对texture memory的访问延迟也不会很高。
初始化纹理内存:
texture<uchar, 3, cudaReadModeNormalizedFloat> tex; cudaArray *d_volumeArray = 0; extern "C" void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>(); cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); cudaMemcpy3DParms copyParams = {0}; copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height); copyParams.dstArray = d_volumeArray; copyParams.extent = volumeSize; copyParams.kind = cudaMemcpyHostToDevice; cutilSafeCall(cudaMemcpy3D(©Params)); tex.normalized = true; tex.filterMode = cudaFilterModeLinear; tex.addressMode[0] = cudaAddressModeWrap; tex.addressMode[1] = cudaAddressModeWrap; tex.addressMode[2] = cudaAddressModeWrap; cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc)); }
在上述代码中已经将h_volume中的数据拷贝到设备端的d_volumeArray中,然后又将其绑定到一个纹理内存。
下面在kernel函数中,对纹理进行访问,并将数据保存到PBO中,然后绘制。PBO的使用在前面已经写过了,可以参考之。
__global__ void kernel(uint *d_output, uint imageW, uint imageH, float w) { uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; float u = x / (float)imageW; float v = y / (float)imageH; float voxel = tex3D(tex, u, v, w); if ((x < imageW) && (y < imageH)) { uint i = __umul24(y, imageW) + x; d_output[i] = voxel * 255; // CUPRINTF("%d ", d_output[i]); } }
代码:
//main.cpp #include <gl/glew.h> #include <cuda_runtime.h> #include <cutil_inline.h> #include <cutil_gl_inline.h> #include <cutil_gl_error.h> #include <rendercheck_gl.h> typedef unsigned int uint; typedef unsigned char uchar; unsigned int window_width = 512; unsigned int window_height = 512; unsigned int timer = 0; bool animFlag = true; float animTime = 0.0; float animInc = 0.1; float w = 0.5; GLuint pbo = NULL; struct cudaGraphicsResource *cuda_pbo_resource; cudaExtent volumeSize = make_cudaExtent(32, 32, 32); extern "C" void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize); extern "C" void launch_kernel(uint *d_output, uint imageW, uint imageH, float w); void createPBO(GLuint *pbo) { if(pbo) { glGenBuffers(1, pbo); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo); glBufferData(GL_PIXEL_UNPACK_BUFFER, window_width*window_height*sizeof(GLubyte)*4, 0, GL_STREAM_DRAW); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); //cudaGLRegisterBufferObject(*pbo); cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard); } } void deletePBO(GLuint *pbo) { if (pbo) { // cudaGLUnregisterBufferObject(*pbo); cudaGraphicsUnregisterResource(cuda_pbo_resource); glBindBuffer(GL_ARRAY_BUFFER, *pbo); glDeleteBuffers(1, pbo); *pbo = NULL; } } void cleanupCuda() { if(pbo) deletePBO(&pbo); } void runCuda() { unsigned int *d_output = NULL; size_t num_bytes; // cudaGLMapBufferObject((void**)&d_output, pbo); cudaGraphicsMapResources(1, &cuda_pbo_resource, 0); cudaGraphicsResourceGetMappedPointer((void**)&d_output, &num_bytes, cuda_pbo_resource); launch_kernel(d_output, window_width, window_height, w); cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0); // cudaGLUnmapBufferObject(pbo); } uchar *loadVolumeData(const char *filename) { size_t size = volumeSize.width * volumeSize.height * volumeSize.depth; FILE *fp = fopen(filename, "rb"); if (!fp) { fprintf(stderr, "Error openging file '%s'\n", filename); return 0; } uchar *data = (uchar *)malloc(size); size_t read = fread(data, 1, size, fp); fclose(fp); printf("Read '%s', %lu bytes\n", filename, read); return data; } void initCuda(int argc, char **argv) { if(cutCheckCmdLineFlag(argc, (const char**)argv, "device")) cutilGLDeviceInit(argc, argv); else cudaGLSetGLDevice(cutGetMaxGflopsDeviceId()); createPBO(&pbo); uchar *data = loadVolumeData("Bucky.raw"); initCudaTexture(data, volumeSize); atexit(cleanupCuda); runCuda(); } void computeFPS() { static int fpsCount = 0; static int fpsLimit = 100; fpsCount++; if(fpsCount == fpsLimit) { char fps[256]; float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f); sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps", ifps); glutSetWindowTitle(fps); fpsCount = 0; cutilCheckError(cutResetTimer(timer)); } } void display() { runCuda(); glClear(GL_COLOR_BUFFER_BIT); glDisable(GL_DEPTH_TEST); glRasterPos2i(0, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo); glDrawPixels(window_width, window_height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glutSwapBuffers(); glutReportErrors(); /*if (animFlag) { animTime += animInc; }*/ // glutPostRedisplay(); } void fpsDisplay() { cutilCheckError(cutStartTimer(timer)); display(); cutilCheckError(cutStopTimer(timer)); computeFPS(); } void keyboard(unsigned char key, int x, int y) { } void idle() { if (animFlag) { // animTime += animInc; w += 0.01f; glutPostRedisplay(); } } void reshape(int x, int y) { glViewport(0, 0, x, y); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } CUTBoolean initGL(int argc, char **argv) { glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); glutInitWindowSize(window_width, window_height); glutCreateWindow("Cuda GL Interop Demo (adapted from NVDIA's simpleGL)"); glutDisplayFunc(fpsDisplay); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); glewInit(); if(!glewIsSupported("GL_VERSION_2_0")) { fprintf(stderr, "ERROR: Support for necessary OpengGL extensions missing."); return CUTFalse; } glClearColor(0.0, 0.0, 0.0, 1.0); glDisable(GL_DEPTH_TEST); glViewport(0, 0, window_width, window_height); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1, 10.0); return CUTTrue; } int main(int argc, char **argv) { cutilCheckError(cutCreateTimer(&timer)); if(CUTFalse == initGL(argc, argv)) return CUTFalse; initCuda(argc, argv); CUT_CHECK_ERROR_GL(); runCuda(); glutDisplayFunc(fpsDisplay); glutKeyboardFunc(keyboard); glutIdleFunc(idle); glutMainLoop(); cudaThreadExit(); /////////////////////////// cutilExit(argc, argv); ////////////////////////// }
//kernelTexture.cu
#include <stdio.h>
#include <cutil_inline.h>
#include <cutil_math.h>
#include "cuPrintf.cu"
//The macro CUPRINTF is defined for architectures
//with different compute capabilities.
#if __CUDA_ARCH__ < 200 //Compute capability 1.x architectures
#define CUPRINTF cuPrintf
#else //Compute capability 2.x architectures
#define CUPRINTF(fmt, ...) printf("[%d, %d]:\t" fmt, \
blockIdx.y*gridDim.x+blockIdx.x,\
threadIdx.z*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x,\
__VA_ARGS__)
#endif
typedef unsigned int uint;
typedef unsigned char uchar;
texture<uchar, 3, cudaReadModeNormalizedFloat> tex;
cudaArray *d_volumeArray = 0;
__global__ void kernel(uint *d_output, uint imageW, uint imageH, float w) { uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; float u = x / (float)imageW; float v = y / (float)imageH; float voxel = tex3D(tex, u, v, w); if ((x < imageW) && (y < imageH)) { uint i = __umul24(y, imageW) + x; d_output[i] = voxel * 255; // CUPRINTF("%d ", d_output[i]); } }
extern "C"
void launch_kernel(uint *d_output, uint imageW, uint imageH, float w)
{
dim3 blockSize(16, 16, 1);
dim3 gridSize(imageW/blockSize.x, imageH/blockSize.y);
kernel<<<gridSize, blockSize>>>(d_output, imageW, imageH, w);
}
extern "C"
void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice;
cutilSafeCall(cudaMemcpy3D(©Params));
tex.normalized = true;
tex.filterMode = cudaFilterModeLinear;
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.addressMode[2] = cudaAddressModeWrap;
cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}
运行结果:
参考自CUDA SDK
相关文章推荐
- CUDA内存--纹理内存的说明与使用
- CUDA使用纹理内存
- CUDA:纹理内存及其使用
- CUDA纹理内存的使用
- CUDA 图像处理使用纹理内存与不使用纹理内存对比
- cuda纹理内存使用
- cuda纹理内存使用例子 vs2013 cuda7.5
- cuda纹理内存的使用
- CUDA中多维数组以及多维纹理内存的使用
- CUDA使用CUDAArray的纹理
- CUDA 纹理内存
- GPU CUDA常量内存使用
- cuda 内存分类和使用
- cuda中内存申请使用void**的原因
- 【cuda学习笔记】2.纹理对象API的使用,实现sobel边缘检测
- CUDA纹理存储器的特性及其使用
- CUDA 纹理内存
- CUDA纹理存储器的特性及其使用
- CUDA二维纹理内存+OpenCV图像滤波
- CUDA 纹理的使用