您的位置:首页 > 其它

CUDA使用纹理内存

2013-07-01 14:17 471 查看
纹理内存位于设备端,global memory也位于设备端,但是texture memory的访问速度较global memory要快。

因为纹理内存有cache, 只有当cache没有命中的时候才会去访问device memory,否则访问texture cache具有很小的延迟。

另外,texture cache的2D定位已经进行了优化,对于同一线程束的线程访问位置临近的texture memory时效率非常高。

还有,texture memory的stream fetching 也进行了优化,所以即使cache没有命中,对texture memory的访问延迟也不会很高。

初始化纹理内存:

texture<uchar, 3, cudaReadModeNormalizedFloat> tex;
cudaArray *d_volumeArray = 0;

extern "C"
void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();

cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));

cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent   = volumeSize;
copyParams.kind     = cudaMemcpyHostToDevice;
cutilSafeCall(cudaMemcpy3D(©Params));

tex.normalized = true;
tex.filterMode = cudaFilterModeLinear;
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.addressMode[2] = cudaAddressModeWrap;

cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}


在上述代码中已经将h_volume中的数据拷贝到设备端的d_volumeArray中,然后又将其绑定到一个纹理内存。

下面在kernel函数中,对纹理进行访问,并将数据保存到PBO中,然后绘制。PBO的使用在前面已经写过了,可以参考之。

__global__ void kernel(uint *d_output, uint imageW, uint imageH, float w)
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

float u = x / (float)imageW;
float v = y / (float)imageH;

float voxel = tex3D(tex, u, v, w);

if ((x < imageW) && (y < imageH))
{
uint i = __umul24(y, imageW) + x;
d_output[i] = voxel * 255;

//		CUPRINTF("%d  ", d_output[i]);
}
}


代码:

//main.cpp

#include <gl/glew.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <cutil_gl_inline.h>
#include <cutil_gl_error.h>
#include <rendercheck_gl.h>

typedef unsigned int  uint;
typedef unsigned char uchar;

unsigned int window_width = 512;
unsigned int window_height = 512;

unsigned int timer = 0;

bool animFlag = true;
float animTime = 0.0;
float animInc  = 0.1;

float w = 0.5;

GLuint pbo = NULL;
struct cudaGraphicsResource *cuda_pbo_resource;

cudaExtent volumeSize = make_cudaExtent(32, 32, 32);

extern "C"
void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize);

extern "C"
void launch_kernel(uint *d_output, uint imageW, uint imageH, float w);

void createPBO(GLuint *pbo)
{
if(pbo)
{
glGenBuffers(1, pbo);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
glBufferData(GL_PIXEL_UNPACK_BUFFER, window_width*window_height*sizeof(GLubyte)*4, 0, GL_STREAM_DRAW);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

//cudaGLRegisterBufferObject(*pbo);
cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard);
}

}

void deletePBO(GLuint *pbo)
{
if (pbo)
{
//	cudaGLUnregisterBufferObject(*pbo);
cudaGraphicsUnregisterResource(cuda_pbo_resource);

glBindBuffer(GL_ARRAY_BUFFER, *pbo);
glDeleteBuffers(1, pbo);

*pbo = NULL;
}
}

void cleanupCuda()
{
if(pbo) deletePBO(&pbo);
}

void runCuda()
{

unsigned int *d_output = NULL;
size_t num_bytes;

//	cudaGLMapBufferObject((void**)&d_output, pbo);
cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
cudaGraphicsResourceGetMappedPointer((void**)&d_output, &num_bytes, cuda_pbo_resource);

launch_kernel(d_output, window_width, window_height, w);

cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);
//	cudaGLUnmapBufferObject(pbo);

}

uchar *loadVolumeData(const char *filename)
{
size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;

FILE *fp = fopen(filename, "rb");
if (!fp)
{
fprintf(stderr, "Error openging file '%s'\n", filename);
return 0;
}

uchar *data = (uchar *)malloc(size);
size_t read = fread(data, 1, size, fp);
fclose(fp);

printf("Read '%s', %lu bytes\n", filename, read);

return data;

}

void initCuda(int argc, char **argv)
{
if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
cutilGLDeviceInit(argc, argv);
else
cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());

createPBO(&pbo);

uchar *data = loadVolumeData("Bucky.raw");

initCudaTexture(data, volumeSize);

atexit(cleanupCuda);

runCuda();
}

void computeFPS()
{
static int fpsCount = 0;
static int fpsLimit = 100;

fpsCount++;

if(fpsCount == fpsLimit)
{
char fps[256];
float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f);
sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps", ifps);

glutSetWindowTitle(fps);
fpsCount = 0;

cutilCheckError(cutResetTimer(timer));
}
}

void display()
{
runCuda();

glClear(GL_COLOR_BUFFER_BIT);

glDisable(GL_DEPTH_TEST);
glRasterPos2i(0, 0);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
glDrawPixels(window_width, window_height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

glutSwapBuffers();
glutReportErrors();
/*if (animFlag)
{

animTime += animInc;
}*/
//	glutPostRedisplay();
}

void fpsDisplay()
{
cutilCheckError(cutStartTimer(timer));

display();

cutilCheckError(cutStopTimer(timer));

computeFPS();
}

void keyboard(unsigned char key, int x, int y)
{

}

void idle()
{
if (animFlag)
{
//		animTime += animInc;
w += 0.01f;
glutPostRedisplay();
}
}

void reshape(int x, int y)
{
glViewport(0, 0, x, y);

glMatrixMode(GL_MODELVIEW);
glLoadIdentity();

glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
}

CUTBoolean initGL(int argc, char **argv)
{
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
glutInitWindowSize(window_width, window_height);
glutCreateWindow("Cuda GL Interop Demo (adapted from NVDIA's simpleGL)");

glutDisplayFunc(fpsDisplay);
glutKeyboardFunc(keyboard);
glutReshapeFunc(reshape);
glutIdleFunc(idle);

glewInit();
if(!glewIsSupported("GL_VERSION_2_0"))
{
fprintf(stderr, "ERROR: Support for necessary OpengGL extensions missing.");
return CUTFalse;
}

glClearColor(0.0, 0.0, 0.0, 1.0);
glDisable(GL_DEPTH_TEST);

glViewport(0, 0, window_width, window_height);

glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1, 10.0);

return CUTTrue;
}

int main(int argc, char **argv)
{

cutilCheckError(cutCreateTimer(&timer));

if(CUTFalse == initGL(argc, argv))
return CUTFalse;

initCuda(argc, argv);
CUT_CHECK_ERROR_GL();

runCuda();

glutDisplayFunc(fpsDisplay);
glutKeyboardFunc(keyboard);
glutIdleFunc(idle);

glutMainLoop();

cudaThreadExit();			///////////////////////////
cutilExit(argc, argv);		//////////////////////////

}


//kernelTexture.cu

#include <stdio.h>
#include <cutil_inline.h>
#include <cutil_math.h>
#include "cuPrintf.cu"

//The macro CUPRINTF is defined for architectures
//with different compute capabilities.
#if __CUDA_ARCH__ < 200 //Compute capability 1.x architectures
#define CUPRINTF cuPrintf
#else //Compute capability 2.x architectures
#define CUPRINTF(fmt, ...) printf("[%d, %d]:\t" fmt, \
blockIdx.y*gridDim.x+blockIdx.x,\
threadIdx.z*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x,\
__VA_ARGS__)
#endif

typedef unsigned int uint;
typedef unsigned char uchar;

texture<uchar, 3, cudaReadModeNormalizedFloat> tex;
cudaArray *d_volumeArray = 0;

__global__ void kernel(uint *d_output, uint imageW, uint imageH, float w) { uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; float u = x / (float)imageW; float v = y / (float)imageH; float voxel = tex3D(tex, u, v, w); if ((x < imageW) && (y < imageH)) { uint i = __umul24(y, imageW) + x; d_output[i] = voxel * 255; // CUPRINTF("%d ", d_output[i]); } }

extern "C"
void launch_kernel(uint *d_output, uint imageW, uint imageH, float w)
{
dim3 blockSize(16, 16, 1);
dim3 gridSize(imageW/blockSize.x, imageH/blockSize.y);

kernel<<<gridSize, blockSize>>>(d_output, imageW, imageH, w);

}

extern "C"
void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();

cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));

cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice;
cutilSafeCall(cudaMemcpy3D(©Params));

tex.normalized = true;
tex.filterMode = cudaFilterModeLinear;
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.addressMode[2] = cudaAddressModeWrap;

cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}



运行结果:



参考自CUDA SDK
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: