CUDA By Examples 8 - 纹理内存Texture Memory
2017-06-28 18:56
246 查看
1. 知识点
纹理内存是read-only.被cache.
spatial locality.
texture ref需要和buffer bind. 使用完还要unbind.
2. 热传导 不用纹理内存
在二维grid内计算热量的传导, 类似于对图像做高通(低通)滤波.当1−4k>0时, 相当于低通滤波;
当1−4k<0时, 相当于高通滤波.
#include "cuda.h" #include "../common/book.h" #include "../common/cpu_bitmap.h" #include "../common/cpu_anim.h" #define DIM 1024 #define PI 3.1415926535897932f #define MAX_TEMP 1.0f #define MIN_TEMP 0.0001f #define SPEED 0.25f struct DataBlock{ unsigned char *output_bitmap; float *dev_inSrc; float *dev_outSrc; float *dev_constSrc; CPUAnimBitmap *bitmap; cudaEvent_t start, stop; float totalTime; float frames; }; //将初始图中Heat源拷贝到更新后的图像中. __global__ void copy_const_kernel( float *iptr, const float *cptr) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; if (cptr[offset] != 0) { iptr[offset] = cptr[offset]; } } //计算更新后的图像. __global__ void blend_kernel( float * outSrc, const float *inSrc) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; int left = offset - 1; int right = offset + 1; //边界操作. if (x ==0 ) { left++; } if (x == DIM-1) { right--; } int top = offset -DIM; int bottom = offset + DIM; if (y == 0) { top += DIM; } if (y == DIM-1) { bottom -= DIM; } outSrc[offset] = inSrc[offset] + SPEED * ( inSrc[top] + inSrc[bottom] + inSrc[left] + inSrc[right] - inSrc[offset]*4); } //滤波90次算是一帧. void anim_gpu( DataBlock *d, int ticks ) { HANDLE_ERROR( cudaEventRecord( d->start, 0 ) ); dim3 blocks(DIM/16, DIM/16); dim3 threads(16, 16); CPUAnimBitmap *bitmap = d->bitmap; for (int i=0; i<90; i++) { copy_const_kernel<<<blocks, threads>>>(d->dev_inSrc, d->dev_constSrc); blend_kernel<<<blocks,threads>>>( d->dev_outSrc, d->dev_inSrc); swap(d->dev_inSrc, d->dev_outSrc); } float_to_color<<<blocks, threads>>>( d->output_bitmap, d->dev_inSrc ); HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost ) ); HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( d->stop ) ); float elapsedTime; HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, d->start, d->stop ) ); d->totalTime += elapsedTime; ++d->frames; printf( "Average Time per frame: %3.1f ms\n", d->totalTime/d->frames); } void anim_exit( DataBlock *d ) { cudaFree( d->dev_inSrc ); cudaFree( d->dev_outSrc ); cudaFree( d->dev_constSrc ); HANDLE_ERROR( cudaEventDestroy( d->start ) ); HANDLE_ERROR( cudaEventDestroy( d->stop ) ); } int main(void) { DataBlock data; CPUAnimBitmap bitmap(DIM,DIM, &data); data.bitmap = &bitmap; data.totalTime = 0; data.frames = 0; HANDLE_ERROR( cudaEventCreate( &data.start ) ); HANDLE_ERROR( cudaEventCreate( &data.stop ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap, bitmap.image_size() ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc, bitmap.image_size() ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc, bitmap.image_size() ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc, bitmap.image_size() ) ); float *temp = (float *)malloc( bitmap.image_size() ); for (int i=0; i<DIM*DIM; i++) { temp[i] = 0; int x = i % DIM; int y = i / DIM; if ((x>300) && (x<600) && (y>310) && (y<601)) { temp[i] = MAX_TEMP; } } temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2; temp[DIM*700+100] = MIN_TEMP; temp[DIM*300+300] = MIN_TEMP; temp[DIM*200+700] = MIN_TEMP; for (int y=800; y<900; y++) { for (int x=400; x<500; x++) { temp[x+y*DIM] = MIN_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice ) ); for (int y=800; y<DIM; y++) { for (int x=0; x<200; x++) { temp[x+y*DIM] = MAX_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice ) ); free( temp ); bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu, (void (*)(void*))anim_exit ); }
3. 使用1-D texture memory
#include "cuda.h" #include "../common/book.h" #include "../common/cpu_bitmap.h" #include "../common/cpu_anim.h" #define DIM 1024 #define PI 3.1415926535897932f #define MAX_TEMP 1.0f #define MIN_TEMP 0.0001f #define SPEED 0.25f //声明texture references. texture<float> texConstSrc; texture<float> texIn; texture<float> texOut; struct DataBlock{ unsigned char *output_bitmap; float *dev_inSrc; float *dev_outSrc; float *dev_constSrc; CPUAnimBitmap *bitmap; cudaEvent_t start, stop; float totalTime; float frames; }; __global__ void copy_const_kernel( float *iptr) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float c = tex1Dfetch(texConstSrc, offset); if (c != 0) { iptr[offset] = c; } } __global__ void blend_kernel( float *dst, bool dstOut) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; int left = offset - 1; int right = offset + 1; if (x ==0 ) { left++; } if (x == DIM-1) { right--; } int top = offset -DIM; int bottom = offset + DIM; if (y == 0) { top += DIM; } if (y == DIM-1) { bottom -= DIM; } float t, l, c, r, b; if(dstOut) { t = tex1Dfetch(texIn, top); l = tex1Dfetch(texIn, left); c = tex1Dfetch(texIn, offset); r = tex1Dfetch(texIn, right); b = tex1Dfetch(texIn, bottom); } else { t = tex1Dfetch(texOut, top); l = tex1Dfetch(texOut, left); c = tex1Dfetch(texOut, offset); r = tex1Dfetch(texOut, right); b = tex1Dfetch(texOut, bottom); } dst[offset] = c + SPEED * (t + b + r + l - 4 * c); } void anim_gpu( DataBlock *d, int ticks ) { HANDLE_ERROR( cudaEventRecord( d->start, 0 ) ); dim3 blocks(DIM/16, DIM/16); dim3 threads(16, 16); CPUAnimBitmap *bitmap = d->bitmap; volatile bool dstOut = true; for (int i=0; i<90; i++) { float *in, *out; if (dstOut) { in = d->dev_inSrc; out = d->dev_outSrc; } else { out = d->dev_inSrc; in = d->dev_outSrc; } copy_const_kernel<<<blocks, threads>>>( in ); blend_kernel<<<blocks,threads>>>( out, dstOut ); dstOut = !dstOut; } float_to_color<<<blocks, threads>>>( d->output_bitmap, d->dev_inSrc ); HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost ) ); HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( d->stop ) ); float elapsedTime; HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, d->start, d->stop ) ); d->totalTime += elapsedTime; ++d->frames; printf( "Average Time per frame: %3.1f ms\n", d->totalTime/d->frames); } void anim_exit( DataBlock *d ) { //将buffer和ref解绑. cudaUnbindTexture( texIn ); cudaUnbindTexture( texOut ); cudaUnbindTexture( texConstSrc ); cudaFree( d->dev_inSrc ); cudaFree( d->dev_outSrc ); cudaFree( d->dev_constSrc ); HANDLE_ERROR( cudaEventDestroy( d->start ) ); HANDLE_ERROR( cudaEventDestroy( d->stop ) ); } int main(void) { DataBlock data; CPUAnimBitmap bitmap(DIM,DIM, &data); data.bitmap = &bitmap; data.totalTime = 0; data.frames = 0; HANDLE_ERROR( cudaEventCreate( &data.start ) ); HANDLE_ERROR( cudaEventCreate( &data.stop ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap, bitmap.image_size() ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc, bitmap.image_size() ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc, bitmap.image_size() ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc, bitmap.image_size() ) ); HANDLE_ERROR( cudaBindTexture( NULL, texConstSrc, data.dev_constSrc, bitmap.image_size() ) ); HANDLE_ERROR( cudaBindTexture( NULL, texIn, data.dev_inSrc, bitmap.image_size() ) ); HANDLE_ERROR( cudaBindTexture( NULL, texOut, data.dev_outSrc, bitmap.image_size() ) ); float *temp = (float *)malloc( bitmap.image_size() ); for (int i=0; i<DIM*DIM; i++) { temp[i] = 0; int x = i % DIM; int y = i / DIM; if ((x>300) && (x<600) && (y>310) && (y<601)) { temp[i] = MAX_TEMP; } } temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2; temp[DIM*700+100] = MIN_TEMP; temp[DIM*300+300] = MIN_TEMP; temp[DIM*200+700] = MIN_TEMP; for (int y=800; y<900; y++) { for (int x=400; x<500; x++) { temp[x+y*DIM] = MIN_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice ) ); for (int y=800; y<DIM; y++) { for (int x=0; x<200; x++) { temp[x+y*DIM] = MAX_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice ) ); free( temp ); bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu, (void (*)(void*))anim_exit ); }
4. 使用2-D纹理内存
使用声明texture<float,2> texIn;
使用
tex2D(texIn, x, y);读取数据
使用
cudaBindTexture2D(...);绑定纹理内存
使用
cudaUnbindTexture( texIn);解绑.
#include "cuda.h" #include "../common/book.h" #include "../common/cpu_anim.h" #define DIM 1024 #define PI 3.1415926535897932f #define MAX_TEMP 1.0f #define MIN_TEMP 0.0001f #define SPEED 0.25f // these exist on the GPU side texture<float, 2> texConstSrc; texture<float, 2> texIn; texture<float, 2> texOut; // this kernel takes in a 2-d array of floats // it updates the value-of-interest by a scaled value based // on itself and its nearest neighbors __global__ void blend_kernel( float *dst, bool dstOut ) { // map from threadIdx/BlockIdx to pixel position int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float t, l, c, r, b; if (dstOut) { t = tex2D(texIn,x,y-1); l = tex2D(texIn,x-1,y); c = tex2D(texIn,x,y); r = tex2D(texIn,x+1,y); b = tex2D(texIn,x,y+1); } else { t = tex2D(texOut,x,y-1); l = tex2D(texOut,x-1,y); c = tex2D(texOut,x,y); r = tex2D(texOut,x+1,y); b = tex2D(texOut,x,y+1); } dst[offset] = c + SPEED * (t + b + r + l - 4 * c); } // NOTE - texOffsetConstSrc could either be passed as a // parameter to this function, or passed in __constant__ memory // if we declared it as a global above, it would be // a parameter here: // __global__ void copy_const_kernel( float *iptr, // size_t texOffset ) __global__ void copy_const_kernel( float *iptr ) { // map from threadIdx/BlockIdx to pixel position int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float c = tex2D(texConstSrc,x,y); if (c != 0) iptr[offset] = c; } // globals needed by the update routine struct DataBlock { unsigned char *output_bitmap; float *dev_inSrc; float *dev_outSrc; float *dev_constSrc; CPUAnimBitmap *bitmap; cudaEvent_t start, stop; float totalTime; float frames; }; void anim_gpu( DataBlock *d, int ticks ) { HANDLE_ERROR( cudaEventRecord( d->start, 0 ) ); dim3 blocks(DIM/16,DIM/16); dim3 threads(16,16); CPUAnimBitmap *bitmap = d->bitmap; // since tex is global and bound, we have to use a flag to // select which is in/out per iteration volatile bool dstOut = true; for (int i=0; i<90; i++) { float *in, *out; if (dstOut) { in = d->dev_inSrc; out = d->dev_outSrc; } else { out = d->dev_inSrc; in = d->dev_outSrc; } copy_const_kernel<<<blocks,threads>>>( in ); blend_kernel<<<blocks,threads>>>( out, dstOut ); dstOut = !dstOut; } float_to_color<<<blocks,threads>>>( d->output_bitmap, d->dev_inSrc ); HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost ) ); HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( d->stop ) ); float elapsedTime; HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, d->start, d->stop ) ); d->totalTime += elapsedTime; ++d->frames; printf( "Average Time per frame: %3.1f ms\n", d->totalTime/d->frames ); } // clean up memory allocated on the GPU void anim_exit( DataBlock *d ) { cudaUnbindTexture( texIn ); cudaUnbindTexture( texOut ); cudaUnbindTexture( texConstSrc ); HANDLE_ERROR( cudaFree( d->dev_inSrc ) ); HANDLE_ERROR( cudaFree( d->dev_outSrc ) ); HANDLE_ERROR( cudaFree( d->dev_constSrc ) ); HANDLE_ERROR( cudaEventDestroy( d->start ) ); HANDLE_ERROR( cudaEventDestroy( d->stop ) ); } int main( void ) { DataBlock data; CPUAnimBitmap bitmap( DIM, DIM, &data ); data.bitmap = &bitmap; data.totalTime = 0; data.frames = 0; HANDLE_ERROR( cudaEventCreate( &data.start ) ); HANDLE_ERROR( cudaEventCreate( &data.stop ) ); int imageSize = bitmap.image_size(); HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap, imageSize ) ); // assume float == 4 chars in size (ie rgba) HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc, imageSize ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc, imageSize ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc, imageSize ) ); cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); HANDLE_ERROR( cudaBindTexture2D( NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float) * DIM ) ); HANDLE_ERROR( cudaBindTexture2D( NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float) * DIM ) ); HANDLE_ERROR( cudaBindTexture2D( NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float) * DIM ) ); // intialize the constant data float *temp = (float*)malloc( imageSize ); for (int i=0; i<DIM*DIM; i++) { temp[i] = 0; int x = i % DIM; int y = i / DIM; if ((x>300) && (x<600) && (y>310) && (y<601)) temp[i] = MAX_TEMP; } temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2; temp[DIM*700+100] = MIN_TEMP; temp[DIM*300+300] = MIN_TEMP; temp[DIM*200+700] = MIN_TEMP; for (int y=800; y<900; y++) { for (int x=400; x<500; x++) { temp[x+y*DIM] = MIN_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice ) ); // initialize the input data for (int y=800; y<DIM; y++) { for (int x=0; x<200; x++) { temp[x+y*DIM] = MAX_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice ) ); free( temp ); bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu, (void (*)(void*))anim_exit ); }
相关文章推荐
- CUDA By Examples 5 - 共享内存 Shared Memory
- CUDA By Examples 6 - 使用常量内存 Constant Memory
- CUDA优化实例(五)纹理内存与常量内存
- CUDA学习日志:常量内存和纹理内存
- CUDA 纹理内存
- CUDA中多维数组以及多维纹理内存的使用
- CUDA 纹理内存
- CUDA BY EXAMPLES第4章代码错误解决方法
- cuda的julia集运行错误解决办法- 《cuda by examples》第四章例子
- CUDA 纹理内存
- cuda纹理内存的使用
- CUDA纹理内存相关参数解释
- cuda纹理内存
- CUDA 纹理内存
- cuda中的纹理内存 二维纹理绑定错误
- CUDA 纹理内存 Error:无法识别texture
- 基于纹理内存的CUDA热传导模拟
- CUDA 学习(十四)、纹理内存
- 绑定CUDA三维数组与纹理内存
- CUDA By Examples 2 - 并行计算向量相加