双边滤波CUDA优化——BilateralFilter CUDA
2015-06-23 16:22
369 查看
转自:http://sangni007.blog.163.com/blog/static/174728148201481305957863/=======双边滤波概述=======双边滤波(Bilateral filter)是一种可以保边去噪的滤波器。之所以可以达到此去噪效果,是因为滤波器是由两个函数构成。一个函数是由几何空间距离决定滤波器系数。另一个由像素差值决定滤波器系数。可以与其相比较的两个filter:高斯低通滤波器(http://en.wikipedia.org/wiki/Gaussian_filter)和α-截尾均值滤波器(去掉百分率为α的最小值和最大之后剩下像素的均值作为滤波器)。=======双边滤波公式======= =======双边滤波代码(CPU)=======OpenCV源码:/****************************************************************************************\ Bilateral Filtering\****************************************************************************************/namespace cv{static voidbilateralFilter_8u( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ){ int cn = src.channels(); int i, j, k, maxk, radius; Size size = src.size(); CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.type() == dst.type() && src.size() == dst.size() && src.data != dst.data ); if( sigma_color <= 0 ) sigma_color = 1; if( sigma_space <= 0 ) sigma_space = 1; double gauss_color_coeff = -0.5/(sigma_color*sigma_color); double gauss_space_coeff = -0.5/(sigma_space*sigma_space); if( d <= 0 ) radius = cvRound(sigma_space*1.5); else radius = d/2; radius = MAX(radius, 1); d = radius*2 + 1; Mat temp; copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); vector<float> _color_weight(cn*256); vector<float> _space_weight(d*d); vector<int> _space_ofs(d*d); float* color_weight = &_color_weight[0]; float* space_weight = &_space_weight[0]; int* space_ofs = &_space_ofs[0]; // initialize color-related bilateral filter coefficients for( i = 0; i < 256*cn; i++ ) color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); // initialize space-related bilateral filter coefficients for( i = -radius, maxk = 0; i <= radius; i++ ) for( j = -radius; j <= radius; j++ ) { double r = std::sqrt((double)i*i + (double)j*j); if( r > radius ) continue; space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_ofs[maxk++] = (int)(i*temp.step + j*cn); } for( i = 0; i < size.height; i++ ) { const uchar* sptr = temp.data + (i+radius)*temp.step + radius*cn; uchar* dptr = dst.data + i*dst.step; if( cn == 1 ) { for( j = 0; j < size.width; j++ ) { float sum = 0, wsum = 0; int val0 = sptr[j]; for( k = 0; k < maxk; k++ ) { int val = sptr[j + space_ofs[k]]; float w = space_weight[k]*color_weight[std::abs(val - val0)]; sum += val*w; wsum += w; } // overflow is not possible here => there is no need to use CV_CAST_8U dptr[j] = (uchar)cvRound(sum/wsum); } } else { assert( cn == 3 ); for( j = 0; j < size.width*3; j += 3 ) { float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; for( k = 0; k < maxk; k++ ) { const uchar* sptr_k = sptr + j + space_ofs[k]; int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; float w = space_weight[k]*color_weight[std::abs(b - b0) + std::abs(g - g0) + std::abs(r - r0)]; sum_b += b*w; sum_g += g*w; sum_r += r*w; wsum += w; } wsum = 1.f/wsum; b0 = cvRound(sum_b*wsum); g0 = cvRound(sum_g*wsum); r0 = cvRound(sum_r*wsum); dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0; } } }}OpenCV 双边滤波调用
bilateralFilter(InputArray src, OutputArray dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT );d 表示滤波时像素邻域直径,d为负时由 sigaColor计算得到;d>5时不能实时处理。sigmaColor、sigmaSpace非别表示颜色空间和坐标空间的滤波系数sigma。可以简单的赋值为相同的值。<10时几乎没有效果;>150时为油画的效果。borderType可以不指定。
OpenCV 双边滤波实验
用sigma为10,150,240,480 时效果如下: =======双边滤波优化(CUDA)=======在进行图像处理时,由于计算量大,常常无法到达实时的效果,因此需利用GPU处理,使用CUDA进行优化。尤其是图像滤波这种,(1) 并行度高,线程间耦合度低,每个像素的处理并不相互影响;(2) 像素传输量小,计算量大;特别适合CUDA进行计算。CUDA BilateralFilter流程(可扩展至CUDA图像处理领域)复制数据 Copy Data to Device在Device上开辟2维数据空间作为输入数据: UINT *dImage = NULL; //original imagesize_t pitch;checkCudaErrors( cudaMallocPitch(&dImage, &pitch, sizeof(UINT)*width, height) );复制数据到显卡 Copy Data from Host to Device:checkCudaErrors( cudaMemcpy2D(dImage, pitch, hImage, sizeof(UINT)*width, sizeof(UINT)*width, height, cudaMemcpyHostToDevice));在Device上开辟2维数据空间保存输出数据:unsigned int *dResult;size_t pitch;checkCudaErrors( cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(UINT), height) );使用纹理储存器声明CUDA数组前,以结构体channelDesc描述CUDA数组中组件的数量和数据类型cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();声明纹理参照系:texture<Type,Dim,ReadMode> texRef;texture<uchar4, 2, cudaReadModeElementType> rgbaTex; //以全局变量形式出现将纹理参照系绑定到一个CUDA数组checkCudaErrors( cudaBindTexture2D(0, rgbaTex, dImage, desc, width, height, pitch));纹理拾取 // 书P65tex1Dfetch()tex1D(); tex2D(); tex3D();使用常量储存器位于显存,但拥有缓存加速。空间较小(64K),保存频繁访问的只读数据,两种使用方法直接赋值:__constant__ float c_cuda[4] = {0,1,2,3}; __constant__ float c_num = 1;使用函数:__constant__ float color_weight[4*256]; checkCudaErrors(cudaMemcpyToSymbol(color_weight, color_gaussian, sizeof(float)*(4*256)));本程序__constant__ float color_weight[4*256];__constant__ float space_weight[1024];函数声明局部变量数组 float color_gaussian[4*256]; float space_gaussian[1024]; 使用定义域核和值域核赋值函数赋值:checkCudaErrors(cudaMemcpyToSymbol(color_weight, color_gaussian, sizeof(float)*(4*256)));checkCudaErrors(cudaMemcpyToSymbol(space_weight, space_gaussian, sizeof(float)*(1024)));CUDA处理Kernel并行设计:dim3 blockSize(16, 16);dim3 gridSize((width + 16 - 1) / 16, (height + 16 - 1) / 16);d_bilateral_filter<<< gridSize, blockSize>>>(dDest, width, height, radius);纹理拾取 center = tex2D(rgbaTex, x, y);计算定义域和值域核的值复制数据Copy Data to HostcheckCudaErrors( cudaMemcpy(hImage,dResult,(width * height)*sizeof(UINT),cudaMemcpyDeviceToHost));CUDA源码:#include <helper_math.h>#include <helper_functions.h>#include <helper_cuda.h> // CUDA device initialization helper functions#include "cuda.h"#include "cuda_runtime_api.h"#include "device_launch_parameters.h"#include "device_functions.h"__constant__ float color_weight[4*256];__constant__ float space_weight[1024];UINT *dImage = NULL; //original imageUINT *dTemp = NULL; //temp array for iterationssize_t pitch;texture<uchar4, 2, cudaReadModeElementType> rgbaTex;//声明纹理参照系__device__ float colorLenGaussian(uchar4 a, uchar4 b){ //若想达到漫画效果,就注释掉sqrt,使颜色距离变大 uint mod = (uint)sqrt(((float)b.x - (float)a.x) * ((float)b.x - (float)a.x) + ((float)b.y - (float)a.y) * ((float)b.y - (float)a.y) + ((float)b.z - (float)a.z) * ((float)b.z - (float)a.z) + ((float)b.w - (float)a.w) * ((float)b.w - (float)a.w)); return color_weight[mod];}__device__ uint rgbaFloatToInt(float4 rgba){ rgba.x = __saturatef(fabs(rgba.x)); // clamp to [0.0, 1.0] rgba.y = __saturatef(fabs(rgba.y)); rgba.z = __saturatef(fabs(rgba.z)); rgba.w = __saturatef(fabs(rgba.w)); return (uint(rgba.w * 255.0f) << 24) | (uint(rgba.z * 255.0f) << 16) | (uint(rgba.y * 255.0f) << 8) | uint(rgba.x * 255.0f);}__device__ float4 rgbaIntToFloat(uint c){ float4 rgba; rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; rgba.y = ((c>>8) & 0xff) * 0.003921568627f; // /255.0f; rgba.z = ((c>>16) & 0xff) * 0.003921568627f; // /255.0f; rgba.w = ((c>>24) & 0xff) * 0.003921568627f; // /255.0f; return rgba;}//column pass using coalesced global memory reads__global__ voidd_bilateral_filter(uint *od, int w, int h, int r){ int x = blockIdx.x*blockDim.x + threadIdx.x; int y = blockIdx.y*blockDim.y + threadIdx.y; if (x >= w || y >= h) { return; } float sum = 0.0f; float factor = 0.0f;; uchar4 t = {0, 0, 0, 0}; float tw = 0.f,tx = 0.f, ty = 0.f, tz = 0.f; uchar4 center = tex2D(rgbaTex, x, y); //t = center; int posIndex = 0; for (int i = -r; i <= r; i++) { for (int j = -r; j <= r; j++) { uchar4 curPix = {0, 0, 0, 0}; UINT d = (UINT) sqrt((double)i*i + (double)j*j); if(d>r) continue; if(x + j<0||y + i<0||x + j>w-1||y + i>h-1) { factor = 0; } else { curPix = tex2D(rgbaTex, x + j, y + i); factor =space_weight[d] * //domain factor colorLenGaussian(curPix, center); //range factor } tw += factor * (float)curPix.w; tx += factor * (float)curPix.x; ty += factor * (float)curPix.y; tz += factor * (float)curPix.z; sum += factor; } } t.w = (UCHAR)(tw/sum); t.x = (UCHAR)(tx/sum); t.y = (UCHAR)(ty/sum); t.z = (UCHAR)(tz/sum); od[y * w + x] = (((UINT)t.w)<<24|((UINT)t.z)<<16|((UINT)t.y)<<8|((UINT)t.x));}extern "C"void updateGaussian(float sigma_color,float sigma_space, int radius){ if( sigma_color <= 0 ) sigma_color = 1; if( sigma_space <= 0 ) sigma_space = 1; double gauss_color_coeff = -0.5/(sigma_color*sigma_color); double gauss_space_coeff = -0.5/(sigma_space*sigma_space); float color_gaussian[4*256]; float space_gaussian[1024]; for(int i=0;i<256*4;i++) { color_gaussian[i] = (float)std::exp(i*i*gauss_color_coeff); space_gaussian[i] = (float)std::exp(i*i*gauss_space_coeff); //if(i>100) color_gaussian[i] = 0.0f; //漫画效果 }// for(int i = -radius,int maxk=0;i<radius;i++)// for(int j=-radius;j<radius;j++)// {// double r = sqrt((double)i*i + (double)j*j);// if( r > radius )// continue; // space_gaussian[maxk++] = (float)std::exp(r*r*gauss_space_coeff); // //space_ofs[maxk++] = (int)(i*temp.step + j*4); // } checkCudaErrors(cudaMemcpyToSymbol(color_weight, color_gaussian, sizeof(float)*(4*256))); checkCudaErrors(cudaMemcpyToSymbol(space_weight, space_gaussian, sizeof(float)*(1024)));}extern "C"void initTexture(int width, int height, uint *hImage){ // copy image data to array checkCudaErrors(cudaMallocPitch(&dImage, &pitch, sizeof(UINT)*width, height)); checkCudaErrors(cudaMallocPitch(&dTemp, &pitch, sizeof(UINT)*width, height)); checkCudaErrors(cudaMemcpy2D(dImage, pitch, hImage, sizeof(UINT)*width, sizeof(UINT)*width, height, cudaMemcpyHostToDevice));}extern "C"void freeTextures(){ checkCudaErrors(cudaFree(dImage)); checkCudaErrors(cudaFree(dTemp));}// RGBA versionextern "C"double bilateralFilterRGBA(uint *dDest, int width, int height, int radius, int iterations, StopWatchInterface *timer){ // var for kernel computation timing double dKernelTime; // Bind the array to the texture cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); checkCudaErrors(cudaBindTexture2D(0, rgbaTex, dImage, desc, width, height, pitch)); for (int i=0; i<iterations; i++) { // sync host and start kernel computation timer dKernelTime = 0.0; checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timer); dim3 blockSize(16, 16); dim3 gridSize((width + 16 - 1) / 16, (height + 16 - 1) / 16); d_bilateral_filter<<< gridSize, blockSize>>>(dDest, width, height, radius); // sync host and stop computation timer checkCudaErrors(cudaDeviceSynchronize()); dKernelTime += sdkGetTimerValue(&timer); } return ((dKernelTime/1000.)/(double)iterations);}CUDA运行结果:pace_sigma 和 color_sigma正向影响平滑力度,取值越大,平滑越明显。pace_sigma = 10 ;color_sigma=10; pace_sigma = 150 ;color_sigma=150; 漫画效果在进行双边滤波时,发现有时会出现类似漫画的效果,物体的边缘有黑边 原理:这是由于当颜色差距过大时,值域核为0,颜色和空间高斯值差距过大时,定义域核和值域核为0(定义域核一般不会为0) 修改代码实现漫画效果:计算高斯核数组时,void updateGaussian( ) 加入:if(i>100) color_gaussian[i] = 0.0f; //漫画效果计算颜色距离时,colorLenGaussian(uchar4 a, uchar4 b),去掉sqrt
相关文章推荐
- 一个人是否能成为程序员是上天注定的
- 低版本浏览器跳转
- 双边滤波CUDA优化——BilateralFilter CUDA
- Behavioral模式之Command模式
- K2新网站(官网和BPM社区)正式上线了
- 期待,下一次长途旅行
- 设置共享实验准备步骤
- 设置过滤器过滤一个月以外的内容
- SF2.0_责任-体谅-和谐-专注-适应_2015.06.20
- Delphi FireDAC 连接mysql
- Java设计模式——六大原则之最少知道
- 微信sdk分享功能详解
- 基于 ANSIBLE 自动化运维实践
- wpf,ListBox,ScrollViewer内容向左向右偏移指定位置
- VC获得window操作系统版本号, 获取操作系统位数
- IP地址在数据库里面的存储方式
- PhpStorm8.0.3配置Xdebug调试PHP程序
- java上传图片
- HADOOP 格式化 namenode节点 准备脚本
- Android BroadcastReceiver详细介绍