您的位置:首页 > 运维架构

CUDA总结:opencv图像金字塔函数分析

2016-12-26 17:46 363 查看
基于 opencv 3.1.0

相关源码

..\sources\modules\cudalegacy\src\image_pyramid.cpp

..\sources\modules\cudalegacy\src\cuda\NCVPyramid.cu



cv::cuda::ImagePyramidImpl

金字塔分解,调用层次:

cv::cuda::ImagePyramidImpl::ImagePyramidImpl ->

cv::cuda::device::pyramid::downsampleX2 ->

kernelDownsampleX2_gpu ->

kernelDownsampleX2 (cuda kernel)

template<typename T>
__global__ void kernelDownsampleX2(T *d_src,    //d for device
Ncv32u srcPitch,
T *d_dst,
Ncv32u dstPitch,
NcvSize32u dstRoi)
{
Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;

if (i < dstRoi.height && j < dstRoi.width)
{
//srcPitch是每行的字节长度
//两行合并为一行
T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);

T p00 = d_src_line1[2*j+0];
T p01 = d_src_line1[2*j+1];
T p10 = d_src_line2[2*j+0];
T p11 = d_src_line2[2*j+1];
//2倍抽取:两行相邻四个点取平均
//也可以采用高斯滤波
d_dst_line[j] = _average4(p00, p01, p10, p11);
}
}


获得金字塔第n层图像,调用层次:

cv::cuda::ImagePyramidImpl::getLayer->

cv::cuda::device::pyramid::interpolateFrom1->

kernelInterpolateFrom1_gpu->

kernelInterpolateFrom1 (cuda kernel)

template<typename T>
__global__ void kernelInterpolateFrom1(T *d_srcTop,
Ncv32u srcTopPitch,
NcvSize32u szTopRoi,
T *d_dst,
Ncv32u dstPitch,
NcvSize32u dstRoi)
{
Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;

if (i < dstRoi.height && j < dstRoi.width)
{
Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
Ncv32u xl = (Ncv32u)ptTopX;
Ncv32u xh = xl+1;
Ncv32f dx = ptTopX - xl;
Ncv32u yl = (Ncv32u)ptTopY;
Ncv32u yh = yl+1;
Ncv32f dy = ptTopY - yl;
//相邻两行之间插值,获得新一行
T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);

T p00, p01, p10, p11;
p00 = d_src_line1[xl];
p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);   //线性插值
TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
T outPix = _pixDemoteClampZ<TVFlt, T>(mixture);   //像素值饱和处理

d_dst_line[j] = outPix;
}
}


线性插值

_lerp函数,支持多通道数据,下面是单通道的定义:

template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
{
typedef typename TConvVec2Base<Tout>::TBase TB;  //定义输出数据类型
return _pixMake(TB(b.x * d + a.x * (1 - d)));
//!b*d+a*(1-d) 等价于 (b-a)*d+a
//! d取值[0,1],代表线性插值系数,d越大,插值越接近于b,d越小,插值越接近于a
}};


能否利用纹理内存的硬件插值特性加速下采样和上采样处理?

金字塔的cuda实现采用线性插值法,而cpu实现则采用高斯滤波,两者的计算结果是不一样的,前者的实时性更好,精度更差。

相比cpu版本的代码,cuda版本的代码并没有实现拉普拉斯金字塔,也没有金字塔重构。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  cuda opencv