您的位置:首页 > 其它

图像均值滤波的CUDA并行化优化

2017-08-18 16:26 471 查看

1、算法原理

均值滤波也是线性滤波,目标点的像素为周围(模板覆盖)像素的平均值。对图像进行均值滤波处理时,每一个像素点的处理与其它像素点无关,所以,可以把对于每一个像素的处理映射到每个线程中,从而实现并行化。

2、并行思路

将像素映射到二维坐标空间,然后使用i*WIDTH+j的方式索引像素,本实例是寻找周围9个像素点的值。对于每一个线程都将去寻找它的领域像素,然后对其求平均值。为了简化计算,边界采用复制像素的方式处理。代码中采用的block Size为256*1。

3、CPU实现代码

对于C++读取图片的方式代码如下,这里不再细讲,需要深究的可以取网上查阅相关博客。

readImage.h

#pragma once
typedef unsigned char BYTE;
typedef unsigned short WORD;
typedef unsigned int DWORD;
typedef long LONG;

//位图文件头定义;
//其中不包含文件类型信息(由于结构体的内存结构决定,
//要是加了的话将不能正确读取文件信息)
typedef struct  tagBITMAPFILEHEADER {
//WORD bfType;//文件类型,必须是0x424D,即字符“BM”
DWORD bfSize;//文件大小
WORD bfReserved1;//保留字
WORD bfReserved2;//保留字
DWORD bfOffBits;//从文件头到实际位图数据的偏移字节数
}BITMAPFILEHEADER;

typedef struct tagBITMAPINFOHEADER {
DWORD biSize;//信息头大小
LONG biWidth;//图像宽度
LONG biHeight;//图像高度
WORD biPlanes;//位平面数,必须为1
WORD biBitCount;//每像素位数
DWORD  biCompression; //压缩类型
DWORD  biSizeImage; //压缩图像大小字节数
LONG  biXPelsPerMeter; //水平分辨率
LONG  biYPelsPerMeter; //垂直分辨率
DWORD  biClrUsed; //位图实际用到的色彩数
DWORD  biClrImportant; //本位图中重要的色彩数
}BITMAPINFOHEADER; //位图信息头定义

typedef struct tagRGBQUAD {
BYTE rgbBlue; //该颜色的蓝色分量
BYTE rgbGreen; //该颜色的绿色分量
BYTE rgbRed; //该颜色的红色分量
BYTE rgbReserved; //保留值
}RGBQUAD;//调色板定义

//像素信息
typedef struct tagIMAGEDATA
{
BYTE blue;
}IMAGEDATA;

unsigned char* readImageData(const char* path, int& width00, int& height11);
void saveImageData(const char* path, int width, int height, unsigned char* imagedata);

void showBmpHead();
void showBmpInforHead();
readImage.cpp

#include <stdio.h>
#include "readImage.h"
#include "stdlib.h"
#include "math.h"
#include <iostream>

#define PI 3.14159//圆周率宏定义
#define LENGTH_NAME_BMP 30//bmp图片文件名的最大长度

using namespace std;

//变量定义
BITMAPFILEHEADER strHead;
RGBQUAD strPla[256];//256色调色板
BITMAPINFOHEADER strInfo;

//显示位图文件头信息
void showBmpHead() {
cout << "位图文件头:" << endl;
cout << "文件大小:" << strHead.bfSize << endl;
cout << "保留字_1:" << strHead.bfReserved1 << endl;
cout << "保留字_2:" << strHead.bfReserved2 << endl;
cout << "实际位图数据的偏移字节数:" << strHead.bfOffBits << endl << endl;
}

void showBmpInforHead() {
cout << "位图信息头:" << endl;
cout << "结构体的长度:" << strInfo.biSize << endl;
cout << "位图宽:" << strInfo.biWidth << endl;
cout << "位图高:" << strInfo.biHeight << endl;
cout << "biPlanes平面数:" << strInfo.biPlanes << endl;
cout << "biBitCount采用颜色位数:" << strInfo.biBitCount << endl;
cout << "压缩方式:" << strInfo.biCompression << endl;
cout << "biSizeImage实际位图数据占用的字节数:" << strInfo.biSizeImage << endl;
cout << "X方向分辨率:" << strInfo.biXPelsPerMeter << endl;
cout << "Y方向分辨率:" << strInfo.biYPelsPerMeter << endl;
cout << "使用的颜色数:" << strInfo.biClrUsed << endl;
cout << "重要颜色数:" << strInfo.biClrImportant << endl;
}

unsigned char* readImageData(const char* path, int& width, int& height) {
unsigned char* imagedata = NULL;//动态分配存储原图片的像素信息的二维数组
FILE *fpi;
fpi = fopen(path, "rb");
if (!fpi) {
cout << "file open error!" << endl;
return NULL;
}
else
{
//先读取文件类型
WORD bfType;
fread(&bfType, 1, sizeof(WORD), fpi);
if (0x4d42 != bfType)
{
cout << "the file is not a bmp file!" << endl;
return NULL;
}
//读取bmp文件的文件头和信息头
fread(&strHead, sizeof(tagBITMAPFILEHEADER), 1, fpi);
fread(&strInfo, sizeof(tagBITMAPINFOHEADER), 1, fpi);
//showBmpInforHead(strInfo);//显示文件信息头

//读取调色板
for (unsigned int nCounti = 0; nCounti<strInfo.biClrUsed; nCounti++)
{
fread((char *)&(strPla[nCounti].rgbBlue), sizeof(BYTE), 1, fpi);
fread((char *)&(strPla[nCounti].rgbGreen), sizeof(BYTE), 1, fpi);
fread((char *)&(strPla[nCounti].rgbRed), sizeof(BYTE), 1, fpi);
fread((char *)&(strPla[nCounti].rgbReserved), sizeof(BYTE), 1, fpi);
}

width = strInfo.biWidth;
height = strInfo.biHeight;
//图像每一行的字节数必须是4的整数倍
width = (width * sizeof(unsigned char) + 3) / 4 * 4;
imagedata = (unsigned char*)malloc(width * height * sizeof(unsigned char));

//初始化原始图片的像素数组
for (int i = 0; i < height; ++i)
{
for (int j = 0; j < width; ++j)
{
(*(imagedata + i * width + j)) = 0;
}
}
//读出图片的像素数据
fread(imagedata, sizeof(unsigned char) * width, height, fpi);
fclose(fpi);
return imagedata;
}
}

void saveImageData(const char* path, int width, int height, unsigned char* imagedata) {
FILE *fpw;
//保存bmp图片
if ((fpw = fopen(path, "wb")) == NULL)
{
cout << "create the bmp file error!" << endl;
return;
}
WORD bfType_w = 0x4d42;
fwrite(&bfType_w, sizeof(WORD), 1, fpw);
//保存文件头
fwrite(&strHead, sizeof(tagBITMAPFILEHEADER), 1, fpw);
strInfo.biWidth = width;
strInfo.biHeight = height;
fwrite(&strInfo, sizeof(tagBITMAPINFOHEADER), 1, fpw);
//保存调色板数据
for (unsigned int nCounti = 0; nCounti<strInfo.biClrUsed; nCounti++)
{
fwrite(&strPla[nCounti].rgbBlue, sizeof(BYTE), 1, fpw);
fwrite(&strPla[nCounti].rgbGreen, sizeof(BYTE), 1, fpw);
fwrite(&strPla[nCounti].rgbRed, sizeof(BYTE), 1, fpw);
fwrite(&strPla[nCounti].rgbReserved, sizeof(BYTE), 1, fpw);
}
//保存像素数据
for (int i = 0; i < height; ++i)
{
for (int j = 0; j < width; ++j)
{
fwrite(&((*(imagedata + i * width + j))), 1, sizeof(unsigned char), fpw);
}
}
fclose(fpw);
}

CPU 实现均值滤波代码:

kernel.cu

extern "C" void cpuSmoothImage(unsigned char* srcData, unsigned char* dstData, int width, int height) {
//复制源图像数据
memcpy(dstData, srcData, width*height * sizeof(unsigned char));
cudaEvent_t d_begin, d_end;
cudaEventCreate(&d_begin);
cudaEventCreate(&d_end);
cudaEventRecord(d_begin, 0);
for (int i = 1; i < height - 1; i++) {
for (int j = 1; j < width - 1; j++) {
float temp = 0;
temp += srcData[i*width + j - 1];
temp += srcData[i*width + j];
temp += srcData[i*width + j + 1];

temp += srcData[(i + 1)*width + j - 1];
temp += srcData[(i + 1)*width + j];
temp += srcData[(i + 1)*width + j + 1];

temp += srcData[(i - 1)*width + j - 1];
temp += srcData[(i - 1)*width + j];
temp += srcData[(i - 1)*width + j + 1];
temp = temp / 9;
dstData[i*width + j] = temp;
}
}
cudaEventRecord(d_end);
cudaEventSynchronize(d_end);
float cpuTime = 0.0;
cudaEventElapsedTime(&cpuTime, d_begin, d_end);
printf(">>>CPU Time is : %f ms\n", cpuTime);
}


4、GPU全局内存实现

kernel.cu

__global__ void kernelGPU(int width, int height, unsigned char* srcData, unsigned char* dstData)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = blockIdx.y;
int pos = j*width + i;

//边缘保留
if (i > 0 && (i < width - 1) && j > 0 && (j < height - 1)) {
float temp = 0;
temp += srcData[pos];
temp += srcData[pos+1];
temp += srcData[pos-1];

temp += srcData[pos - width - 1];
temp += srcData[pos - width];
temp += srcData[pos - width + 1];

temp += srcData[pos + width - 1];
temp += srcData[pos + width];
temp += srcData[pos + width + 1];
temp = temp / 9;
dstData[pos] = temp;
}
else {
dstData[pos] = srcData[pos];
}
}

extern "C" void gpuSmoothImage(int width, int height, unsigned char* srcData, unsigned char* dstData) {
size_t size = width * height * sizeof(unsigned char);
cudaEvent_t d_begin, d_end;
cudaEventCreate(&d_begin);
cudaEventCreate(&d_end);
cudaEventRecord(d_begin, 0);

unsigned char* d_srcData=NULL;
cudaMalloc((void**)&d_srcData, size);
cudaMemcpy(d_srcData, srcData, size, cudaMemcpyHostToDevice);

unsigned char* d_dstData = NULL;
cudaMalloc((void**)&d_dstData, size);

//dim3 blockSize(16,16);
//dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
dim3 blockSize(256,1);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,height);
kernelGPU << <gridSize,blockSize >> > (width, height, d_srcData, d_dstData);
cudaMemcpy(dstData, d_dstData, size, cudaMemcpyDeviceToHost);

cudaEventRecord(d_end);
cudaEventSynchronize(d_end);
float gpuTime = 0.0;
cudaEventElapsedTime(&gpuTime, d_begin, d_end);
printf(">>>GPU Time is : %f ms\n", gpuTime);

cudaFree(d_srcData);
cudaFree(d_dstData);
}

调用方式  main.cpp

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "readImage.h"

extern "C" void gpuSmoothImage(int width, int height, unsigned char* srcData, unsigned char* dstData);
extern "C" void cpuSmoothImage(unsigned char* srcData, unsigned char* dstData, int width, int height);
extern "C" void gpuSmoothImageTexture(int width, int height, unsigned char* srcData, unsigned char* dstData);

int main() {
const char* path = "lena.bmp";
int Width, Height;
unsigned char* readData = readImageData(path, Width, Height);
printf("w:%d h:%d\n", Width, Height);
//showBmpHead();
//showBmpInforHead();

//CPU smooth Image
clock_t t1, t2;
unsigned char* moothData = NULL;
moothData = (unsigned char*)malloc(Width*Height * sizeof(unsigned char));
cpuSmoothImage(readData, moothData, Width, Height);
const char * savePath = "saveCPU.bmp";
saveImageData(savePath, Width, Height, moothData);

/*
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 10; j++) {
printf("%5d", moothData[i*Width + j]);
}
printf("\n");
}*/

//GPU smooth Image
unsigned char* result = NULL;
result = (unsigned char*)malloc(Width*Height * sizeof(unsigned char));
const char * savePath1 = "saveGPU.bmp";
gpuSmoothImage(Width, Height, readData, result);
//gpuSmoothImageTexture(Width, Height, readData, result);
saveImageData(savePath1, Width, Height, result);

/*printf("GPUdata:\n");
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 10; j++) {
printf("%5d", result[i*Width + j]);
}
printf("\n");
}
*/
return 0;
}
实验结果:









可以看到GPU和CPU的平滑结果完全一样,我们再看下并行加速的效果,图片在1024的时候加速了14.67倍,效果还是很好的。



5、纹理内存

因为我们是按照行存储的方式读取数据,所以使用一维纹理内存。对于纹理内存主要是三个步骤:纹理参考声明,纹理数据绑定,纹理拾取。

可以参考本人以前博客:http://blog.csdn.net/zhangfuliang123/article/details/76571498

代码如下:

texture<unsigned char, 1, cudaReadModeElementType> texRef;

__global__ void kernelGPUTexture(int width, int height, unsigned char* dstData)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = blockIdx.y;
int pos = j*width + i;

//边缘保留
if (i > 0 && (i < width - 1) && j > 0 && (j < height - 1)) {
float temp = 0;
temp += tex1Dfetch(texRef, pos);
temp += tex1Dfetch(texRef, pos + 1);
temp += tex1Dfetch(texRef, pos - 1);

temp += tex1Dfetch(texRef, pos - width - 1);
temp += tex1Dfetch(texRef, pos - width);
temp += tex1Dfetch(texRef, pos - width + 1);

temp += tex1Dfetch(texRef, pos + width - 1);
temp += tex1Dfetch(texRef, pos + width);
temp += tex1Dfetch(texRef, pos + width + 1);
temp = temp / 9;
dstData[pos] = temp;
}
else {
dstData[pos] = tex1Dfetch(texRef, pos);
}
}

extern "C" void gpuSmoothImageTexture(int width, int height, unsigned char* srcData, unsigned char* dstData) {
size_t size = width * height * sizeof(unsigned char);
cudaEvent_t d_begin, d_end;
cudaEventCreate(&d_begin);
cudaEventCreate(&d_end);
cudaEventRecord(d_begin, 0);

unsigned char* d_srcData = NULL;
cudaMalloc((void**)&d_srcData, size);
cudaMemcpy(d_srcData, srcData, size, cudaMemcpyHostToDevice);

unsigned char* d_dstData = NULL;
cudaMalloc((void**)&d_dstData, size);

//bind texture
cudaBindTexture(0, texRef, d_srcData);

//dim3 blockSize(16, 16);
//dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
dim3 blockSize(256,1);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,height);
kernelGPUTexture << <gridSize, blockSize >> > (width, height, d_dstData);
cudaMemcpy(dstData, d_dstData, size, cudaMemcpyDeviceToHost);

cudaEventRecord(d_end);
cudaEventSynchronize(d_end);
float gpuTime = 0.0;
cudaEventElapsedTime(&gpuTime, d_begin, d_end);
printf(">>>GPU Time is : %f ms\n", gpuTime);

cudaFree(d_srcData);
cudaFree(d_dstData);
cudaUnbindTexture(&texRef);
}
使用纹理内存加速效果如下:



可以看出,使用纹理内存,加速效果并不是很理想,应该是使用一维纹理内存拾取不能达到很好的效果(不是非常确定)。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: