您的位置:首页 > 编程语言

OpenCL 优化后的卷积代码

2016-01-07 19:49 197 查看
自己修改过的 可以直接运行 卷积模板

void prefetch_texture_samples_8x4(image2d_t src, sampler_t s, __local float4 rgb [10][6], int2 gid, int2 lid)
{
if (lid.x == 0) {
// work-item 1 fetches all 60 rgb samples
for (int i=-1; i<9; i++) {
for (int j=-1; j<5; j++)
rgb[i+1][j+1] = read_imagef(src, s, gid+(int2)(i, j));
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}

void prefetch_8x4_optimized(image2d_t src, sampler_t s, __local float4 rgb[10][6])
{
// Coord of wi0 in NRDange
int2 wi0Coord = (int2)(get_group_id(0)*8, get_group_id(1)*4);

// 2D to 1D address (from 8x4 to 32x1)
int flatLocal = get_local_id(1)*8 + get_local_id(0);

// Only first 30 work-items load, each loads 2 values in sequence
if (flatLocal < 30)
{
/* Convert from flatLocal 1D id to 2D, 10x3 */
int i = flatLocal % 10; // Width
int j = flatLocal / 10; // Height

/* 30 work iteams reads 10x3 values,
* values 0-9, 10-19, 20-29 from 10x6 - top half
*/
rgb[j][i] = read_imagef(src, s, (int2)(wi0Coord.x + i - 1, wi0Coord.y + j - 1));

/* 30 work iteams reads 10x3 values,
* values 30-39, 40-49, 50-59 from 10x6 - bottom half
*/
rgb[j + 3][i] = read_imagef(src, s, (int2)(wi0Coord.x + i - 1, wi0Coord.y + j + 3 - 1));
}
barrier(CLK_LOCAL_MEM_FENCE);
}

__attribute__((reqd_work_group_size(8, 4, 1)))
__kernel void blur (image2d_t src, image2d_t dst, sampler_t s, float *weight)
{
int2 gid = (int2)(get_group_id(0)*8, get_group_id(1)*4);
int2 lid = (int2)(get_local_id(0),   get_local_id(1));
float4 pixel = 0.0f;

__local float4 rgb[10][6];
prefetch_texture_samples_8x4(src, s, rgb, gid, lid);

for (int j=-1; j<=1; j++)
for (int i=-1; i<=1; i++)
pixel += rgb[lid.x+1+i][lid.y+1+i] * weight[(j+1)*3+i+1];

int x = get_global_id(0);
int y = get_global_id(1);
write_imagef(dst, (int2)(x, y), pixel/9.f);
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: