矩阵转置的SSE汇编优化艺术以及ARM cortext 汇编优化 .
2013-01-03 14:05
288 查看
原文地址:/article/1645380.html
平时我们做图像处理或者视频处理, 很多地方会用到矩阵转置:
比如: DCT变换, 图像旋转, 图像滤波, 以及一些数据的内存行和列的交换等, 会大量使用转置这个动作.
然而由于数据量很大,处理速度很慢!如何来提高处理速度呢?
下面看看分析:
HEVC中有个地方是如下这样实现(直接行和列对应的位置交换):
[cpp]
view plaincopyprint?
Pel tmp;
for (k=0;k<blkSize-1;k++)
{
for (l=k+1;l<blkSize;l++)
{
tmp = pDst[k*dstStride+l];
pDst[k*dstStride+l] = pDst[l*dstStride+k];
pDst[l*dstStride+k] = tmp;
}
}
如何用汇编来实现呢?
我们先用SSE汇编来实现一个8X8的矩阵转置吧: 这里输入地址pSrc_128[i] 和输出地址pDst_128[i]可以相同也可以不同:
相同的话就是原地转置, 不同的话就是非原地转置.
[cpp]
view plaincopyprint?
__m128i* m_pSrc_tmp = pSrc_128[i];
__m128i* m_pDst_tmp = pDst_128[i];
__m128i Org_8_0,Org_8_1, Org_8_2, Org_8_3;
__m128i tttt1,tttt2,tttt3,tttt4,tttt33,tttt44;
__m128i tttt5,tttt6, tttt7, tttt8;
int stride_ii = dstStride>>3;
//one
Org_8_0 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_1 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_2 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_3 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1);
tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3);
tttt3 = _mm_unpackhi_epi16(Org_8_0, Org_8_1);
tttt4 = _mm_unpackhi_epi16(Org_8_2, Org_8_3);
tttt5 = _mm_unpacklo_epi32(tttt1, tttt2);
tttt6 = _mm_unpackhi_epi32(tttt1, tttt2);
Org_8_0 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;;
Org_8_1 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_2 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_3 = _mm_load_si128(m_pSrc_tmp);
//m_pSrc_tmp+=8;
tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1);
tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3);
tttt33 = _mm_unpackhi_epi16(Org_8_0, Org_8_1);
tttt44 = _mm_unpackhi_epi16(Org_8_2, Org_8_3);
tttt7 = _mm_unpacklo_epi32(tttt1, tttt2);
tttt8 = _mm_unpackhi_epi32(tttt1, tttt2);
tttt1 = _mm_unpacklo_epi64(tttt5, tttt7);
tttt2 = _mm_unpackhi_epi64(tttt5, tttt7);
_mm_storeu_si128(m_pDst_tmp, tttt1);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt2);
m_pDst_tmp+=stride_ii;
tttt5 = _mm_unpacklo_epi64(tttt6, tttt8);
tttt7 = _mm_unpackhi_epi64(tttt6, tttt8);
_mm_storeu_si128(m_pDst_tmp, tttt5);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt7);
m_pDst_tmp+=stride_ii;
//tow
tttt5 = _mm_unpacklo_epi32(tttt3, tttt4);
tttt6 = _mm_unpackhi_epi32(tttt3, tttt4);
tttt7 = _mm_unpacklo_epi32(tttt33, tttt44);
tttt8 = _mm_unpackhi_epi32(tttt33, tttt44);
tttt1 = _mm_unpacklo_epi64(tttt5, tttt7);
tttt2 = _mm_unpackhi_epi64(tttt5, tttt7);
_mm_storeu_si128(m_pDst_tmp, tttt1);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt2);
m_pDst_tmp+=stride_ii;
tttt5 = _mm_unpacklo_epi64(tttt6, tttt8);
tttt7 = _mm_unpackhi_epi64(tttt6, tttt8);
_mm_storeu_si128(m_pDst_tmp, tttt5);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt7);
要实现的是NXN的转置,如何实现呢:
基于8X8来实现NXN的块或者图像的转置:
这里先把NXN划分为size_case 个8X8, 然后循环调用8X8的转置!
[cpp]
view plaincopyprint?
__m128i* pDst_128[64];
__m128i* pSrc_128[64];
int size_case = (blkSize>>3);
dstStride = dstStride_tmp;
for(int y = 0; y<size_case; y++)//对所有8x8的块进行地址映射
for(int x = 0; x<size_case; x++)
{
pSrc_128[y*size_case + x] = (__m128i*)(pDst + 8*x + y*8*64);
pDst_128[y*size_case + x] = (__m128i*)(rpDst + 8*y + x*8*dstStride);
}
size_case = size_case*size_case;
for(int i = 0;i <size_case; i++)//开始转置
{
8x8转置的代码:
}
通过比较, 用SSE汇编优化实现转置比用纯 C代码实现的转置速度快5倍左右!
同样在ARM cortext上的汇编优化也是基于这个原理:
主要循环体代码如下:
[cpp]
view plaincopyprint?
VTRN.16 q8, q9
VTRN.16 q10, q11
VTRN.16 q4, q5
VTRN.16 q6, q7
VTRN.32 q8, q10
VTRN.32 q9, q11
VTRN.32 q4, q6
VTRN.32 q5, q7
VSWP d17, d8
VSWP d19, d10
VSWP d21, d12
VSWP d23, d14
感兴趣的可以自己调试下!
当然DSP上也是同样的方法, 只是涉及到的指令不同而已!
平时我们做图像处理或者视频处理, 很多地方会用到矩阵转置:
比如: DCT变换, 图像旋转, 图像滤波, 以及一些数据的内存行和列的交换等, 会大量使用转置这个动作.
然而由于数据量很大,处理速度很慢!如何来提高处理速度呢?
下面看看分析:
HEVC中有个地方是如下这样实现(直接行和列对应的位置交换):
[cpp]
view plaincopyprint?
Pel tmp;
for (k=0;k<blkSize-1;k++)
{
for (l=k+1;l<blkSize;l++)
{
tmp = pDst[k*dstStride+l];
pDst[k*dstStride+l] = pDst[l*dstStride+k];
pDst[l*dstStride+k] = tmp;
}
}
Pel tmp; for (k=0;k<blkSize-1;k++) { for (l=k+1;l<blkSize;l++) { tmp = pDst[k*dstStride+l]; pDst[k*dstStride+l] = pDst[l*dstStride+k]; pDst[l*dstStride+k] = tmp; } }
如何用汇编来实现呢?
我们先用SSE汇编来实现一个8X8的矩阵转置吧: 这里输入地址pSrc_128[i] 和输出地址pDst_128[i]可以相同也可以不同:
相同的话就是原地转置, 不同的话就是非原地转置.
[cpp]
view plaincopyprint?
__m128i* m_pSrc_tmp = pSrc_128[i];
__m128i* m_pDst_tmp = pDst_128[i];
__m128i Org_8_0,Org_8_1, Org_8_2, Org_8_3;
__m128i tttt1,tttt2,tttt3,tttt4,tttt33,tttt44;
__m128i tttt5,tttt6, tttt7, tttt8;
int stride_ii = dstStride>>3;
//one
Org_8_0 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_1 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_2 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_3 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1);
tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3);
tttt3 = _mm_unpackhi_epi16(Org_8_0, Org_8_1);
tttt4 = _mm_unpackhi_epi16(Org_8_2, Org_8_3);
tttt5 = _mm_unpacklo_epi32(tttt1, tttt2);
tttt6 = _mm_unpackhi_epi32(tttt1, tttt2);
Org_8_0 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;;
Org_8_1 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_2 = _mm_load_si128(m_pSrc_tmp);
m_pSrc_tmp+=8;
Org_8_3 = _mm_load_si128(m_pSrc_tmp);
//m_pSrc_tmp+=8;
tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1);
tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3);
tttt33 = _mm_unpackhi_epi16(Org_8_0, Org_8_1);
tttt44 = _mm_unpackhi_epi16(Org_8_2, Org_8_3);
tttt7 = _mm_unpacklo_epi32(tttt1, tttt2);
tttt8 = _mm_unpackhi_epi32(tttt1, tttt2);
tttt1 = _mm_unpacklo_epi64(tttt5, tttt7);
tttt2 = _mm_unpackhi_epi64(tttt5, tttt7);
_mm_storeu_si128(m_pDst_tmp, tttt1);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt2);
m_pDst_tmp+=stride_ii;
tttt5 = _mm_unpacklo_epi64(tttt6, tttt8);
tttt7 = _mm_unpackhi_epi64(tttt6, tttt8);
_mm_storeu_si128(m_pDst_tmp, tttt5);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt7);
m_pDst_tmp+=stride_ii;
//tow
tttt5 = _mm_unpacklo_epi32(tttt3, tttt4);
tttt6 = _mm_unpackhi_epi32(tttt3, tttt4);
tttt7 = _mm_unpacklo_epi32(tttt33, tttt44);
tttt8 = _mm_unpackhi_epi32(tttt33, tttt44);
tttt1 = _mm_unpacklo_epi64(tttt5, tttt7);
tttt2 = _mm_unpackhi_epi64(tttt5, tttt7);
_mm_storeu_si128(m_pDst_tmp, tttt1);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt2);
m_pDst_tmp+=stride_ii;
tttt5 = _mm_unpacklo_epi64(tttt6, tttt8);
tttt7 = _mm_unpackhi_epi64(tttt6, tttt8);
_mm_storeu_si128(m_pDst_tmp, tttt5);
m_pDst_tmp+=stride_ii;
_mm_storeu_si128(m_pDst_tmp, tttt7);
__m128i* m_pSrc_tmp = pSrc_128[i]; __m128i* m_pDst_tmp = pDst_128[i]; __m128i Org_8_0,Org_8_1, Org_8_2, Org_8_3; __m128i tttt1,tttt2,tttt3,tttt4,tttt33,tttt44; __m128i tttt5,tttt6, tttt7, tttt8; int stride_ii = dstStride>>3; //one Org_8_0 = _mm_load_si128(m_pSrc_tmp); m_pSrc_tmp+=8; Org_8_1 = _mm_load_si128(m_pSrc_tmp); m_pSrc_tmp+=8; Org_8_2 = _mm_load_si128(m_pSrc_tmp); m_pSrc_tmp+=8; Org_8_3 = _mm_load_si128(m_pSrc_tmp); m_pSrc_tmp+=8; tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1); tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3); tttt3 = _mm_unpackhi_epi16(Org_8_0, Org_8_1); tttt4 = _mm_unpackhi_epi16(Org_8_2, Org_8_3); tttt5 = _mm_unpacklo_epi32(tttt1, tttt2); tttt6 = _mm_unpackhi_epi32(tttt1, tttt2); Org_8_0 = _mm_load_si128(m_pSrc_tmp); m_pSrc_tmp+=8;; Org_8_1 = _mm_load_si128(m_pSrc_tmp); m_pSrc_tmp+=8; Org_8_2 = _mm_load_si128(m_pSrc_tmp); m_pSrc_tmp+=8; Org_8_3 = _mm_load_si128(m_pSrc_tmp); //m_pSrc_tmp+=8; tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1); tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3); tttt33 = _mm_unpackhi_epi16(Org_8_0, Org_8_1); tttt44 = _mm_unpackhi_epi16(Org_8_2, Org_8_3); tttt7 = _mm_unpacklo_epi32(tttt1, tttt2); tttt8 = _mm_unpackhi_epi32(tttt1, tttt2); tttt1 = _mm_unpacklo_epi64(tttt5, tttt7); tttt2 = _mm_unpackhi_epi64(tttt5, tttt7); _mm_storeu_si128(m_pDst_tmp, tttt1); m_pDst_tmp+=stride_ii; _mm_storeu_si128(m_pDst_tmp, tttt2); m_pDst_tmp+=stride_ii; tttt5 = _mm_unpacklo_epi64(tttt6, tttt8); tttt7 = _mm_unpackhi_epi64(tttt6, tttt8); _mm_storeu_si128(m_pDst_tmp, tttt5); m_pDst_tmp+=stride_ii; _mm_storeu_si128(m_pDst_tmp, tttt7); m_pDst_tmp+=stride_ii; //tow tttt5 = _mm_unpacklo_epi32(tttt3, tttt4); tttt6 = _mm_unpackhi_epi32(tttt3, tttt4); tttt7 = _mm_unpacklo_epi32(tttt33, tttt44); tttt8 = _mm_unpackhi_epi32(tttt33, tttt44); tttt1 = _mm_unpacklo_epi64(tttt5, tttt7); tttt2 = _mm_unpackhi_epi64(tttt5, tttt7); _mm_storeu_si128(m_pDst_tmp, tttt1); m_pDst_tmp+=stride_ii; _mm_storeu_si128(m_pDst_tmp, tttt2); m_pDst_tmp+=stride_ii; tttt5 = _mm_unpacklo_epi64(tttt6, tttt8); tttt7 = _mm_unpackhi_epi64(tttt6, tttt8); _mm_storeu_si128(m_pDst_tmp, tttt5); m_pDst_tmp+=stride_ii; _mm_storeu_si128(m_pDst_tmp, tttt7);
要实现的是NXN的转置,如何实现呢:
基于8X8来实现NXN的块或者图像的转置:
这里先把NXN划分为size_case 个8X8, 然后循环调用8X8的转置!
[cpp]
view plaincopyprint?
__m128i* pDst_128[64];
__m128i* pSrc_128[64];
int size_case = (blkSize>>3);
dstStride = dstStride_tmp;
for(int y = 0; y<size_case; y++)//对所有8x8的块进行地址映射
for(int x = 0; x<size_case; x++)
{
pSrc_128[y*size_case + x] = (__m128i*)(pDst + 8*x + y*8*64);
pDst_128[y*size_case + x] = (__m128i*)(rpDst + 8*y + x*8*dstStride);
}
size_case = size_case*size_case;
for(int i = 0;i <size_case; i++)//开始转置
{
8x8转置的代码:
}
__m128i* pDst_128[64]; __m128i* pSrc_128[64]; int size_case = (blkSize>>3); dstStride = dstStride_tmp; for(int y = 0; y<size_case; y++)//对所有8x8的块进行地址映射 for(int x = 0; x<size_case; x++) { pSrc_128[y*size_case + x] = (__m128i*)(pDst + 8*x + y*8*64); pDst_128[y*size_case + x] = (__m128i*)(rpDst + 8*y + x*8*dstStride); } size_case = size_case*size_case; for(int i = 0;i <size_case; i++)//开始转置 { 8x8转置的代码: }
通过比较, 用SSE汇编优化实现转置比用纯 C代码实现的转置速度快5倍左右!
同样在ARM cortext上的汇编优化也是基于这个原理:
主要循环体代码如下:
[cpp]
view plaincopyprint?
VTRN.16 q8, q9
VTRN.16 q10, q11
VTRN.16 q4, q5
VTRN.16 q6, q7
VTRN.32 q8, q10
VTRN.32 q9, q11
VTRN.32 q4, q6
VTRN.32 q5, q7
VSWP d17, d8
VSWP d19, d10
VSWP d21, d12
VSWP d23, d14
VTRN.16 q8, q9 VTRN.16 q10, q11 VTRN.16 q4, q5 VTRN.16 q6, q7 VTRN.32 q8, q10 VTRN.32 q9, q11 VTRN.32 q4, q6 VTRN.32 q5, q7 VSWP d17, d8 VSWP d19, d10 VSWP d21, d12 VSWP d23, d14
感兴趣的可以自己调试下!
当然DSP上也是同样的方法, 只是涉及到的指令不同而已!
相关文章推荐
- 矩阵转置的SSE汇编优化艺术以及ARM cortext 汇编优化
- 矩阵转置的SSE汇编优化艺术以及ARM cortext 汇编优化
- 逆向知识第七讲,三目运算符在汇编中的表现形式,以及编译器优化方式
- 汇编3-返回以及优化
- C6000的线性汇编--格式,用法以及优化策略
- 关于适配ios8、iPhone6、iphone plus以及资源优化相关问题
- 聚簇索引与非聚簇索引的区别以及SQL Server查询优化技术
- [Intel汇编-MASM]内存访问方式以及循环程序的调试
- Tomcat的URL中文乱码解决以及传输优化
- myecplise8.5个人喜爱的配置以及优化
- 关于性能优化的以及int Number uint的比较
- <<Oracle数据库性能优化艺术(第五期)>> 第1周 性能优化综述
- kylin官方给出的优化 以及各个步骤容易出现的问题
- Android内存泄露分析以及优化方案
- MySQL大数据优化以及分解(下篇)
- hive的查询注意事项以及优化总结 .
- J2EE运行环境性能大优化艺术之一
- BaseAdapter以及对ListView的优化(转)
- 聚簇索引与非聚簇索引的区别以及SQL Server查询优化技术
- 经验与交流:关于系统优化软件的评判标准、使用原则以及“诸多版本中哪个才是最好的”?