您的位置:首页 > 运维架构

Assembly x64 Intro - Dct.asm of OpenH264 Decode

2015-12-10 15:08 375 查看
;*!

;* \copy

;* Copyright (c) 2009-2013, Cisco Systems

;* All rights reserved.

;*

;* Redistribution and use in source and binary forms, with or without

;* modification, are permitted provided that the following conditions

;* are met:

;*

;* ?Redistributions of source code must retain the above copyright

;* notice, this list of conditions and the following disclaimer.

;*

;* ?Redistributions in binary form must reproduce the above copyright

;* notice, this list of conditions and the following disclaimer in

;* the documentation and/or other materials provided with the

;* distribution.

;*

;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

;* POSSIBILITY OF SUCH DAMAGE.

;*

;*

;* dct.asm

;*

;* Abstract

;* WelsDctFourT4_sse2

;*

;* History

;* 8/4/2009 Created

;*

;*

;*************************************************************************/

%include "asm_inc.asm"

;*******************************************************************************

; Macros and other preprocessor constants

;*******************************************************************************

%macro MMX_SumSubDiv2 3 ; 宏 MMX_SumSubDiv2 定义, 该宏可带三个参数

movq %3, %2 ; mov operate

;MOVQ instruction when operating on MMX registers and memory locations:

;DEST

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;

;MOVQ instruction when source and destination operands are XMM registers:

;DEST[63-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];

;MOVQ instruction when source operand is XMM register and destination

;operand is memory location:

;DEST

<img src='arrwleft.gif'
style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];

;MOVQ instruction when source operand is memory location and destination

;operand is XMM register:

;DEST[63-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;

;DEST[127-64]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 0000000000000000H;

psraw %3, $01 ; Packed Shift Right Arithmetic

paddw %3, %1

;PADDB 指令将压缩字节整数相加。单个结果太大而无法使用 8 位表示(上溢)时,则对结果进行舍位,将低 8 位写入目标操作数(即忽略进位)。

;PADDW 指令将压缩字整数相加。单个结果太大而无法使用 16 位表示(上溢)时,则对结果进行舍位,将低 16 位写入目标操作数。

;PADDD 指令将压缩双字整数相加。单个结果太大而无法使用 32 位表示(上溢)时,则对结果进行舍位,将低 32 位写入目标操作数。 .

psraw %1, $01

psubw %1, %2

;PSUBB 指令将压缩字节整数相减。单个结果太大或太小而无法使用一个字节表示时,则对结果执行舍位处理,将低 8 位写入目标元素。

;PSUBW 指令将压缩字整数相减。单个结果太大或太小而无法使用一个字表示时,则对结果执行舍位处理,将低 16 位写入目标元素。

;PSUBD 指令将压缩双字整数相减。单个结果太大或太小而无法使用一个双字表示时,则对结果执行舍位处理,将低 32 位写入目标元素。

%endmacro

%macro MMX_SumSub 3

movq %3, %2

psubw %2, %1

paddw %1, %3

%endmacro

%macro MMX_IDCT 6

MMX_SumSub %4, %5, %6

MMX_SumSubDiv2 %3, %2, %1

MMX_SumSub %1, %4, %6

MMX_SumSub %3, %5, %6

%endmacro

%macro MMX_StoreDiff4P 5

movd %2, %5

;MOVD instruction when destination operand is MMX register:

;DEST[31-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;

;DEST[63-32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 00000000H;

;MOVD instruction when destination operand is XMM register:

;DEST[31-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;

;DEST[127-32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 000000000000000000000000H;

;MOVD instruction when source operand is MMX or XXM register:

;DEST

<img src='arrwleft.gif'
style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[31-0];

punpcklbw %2, %4

;PUNPCKLBW instruction with 64-bit operands:

;DEST[63..56]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..24];

;DEST[55..48]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..24];

;DEST[47..40]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23..16];

;DEST[39..32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23..16];

;DEST[31..24]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..8];

;DEST[23..16]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..8];

;DEST[15..8]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7..0];

;DEST[7..0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7..0];

;PUNPCKLWD instruction with 64-bit operands:

;DEST[63..48]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..16];

;DEST[47..32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..16];

;DEST[31..16]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..0];

;DEST[15..0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..0];

;PUNPCKLDQ instruction with 64-bit operands:

;DEST[63..32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..0];

;DEST[31..0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..0];

;PUNPCKLBW instruction with 128-bit operands:

;DEST[7-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7-0];

;DEST[15-8]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7-0];

;DEST[23-16]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-8];

;DEST[31-24]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-8];

;DEST[39-32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23-16];

;DEST[47-40]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23-16];

;DEST[55-48]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-24];

;DEST[63-56]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-24];

;DEST[71-64]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[39-32];

;DEST[79-72]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[39-32];

;DEST[87-80]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-40];

;DEST[95-88]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-40];

;DEST[103-96]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[55-48];

;DEST[111-104]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[55-48];

;DEST[119-112]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-56];

;DEST[127-120]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-56];

;PUNPCKLWD instruction with 128-bit operands:

;DEST[15-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-0];

;DEST[31-16]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-0];

;DEST[47-32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-16];

;DEST[63-48]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-16];

;DEST[79-64]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-32];

;DEST[95-80]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-32];

;DEST[111-96]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-48];

;DEST[127-112]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-48];

;PUNPCKLDQ instruction with 128-bit operands:

;DEST[31-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-0];

;DEST[63-32]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-0];

;DEST[95-64]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-32];

;DEST[127-96]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-32];

;PUNPCKLQDQ

;DEST[63-0]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-0];

;DEST[127-64]

<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-0];

paddw %1, %3

psraw %1, $06

paddsw %1, %2 ; Packed Add with Saturation

;PADDSB 指令将压缩有符号字节整数相加。单个字节结果超出有符号字节整数的范围(即大于 7FH 或小于 80H)时,则分别将饱和值 7FH 或 80H 写入目标操作数。

;PADDSW 指令将压缩有符号字整数相加。单个字结果超出有符号字整数的范围(即大于 7FFFH 或小于 8000H)时,则分别将饱和值 7FFFH 或 8000H 写入目标操作数。

packuswb %1, %2 ; Pack with Unsigned Saturation

;使用饱和运算将 mm 中的 4 个有符号字与mm/m64 中的 4 个有符号字压缩成 8 个无符号字节,结果放入mm。

;使用饱和运算将 xmm1 与xmm2/m128 中的有符号字压缩成无符号字节,结果放入
xmm1。

movd %5, %1

%endmacro

%macro WELS_EXTERN 1

ALIGN 16

%ifdef PREFIX

global _%1

%define %1 _%1

%else

global %1

%endif

%1:

%endmacro

%macro SIGN_EXTENSION 2

%ifndef X86_32

movsxd %1, %2

%endif

%endmacro

; pOut mm1, mm4, mm5, mm3

%macro MMX_Trans4x4W 5

MMX_XSwap wd, %1, %2, %5

MMX_XSwap wd, %3, %4, %2

MMX_XSwap dq, %1, %3, %4

MMX_XSwap dq, %5, %2, %3

%endmacro

%macro MMX_XSwap 4

movq %4, %2

punpckh%1 %4, %3

punpckl%1 %2, %3

%endmacro

;*******************************************************************************

; Code

;*******************************************************************************

SECTION .text

;*******************************************************************************

; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )

;*******************************************************************************

WELS_EXTERN IdctResAddPred_mmx
; 定义 IdctResAddPred_mmx 函数

%assign push_num 0

LOAD_3_PARA

SIGN_EXTENSION r1, r1d

movq mm0, [r2+ 0]

movq mm1, [r2+ 8]

movq mm2, [r2+16]

movq mm3, [r2+24]

MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4

========>

movq mm4, mm0

punpckhwd mm4, mm1 ; 高位交叉组合

punpcklwd mm0, mm1 ; 低位交叉组合

movq mm1, mm2

punpckhwd mm1, mm3

punpcklwd mm2, mm3

movq mm3, mm0

punpckhdq mm3, mm2

punpckldq mm0, mm2

movq mm2, mm4

punpckhdq mm2, mm1

punpckldq mm4, mm1

MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6

%macro MMX_SumSubDiv2 3

movq %3, %2

psraw %3, $01

paddw %3, %1

psraw %1, $01

psubw %1, %2

%endmacro

%macro MMX_SumSub 3

movq %3, %2

psubw %2, %1

paddw %1, %3

%endmacro

%macro MMX_IDCT 6

MMX_SumSub %4, %5, %6

MMX_SumSubDiv2 %3, %2, %1

MMX_SumSub %1, %4, %6

MMX_SumSub %3, %5, %6

%endmacro

MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2

MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6

WELS_Zero mm7

WELS_DW32 mm6

MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]

MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]

lea r0, [r0+2*r1]

MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]

MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]

emms

ret

;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);

WELS_EXTERN WelsBlockZero16x16_sse2

%assign push_num 0

LOAD_2_PARA

SIGN_EXTENSION r1, r1d

shl r1, 1

pxor xmm0, xmm0

%rep 16

movdqa [r0], xmm0

movdqa [r0+16], xmm0

add r0, r1

%endrep

ret

;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);

WELS_EXTERN WelsBlockZero8x8_sse2

%assign push_num 0

LOAD_2_PARA

SIGN_EXTENSION r1, r1d

shl r1, 1

pxor xmm0, xmm0

%rep 8

movdqa [r0], xmm0

add r0, r1

%endrep

ret




内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: