Assembly x64 Intro - Dct.asm of OpenH264 Decode
2015-12-10 15:08
375 查看
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* dct.asm
;*
;* Abstract
;* WelsDctFourT4_sse2
;*
;* History
;* 8/4/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro MMX_SumSubDiv2 3 ; 宏 MMX_SumSubDiv2 定义, 该宏可带三个参数
movq %3, %2 ; mov operate
;MOVQ instruction when operating on MMX registers and memory locations:
;DEST
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;MOVQ instruction when source and destination operands are XMM registers:
;DEST[63-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is XMM register and destination
;operand is memory location:
;DEST
<img src='arrwleft.gif'
style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is memory location and destination
;operand is XMM register:
;DEST[63-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 0000000000000000H;
psraw %3, $01 ; Packed Shift Right Arithmetic
paddw %3, %1
;PADDB 指令将压缩字节整数相加。单个结果太大而无法使用 8 位表示(上溢)时,则对结果进行舍位,将低 8 位写入目标操作数(即忽略进位)。
;PADDW 指令将压缩字整数相加。单个结果太大而无法使用 16 位表示(上溢)时,则对结果进行舍位,将低 16 位写入目标操作数。
;PADDD 指令将压缩双字整数相加。单个结果太大而无法使用 32 位表示(上溢)时,则对结果进行舍位,将低 32 位写入目标操作数。 .
psraw %1, $01
psubw %1, %2
;PSUBB 指令将压缩字节整数相减。单个结果太大或太小而无法使用一个字节表示时,则对结果执行舍位处理,将低 8 位写入目标元素。
;PSUBW 指令将压缩字整数相减。单个结果太大或太小而无法使用一个字表示时,则对结果执行舍位处理,将低 16 位写入目标元素。
;PSUBD 指令将压缩双字整数相减。单个结果太大或太小而无法使用一个双字表示时,则对结果执行舍位处理,将低 32 位写入目标元素。
%endmacro
%macro MMX_SumSub 3
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
%macro MMX_StoreDiff4P 5
movd %2, %5
;MOVD instruction when destination operand is MMX register:
;DEST[31-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[63-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 00000000H;
;MOVD instruction when destination operand is XMM register:
;DEST[31-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 000000000000000000000000H;
;MOVD instruction when source operand is MMX or XXM register:
;DEST
<img src='arrwleft.gif'
style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[31-0];
punpcklbw %2, %4
;PUNPCKLBW instruction with 64-bit operands:
;DEST[63..56]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..24];
;DEST[55..48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..24];
;DEST[47..40]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23..16];
;DEST[39..32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23..16];
;DEST[31..24]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..8];
;DEST[23..16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..8];
;DEST[15..8]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7..0];
;DEST[7..0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7..0];
;PUNPCKLWD instruction with 64-bit operands:
;DEST[63..48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..16];
;DEST[47..32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..16];
;DEST[31..16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..0];
;DEST[15..0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..0];
;PUNPCKLDQ instruction with 64-bit operands:
;DEST[63..32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..0];
;DEST[31..0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..0];
;PUNPCKLBW instruction with 128-bit operands:
;DEST[7-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7-0];
;DEST[15-8]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7-0];
;DEST[23-16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-8];
;DEST[31-24]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-8];
;DEST[39-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23-16];
;DEST[47-40]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23-16];
;DEST[55-48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-24];
;DEST[63-56]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-24];
;DEST[71-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[39-32];
;DEST[79-72]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[39-32];
;DEST[87-80]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-40];
;DEST[95-88]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-40];
;DEST[103-96]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[55-48];
;DEST[111-104]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[55-48];
;DEST[119-112]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-56];
;DEST[127-120]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-56];
;PUNPCKLWD instruction with 128-bit operands:
;DEST[15-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-0];
;DEST[31-16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-0];
;DEST[47-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-16];
;DEST[63-48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-16];
;DEST[79-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-32];
;DEST[95-80]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-32];
;DEST[111-96]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-48];
;DEST[127-112]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-48];
;PUNPCKLDQ instruction with 128-bit operands:
;DEST[31-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-0];
;DEST[63-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-0];
;DEST[95-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-32];
;DEST[127-96]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-32];
;PUNPCKLQDQ
;DEST[63-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-0];
;DEST[127-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-0];
paddw %1, %3
psraw %1, $06
paddsw %1, %2 ; Packed Add with Saturation
;PADDSB 指令将压缩有符号字节整数相加。单个字节结果超出有符号字节整数的范围(即大于 7FH 或小于 80H)时,则分别将饱和值 7FH 或 80H 写入目标操作数。
;PADDSW 指令将压缩有符号字整数相加。单个字结果超出有符号字整数的范围(即大于 7FFFH 或小于 8000H)时,则分别将饱和值 7FFFH 或 8000H 写入目标操作数。
packuswb %1, %2 ; Pack with Unsigned Saturation
;使用饱和运算将 mm 中的 4 个有符号字与mm/m64 中的 4 个有符号字压缩成 8 个无符号字节,结果放入mm。
;使用饱和运算将 xmm1 与xmm2/m128 中的有符号字压缩成无符号字节,结果放入
xmm1。
movd %5, %1
%endmacro
%macro WELS_EXTERN 1
ALIGN 16
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%1:
%endmacro
%macro SIGN_EXTENSION 2
%ifndef X86_32
movsxd %1, %2
%endif
%endmacro
; pOut mm1, mm4, mm5, mm3
%macro MMX_Trans4x4W 5
MMX_XSwap wd, %1, %2, %5
MMX_XSwap wd, %3, %4, %2
MMX_XSwap dq, %1, %3, %4
MMX_XSwap dq, %5, %2, %3
%endmacro
%macro MMX_XSwap 4
movq %4, %2
punpckh%1 %4, %3
punpckl%1 %2, %3
%endmacro
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;*******************************************************************************
; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
;*******************************************************************************
WELS_EXTERN IdctResAddPred_mmx
; 定义 IdctResAddPred_mmx 函数
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1d
movq mm0, [r2+ 0]
movq mm1, [r2+ 8]
movq mm2, [r2+16]
movq mm3, [r2+24]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
========>
movq mm4, mm0
punpckhwd mm4, mm1 ; 高位交叉组合
punpcklwd mm0, mm1 ; 低位交叉组合
movq mm1, mm2
punpckhwd mm1, mm3
punpcklwd mm2, mm3
movq mm3, mm0
punpckhdq mm3, mm2
punpckldq mm0, mm2
movq mm2, mm4
punpckhdq mm2, mm1
punpckldq mm4, mm1
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
%macro MMX_SumSubDiv2 3
movq %3, %2
psraw %3, $01
paddw %3, %1
psraw %1, $01
psubw %1, %2
%endmacro
%macro MMX_SumSub 3
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7
WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
lea r0, [r0+2*r1]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
emms
ret
;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero16x16_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
shl r1, 1
pxor xmm0, xmm0
%rep 16
movdqa [r0], xmm0
movdqa [r0+16], xmm0
add r0, r1
%endrep
ret
;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero8x8_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
shl r1, 1
pxor xmm0, xmm0
%rep 8
movdqa [r0], xmm0
add r0, r1
%endrep
ret
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* dct.asm
;*
;* Abstract
;* WelsDctFourT4_sse2
;*
;* History
;* 8/4/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro MMX_SumSubDiv2 3 ; 宏 MMX_SumSubDiv2 定义, 该宏可带三个参数
movq %3, %2 ; mov operate
;MOVQ instruction when operating on MMX registers and memory locations:
;DEST
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;MOVQ instruction when source and destination operands are XMM registers:
;DEST[63-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is XMM register and destination
;operand is memory location:
;DEST
<img src='arrwleft.gif'
style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is memory location and destination
;operand is XMM register:
;DEST[63-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 0000000000000000H;
psraw %3, $01 ; Packed Shift Right Arithmetic
paddw %3, %1
;PADDB 指令将压缩字节整数相加。单个结果太大而无法使用 8 位表示(上溢)时,则对结果进行舍位,将低 8 位写入目标操作数(即忽略进位)。
;PADDW 指令将压缩字整数相加。单个结果太大而无法使用 16 位表示(上溢)时,则对结果进行舍位,将低 16 位写入目标操作数。
;PADDD 指令将压缩双字整数相加。单个结果太大而无法使用 32 位表示(上溢)时,则对结果进行舍位,将低 32 位写入目标操作数。 .
psraw %1, $01
psubw %1, %2
;PSUBB 指令将压缩字节整数相减。单个结果太大或太小而无法使用一个字节表示时,则对结果执行舍位处理,将低 8 位写入目标元素。
;PSUBW 指令将压缩字整数相减。单个结果太大或太小而无法使用一个字表示时,则对结果执行舍位处理,将低 16 位写入目标元素。
;PSUBD 指令将压缩双字整数相减。单个结果太大或太小而无法使用一个双字表示时,则对结果执行舍位处理,将低 32 位写入目标元素。
%endmacro
%macro MMX_SumSub 3
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
%macro MMX_StoreDiff4P 5
movd %2, %5
;MOVD instruction when destination operand is MMX register:
;DEST[31-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[63-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 00000000H;
;MOVD instruction when destination operand is XMM register:
;DEST[31-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 000000000000000000000000H;
;MOVD instruction when source operand is MMX or XXM register:
;DEST
<img src='arrwleft.gif'
style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[31-0];
punpcklbw %2, %4
;PUNPCKLBW instruction with 64-bit operands:
;DEST[63..56]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..24];
;DEST[55..48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..24];
;DEST[47..40]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23..16];
;DEST[39..32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23..16];
;DEST[31..24]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..8];
;DEST[23..16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..8];
;DEST[15..8]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7..0];
;DEST[7..0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7..0];
;PUNPCKLWD instruction with 64-bit operands:
;DEST[63..48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..16];
;DEST[47..32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..16];
;DEST[31..16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..0];
;DEST[15..0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..0];
;PUNPCKLDQ instruction with 64-bit operands:
;DEST[63..32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..0];
;DEST[31..0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..0];
;PUNPCKLBW instruction with 128-bit operands:
;DEST[7-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7-0];
;DEST[15-8]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7-0];
;DEST[23-16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-8];
;DEST[31-24]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-8];
;DEST[39-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23-16];
;DEST[47-40]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23-16];
;DEST[55-48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-24];
;DEST[63-56]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-24];
;DEST[71-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[39-32];
;DEST[79-72]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[39-32];
;DEST[87-80]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-40];
;DEST[95-88]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-40];
;DEST[103-96]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[55-48];
;DEST[111-104]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[55-48];
;DEST[119-112]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-56];
;DEST[127-120]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-56];
;PUNPCKLWD instruction with 128-bit operands:
;DEST[15-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-0];
;DEST[31-16]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-0];
;DEST[47-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-16];
;DEST[63-48]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-16];
;DEST[79-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-32];
;DEST[95-80]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-32];
;DEST[111-96]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-48];
;DEST[127-112]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-48];
;PUNPCKLDQ instruction with 128-bit operands:
;DEST[31-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-0];
;DEST[63-32]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-0];
;DEST[95-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-32];
;DEST[127-96]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-32];
;PUNPCKLQDQ
;DEST[63-0]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-0];
;DEST[127-64]
<img
src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-0];
paddw %1, %3
psraw %1, $06
paddsw %1, %2 ; Packed Add with Saturation
;PADDSB 指令将压缩有符号字节整数相加。单个字节结果超出有符号字节整数的范围(即大于 7FH 或小于 80H)时,则分别将饱和值 7FH 或 80H 写入目标操作数。
;PADDSW 指令将压缩有符号字整数相加。单个字结果超出有符号字整数的范围(即大于 7FFFH 或小于 8000H)时,则分别将饱和值 7FFFH 或 8000H 写入目标操作数。
packuswb %1, %2 ; Pack with Unsigned Saturation
;使用饱和运算将 mm 中的 4 个有符号字与mm/m64 中的 4 个有符号字压缩成 8 个无符号字节,结果放入mm。
;使用饱和运算将 xmm1 与xmm2/m128 中的有符号字压缩成无符号字节,结果放入
xmm1。
movd %5, %1
%endmacro
%macro WELS_EXTERN 1
ALIGN 16
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%1:
%endmacro
%macro SIGN_EXTENSION 2
%ifndef X86_32
movsxd %1, %2
%endif
%endmacro
; pOut mm1, mm4, mm5, mm3
%macro MMX_Trans4x4W 5
MMX_XSwap wd, %1, %2, %5
MMX_XSwap wd, %3, %4, %2
MMX_XSwap dq, %1, %3, %4
MMX_XSwap dq, %5, %2, %3
%endmacro
%macro MMX_XSwap 4
movq %4, %2
punpckh%1 %4, %3
punpckl%1 %2, %3
%endmacro
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;*******************************************************************************
; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
;*******************************************************************************
WELS_EXTERN IdctResAddPred_mmx
; 定义 IdctResAddPred_mmx 函数
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1d
movq mm0, [r2+ 0]
movq mm1, [r2+ 8]
movq mm2, [r2+16]
movq mm3, [r2+24]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
========>
movq mm4, mm0
punpckhwd mm4, mm1 ; 高位交叉组合
punpcklwd mm0, mm1 ; 低位交叉组合
movq mm1, mm2
punpckhwd mm1, mm3
punpcklwd mm2, mm3
movq mm3, mm0
punpckhdq mm3, mm2
punpckldq mm0, mm2
movq mm2, mm4
punpckhdq mm2, mm1
punpckldq mm4, mm1
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
%macro MMX_SumSubDiv2 3
movq %3, %2
psraw %3, $01
paddw %3, %1
psraw %1, $01
psubw %1, %2
%endmacro
%macro MMX_SumSub 3
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7
WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
lea r0, [r0+2*r1]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
emms
ret
;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero16x16_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
shl r1, 1
pxor xmm0, xmm0
%rep 16
movdqa [r0], xmm0
movdqa [r0+16], xmm0
add r0, r1
%endrep
ret
;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero8x8_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
shl r1, 1
pxor xmm0, xmm0
%rep 8
movdqa [r0], xmm0
add r0, r1
%endrep
ret
相关文章推荐
- Corosync+Pacemaker+DRBD实现Mysql服务的高可用 推荐
- Assembly x64 Intro - Dct.asm of OpenH264 Encode
- 使用shell和awk批量处理二进制数据
- 监控
- nginx配置location总结及rewrite规则写法
- Linux awk sed
- centos yum 错误处理
- Linux内核哈希表分析与应用
- shell记录
- eclipse 启动tomcat一直处在starting状态
- shell 常用命令之三 grep
- 深入浅出linux内核源代码之双向链表list_head(下)
- 深入浅出linux内核源代码之双向链表list_head(上)
- 查看Linux版本系统信息方法汇总
- imeOptions使用注意
- linux screen 命令行终端切换
- Corosync+Pacemaker实现web集群高可用
- 《Linux内核Makefile分析》之 if_changed_rule/cc_o_c/any-prereq/arg-check
- 需要交互的shell编程——EOF(转载)
- meizu手机的虚拟键盘被popwindow覆盖的解决办法