您的位置:首页 > 编程语言

国密SM2素域椭圆曲线快速约减算法x64编程研究(下)

2015-02-10 00:00 441 查看
摘要: 本文是完整汇编程序代码,未经严格测试和数学验证请勿轻易用于产品代码中

.file   "mp_mod_sm2.c"
.text
.p2align 4,,15
.globl mp_mod_sm2
.type   mp_mod_sm2, @function
mp_mod_sm2:
.LFB0:
.cfi_startproc
#       uint64_t mp_mod_sm2(uint64_t r[4], uint64_t a[8])
#       rdi = r
#       rsi = c
#       ---------------------------
#       backup (r12, r13, r14, r15)
#       ---------------------------
movq    %r12, %xmm14
movq    %r14, %xmm15
pinsrq  $1, %r13, %xmm14
pinsrq  $1, %r15, %xmm15
#       ------------------------------------
#       load (xmm10 ~ xmm13) = (a[0] ~ c[7])
#        ------------------------------------
movdqa  (%rsi), %xmm10
movdqa  16(%rsi), %xmm11
movdqa  32(%rsi), %xmm12
movdqa  48(%rsi), %xmm13
#       ---------
#       r15 = a15
#       ---------
pextrd  $3, %xmm13, %r15d
#       ---------------
#       r14 = a14 + a15
#       ---------------
pextrd  $2, %xmm13, %r14d
addq    %r15, %r14
#       ---------------------
#       r13 = a13 + a14 + a15
#       ---------------------
pextrd  $1, %xmm13, %r13d
addq    %r14, %r13
#       ---------------------------
#       r12 = a12 + a13 + a14 + a15
#       ---------------------------
movd    %xmm13, %r12d
addq    %r13, %r12
#       --------
#       r8 = a08
#       --------
movd    %xmm12, %r8d
#       --------
#       r9 = a09
#       --------
pextrd  $1, %xmm12, %r9d
#       --------
#       r10 = a10
#       --------
pextrd  $2, %xmm12, %r10d
#       --------
#       r11 = a11
#       --------
pextrd  $3, %xmm12, %r11d
#       ---------------------------
#       rsi = a08 + a09 + a10 + a11
#       ---------------------------
movq    %r8, %rsi
addq    %r9, %rsi
addq    %r10, %rsi
addq    %r11, %rsi
#       --------------------------------------------------------------------------
#       a00 + (a08 + a09 + a10 +a11) + (a12 + a13 + a14 + a15) + (a13 + a14 + a15)
#       --------------------------------------------------------------------------
movd    %xmm10, %eax
addq    %rsi, %rax
addq    %r12, %rax
addq    %r13, %rax
movd    %eax, %xmm0
#       --------------------------------------------------------------------------------
#       up + a01 + (a08 + a09 + a10 + a11) + (a12 + a13 + a14 + a15) + (a14 + a15) - a08
#       --------------------------------------------------------------------------------
shr     $32, %rax
pextrd  $1, %xmm10, %edx
addq    %rdx, %rax
addq    %rsi, %rax
addq    %r12, %rax
addq    %r14, %rax
subq    %r8, %rax
pinsrd  $1, %eax, %xmm0
#       -------------------------------------------
#       up + a02 + (2 ^ 34) - a08 - a09 - a13 - a14
#       -------------------------------------------
shr     $32, %rax
pextrd  $2, %xmm10, %edx
addq    %rdx, %rax
movq    $1, %rdx
shl     $34, %rdx
addq    %rdx, %rax
subq    %r8, %rax
subq    %r9, %rax
pextrd  $1, %xmm13, %edx
subq    %rdx, %rax
pextrd  $2, %xmm13, %edx
subq    %rdx, %rax
pinsrd  $2, %eax, %xmm0
#       -------------------------------------------------------------------
#       up + a03 + (2 ^ 32) + (a12 + a13 + a14 + a15) + a08 + a11 + a13 - 4
#       -------------------------------------------------------------------
shr     $32, %rax
pextrd  $3, %xmm10, %edx
addq    %rdx, %rax
movq    $1, %rdx
shl     $32, %rdx
addq    %rdx, %rax
addq    %r12, %rax
addq    %r8, %rax
addq    %r11, %rax
pextrd  $1, %xmm13, %edx
addq    %rdx, %rax
subq    $4, %rax
pinsrd  $3, %eax, %xmm0
#       --------------------------------------------------
#       up + a04 + (a12 + a13 + a14 + a15) + a09 + a14 - 1
#       --------------------------------------------------
shr     $32, %rax
movd    %xmm11, %edx
addq    %rdx, %rax
addq    %r12, %rax
addq    %r9, %rax
pextrd  $2, %xmm13, %edx
addq    %rdx, %rax
decq    %rax
movd    %eax, %xmm1
#       ----------------------------------------
#       up + a05 + (a13 + a14 + a15) + a10 + a15
#       ----------------------------------------
shr     $32, %rax
pextrd  $1, %xmm11, %edx
addq    %rdx, %rax
addq    %r13, %rax
addq    %r10, %rax
addq    %r15, %rax
pinsrd  $1, %eax, %xmm1
#       ----------------------------
#       up + a06 + (a14 + a15) + a11
#       ----------------------------
shr     $32, %rax
pextrd  $2, %xmm11, %edx
addq    %rdx, %rax
addq    %r14, %rax
addq    %r11, %rax
pinsrd  $2, %eax, %xmm1
#       -------------------------------------------------------------------------------------------
#       up + a07 + (a08 + a09 + a10 +a11) + (a12 + a13 + a14 + a15) + (a12 + a13 + a14 + a15) + a15
#       -------------------------------------------------------------------------------------------
shr     $32, %rax
pextrd  $3, %xmm11, %edx
addq    %rdx, %rax
addq    %rsi, %rax
addq    %r12, %rax
addq    %r12, %rax
addq    %r15, %rax
pinsrd  $3, %eax, %xmm1
#       -----
#       final
#       -----
movq    %xmm0, %r12
movq    %xmm1, %r14
pextrq  $1, %xmm0, %r13
pextrq  $1, %xmm1, %r15
shr     $32, %rax
movq    %rax, %rdx
shl     $32, %rdx
movq    %rdx, %rsi
subq    %rax, %rsi
addq    %rax, %r12
adcq    %rsi, %r13
adcq    $0, %r14
adcq    %rdx, %r15
movq    %r12, %xmm0
movq    %r14, %xmm1
pinsrq  $1, %r13, %xmm0
pinsrq  $1, %r15, %xmm1
#       ----------------------------
#       restore (r12, r13, r14, r15)
#       ----------------------------
movq    %xmm14, %r12
movq    %xmm15, %r14
pextrq  $1, %xmm14, %r13
pextrq  $1, %xmm15, %r15
#       -------------------
#       output (xmm0, xmm1)
#       -------------------
movdqa  %xmm0, (%rdi)
movdqa  %xmm1, 16(%rdi)
#       return
emms
xorq    %rax, %rax
xorq    %rdx, %rdx
ret
.cfi_endproc
.LFE0:
.size   mp_mod_sm2, .-mp_mod_sm2
.ident  "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-4)"
.section        .note.GNU-stack,"",@progbits

最后的收尾代码从数学上讲有些不够严谨,要改很容易拉
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐