156 lines
3.7 KiB
NASM
156 lines
3.7 KiB
NASM
; AMD64 mpn_addmul_1 optimised for Intel Broadwell.
|
|
;
|
|
; Copyright 2015 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or modify
|
|
; it under the terms of either:
|
|
;
|
|
; * the GNU Lesser General Public License as published by the Free
|
|
; Software Foundation; either version 3 of the License, or (at your
|
|
; option) any later version.
|
|
;
|
|
; or
|
|
;
|
|
; * the GNU General Public License as published by the Free Software
|
|
; Foundation; either version 2 of the License, or (at your option) any
|
|
; later version.
|
|
;
|
|
; or both in parallel, as here.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful, but
|
|
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
; for more details.
|
|
;
|
|
; You should have received copies of the GNU General Public License and the
|
|
; GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
; see https://www.gnu.org/licenses/.
|
|
;
|
|
; mp_limb_t mpn_addmul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
|
; rax rdi rsi rdx rcx
|
|
; rax rcx rdx r8 r9d
|
|
|
|
%include "yasm_mac.inc"
|
|
|
|
%define reg_save_list rsi, rdi
|
|
|
|
TEXT
|
|
align 32
|
|
FRAME_PROC mpn_addmul_1, 0, reg_save_list
|
|
mov rdi, rcx
|
|
mov rsi, rdx
|
|
mov rdx, r8
|
|
mov rcx, r9
|
|
|
|
mov r10, rcx
|
|
mov rcx, rdx
|
|
mov r8, rdx
|
|
shr rcx, 3
|
|
and r8, 7
|
|
mov rdx, r10
|
|
lea r10, [rel .1]
|
|
movsxd r8, dword [r10+r8*4]
|
|
lea r10, [r8+r10]
|
|
jmp r10
|
|
|
|
align 8
|
|
.1: dd .2 - .1
|
|
dd .7 - .1
|
|
dd .10 - .1
|
|
dd .3 - .1
|
|
dd .4 - .1
|
|
dd .5 - .1
|
|
dd .6 - .1
|
|
dd .19 - .1
|
|
|
|
.2: mulx r8, r10, [rsi]
|
|
lea rsi, [rsi-8]
|
|
lea rdi, [rdi-8]
|
|
lea rcx, [rcx-1]
|
|
jmp .13
|
|
.3: mulx rax, r9, [rsi]
|
|
lea rsi, [rsi+16]
|
|
lea rdi, [rdi-48]
|
|
jmp .18
|
|
.4: mulx r8, r10, [rsi]
|
|
lea rsi, [rsi+24]
|
|
lea rdi, [rdi-40]
|
|
jmp .17
|
|
.5: mulx rax, r9, [rsi]
|
|
lea rsi, [rsi+32]
|
|
lea rdi, [rdi-32]
|
|
jmp .16
|
|
.6: mulx r8, r10, [rsi]
|
|
lea rsi, [rsi+40]
|
|
lea rdi, [rdi-24]
|
|
jmp .15
|
|
.7: mulx rax, r9, [rsi]
|
|
jrcxz .8
|
|
jmp .12
|
|
.8: add r9, [rdi]
|
|
mov [rdi], r9
|
|
adc rax, rcx
|
|
EXIT_PROC reg_save_list
|
|
|
|
.9: adox r9, [rdi]
|
|
mov [rdi], r9
|
|
adox rax, rcx
|
|
adc rax, rcx
|
|
END_PROC reg_save_list
|
|
|
|
nop
|
|
nop
|
|
nop
|
|
nop
|
|
.10:mulx r8, r10, [rsi]
|
|
lea rsi, [rsi+8]
|
|
lea rdi, [rdi+8]
|
|
mulx rax, r9, [rsi]
|
|
|
|
align 32
|
|
.11:adox r10, [rdi-8]
|
|
adcx r9, r8
|
|
mov [rdi-8], r10
|
|
jrcxz .9
|
|
.12:mulx r8, r10, [rsi+8]
|
|
adox r9, [rdi]
|
|
lea rcx, [rcx-1]
|
|
mov [rdi], r9
|
|
adcx r10, rax
|
|
.13:mulx rax, r9, [rsi+16]
|
|
adcx r9, r8
|
|
adox r10, [rdi+8]
|
|
mov [rdi+8], r10
|
|
.14:mulx r8, r10, [rsi+24]
|
|
lea rsi, [rsi+64]
|
|
adcx r10, rax
|
|
adox r9, [rdi+16]
|
|
mov [rdi+16], r9
|
|
.15:mulx rax, r9, [rsi-32]
|
|
adox r10, [rdi+24]
|
|
adcx r9, r8
|
|
mov [rdi+24], r10
|
|
.16:mulx r8, r10, [rsi-24]
|
|
adcx r10, rax
|
|
adox r9, [rdi+32]
|
|
mov [rdi+32], r9
|
|
.17:mulx rax, r9, [rsi-16]
|
|
adox r10, [rdi+40]
|
|
adcx r9, r8
|
|
mov [rdi+40], r10
|
|
.18:adox r9, [rdi+48]
|
|
mulx r8, r10, [rsi-8]
|
|
mov [rdi+48], r9
|
|
lea rdi, [rdi+64]
|
|
adcx r10, rax
|
|
mulx rax, r9, [rsi]
|
|
jmp .11
|
|
.19:mulx rax, r9, [rsi]
|
|
lea rsi, [rsi-16]
|
|
lea rdi, [rdi-16]
|
|
jmp .14
|
|
|
|
end
|