add latest assembler code to Windows

This commit is contained in:
Brian Gladman 2016-12-22 16:53:24 +00:00
parent 5167ce8705
commit ac5ed04440
9 changed files with 1164 additions and 170 deletions

View File

@ -58,6 +58,7 @@ r_x = r'(?:x?mm1[0-5]|x?mm\d)|(?:mmx1[0-5]|mmx\d)|(?:st\([0-7]\))'
p_r1 = r'(?:\s*%(?P<reg1>' + r_b + r_w + r_d + r_q + r_x + r'))'
p_r2 = r'(?:\s*%(?P<reg2>' + r_b + r_w + r_d + r_q + r_x + r'))'
p_r3 = r'(?:\s*%(?P<reg3>' + r_b + r_w + r_d + r_q + r_x + r'))'
p_r4 = r'(?:\s*%(?P<reg4>' + r_b + r_w + r_d + r_q + r_x + r'))'
# regular expression for immediate (numeric, not symbolic)
@ -99,6 +100,7 @@ m_g7 = re.compile(p_in + r'\s+\*' + p_r1)
m_f8 = re.compile(p_in + p_im)
m_f9 = re.compile(p_in + p_lr)
m_fa = re.compile(p_in + '(?:' + p_im + '|' + p_r1 + r')\s*,' + p_r2 + r'\s*,' + p_r3)
m_fb = re.compile(p_in + p_t1 + r'\s*,' + p_r3 + r'\s*,' + p_r4)
m_la = re.compile(p_la)
m_jt = re.compile(p_jt)
@ -266,6 +268,15 @@ def pass_three(code, labels, macros, level):
e = d['imm'] if d['imm'] else d['reg1']
lo += [lp + '\t{0[ins]:7s} {0[reg3]}, {0[reg2]}, {1}'.format(d, e)]
continue
m = m_fb.search(l)
if m:
d = m.groupdict()
if debug:
print(l, end = '')
s = addr(d, labels, macros, mac_name)
lo += [lp + '\t{0[ins]:7s} {0[reg4]}, {0[reg3]}, {1}'.format(d, s)]
continue
# ins reg, dis(reg, reg, off)
m = m_f1.search(l)
@ -477,6 +488,11 @@ def pass_three(code, labels, macros, level):
lo += [lp + '{0}'.format(l.rstrip(string.whitespace))]
continue
m = re.search(r'\s*\.(align\s+[0-9]*)', l)
if m:
lo += [lp + '\t{0}'.format(m.group(1))]
continue
m = re.search(r'\s*(\S+)', l)
if m:
if len(l):

View File

@ -623,8 +623,7 @@ postbuild "$(TargetPath)" 14
<YASM Include="..\..\mpn\x86_64w\haswell\rsh_divrem_hensel_qr_1_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err2_n.asm" />

View File

@ -1581,10 +1581,7 @@
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm">
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm">

View File

@ -608,8 +608,7 @@ postbuild "$(TargetPath)" 14
<YASM Include="..\..\mpn\x86_64w\haswell\rsh_divrem_hensel_qr_1_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err2_n.asm" />

View File

@ -1547,10 +1547,7 @@
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm">
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm">

View File

@ -1,6 +1,6 @@
; AMD64 mpn_add_n
; Copyright 2008, 2016 Jason Moxham and Alexander Kruppa
; Copyright 2016 Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
@ -27,100 +27,134 @@
%define Inp1P rdx
%define Inp2P r8
%define Size r9
%define SizeRest r11
%define LIMB1 rax
%define LIMB2 r10
%define SizeRest r11
%else
%define SumP rdi
%define Inp1P rsi
%define Inp2P rdx
%define Size rcx
%define SizeRest r11
%define LIMB1 rax
%define LIMB2 r9
%define SizeRest r10
%define LIMB2 r8
%endif
%define ADDSUB add
%define ADCSBB adc
; Skylake has problems sustaining 2 read and 1 write per clock cycle.
; It sometimes gets into a "mode" (for the lack of a better word) where
; it does not fully utilize port 7, causing store uops to compete with
; the reads for ports 2,3. We try to alleviate the problem by turning
; some of the 64-bit writes into 128-bit writes, reducing the number of
; write instructions. Unfortunately, SSE2/AVX2 do not have particularly
; good instructions for assembling an SSE2 128-bit word from two GPR
; 64-bit words, so the instruction count is greatly inflated.
%macro STORE 1
mov [SumP %1], LIMB1
mov [SumP %1 + 8], LIMB2
%endmacro
%macro SSESTORE 1
movq xmm0, LIMB1
movq xmm1, LIMB2
vpermilpd xmm1, xmm1, 0
pblendw xmm0, xmm1, 0xf0
movaps [SumP %1], xmm0
%endmacro
BITS 64
xalign 8
LEAF_PROC mpn_add_nc
mov r10,[rsp+40]
jmp entry
xalign 8
LEAF_PROC mpn_add_n
xor r10, r10
entry:
mov SizeRest, Size
and SizeRest, 7
shr Size, 3
lea Size, [r10 + 2*Size]
sar Size, 1
jnz .loop1
jmp .rest
; Make dest 16-bytes aligned
test SumP, 8
jz .aligned
dec Size
mov SizeRest, Size
and SizeRest, 7
shr Size, 3
; Unaligned and Size > 8: do one limb separately, then the normal loop
jnz .unaligned
; Unaligned and Size <= 8: do all with .rest loop
inc SizeRest
clc
jmp .rest ;ajs:notshortform
.aligned:
mov SizeRest, Size
and SizeRest, 7
shr Size, 3
clc
jz .rest ;ajs:notshortform
jmp .loop1
.unaligned:
mov LIMB1, [Inp1P]
ADDSUB LIMB1, [Inp2P]
mov [SumP], LIMB1
lea Inp1P, [Inp1P+8]
lea Inp2P, [Inp2P+8]
lea SumP, [SumP+8]
align 16
.loop1:
mov LIMB1, [Inp1P]
mov LIMB2, [Inp1P+8]
ADCSBB LIMB1, [Inp2P]
mov [SumP], LIMB1
ADCSBB LIMB2, [Inp2P+8]
mov LIMB1, [Inp1P+16]
mov [SumP+8], LIMB2
ADCSBB LIMB1, [Inp2P+16]
mov LIMB2, [Inp1P+24]
mov [SumP+16], LIMB1
mov LIMB1, [Inp1P+32]
ADCSBB LIMB2, [Inp2P+24]
mov [SumP+24], LIMB2
ADCSBB LIMB1, [Inp2P+32]
mov [SumP+32], LIMB1
mov LIMB2, [Inp1P+40]
ADCSBB LIMB2, [Inp2P+40]
mov [SumP+40], LIMB2
mov LIMB1, [Inp1P+48]
mov LIMB2, [Inp1P+56]
lea Inp1P, [Inp1P+64]
ADCSBB LIMB1, [Inp2P+48]
ADCSBB LIMB2, [Inp2P+56]
lea Inp2P, [Inp2P+64]
mov [SumP+48], LIMB1
mov [SumP+56], LIMB2
lea SumP, [SumP+64]
dec Size
jnz .loop1
inc SizeRest
dec SizeRest
jz .end
mov LIMB1, [Inp1P]
mov LIMB2, [Inp1P+8]
ADCSBB LIMB1, [Inp2P]
ADCSBB LIMB2, [Inp2P+8]
SSESTORE +0
mov LIMB1, [Inp1P+16]
mov LIMB2, [Inp1P+24]
ADCSBB LIMB1, [Inp2P+16]
ADCSBB LIMB2, [Inp2P+24]
STORE +16
mov LIMB1, [Inp1P+32]
mov LIMB2, [Inp1P+40]
ADCSBB LIMB1, [Inp2P+32]
ADCSBB LIMB2, [Inp2P+40]
STORE +32
mov LIMB1, [Inp1P+48]
mov LIMB2, [Inp1P+56]
ADCSBB LIMB1, [Inp2P+48]
ADCSBB LIMB2, [Inp2P+56]
STORE +48
lea Inp1P, [Inp1P+64]
lea Inp2P, [Inp2P+64]
lea SumP, [SumP+64]
dec Size
jnz .loop1
inc SizeRest
dec SizeRest
jz .end
.rest:
mov LIMB1, [Inp1P]
ADCSBB LIMB1, [Inp2P]
mov [SumP], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+8]
ADCSBB LIMB1, [Inp2P+8]
mov [SumP+8], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+16]
ADCSBB LIMB1, [Inp2P+16]
mov [SumP+16], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+24]
ADCSBB LIMB1, [Inp2P+24]
mov [SumP+24], LIMB1
dec SizeRest
jz .end
lea Inp1P, [Inp1P+32]
lea Inp2P, [Inp2P+32]
lea SumP, [SumP+32]
jmp .rest
mov LIMB1, [Inp1P]
ADCSBB LIMB1, [Inp2P]
mov [SumP], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+8]
ADCSBB LIMB1, [Inp2P+8]
mov [SumP+8], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+16]
ADCSBB LIMB1, [Inp2P+16]
mov [SumP+16], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+24]
ADCSBB LIMB1, [Inp2P+24]
mov [SumP+24], LIMB1
dec SizeRest
jz .end
lea Inp1P, [Inp1P+32]
lea Inp2P, [Inp2P+32]
lea SumP, [SumP+32]
jmp .rest
.end:
mov eax, 0
adc eax, eax
mov eax, 0
adc eax, eax
ret

View File

@ -0,0 +1,244 @@
; mp_limb_t mpn_mul_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)
; rax rdi rsi rdx rcx r8
; rax rcx rdx r8 r9 [rsp+40]
; with 1st size >= 2nd size
%include 'yasm_mac.inc'
%define reg_save_list rsi, rdi, rbx, rbp, r12, r14
BITS 64
align 16
LEAF_PROC mpn_mul_basecase
cmp r8, 2
ja .4
mov r8, rdx
mov rdx, [r9]
mulx r11, rax, [r8]
mov [rcx], rax
je .1
.0: mov [rcx+8], r11
ret
.1: cmp qword [rsp+40], 2
mulx r10, rax, [r8+8]
je .3
.2: add r11, rax
adc r10, 0
mov [rcx+8], r11
mov [rcx+16], r10
ret
.3: add r11, rax
adc r10, 0
mov rdx, [r9+8]
mov rax, r8
mulx r9, r8, [rax]
mulx rdx, rax, [rax+8]
add rax, r9
adc rdx, 0
add r11, r8
adc r10, rax
adc rdx, 0
mov [rcx+8], r11
mov [rcx+16], r10
mov [rcx+24], rdx
ret
align 16
.4:
FRAME_PROC mpn_mul_bc, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rcx, r9
mov r8, [rsp+stack_use+40]
mov r14, rcx
lea rbx, [rdx+1]
mov rbp, rdx
mov eax, edx
and rbx, -8
shr rbp, 3
neg rbx
and eax, 7
mov rcx, rbp
mov rdx, [r14]
lea r14, [r14+8]
lea r10, [rel .41]
movsxd r11, dword [r10+rax*4]
lea r10, [r11+r10]
jmp r10
.5: mulx r11, r10, [rsi]
lea rsi, [rsi+56]
lea rdi, [rdi-8]
jmp .15
.6: mulx r9, r12, [rsi]
lea rsi, [rsi+16]
lea rdi, [rdi+16]
inc rcx
jmp .20
.7: mulx r11, r10, [rsi]
lea rsi, [rsi+24]
lea rdi, [rdi+24]
inc rcx
jmp .19
.8: mulx r9, r12, [rsi]
lea rsi, [rsi+32]
lea rdi, [rdi+32]
inc rcx
jmp .18
.9: mulx r11, r10, [rsi]
lea rsi, [rsi+40]
lea rdi, [rdi+40]
inc rcx
jmp .17
.10:mulx r9, r12, [rsi]
lea rsi, [rsi+48]
lea rdi, [rdi+48]
inc rcx
jmp .16
.11:mulx r9, r12, [rsi]
jmp .14
.12:mulx r11, r10, [rsi]
lea rsi, [rsi+8]
lea rdi, [rdi+8]
mulx r9, r12, [rsi]
align 16
.13:mov [rdi-8], r10
adc r12, r11
.14:mulx r11, r10, [rsi+8]
adc r10, r9
lea rsi, [rsi+64]
mov [rdi], r12
.15:mov [rdi+8], r10
mulx r9, r12, [rsi-48]
lea rdi, [rdi+64]
adc r12, r11
.16:mulx r11, r10, [rsi-40]
mov [rdi-48], r12
adc r10, r9
.17:mov [rdi-40], r10
mulx r9, r12, [rsi-32]
adc r12, r11
.18:mulx r11, r10, [rsi-24]
mov [rdi-32], r12
adc r10, r9
.19:mulx r9, r12, [rsi-16]
mov [rdi-24], r10
adc r12, r11
.20:mulx r11, r10, [rsi-8]
adc r10, r9
mov [rdi-16], r12
dec rcx
mulx r9, r12, [rsi]
jnz .13
.21:mov [rdi-8], r10
adc r12, r11
mov [rdi], r12
adc r9, rcx
mov [rdi+8], r9
dec r8
jz .31
lea r10, [rel .42]
movsxd rax, dword [r10+rax*4]
lea rax, [rax+r10]
.22:lea rsi, [rsi+rbx*8]
mov rcx, rbp
mov rdx, [r14]
lea r14, [r14+8]
jmp rax
.23:mulx r11, r10, [rsi+8]
lea rdi, [rdi+rbx*8+8]
lea rcx, [rcx-1]
jmp .35
.24:mulx r9, r12, [rsi-16]
lea rdi, [rdi+rbx*8-56]
jmp .40
.25:mulx r11, r10, [rsi-24]
lea rdi, [rdi+rbx*8-56]
jmp .39
.26:mulx r9, r12, [rsi-32]
lea rdi, [rdi+rbx*8-56]
jmp .38
.27:mulx r11, r10, [rsi-40]
lea rdi, [rdi+rbx*8-56]
jmp .37
.28:mulx r9, r12, [rsi+16]
lea rdi, [rdi+rbx*8+8]
jmp .36
.29:mulx r9, r12, [rsi]
lea rdi, [rdi+rbx*8+8]
jmp .34
.30:adox r12, [rdi]
adox r9, rcx
mov [rdi], r12
adc r9, rcx
mov [rdi+8], r9
dec r8
jnz .22
.31:
END_PROC reg_save_list
.32:mulx r11, r10, [rsi-8]
lea rdi, [rdi+rbx*8+8]
mulx r9, r12, [rsi]
align 16
.33:adox r10, [rdi-8]
adcx r12, r11
mov [rdi-8], r10
jrcxz .30
.34:mulx r11, r10, [rsi+8]
adox r12, [rdi]
lea rcx, [rcx-1]
mov [rdi], r12
adcx r10, r9
.35:mulx r9, r12, [rsi+16]
adcx r12, r11
adox r10, [rdi+8]
mov [rdi+8], r10
.36:mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r12, [rdi+16]
mov [rdi+16], r12
.37:mulx r9, r12, [rsi-32]
adox r10, [rdi+24]
adcx r12, r11
mov [rdi+24], r10
.38:mulx r11, r10, [rsi-24]
adcx r10, r9
adox r12, [rdi+32]
mov [rdi+32], r12
.39:mulx r9, r12, [rsi-16]
adox r10, [rdi+40]
adcx r12, r11
mov [rdi+40], r10
.40:adox r12, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r12
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r12, [rsi]
jmp .33
align 8
.41:
dd .5 - .41
dd .11 - .41
dd .12 - .41
dd .6 - .41
dd .7 - .41
dd .8 - .41
dd .9 - .41
dd .10 - .41
.42:
dd .23 - .42
dd .29 - .42
dd .32 - .42
dd .24 - .42
dd .25 - .42
dd .26 - .42
dd .27 - .42
dd .28 - .42

View File

@ -0,0 +1,675 @@
; AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
; Copyright 2015 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of either:
;
; * the GNU Lesser General Public License as published by the Free
; Software Foundation; either version 3 of the License, or (at your
; option) any later version.
;
; or
;
; * the GNU General Public License as published by the Free Software
; Foundation; either version 2 of the License, or (at your option) any
; later version.
;
; or both in parallel, as here.
;
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
; for more details.
;
; You should have received copies of the GNU General Public License and the
; GNU Lesser General Public License along with the GNU MP Library. If not,
; see https://www.gnu.org/licenses/.
;
; void mpn_sqr_basecase(mp_ptr, mp_srcptr, mp_size_t)
; Linux rdi rsi rdx
; Win64 rcx rdx r8
%include 'yasm_mac.inc'
%define reg_save_list rsi, rdi, rbx
TEXT
align 16
LEAF_PROC mpn_sqr_basecase
cmp r8, 2
jae .1
mov rdx, [rdx]
mulx rdx, rax, rdx
mov [rcx], rax
mov [rcx+8], rdx
ret
.1: jne .2
mov r11, [rdx+8]
mov rdx, [rdx]
mulx r10, r9, r11
mulx r8, rax, rdx
mov rdx, r11
mulx rdx, r11, rdx
add r9, r9
adc r10, r10
adc rdx, 0
add r8, r9
adc r10, r11
adc rdx, 0
mov [rcx], rax
mov [rcx+8], r8
mov [rcx+16], r10
mov [rcx+24], rdx
ret
.2:
FRAME_PROC ?mpn_sqb, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
cmp rdx, 4
jae .3
mov rdx, [rsi]
mulx r11, r10, [rsi+8]
mulx r9, r8, [rsi+16]
add r8, r11
mov rdx, [rsi+8]
mulx r11, rax, [rsi+16]
adc r9, rax
adc r11, 0
test ebx, ebx
mov rdx, [rsi]
mulx rcx, rbx, rdx
mov [rdi], rbx
mov rdx, [rsi+8]
mulx rbx, rax, rdx
mov rdx, [rsi+16]
mulx rdx, rsi, rdx
adcx r10, r10
adcx r8, r8
adcx r9, r9
adcx r11, r11
adox rcx, r10
adox rax, r8
adox rbx, r9
adox rsi, r11
mov r8d, 0
adox rdx, r8
adcx rdx, r8
mov [rdi+8], rcx
mov [rdi+16], rax
mov [rdi+24], rbx
mov [rdi+32], rsi
mov [rdi+40], rdx
EXIT_PROC reg_save_list
.3: mov [rsp+stack_use+8], rdi
mov [rsp+stack_use+16], rsi
mov [rsp+stack_use+24], rdx
lea ebx, [rdx-3]
lea rcx, [rdx+5]
mov eax, edx
and ebx, -8
shr ecx, 3
neg rbx
and eax, 7
mov rdx, [rsi]
lea r10, [rel .58]
movsxd r8, dword [r10+rax*4]
lea r10, [r8+r10]
jmp r10
.4: mulx r11, r10, [rsi+8]
lea rsi, [rsi+64]
jmp .14
.5: mulx r9, r8, [rsi+8]
lea rsi, [rsi+24]
lea rdi, [rdi+24]
jmp .19
.6: mulx r11, r10, [rsi+8]
lea rsi, [rsi+32]
lea rdi, [rdi+32]
jmp .18
.7: mulx r9, r8, [rsi+8]
lea rsi, [rsi+40]
lea rdi, [rdi+40]
jmp .17
.8: mulx r11, r10, [rsi+8]
lea rsi, [rsi+48]
lea rdi, [rdi+48]
jmp .16
.9: mulx r9, r8, [rsi+8]
lea rsi, [rsi+56]
lea rdi, [rdi+56]
jmp .15
.10:mulx r9, r8, [rsi+8]
lea rsi, [rsi+8]
lea rdi, [rdi+8]
jmp .13
.11:mulx r11, r10, [rsi+8]
lea rsi, [rsi+16]
lea rdi, [rdi+16]
dec ecx
mulx r9, r8, [rsi]
align 16
.12:mov [rdi-8], r10
adc r8, r11
.13:mulx r11, r10, [rsi+8]
adc r10, r9
lea rsi, [rsi+64]
mov [rdi], r8
.14:mov [rdi+8], r10
mulx r9, r8, [rsi-48]
lea rdi, [rdi+64]
adc r8, r11
.15:mulx r11, r10, [rsi-40]
mov [rdi-48], r8
adc r10, r9
.16:mov [rdi-40], r10
mulx r9, r8, [rsi-32]
adc r8, r11
.17:mulx r11, r10, [rsi-24]
mov [rdi-32], r8
adc r10, r9
.18:mulx r9, r8, [rsi-16]
mov [rdi-24], r10
adc r8, r11
.19:mulx r11, r10, [rsi-8]
adc r10, r9
mov [rdi-16], r8
dec ecx
mulx r9, r8, [rsi]
jnz .12
.20:mov [rdi-8], r10
adc r8, r11
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
lea r10, [rel .59]
movsxd r11, dword [r10+rax*4]
lea r11, [r11+r10]
jmp r11
.21:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.22:lea rsi, [rsi+rbx*8-64]
or ecx, ebx
mov rdx, [rsi+8]
mulx r9, r8, [rsi+16]
lea rdi, [rdi+rbx*8-56]
jmp .51
align 16
.23:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .21
mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
.24:mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .23
.25:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.26:lea rsi, [rsi+rbx*8-64]
or ecx, ebx
mov rdx, [rsi]
mulx r11, r10, [rsi+8]
lea rdi, [rdi+rbx*8-56]
jmp .24
align 16
.27:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .25
.28:mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .27
.29:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.30:lea rsi, [rsi+rbx*8]
or ecx, ebx
lea rbx, [rbx+8]
mov rdx, [rsi-8]
mulx r9, r8, [rsi]
lea rdi, [rdi+rbx*8-56]
jmp .28
align 16
.31:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .29
mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .31
.32:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.33:lea rsi, [rsi+rbx*8]
or ecx, ebx
jz .53
mov rdx, [rsi-16]
mulx r11, r10, [rsi-8]
lea rdi, [rdi+rbx*8+8]
mulx r9, r8, [rsi]
jmp .31
align 16
.34:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .32
mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
.35:adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .34
.36:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.37:lea rsi, [rsi+rbx*8]
or ecx, ebx
jz .52
mov rdx, [rsi-24]
mulx r9, r8, [rsi-16]
lea rdi, [rdi+rbx*8-56]
jmp .35
align 16
.38:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .36
mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
.39:mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .38
.40:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.41:lea rsi, [rsi+rbx*8]
or ecx, ebx
mov rdx, [rsi-32]
mulx r11, r10, [rsi-24]
lea rdi, [rdi+rbx*8-56]
jmp .39
align 16
.42:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .40
mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
.43:mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .42
.44:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.45:lea rsi, [rsi+rbx*8]
or ecx, ebx
mov rdx, [rsi-40]
mulx r9, r8, [rsi-32]
lea rdi, [rdi+rbx*8-56]
jmp .43
align 16
.46:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .44
mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
.47:mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .46
.48:adox r8, [rdi]
adox r9, rcx
mov [rdi], r8
adc r9, rcx
mov [rdi+8], r9
.49:lea rsi, [rsi+rbx*8]
or ecx, ebx
mov rdx, [rsi-48]
mulx r11, r10, [rsi-40]
lea rdi, [rdi+rbx*8-56]
jmp .47
align 16
.50:adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
jrcxz .48
mulx r11, r10, [rsi+8]
adox r8, [rdi]
lea ecx, [rcx+8]
mov [rdi], r8
adcx r10, r9
mulx r9, r8, [rsi+16]
adcx r8, r11
adox r10, [rdi+8]
mov [rdi+8], r10
.51:mulx r11, r10, [rsi+24]
lea rsi, [rsi+64]
adcx r10, r9
adox r8, [rdi+16]
mov [rdi+16], r8
mulx r9, r8, [rsi-32]
adox r10, [rdi+24]
adcx r8, r11
mov [rdi+24], r10
mulx r11, r10, [rsi-24]
adcx r10, r9
adox r8, [rdi+32]
mov [rdi+32], r8
mulx r9, r8, [rsi-16]
adox r10, [rdi+40]
adcx r8, r11
mov [rdi+40], r10
adox r8, [rdi+48]
mulx r11, r10, [rsi-8]
mov [rdi+48], r8
lea rdi, [rdi+64]
adcx r10, r9
mulx r9, r8, [rsi]
jmp .50
.52:
mov rdx, [rsi-24]
mulx r9, r8, [rsi-16]
adox r8, [rdi-8]
mulx r11, r10, [rsi-8]
mov [rdi-8], r8
lea rdi, [rdi+8]
adcx r10, r9
mulx r9, r8, [rsi]
adox r10, [rdi-8]
adcx r8, r11
mov [rdi-8], r10
adox r8, [rdi]
adox r9, rcx
adcx r9, rcx
.53:
mov rdx, [rsi-16]
mulx r11, r10, [rsi-8]
mulx rbx, rax, [rsi]
adox r10, r8
adcx rax, r11
mov [rdi], r10
adox rax, r9
adox rbx, rcx
mov [rdi+8], rax
adc rbx, rcx
mov rdx, [rsi-8]
mulx rdx, rax, [rsi]
add rax, rbx
mov [rdi+16], rax
adc rdx, rcx
mov [rdi+24], rdx
.54:
mov rdi, [rsp+stack_use+8]
mov rsi, [rsp+stack_use+16]
mov rcx, [rsp+stack_use+24]
dec ecx
mov rdx, [rsi]
xor ebx, ebx
mulx r10, rax, rdx
mov [rdi], rax
mov r8, [rdi+8]
mov r9, [rdi+16]
jmp .56
align 16
.55:mov r8, [rdi+24]
mov r9, [rdi+32]
lea rdi, [rdi+16]
lea r10, [rdx+rbx]
.56:adc r8, r8
adc r9, r9
setc bl
mov rdx, [rsi+8]
lea rsi, [rsi+8]
mulx rdx, rax, rdx
add r8, r10
adc r9, rax
mov [rdi+8], r8
mov [rdi+16], r9
dec ecx
jnz .55
.57:adc rdx, rbx
mov [rdi+24], rdx
END_PROC reg_save_list
align 8
.58:
dd .9 - .58
dd .4 - .58
dd .10 - .58
dd .11 - .58
dd .5 - .58
dd .6 - .58
dd .7 - .58
dd .8 - .58
.59:
dd .49 - .59
dd .22 - .59
dd .26 - .59
dd .30 - .59
dd .33 - .59
dd .37 - .59
dd .41 - .59
dd .45 - .59
end

View File

@ -1,6 +1,6 @@
; AMD64 mpn_sub_n
; Copyright 2008, 2016 Jason Moxham and Alexander Kruppa
; Copyright 2016 Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
@ -15,112 +15,145 @@
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,rcx) = (rsi,rcx)+(rdx,rcx)
; rax = carry
%define USE_WIN64
; (rdi,rcx) = (rsi,rcx)-(rdx,rcx)
; rax = borrow
%include 'yasm_mac.inc'
%define USE_WIN64
%ifdef USE_WIN64
%define SumP rcx
%define Inp1P rdx
%define Inp2P r8
%define Size r9
%define SizeRest r11
%define LIMB1 rax
%define LIMB2 r10
%define SizeRest r11
%else
%define SumP rdi
%define Inp1P rsi
%define Inp2P rdx
%define Size rcx
%define SizeRest r11
%define LIMB1 rax
%define LIMB2 r9
%define SizeRest r10
%define LIMB2 r8
%endif
%define ADDSUB sub
%define ADCSBB sbb
; Skylake has problems sustaining 2 read and 1 write per clock cycle.
; It sometimes gets into a "mode" (for the lack of a better word) where
; it does not fully utilize port 7, causing store uops to compete with
; the reads for ports 2,3. We try to alleviate the problem by turning
; some of the 64-bit writes into 128-bit writes, reducing the number of
; write instructions. Unfortunately, SSE2/AVX2 do not have particularly
; good instructions for assembling an SSE2 128-bit word from two GPR
; 64-bit words, so the instruction count is greatly inflated.
%macro STORE 1
mov [SumP %1], LIMB1
mov [SumP %1 + 8], LIMB2
%endmacro
%macro SSESTORE 1
movq xmm0, LIMB1
movq xmm1, LIMB2
vpermilpd xmm1, xmm1, 0
pblendw xmm0, xmm1, 0xf0
movaps [SumP %1], xmm0
%endmacro
BITS 64
xalign 8
LEAF_PROC mpn_sub_nc
mov r10,[rsp+40]
jmp entry
LEAF_PROC mpn_sub_n
; Make dest 16-bytes aligned
test SumP, 8
jz .aligned
dec Size
mov SizeRest, Size
and SizeRest, 7
shr Size, 3
; Unaligned and Size > 8: do one limb separately, then the normal loop
jnz .unaligned
; Unaligned and Size <= 8: do all with .rest loop
inc SizeRest
clc
jmp .rest ;ajs:notshortform
xalign 8
LEAF_PROC mpn_sub_n
xor r10, r10
entry:
mov SizeRest, Size
and SizeRest, 7
shr Size, 3
lea Size, [r10 + 2*Size]
sar Size, 1
jnz .loop1
jmp .rest
.aligned:
mov SizeRest, Size
and SizeRest, 7
shr Size, 3
clc
jz .rest ;ajs:notshortform
jmp .loop1
.unaligned:
mov LIMB1, [Inp1P]
ADDSUB LIMB1, [Inp2P]
mov [SumP], LIMB1
lea Inp1P, [Inp1P+8]
lea Inp2P, [Inp2P+8]
lea SumP, [SumP+8]
align 16
.loop1:
mov LIMB1, [Inp1P]
mov LIMB2, [Inp1P+8]
ADCSBB LIMB1, [Inp2P]
mov [SumP], LIMB1
ADCSBB LIMB2, [Inp2P+8]
mov LIMB1, [Inp1P+16]
mov [SumP+8], LIMB2
ADCSBB LIMB1, [Inp2P+16]
mov LIMB2, [Inp1P+24]
mov [SumP+16], LIMB1
mov LIMB1, [Inp1P+32]
ADCSBB LIMB2, [Inp2P+24]
mov [SumP+24], LIMB2
ADCSBB LIMB1, [Inp2P+32]
mov [SumP+32], LIMB1
mov LIMB2, [Inp1P+40]
ADCSBB LIMB2, [Inp2P+40]
mov [SumP+40], LIMB2
mov LIMB1, [Inp1P+48]
mov LIMB2, [Inp1P+56]
lea Inp1P, [Inp1P+64]
ADCSBB LIMB1, [Inp2P+48]
ADCSBB LIMB2, [Inp2P+56]
lea Inp2P, [Inp2P+64]
mov [SumP+48], LIMB1
mov [SumP+56], LIMB2
lea SumP, [SumP+64]
dec Size
jnz .loop1
inc SizeRest
dec SizeRest
jz .end
mov LIMB1, [Inp1P]
mov LIMB2, [Inp1P+8]
ADCSBB LIMB1, [Inp2P]
ADCSBB LIMB2, [Inp2P+8]
SSESTORE +0
mov LIMB1, [Inp1P+16]
mov LIMB2, [Inp1P+24]
ADCSBB LIMB1, [Inp2P+16]
ADCSBB LIMB2, [Inp2P+24]
STORE +16
mov LIMB1, [Inp1P+32]
mov LIMB2, [Inp1P+40]
ADCSBB LIMB1, [Inp2P+32]
ADCSBB LIMB2, [Inp2P+40]
STORE +32
mov LIMB1, [Inp1P+48]
mov LIMB2, [Inp1P+56]
ADCSBB LIMB1, [Inp2P+48]
ADCSBB LIMB2, [Inp2P+56]
STORE +48
lea Inp1P, [Inp1P+64]
lea Inp2P, [Inp2P+64]
lea SumP, [SumP+64]
dec Size
jnz .loop1
inc SizeRest
dec SizeRest
jz .end
.rest:
mov LIMB1, [Inp1P]
ADCSBB LIMB1, [Inp2P]
mov [SumP], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+8]
ADCSBB LIMB1, [Inp2P+8]
mov [SumP+8], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+16]
ADCSBB LIMB1, [Inp2P+16]
mov [SumP+16], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+24]
ADCSBB LIMB1, [Inp2P+24]
mov [SumP+24], LIMB1
dec SizeRest
jz .end
lea Inp1P, [Inp1P+32]
lea Inp2P, [Inp2P+32]
lea SumP, [SumP+32]
jmp .rest
mov LIMB1, [Inp1P]
ADCSBB LIMB1, [Inp2P]
mov [SumP], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+8]
ADCSBB LIMB1, [Inp2P+8]
mov [SumP+8], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+16]
ADCSBB LIMB1, [Inp2P+16]
mov [SumP+16], LIMB1
dec SizeRest
jz .end
mov LIMB1, [Inp1P+24]
ADCSBB LIMB1, [Inp2P+24]
mov [SumP+24], LIMB1
dec SizeRest
jz .end
lea Inp1P, [Inp1P+32]
lea Inp2P, [Inp2P+32]
lea SumP, [SumP+32]
jmp .rest
.end:
mov eax, 0
adc eax, eax
mov eax, 0
adc eax, eax
ret