add latest assembler code to Windows
This commit is contained in:
parent
5167ce8705
commit
ac5ed04440
@ -58,6 +58,7 @@ r_x = r'(?:x?mm1[0-5]|x?mm\d)|(?:mmx1[0-5]|mmx\d)|(?:st\([0-7]\))'
|
||||
p_r1 = r'(?:\s*%(?P<reg1>' + r_b + r_w + r_d + r_q + r_x + r'))'
|
||||
p_r2 = r'(?:\s*%(?P<reg2>' + r_b + r_w + r_d + r_q + r_x + r'))'
|
||||
p_r3 = r'(?:\s*%(?P<reg3>' + r_b + r_w + r_d + r_q + r_x + r'))'
|
||||
p_r4 = r'(?:\s*%(?P<reg4>' + r_b + r_w + r_d + r_q + r_x + r'))'
|
||||
|
||||
# regular expression for immediate (numeric, not symbolic)
|
||||
|
||||
@ -99,6 +100,7 @@ m_g7 = re.compile(p_in + r'\s+\*' + p_r1)
|
||||
m_f8 = re.compile(p_in + p_im)
|
||||
m_f9 = re.compile(p_in + p_lr)
|
||||
m_fa = re.compile(p_in + '(?:' + p_im + '|' + p_r1 + r')\s*,' + p_r2 + r'\s*,' + p_r3)
|
||||
m_fb = re.compile(p_in + p_t1 + r'\s*,' + p_r3 + r'\s*,' + p_r4)
|
||||
|
||||
m_la = re.compile(p_la)
|
||||
m_jt = re.compile(p_jt)
|
||||
@ -266,6 +268,15 @@ def pass_three(code, labels, macros, level):
|
||||
e = d['imm'] if d['imm'] else d['reg1']
|
||||
lo += [lp + '\t{0[ins]:7s} {0[reg3]}, {0[reg2]}, {1}'.format(d, e)]
|
||||
continue
|
||||
|
||||
m = m_fb.search(l)
|
||||
if m:
|
||||
d = m.groupdict()
|
||||
if debug:
|
||||
print(l, end = '')
|
||||
s = addr(d, labels, macros, mac_name)
|
||||
lo += [lp + '\t{0[ins]:7s} {0[reg4]}, {0[reg3]}, {1}'.format(d, s)]
|
||||
continue
|
||||
|
||||
# ins reg, dis(reg, reg, off)
|
||||
m = m_f1.search(l)
|
||||
@ -477,6 +488,11 @@ def pass_three(code, labels, macros, level):
|
||||
lo += [lp + '{0}'.format(l.rstrip(string.whitespace))]
|
||||
continue
|
||||
|
||||
m = re.search(r'\s*\.(align\s+[0-9]*)', l)
|
||||
if m:
|
||||
lo += [lp + '\t{0}'.format(m.group(1))]
|
||||
continue
|
||||
|
||||
m = re.search(r'\s*(\S+)', l)
|
||||
if m:
|
||||
if len(l):
|
||||
|
@ -623,8 +623,7 @@ postbuild "$(TargetPath)" 14
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rsh_divrem_hensel_qr_1_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err2_n.asm" />
|
||||
|
@ -1581,10 +1581,7 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm">
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm">
|
||||
|
@ -608,8 +608,7 @@ postbuild "$(TargetPath)" 14
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rsh_divrem_hensel_qr_1_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err2_n.asm" />
|
||||
|
@ -1547,10 +1547,7 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase..asm">
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\sqr_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\store.asm">
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
; AMD64 mpn_add_n
|
||||
; Copyright 2008, 2016 Jason Moxham and Alexander Kruppa
|
||||
; Copyright 2016 Alexander Kruppa
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
@ -27,100 +27,134 @@
|
||||
%define Inp1P rdx
|
||||
%define Inp2P r8
|
||||
%define Size r9
|
||||
%define SizeRest r11
|
||||
%define LIMB1 rax
|
||||
%define LIMB2 r10
|
||||
%define SizeRest r11
|
||||
%else
|
||||
%define SumP rdi
|
||||
%define Inp1P rsi
|
||||
%define Inp2P rdx
|
||||
%define Size rcx
|
||||
%define SizeRest r11
|
||||
%define LIMB1 rax
|
||||
%define LIMB2 r9
|
||||
%define SizeRest r10
|
||||
%define LIMB2 r8
|
||||
%endif
|
||||
|
||||
%define ADDSUB add
|
||||
%define ADCSBB adc
|
||||
|
||||
; Skylake has problems sustaining 2 read and 1 write per clock cycle.
|
||||
; It sometimes gets into a "mode" (for the lack of a better word) where
|
||||
; it does not fully utilize port 7, causing store uops to compete with
|
||||
; the reads for ports 2,3. We try to alleviate the problem by turning
|
||||
; some of the 64-bit writes into 128-bit writes, reducing the number of
|
||||
; write instructions. Unfortunately, SSE2/AVX2 do not have particularly
|
||||
; good instructions for assembling an SSE2 128-bit word from two GPR
|
||||
; 64-bit words, so the instruction count is greatly inflated.
|
||||
|
||||
%macro STORE 1
|
||||
mov [SumP %1], LIMB1
|
||||
mov [SumP %1 + 8], LIMB2
|
||||
%endmacro
|
||||
|
||||
%macro SSESTORE 1
|
||||
movq xmm0, LIMB1
|
||||
movq xmm1, LIMB2
|
||||
vpermilpd xmm1, xmm1, 0
|
||||
pblendw xmm0, xmm1, 0xf0
|
||||
movaps [SumP %1], xmm0
|
||||
%endmacro
|
||||
|
||||
|
||||
BITS 64
|
||||
|
||||
xalign 8
|
||||
LEAF_PROC mpn_add_nc
|
||||
mov r10,[rsp+40]
|
||||
jmp entry
|
||||
|
||||
xalign 8
|
||||
LEAF_PROC mpn_add_n
|
||||
xor r10, r10
|
||||
entry:
|
||||
mov SizeRest, Size
|
||||
and SizeRest, 7
|
||||
shr Size, 3
|
||||
lea Size, [r10 + 2*Size]
|
||||
sar Size, 1
|
||||
jnz .loop1
|
||||
jmp .rest
|
||||
; Make dest 16-bytes aligned
|
||||
test SumP, 8
|
||||
jz .aligned
|
||||
dec Size
|
||||
mov SizeRest, Size
|
||||
and SizeRest, 7
|
||||
shr Size, 3
|
||||
; Unaligned and Size > 8: do one limb separately, then the normal loop
|
||||
jnz .unaligned
|
||||
; Unaligned and Size <= 8: do all with .rest loop
|
||||
inc SizeRest
|
||||
clc
|
||||
jmp .rest ;ajs:notshortform
|
||||
|
||||
.aligned:
|
||||
mov SizeRest, Size
|
||||
and SizeRest, 7
|
||||
shr Size, 3
|
||||
clc
|
||||
jz .rest ;ajs:notshortform
|
||||
jmp .loop1
|
||||
|
||||
.unaligned:
|
||||
mov LIMB1, [Inp1P]
|
||||
ADDSUB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
lea Inp1P, [Inp1P+8]
|
||||
lea Inp2P, [Inp2P+8]
|
||||
lea SumP, [SumP+8]
|
||||
|
||||
align 16
|
||||
.loop1:
|
||||
mov LIMB1, [Inp1P]
|
||||
mov LIMB2, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
ADCSBB LIMB2, [Inp2P+8]
|
||||
mov LIMB1, [Inp1P+16]
|
||||
mov [SumP+8], LIMB2
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
mov LIMB2, [Inp1P+24]
|
||||
mov [SumP+16], LIMB1
|
||||
mov LIMB1, [Inp1P+32]
|
||||
ADCSBB LIMB2, [Inp2P+24]
|
||||
mov [SumP+24], LIMB2
|
||||
ADCSBB LIMB1, [Inp2P+32]
|
||||
mov [SumP+32], LIMB1
|
||||
mov LIMB2, [Inp1P+40]
|
||||
ADCSBB LIMB2, [Inp2P+40]
|
||||
mov [SumP+40], LIMB2
|
||||
mov LIMB1, [Inp1P+48]
|
||||
mov LIMB2, [Inp1P+56]
|
||||
lea Inp1P, [Inp1P+64]
|
||||
ADCSBB LIMB1, [Inp2P+48]
|
||||
ADCSBB LIMB2, [Inp2P+56]
|
||||
lea Inp2P, [Inp2P+64]
|
||||
mov [SumP+48], LIMB1
|
||||
mov [SumP+56], LIMB2
|
||||
lea SumP, [SumP+64]
|
||||
dec Size
|
||||
jnz .loop1
|
||||
inc SizeRest
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P]
|
||||
mov LIMB2, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
ADCSBB LIMB2, [Inp2P+8]
|
||||
SSESTORE +0
|
||||
mov LIMB1, [Inp1P+16]
|
||||
mov LIMB2, [Inp1P+24]
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
ADCSBB LIMB2, [Inp2P+24]
|
||||
STORE +16
|
||||
mov LIMB1, [Inp1P+32]
|
||||
mov LIMB2, [Inp1P+40]
|
||||
ADCSBB LIMB1, [Inp2P+32]
|
||||
ADCSBB LIMB2, [Inp2P+40]
|
||||
STORE +32
|
||||
mov LIMB1, [Inp1P+48]
|
||||
mov LIMB2, [Inp1P+56]
|
||||
ADCSBB LIMB1, [Inp2P+48]
|
||||
ADCSBB LIMB2, [Inp2P+56]
|
||||
STORE +48
|
||||
lea Inp1P, [Inp1P+64]
|
||||
lea Inp2P, [Inp2P+64]
|
||||
lea SumP, [SumP+64]
|
||||
dec Size
|
||||
jnz .loop1
|
||||
inc SizeRest
|
||||
dec SizeRest
|
||||
jz .end
|
||||
.rest:
|
||||
mov LIMB1, [Inp1P]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P+8]
|
||||
mov [SumP+8], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+16]
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
mov [SumP+16], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+24]
|
||||
ADCSBB LIMB1, [Inp2P+24]
|
||||
mov [SumP+24], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
lea Inp1P, [Inp1P+32]
|
||||
lea Inp2P, [Inp2P+32]
|
||||
lea SumP, [SumP+32]
|
||||
jmp .rest
|
||||
mov LIMB1, [Inp1P]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P+8]
|
||||
mov [SumP+8], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+16]
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
mov [SumP+16], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+24]
|
||||
ADCSBB LIMB1, [Inp2P+24]
|
||||
mov [SumP+24], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
lea Inp1P, [Inp1P+32]
|
||||
lea Inp2P, [Inp2P+32]
|
||||
lea SumP, [SumP+32]
|
||||
jmp .rest
|
||||
.end:
|
||||
mov eax, 0
|
||||
adc eax, eax
|
||||
mov eax, 0
|
||||
adc eax, eax
|
||||
ret
|
||||
|
244
mpn/x86_64w/skylake/avx/mul_basecase.asm
Normal file
244
mpn/x86_64w/skylake/avx/mul_basecase.asm
Normal file
@ -0,0 +1,244 @@
|
||||
; mp_limb_t mpn_mul_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)
|
||||
; rax rdi rsi rdx rcx r8
|
||||
; rax rcx rdx r8 r9 [rsp+40]
|
||||
; with 1st size >= 2nd size
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
%define reg_save_list rsi, rdi, rbx, rbp, r12, r14
|
||||
|
||||
BITS 64
|
||||
align 16
|
||||
|
||||
LEAF_PROC mpn_mul_basecase
|
||||
cmp r8, 2
|
||||
ja .4
|
||||
mov r8, rdx
|
||||
mov rdx, [r9]
|
||||
mulx r11, rax, [r8]
|
||||
mov [rcx], rax
|
||||
je .1
|
||||
.0: mov [rcx+8], r11
|
||||
ret
|
||||
.1: cmp qword [rsp+40], 2
|
||||
mulx r10, rax, [r8+8]
|
||||
je .3
|
||||
.2: add r11, rax
|
||||
adc r10, 0
|
||||
mov [rcx+8], r11
|
||||
mov [rcx+16], r10
|
||||
ret
|
||||
.3: add r11, rax
|
||||
adc r10, 0
|
||||
mov rdx, [r9+8]
|
||||
mov rax, r8
|
||||
mulx r9, r8, [rax]
|
||||
mulx rdx, rax, [rax+8]
|
||||
add rax, r9
|
||||
adc rdx, 0
|
||||
add r11, r8
|
||||
adc r10, rax
|
||||
adc rdx, 0
|
||||
mov [rcx+8], r11
|
||||
mov [rcx+16], r10
|
||||
mov [rcx+24], rdx
|
||||
ret
|
||||
|
||||
align 16
|
||||
.4:
|
||||
FRAME_PROC mpn_mul_bc, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, [rsp+stack_use+40]
|
||||
|
||||
mov r14, rcx
|
||||
lea rbx, [rdx+1]
|
||||
mov rbp, rdx
|
||||
mov eax, edx
|
||||
and rbx, -8
|
||||
shr rbp, 3
|
||||
neg rbx
|
||||
and eax, 7
|
||||
mov rcx, rbp
|
||||
mov rdx, [r14]
|
||||
lea r14, [r14+8]
|
||||
lea r10, [rel .41]
|
||||
movsxd r11, dword [r10+rax*4]
|
||||
lea r10, [r11+r10]
|
||||
jmp r10
|
||||
.5: mulx r11, r10, [rsi]
|
||||
lea rsi, [rsi+56]
|
||||
lea rdi, [rdi-8]
|
||||
jmp .15
|
||||
.6: mulx r9, r12, [rsi]
|
||||
lea rsi, [rsi+16]
|
||||
lea rdi, [rdi+16]
|
||||
inc rcx
|
||||
jmp .20
|
||||
.7: mulx r11, r10, [rsi]
|
||||
lea rsi, [rsi+24]
|
||||
lea rdi, [rdi+24]
|
||||
inc rcx
|
||||
jmp .19
|
||||
.8: mulx r9, r12, [rsi]
|
||||
lea rsi, [rsi+32]
|
||||
lea rdi, [rdi+32]
|
||||
inc rcx
|
||||
jmp .18
|
||||
.9: mulx r11, r10, [rsi]
|
||||
lea rsi, [rsi+40]
|
||||
lea rdi, [rdi+40]
|
||||
inc rcx
|
||||
jmp .17
|
||||
.10:mulx r9, r12, [rsi]
|
||||
lea rsi, [rsi+48]
|
||||
lea rdi, [rdi+48]
|
||||
inc rcx
|
||||
jmp .16
|
||||
.11:mulx r9, r12, [rsi]
|
||||
jmp .14
|
||||
.12:mulx r11, r10, [rsi]
|
||||
lea rsi, [rsi+8]
|
||||
lea rdi, [rdi+8]
|
||||
mulx r9, r12, [rsi]
|
||||
|
||||
align 16
|
||||
.13:mov [rdi-8], r10
|
||||
adc r12, r11
|
||||
.14:mulx r11, r10, [rsi+8]
|
||||
adc r10, r9
|
||||
lea rsi, [rsi+64]
|
||||
mov [rdi], r12
|
||||
.15:mov [rdi+8], r10
|
||||
mulx r9, r12, [rsi-48]
|
||||
lea rdi, [rdi+64]
|
||||
adc r12, r11
|
||||
.16:mulx r11, r10, [rsi-40]
|
||||
mov [rdi-48], r12
|
||||
adc r10, r9
|
||||
.17:mov [rdi-40], r10
|
||||
mulx r9, r12, [rsi-32]
|
||||
adc r12, r11
|
||||
.18:mulx r11, r10, [rsi-24]
|
||||
mov [rdi-32], r12
|
||||
adc r10, r9
|
||||
.19:mulx r9, r12, [rsi-16]
|
||||
mov [rdi-24], r10
|
||||
adc r12, r11
|
||||
.20:mulx r11, r10, [rsi-8]
|
||||
adc r10, r9
|
||||
mov [rdi-16], r12
|
||||
dec rcx
|
||||
mulx r9, r12, [rsi]
|
||||
jnz .13
|
||||
.21:mov [rdi-8], r10
|
||||
adc r12, r11
|
||||
mov [rdi], r12
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
dec r8
|
||||
jz .31
|
||||
lea r10, [rel .42]
|
||||
movsxd rax, dword [r10+rax*4]
|
||||
lea rax, [rax+r10]
|
||||
.22:lea rsi, [rsi+rbx*8]
|
||||
mov rcx, rbp
|
||||
mov rdx, [r14]
|
||||
lea r14, [r14+8]
|
||||
jmp rax
|
||||
.23:mulx r11, r10, [rsi+8]
|
||||
lea rdi, [rdi+rbx*8+8]
|
||||
lea rcx, [rcx-1]
|
||||
jmp .35
|
||||
.24:mulx r9, r12, [rsi-16]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .40
|
||||
.25:mulx r11, r10, [rsi-24]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .39
|
||||
.26:mulx r9, r12, [rsi-32]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .38
|
||||
.27:mulx r11, r10, [rsi-40]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .37
|
||||
.28:mulx r9, r12, [rsi+16]
|
||||
lea rdi, [rdi+rbx*8+8]
|
||||
jmp .36
|
||||
.29:mulx r9, r12, [rsi]
|
||||
lea rdi, [rdi+rbx*8+8]
|
||||
jmp .34
|
||||
.30:adox r12, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r12
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
dec r8
|
||||
jnz .22
|
||||
.31:
|
||||
END_PROC reg_save_list
|
||||
|
||||
.32:mulx r11, r10, [rsi-8]
|
||||
lea rdi, [rdi+rbx*8+8]
|
||||
mulx r9, r12, [rsi]
|
||||
|
||||
align 16
|
||||
.33:adox r10, [rdi-8]
|
||||
adcx r12, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .30
|
||||
.34:mulx r11, r10, [rsi+8]
|
||||
adox r12, [rdi]
|
||||
lea rcx, [rcx-1]
|
||||
mov [rdi], r12
|
||||
adcx r10, r9
|
||||
.35:mulx r9, r12, [rsi+16]
|
||||
adcx r12, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
.36:mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r12, [rdi+16]
|
||||
mov [rdi+16], r12
|
||||
.37:mulx r9, r12, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r12, r11
|
||||
mov [rdi+24], r10
|
||||
.38:mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r12, [rdi+32]
|
||||
mov [rdi+32], r12
|
||||
.39:mulx r9, r12, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r12, r11
|
||||
mov [rdi+40], r10
|
||||
.40:adox r12, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r12
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r12, [rsi]
|
||||
jmp .33
|
||||
|
||||
align 8
|
||||
.41:
|
||||
dd .5 - .41
|
||||
dd .11 - .41
|
||||
dd .12 - .41
|
||||
dd .6 - .41
|
||||
dd .7 - .41
|
||||
dd .8 - .41
|
||||
dd .9 - .41
|
||||
dd .10 - .41
|
||||
.42:
|
||||
dd .23 - .42
|
||||
dd .29 - .42
|
||||
dd .32 - .42
|
||||
dd .24 - .42
|
||||
dd .25 - .42
|
||||
dd .26 - .42
|
||||
dd .27 - .42
|
||||
dd .28 - .42
|
675
mpn/x86_64w/skylake/avx/sqr_basecase.asm
Normal file
675
mpn/x86_64w/skylake/avx/sqr_basecase.asm
Normal file
@ -0,0 +1,675 @@
|
||||
; AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
|
||||
|
||||
; Copyright 2015 Free Software Foundation, Inc.
|
||||
|
||||
; This file is part of the GNU MP Library.
|
||||
;
|
||||
; The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of either:
|
||||
;
|
||||
; * the GNU Lesser General Public License as published by the Free
|
||||
; Software Foundation; either version 3 of the License, or (at your
|
||||
; option) any later version.
|
||||
;
|
||||
; or
|
||||
;
|
||||
; * the GNU General Public License as published by the Free Software
|
||||
; Foundation; either version 2 of the License, or (at your option) any
|
||||
; later version.
|
||||
;
|
||||
; or both in parallel, as here.
|
||||
;
|
||||
; The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
; for more details.
|
||||
;
|
||||
; You should have received copies of the GNU General Public License and the
|
||||
; GNU Lesser General Public License along with the GNU MP Library. If not,
|
||||
; see https://www.gnu.org/licenses/.
|
||||
;
|
||||
; void mpn_sqr_basecase(mp_ptr, mp_srcptr, mp_size_t)
|
||||
; Linux rdi rsi rdx
|
||||
; Win64 rcx rdx r8
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
%define reg_save_list rsi, rdi, rbx
|
||||
|
||||
TEXT
|
||||
align 16
|
||||
LEAF_PROC mpn_sqr_basecase
|
||||
cmp r8, 2
|
||||
jae .1
|
||||
mov rdx, [rdx]
|
||||
mulx rdx, rax, rdx
|
||||
mov [rcx], rax
|
||||
mov [rcx+8], rdx
|
||||
ret
|
||||
.1: jne .2
|
||||
mov r11, [rdx+8]
|
||||
mov rdx, [rdx]
|
||||
mulx r10, r9, r11
|
||||
mulx r8, rax, rdx
|
||||
mov rdx, r11
|
||||
mulx rdx, r11, rdx
|
||||
add r9, r9
|
||||
adc r10, r10
|
||||
adc rdx, 0
|
||||
add r8, r9
|
||||
adc r10, r11
|
||||
adc rdx, 0
|
||||
mov [rcx], rax
|
||||
mov [rcx+8], r8
|
||||
mov [rcx+16], r10
|
||||
mov [rcx+24], rdx
|
||||
ret
|
||||
.2:
|
||||
FRAME_PROC ?mpn_sqb, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
|
||||
cmp rdx, 4
|
||||
jae .3
|
||||
mov rdx, [rsi]
|
||||
mulx r11, r10, [rsi+8]
|
||||
mulx r9, r8, [rsi+16]
|
||||
add r8, r11
|
||||
mov rdx, [rsi+8]
|
||||
mulx r11, rax, [rsi+16]
|
||||
adc r9, rax
|
||||
adc r11, 0
|
||||
test ebx, ebx
|
||||
mov rdx, [rsi]
|
||||
mulx rcx, rbx, rdx
|
||||
mov [rdi], rbx
|
||||
mov rdx, [rsi+8]
|
||||
mulx rbx, rax, rdx
|
||||
mov rdx, [rsi+16]
|
||||
mulx rdx, rsi, rdx
|
||||
adcx r10, r10
|
||||
adcx r8, r8
|
||||
adcx r9, r9
|
||||
adcx r11, r11
|
||||
adox rcx, r10
|
||||
adox rax, r8
|
||||
adox rbx, r9
|
||||
adox rsi, r11
|
||||
mov r8d, 0
|
||||
adox rdx, r8
|
||||
adcx rdx, r8
|
||||
mov [rdi+8], rcx
|
||||
mov [rdi+16], rax
|
||||
mov [rdi+24], rbx
|
||||
mov [rdi+32], rsi
|
||||
mov [rdi+40], rdx
|
||||
EXIT_PROC reg_save_list
|
||||
|
||||
.3: mov [rsp+stack_use+8], rdi
|
||||
mov [rsp+stack_use+16], rsi
|
||||
mov [rsp+stack_use+24], rdx
|
||||
lea ebx, [rdx-3]
|
||||
lea rcx, [rdx+5]
|
||||
mov eax, edx
|
||||
and ebx, -8
|
||||
shr ecx, 3
|
||||
neg rbx
|
||||
and eax, 7
|
||||
mov rdx, [rsi]
|
||||
lea r10, [rel .58]
|
||||
movsxd r8, dword [r10+rax*4]
|
||||
lea r10, [r8+r10]
|
||||
jmp r10
|
||||
.4: mulx r11, r10, [rsi+8]
|
||||
lea rsi, [rsi+64]
|
||||
jmp .14
|
||||
.5: mulx r9, r8, [rsi+8]
|
||||
lea rsi, [rsi+24]
|
||||
lea rdi, [rdi+24]
|
||||
jmp .19
|
||||
.6: mulx r11, r10, [rsi+8]
|
||||
lea rsi, [rsi+32]
|
||||
lea rdi, [rdi+32]
|
||||
jmp .18
|
||||
.7: mulx r9, r8, [rsi+8]
|
||||
lea rsi, [rsi+40]
|
||||
lea rdi, [rdi+40]
|
||||
jmp .17
|
||||
.8: mulx r11, r10, [rsi+8]
|
||||
lea rsi, [rsi+48]
|
||||
lea rdi, [rdi+48]
|
||||
jmp .16
|
||||
.9: mulx r9, r8, [rsi+8]
|
||||
lea rsi, [rsi+56]
|
||||
lea rdi, [rdi+56]
|
||||
jmp .15
|
||||
.10:mulx r9, r8, [rsi+8]
|
||||
lea rsi, [rsi+8]
|
||||
lea rdi, [rdi+8]
|
||||
jmp .13
|
||||
.11:mulx r11, r10, [rsi+8]
|
||||
lea rsi, [rsi+16]
|
||||
lea rdi, [rdi+16]
|
||||
dec ecx
|
||||
mulx r9, r8, [rsi]
|
||||
align 16
|
||||
.12:mov [rdi-8], r10
|
||||
adc r8, r11
|
||||
.13:mulx r11, r10, [rsi+8]
|
||||
adc r10, r9
|
||||
lea rsi, [rsi+64]
|
||||
mov [rdi], r8
|
||||
.14:mov [rdi+8], r10
|
||||
mulx r9, r8, [rsi-48]
|
||||
lea rdi, [rdi+64]
|
||||
adc r8, r11
|
||||
.15:mulx r11, r10, [rsi-40]
|
||||
mov [rdi-48], r8
|
||||
adc r10, r9
|
||||
.16:mov [rdi-40], r10
|
||||
mulx r9, r8, [rsi-32]
|
||||
adc r8, r11
|
||||
.17:mulx r11, r10, [rsi-24]
|
||||
mov [rdi-32], r8
|
||||
adc r10, r9
|
||||
.18:mulx r9, r8, [rsi-16]
|
||||
mov [rdi-24], r10
|
||||
adc r8, r11
|
||||
.19:mulx r11, r10, [rsi-8]
|
||||
adc r10, r9
|
||||
mov [rdi-16], r8
|
||||
dec ecx
|
||||
mulx r9, r8, [rsi]
|
||||
jnz .12
|
||||
.20:mov [rdi-8], r10
|
||||
adc r8, r11
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
lea r10, [rel .59]
|
||||
movsxd r11, dword [r10+rax*4]
|
||||
lea r11, [r11+r10]
|
||||
jmp r11
|
||||
.21:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.22:lea rsi, [rsi+rbx*8-64]
|
||||
or ecx, ebx
|
||||
mov rdx, [rsi+8]
|
||||
mulx r9, r8, [rsi+16]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .51
|
||||
align 16
|
||||
.23:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .21
|
||||
mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
.24:mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .23
|
||||
.25:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.26:lea rsi, [rsi+rbx*8-64]
|
||||
or ecx, ebx
|
||||
mov rdx, [rsi]
|
||||
mulx r11, r10, [rsi+8]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .24
|
||||
align 16
|
||||
.27:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .25
|
||||
.28:mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .27
|
||||
.29:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.30:lea rsi, [rsi+rbx*8]
|
||||
or ecx, ebx
|
||||
lea rbx, [rbx+8]
|
||||
mov rdx, [rsi-8]
|
||||
mulx r9, r8, [rsi]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .28
|
||||
align 16
|
||||
.31:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .29
|
||||
mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .31
|
||||
.32:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.33:lea rsi, [rsi+rbx*8]
|
||||
or ecx, ebx
|
||||
jz .53
|
||||
mov rdx, [rsi-16]
|
||||
mulx r11, r10, [rsi-8]
|
||||
lea rdi, [rdi+rbx*8+8]
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .31
|
||||
align 16
|
||||
.34:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .32
|
||||
mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
.35:adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .34
|
||||
.36:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.37:lea rsi, [rsi+rbx*8]
|
||||
or ecx, ebx
|
||||
jz .52
|
||||
mov rdx, [rsi-24]
|
||||
mulx r9, r8, [rsi-16]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .35
|
||||
align 16
|
||||
.38:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .36
|
||||
mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
.39:mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .38
|
||||
.40:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.41:lea rsi, [rsi+rbx*8]
|
||||
or ecx, ebx
|
||||
mov rdx, [rsi-32]
|
||||
mulx r11, r10, [rsi-24]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .39
|
||||
align 16
|
||||
.42:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .40
|
||||
mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
.43:mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .42
|
||||
.44:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.45:lea rsi, [rsi+rbx*8]
|
||||
or ecx, ebx
|
||||
mov rdx, [rsi-40]
|
||||
mulx r9, r8, [rsi-32]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .43
|
||||
align 16
|
||||
.46:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .44
|
||||
mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
.47:mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .46
|
||||
.48:adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
mov [rdi], r8
|
||||
adc r9, rcx
|
||||
mov [rdi+8], r9
|
||||
.49:lea rsi, [rsi+rbx*8]
|
||||
or ecx, ebx
|
||||
mov rdx, [rsi-48]
|
||||
mulx r11, r10, [rsi-40]
|
||||
lea rdi, [rdi+rbx*8-56]
|
||||
jmp .47
|
||||
align 16
|
||||
.50:adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
jrcxz .48
|
||||
mulx r11, r10, [rsi+8]
|
||||
adox r8, [rdi]
|
||||
lea ecx, [rcx+8]
|
||||
mov [rdi], r8
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi+16]
|
||||
adcx r8, r11
|
||||
adox r10, [rdi+8]
|
||||
mov [rdi+8], r10
|
||||
.51:mulx r11, r10, [rsi+24]
|
||||
lea rsi, [rsi+64]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+16]
|
||||
mov [rdi+16], r8
|
||||
mulx r9, r8, [rsi-32]
|
||||
adox r10, [rdi+24]
|
||||
adcx r8, r11
|
||||
mov [rdi+24], r10
|
||||
mulx r11, r10, [rsi-24]
|
||||
adcx r10, r9
|
||||
adox r8, [rdi+32]
|
||||
mov [rdi+32], r8
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r10, [rdi+40]
|
||||
adcx r8, r11
|
||||
mov [rdi+40], r10
|
||||
adox r8, [rdi+48]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi+48], r8
|
||||
lea rdi, [rdi+64]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
jmp .50
|
||||
.52:
|
||||
mov rdx, [rsi-24]
|
||||
mulx r9, r8, [rsi-16]
|
||||
adox r8, [rdi-8]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mov [rdi-8], r8
|
||||
lea rdi, [rdi+8]
|
||||
adcx r10, r9
|
||||
mulx r9, r8, [rsi]
|
||||
adox r10, [rdi-8]
|
||||
adcx r8, r11
|
||||
mov [rdi-8], r10
|
||||
adox r8, [rdi]
|
||||
adox r9, rcx
|
||||
adcx r9, rcx
|
||||
.53:
|
||||
mov rdx, [rsi-16]
|
||||
mulx r11, r10, [rsi-8]
|
||||
mulx rbx, rax, [rsi]
|
||||
adox r10, r8
|
||||
adcx rax, r11
|
||||
mov [rdi], r10
|
||||
adox rax, r9
|
||||
adox rbx, rcx
|
||||
mov [rdi+8], rax
|
||||
adc rbx, rcx
|
||||
mov rdx, [rsi-8]
|
||||
mulx rdx, rax, [rsi]
|
||||
add rax, rbx
|
||||
mov [rdi+16], rax
|
||||
adc rdx, rcx
|
||||
mov [rdi+24], rdx
|
||||
.54:
|
||||
mov rdi, [rsp+stack_use+8]
|
||||
mov rsi, [rsp+stack_use+16]
|
||||
mov rcx, [rsp+stack_use+24]
|
||||
dec ecx
|
||||
mov rdx, [rsi]
|
||||
xor ebx, ebx
|
||||
mulx r10, rax, rdx
|
||||
mov [rdi], rax
|
||||
mov r8, [rdi+8]
|
||||
mov r9, [rdi+16]
|
||||
jmp .56
|
||||
align 16
|
||||
.55:mov r8, [rdi+24]
|
||||
mov r9, [rdi+32]
|
||||
lea rdi, [rdi+16]
|
||||
lea r10, [rdx+rbx]
|
||||
.56:adc r8, r8
|
||||
adc r9, r9
|
||||
setc bl
|
||||
mov rdx, [rsi+8]
|
||||
lea rsi, [rsi+8]
|
||||
mulx rdx, rax, rdx
|
||||
add r8, r10
|
||||
adc r9, rax
|
||||
mov [rdi+8], r8
|
||||
mov [rdi+16], r9
|
||||
dec ecx
|
||||
jnz .55
|
||||
.57:adc rdx, rbx
|
||||
mov [rdi+24], rdx
|
||||
END_PROC reg_save_list
|
||||
|
||||
align 8
|
||||
.58:
|
||||
dd .9 - .58
|
||||
dd .4 - .58
|
||||
dd .10 - .58
|
||||
dd .11 - .58
|
||||
dd .5 - .58
|
||||
dd .6 - .58
|
||||
dd .7 - .58
|
||||
dd .8 - .58
|
||||
.59:
|
||||
dd .49 - .59
|
||||
dd .22 - .59
|
||||
dd .26 - .59
|
||||
dd .30 - .59
|
||||
dd .33 - .59
|
||||
dd .37 - .59
|
||||
dd .41 - .59
|
||||
dd .45 - .59
|
||||
|
||||
end
|
@ -1,6 +1,6 @@
|
||||
|
||||
; AMD64 mpn_sub_n
|
||||
; Copyright 2008, 2016 Jason Moxham and Alexander Kruppa
|
||||
; Copyright 2016 Alexander Kruppa
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
@ -15,112 +15,145 @@
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
; (rdi,rcx) = (rsi,rcx)+(rdx,rcx)
|
||||
; rax = carry
|
||||
|
||||
%define USE_WIN64
|
||||
; (rdi,rcx) = (rsi,rcx)-(rdx,rcx)
|
||||
; rax = borrow
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
%define USE_WIN64
|
||||
|
||||
%ifdef USE_WIN64
|
||||
%define SumP rcx
|
||||
%define Inp1P rdx
|
||||
%define Inp2P r8
|
||||
%define Size r9
|
||||
%define SizeRest r11
|
||||
%define LIMB1 rax
|
||||
%define LIMB2 r10
|
||||
%define SizeRest r11
|
||||
%else
|
||||
%define SumP rdi
|
||||
%define Inp1P rsi
|
||||
%define Inp2P rdx
|
||||
%define Size rcx
|
||||
%define SizeRest r11
|
||||
%define LIMB1 rax
|
||||
%define LIMB2 r9
|
||||
%define SizeRest r10
|
||||
%define LIMB2 r8
|
||||
%endif
|
||||
|
||||
%define ADDSUB sub
|
||||
%define ADCSBB sbb
|
||||
|
||||
; Skylake has problems sustaining 2 read and 1 write per clock cycle.
|
||||
; It sometimes gets into a "mode" (for the lack of a better word) where
|
||||
; it does not fully utilize port 7, causing store uops to compete with
|
||||
; the reads for ports 2,3. We try to alleviate the problem by turning
|
||||
; some of the 64-bit writes into 128-bit writes, reducing the number of
|
||||
; write instructions. Unfortunately, SSE2/AVX2 do not have particularly
|
||||
; good instructions for assembling an SSE2 128-bit word from two GPR
|
||||
; 64-bit words, so the instruction count is greatly inflated.
|
||||
|
||||
%macro STORE 1
|
||||
mov [SumP %1], LIMB1
|
||||
mov [SumP %1 + 8], LIMB2
|
||||
%endmacro
|
||||
|
||||
%macro SSESTORE 1
|
||||
movq xmm0, LIMB1
|
||||
movq xmm1, LIMB2
|
||||
vpermilpd xmm1, xmm1, 0
|
||||
pblendw xmm0, xmm1, 0xf0
|
||||
movaps [SumP %1], xmm0
|
||||
%endmacro
|
||||
|
||||
BITS 64
|
||||
|
||||
xalign 8
|
||||
LEAF_PROC mpn_sub_nc
|
||||
mov r10,[rsp+40]
|
||||
jmp entry
|
||||
LEAF_PROC mpn_sub_n
|
||||
; Make dest 16-bytes aligned
|
||||
test SumP, 8
|
||||
jz .aligned
|
||||
dec Size
|
||||
mov SizeRest, Size
|
||||
and SizeRest, 7
|
||||
shr Size, 3
|
||||
; Unaligned and Size > 8: do one limb separately, then the normal loop
|
||||
jnz .unaligned
|
||||
; Unaligned and Size <= 8: do all with .rest loop
|
||||
inc SizeRest
|
||||
clc
|
||||
jmp .rest ;ajs:notshortform
|
||||
|
||||
xalign 8
|
||||
LEAF_PROC mpn_sub_n
|
||||
xor r10, r10
|
||||
entry:
|
||||
mov SizeRest, Size
|
||||
and SizeRest, 7
|
||||
shr Size, 3
|
||||
lea Size, [r10 + 2*Size]
|
||||
sar Size, 1
|
||||
jnz .loop1
|
||||
jmp .rest
|
||||
.aligned:
|
||||
mov SizeRest, Size
|
||||
and SizeRest, 7
|
||||
shr Size, 3
|
||||
clc
|
||||
jz .rest ;ajs:notshortform
|
||||
jmp .loop1
|
||||
|
||||
.unaligned:
|
||||
mov LIMB1, [Inp1P]
|
||||
ADDSUB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
lea Inp1P, [Inp1P+8]
|
||||
lea Inp2P, [Inp2P+8]
|
||||
lea SumP, [SumP+8]
|
||||
|
||||
align 16
|
||||
.loop1:
|
||||
mov LIMB1, [Inp1P]
|
||||
mov LIMB2, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
ADCSBB LIMB2, [Inp2P+8]
|
||||
mov LIMB1, [Inp1P+16]
|
||||
mov [SumP+8], LIMB2
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
mov LIMB2, [Inp1P+24]
|
||||
mov [SumP+16], LIMB1
|
||||
mov LIMB1, [Inp1P+32]
|
||||
ADCSBB LIMB2, [Inp2P+24]
|
||||
mov [SumP+24], LIMB2
|
||||
ADCSBB LIMB1, [Inp2P+32]
|
||||
mov [SumP+32], LIMB1
|
||||
mov LIMB2, [Inp1P+40]
|
||||
ADCSBB LIMB2, [Inp2P+40]
|
||||
mov [SumP+40], LIMB2
|
||||
mov LIMB1, [Inp1P+48]
|
||||
mov LIMB2, [Inp1P+56]
|
||||
lea Inp1P, [Inp1P+64]
|
||||
ADCSBB LIMB1, [Inp2P+48]
|
||||
ADCSBB LIMB2, [Inp2P+56]
|
||||
lea Inp2P, [Inp2P+64]
|
||||
mov [SumP+48], LIMB1
|
||||
mov [SumP+56], LIMB2
|
||||
lea SumP, [SumP+64]
|
||||
dec Size
|
||||
jnz .loop1
|
||||
inc SizeRest
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P]
|
||||
mov LIMB2, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
ADCSBB LIMB2, [Inp2P+8]
|
||||
SSESTORE +0
|
||||
mov LIMB1, [Inp1P+16]
|
||||
mov LIMB2, [Inp1P+24]
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
ADCSBB LIMB2, [Inp2P+24]
|
||||
STORE +16
|
||||
mov LIMB1, [Inp1P+32]
|
||||
mov LIMB2, [Inp1P+40]
|
||||
ADCSBB LIMB1, [Inp2P+32]
|
||||
ADCSBB LIMB2, [Inp2P+40]
|
||||
STORE +32
|
||||
mov LIMB1, [Inp1P+48]
|
||||
mov LIMB2, [Inp1P+56]
|
||||
ADCSBB LIMB1, [Inp2P+48]
|
||||
ADCSBB LIMB2, [Inp2P+56]
|
||||
STORE +48
|
||||
lea Inp1P, [Inp1P+64]
|
||||
lea Inp2P, [Inp2P+64]
|
||||
lea SumP, [SumP+64]
|
||||
dec Size
|
||||
jnz .loop1
|
||||
inc SizeRest
|
||||
dec SizeRest
|
||||
jz .end
|
||||
.rest:
|
||||
mov LIMB1, [Inp1P]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P+8]
|
||||
mov [SumP+8], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+16]
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
mov [SumP+16], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+24]
|
||||
ADCSBB LIMB1, [Inp2P+24]
|
||||
mov [SumP+24], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
lea Inp1P, [Inp1P+32]
|
||||
lea Inp2P, [Inp2P+32]
|
||||
lea SumP, [SumP+32]
|
||||
jmp .rest
|
||||
mov LIMB1, [Inp1P]
|
||||
ADCSBB LIMB1, [Inp2P]
|
||||
mov [SumP], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+8]
|
||||
ADCSBB LIMB1, [Inp2P+8]
|
||||
mov [SumP+8], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+16]
|
||||
ADCSBB LIMB1, [Inp2P+16]
|
||||
mov [SumP+16], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
mov LIMB1, [Inp1P+24]
|
||||
ADCSBB LIMB1, [Inp2P+24]
|
||||
mov [SumP+24], LIMB1
|
||||
dec SizeRest
|
||||
jz .end
|
||||
lea Inp1P, [Inp1P+32]
|
||||
lea Inp2P, [Inp2P+32]
|
||||
lea SumP, [SumP+32]
|
||||
jmp .rest
|
||||
.end:
|
||||
mov eax, 0
|
||||
adc eax, eax
|
||||
mov eax, 0
|
||||
adc eax, eax
|
||||
ret
|
||||
|
Loading…
Reference in New Issue
Block a user