841 lines
16 KiB
NASM
841 lines
16 KiB
NASM
dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
|
|
|
|
dnl Copyright 2015 Free Software Foundation, Inc.
|
|
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
dnl it under the terms of either:
|
|
dnl
|
|
dnl * the GNU Lesser General Public License as published by the Free
|
|
dnl Software Foundation; either version 3 of the License, or (at your
|
|
dnl option) any later version.
|
|
dnl
|
|
dnl or
|
|
dnl
|
|
dnl * the GNU General Public License as published by the Free Software
|
|
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
dnl later version.
|
|
dnl
|
|
dnl or both in parallel, as here.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
dnl for more details.
|
|
dnl
|
|
dnl You should have received copies of the GNU General Public License and the
|
|
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
dnl see https://www.gnu.org/licenses/.
|
|
|
|
include(`../config.m4')
|
|
|
|
C cycles/limb mul_1 addmul_1
|
|
C AMD K8,K9 n/a n/a
|
|
C AMD K10 n/a n/a
|
|
C AMD bull n/a n/a
|
|
C AMD pile n/a n/a
|
|
C AMD steam n/a n/a
|
|
C AMD excavator ? ?
|
|
C AMD bobcat n/a n/a
|
|
C AMD jaguar n/a n/a
|
|
C Intel P4 n/a n/a
|
|
C Intel core2 n/a n/a
|
|
C Intel NHM n/a n/a
|
|
C Intel SBR n/a n/a
|
|
C Intel IBR n/a n/a
|
|
C Intel HWL 1.68 n/a
|
|
C Intel BWL 1.69 1.8-1.9
|
|
C Intel atom n/a n/a
|
|
C Intel SLM n/a n/a
|
|
C VIA nano n/a n/a
|
|
|
|
C The inner loops of this code are the result of running a code generation and
|
|
C optimisation tool suite written by David Harvey and Torbjorn Granlund.
|
|
|
|
C TODO
|
|
C * We have 8 addmul_1 loops which fall into each other. The idea is to save
|
|
C on switching code, since a circularly updated computed goto target will
|
|
C hardly allow correct branch prediction. On 2nd thought, we now might make
|
|
C each of the 8 loop branches be poorly predicted since they will be
|
|
C executed fewer times for each time. With just one addmul_1 loop, the loop
|
|
C count will change only once each 8th time!
|
|
C * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code. We have
|
|
C 3 variants below, but the haswell code turns out to be fastest.
|
|
C * Do overlapped software pipelining.
|
|
C * When changing this, make sure the code which falls into the inner loops
|
|
C does not execute too many no-ops (for both PIC and non-PIC).
|
|
|
|
define(`rp', `%rdi')
|
|
define(`up', `%rsi')
|
|
define(`un_param',`%rdx')
|
|
|
|
define(`n', `%rcx')
|
|
define(`un_save', `%rbx')
|
|
define(`u0', `%rdx')
|
|
|
|
define(`w0', `%r8')
|
|
define(`w1', `%r9')
|
|
define(`w2', `%r10')
|
|
define(`w3', `%r11')
|
|
|
|
ABI_SUPPORT(DOS64)
|
|
ABI_SUPPORT(STD64)
|
|
|
|
ASM_START()
|
|
TEXT
|
|
ALIGN(16)
|
|
PROLOGUE(mpn_sqr_basecase)
|
|
FUNC_ENTRY(3)
|
|
|
|
cmp $2, un_param
|
|
jae L(gt1)
|
|
|
|
mov (up), %rdx
|
|
mulx( %rdx, %rax, %rdx)
|
|
mov %rax, (rp)
|
|
mov %rdx, 8(rp)
|
|
FUNC_EXIT()
|
|
ret
|
|
|
|
L(gt1): jne L(gt2)
|
|
|
|
mov (up), %rdx
|
|
mov 8(up), %rcx
|
|
mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
|
|
mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
|
|
mov %rcx, %rdx
|
|
mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
|
|
add %r9, %r9 C W 1
|
|
adc %r10, %r10 C W 2
|
|
adc $0, %rdx C W 3
|
|
add %r9, %r8 C W 1
|
|
adc %r11, %r10 C W 2
|
|
adc $0, %rdx C W 3
|
|
mov %rax, (rp)
|
|
mov %r8, 8(rp)
|
|
mov %r10, 16(rp)
|
|
mov %rdx, 24(rp)
|
|
FUNC_EXIT()
|
|
ret
|
|
|
|
L(gt2): cmp $4, un_param
|
|
jae L(gt3)
|
|
|
|
push %rbx
|
|
mov (up), %rdx
|
|
mulx( 8,(up), w2, w3)
|
|
mulx( 16,(up), w0, w1)
|
|
add w3, w0
|
|
mov 8(up), %rdx
|
|
mulx( 16,(up), %rax, w3)
|
|
adc %rax, w1
|
|
adc $0, w3
|
|
test R32(%rbx), R32(%rbx)
|
|
mov (up), %rdx
|
|
mulx( %rdx, %rbx, %rcx)
|
|
mov %rbx, (rp)
|
|
mov 8(up), %rdx
|
|
mulx( %rdx, %rax, %rbx)
|
|
mov 16(up), %rdx
|
|
mulx( %rdx, %rsi, %rdx)
|
|
adcx( w2, w2)
|
|
adcx( w0, w0)
|
|
adcx( w1, w1)
|
|
adcx( w3, w3)
|
|
adox( w2, %rcx)
|
|
adox( w0, %rax)
|
|
adox( w1, %rbx)
|
|
adox( w3, %rsi)
|
|
mov $0, R32(%r8)
|
|
adox( %r8, %rdx)
|
|
adcx( %r8, %rdx)
|
|
mov %rcx, 8(rp)
|
|
mov %rax, 16(rp)
|
|
mov %rbx, 24(rp)
|
|
mov %rsi, 32(rp)
|
|
mov %rdx, 40(rp)
|
|
pop %rbx
|
|
FUNC_EXIT()
|
|
ret
|
|
|
|
L(gt3): push %rbx
|
|
|
|
push rp
|
|
push up
|
|
push un_param
|
|
|
|
lea -3(un_param), R32(un_save)
|
|
lea 5(un_param), n
|
|
mov R32(un_param), R32(%rax)
|
|
and $-8, R32(un_save)
|
|
shr $3, R32(n) C count for mul_1 loop
|
|
neg un_save C 8*count and offert for addmul_1 loops
|
|
and $7, R32(%rax) C clear CF for adc as side-effect
|
|
|
|
mov (up), u0
|
|
|
|
lea L(mtab)(%rip), %r10
|
|
ifdef(`PIC',
|
|
` movslq (%r10,%rax,4), %r8
|
|
lea (%r8, %r10), %r10
|
|
jmp *%r10
|
|
',`
|
|
jmp *(%r10,%rax,8)
|
|
')
|
|
|
|
L(mf0): mulx( 8,(up), w2, w3)
|
|
lea 64(up), up
|
|
C lea (rp), rp
|
|
jmp L(mb0)
|
|
|
|
L(mf3): mulx( 8,(up), w0, w1)
|
|
lea 24(up), up
|
|
lea 24(rp), rp
|
|
jmp L(mb3)
|
|
|
|
L(mf4): mulx( 8,(up), w2, w3)
|
|
lea 32(up), up
|
|
lea 32(rp), rp
|
|
jmp L(mb4)
|
|
|
|
L(mf5): mulx( 8,(up), w0, w1)
|
|
lea 40(up), up
|
|
lea 40(rp), rp
|
|
jmp L(mb5)
|
|
|
|
L(mf6): mulx( 8,(up), w2, w3)
|
|
lea 48(up), up
|
|
lea 48(rp), rp
|
|
jmp L(mb6)
|
|
|
|
L(mf7): mulx( 8,(up), w0, w1)
|
|
lea 56(up), up
|
|
lea 56(rp), rp
|
|
jmp L(mb7)
|
|
|
|
L(mf1): mulx( 8,(up), w0, w1)
|
|
lea 8(up), up
|
|
lea 8(rp), rp
|
|
jmp L(mb1)
|
|
|
|
L(mf2): mulx( 8,(up), w2, w3)
|
|
lea 16(up), up
|
|
lea 16(rp), rp
|
|
dec R32(n)
|
|
mulx( (up), w0, w1)
|
|
|
|
ALIGN(16)
|
|
L(top): mov w2, -8(rp)
|
|
adc w3, w0
|
|
L(mb1): mulx( 8,(up), w2, w3)
|
|
adc w1, w2
|
|
lea 64(up), up
|
|
mov w0, (rp)
|
|
L(mb0): mov w2, 8(rp)
|
|
mulx( -48,(up), w0, w1)
|
|
lea 64(rp), rp
|
|
adc w3, w0
|
|
L(mb7): mulx( -40,(up), w2, w3)
|
|
mov w0, -48(rp)
|
|
adc w1, w2
|
|
L(mb6): mov w2, -40(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adc w3, w0
|
|
L(mb5): mulx( -24,(up), w2, w3)
|
|
mov w0, -32(rp)
|
|
adc w1, w2
|
|
L(mb4): mulx( -16,(up), w0, w1)
|
|
mov w2, -24(rp)
|
|
adc w3, w0
|
|
L(mb3): mulx( -8,(up), w2, w3)
|
|
adc w1, w2
|
|
mov w0, -16(rp)
|
|
dec R32(n)
|
|
mulx( (up), w0, w1)
|
|
jnz L(top)
|
|
|
|
L(end): mov w2, -8(rp)
|
|
adc w3, w0
|
|
mov w0, (rp)
|
|
adc %rcx, w1
|
|
mov w1, 8(rp)
|
|
|
|
lea L(atab)(%rip), %r10
|
|
ifdef(`PIC',
|
|
` movslq (%r10,%rax,4), %r11
|
|
lea (%r11, %r10), %r11
|
|
jmp *%r11
|
|
',`
|
|
jmp *(%r10,%rax,8)
|
|
')
|
|
|
|
L(ed0): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f7): lea -64(up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
mov 8(up), u0
|
|
mulx( 16,(up), w0, w1)
|
|
lea -56(rp,un_save,8), rp
|
|
jmp L(b7)
|
|
|
|
ALIGN(16)
|
|
L(tp0): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed0)
|
|
mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
L(b0): mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp0)
|
|
|
|
L(ed1): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f0): lea -64(up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
mov (up), u0
|
|
mulx( 8,(up), w2, w3)
|
|
lea -56(rp,un_save,8), rp
|
|
jmp L(b0)
|
|
|
|
ALIGN(16)
|
|
L(tp1): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed1)
|
|
L(b1): mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp1)
|
|
|
|
L(ed2): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f1): lea (up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
lea 8(un_save), un_save
|
|
mov -8(up), u0
|
|
mulx( (up), w0, w1)
|
|
lea -56(rp,un_save,8), rp
|
|
jmp L(b1)
|
|
|
|
ALIGN(16)
|
|
L(tp2): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed2)
|
|
mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp2)
|
|
|
|
L(ed3): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f2): lea (up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
jz L(corner2)
|
|
mov -16(up), u0
|
|
mulx( -8,(up), w2, w3)
|
|
lea 8(rp,un_save,8), rp
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp2)
|
|
|
|
ALIGN(16)
|
|
L(tp3): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed3)
|
|
mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
L(b3): adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp3)
|
|
|
|
L(ed4): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f3): lea (up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
jz L(corner3)
|
|
mov -24(up), u0
|
|
mulx( -16,(up), w0, w1)
|
|
lea -56(rp,un_save,8), rp
|
|
jmp L(b3)
|
|
|
|
ALIGN(16)
|
|
L(tp4): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed4)
|
|
mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
L(b4): mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp4)
|
|
|
|
L(ed5): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f4): lea (up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
mov -32(up), u0
|
|
mulx( -24,(up), w2, w3)
|
|
lea -56(rp,un_save,8), rp
|
|
jmp L(b4)
|
|
|
|
ALIGN(16)
|
|
L(tp5): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed5)
|
|
mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
L(b5): mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp5)
|
|
|
|
L(ed6): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f5): lea (up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
mov -40(up), u0
|
|
mulx( -32,(up), w0, w1)
|
|
lea -56(rp,un_save,8), rp
|
|
jmp L(b5)
|
|
|
|
ALIGN(16)
|
|
L(tp6): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed6)
|
|
mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
L(b6): mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp6)
|
|
|
|
L(ed7): adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
mov w0, (rp)
|
|
adc %rcx, w1 C relies on rcx = 0
|
|
mov w1, 8(rp)
|
|
L(f6): lea (up,un_save,8), up
|
|
or R32(un_save), R32(n)
|
|
mov -48(up), u0
|
|
mulx( -40,(up), w2, w3)
|
|
lea -56(rp,un_save,8), rp
|
|
jmp L(b6)
|
|
|
|
ALIGN(16)
|
|
L(tp7): adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
jrcxz L(ed7)
|
|
mulx( 8,(up), w2, w3)
|
|
adox( (rp), w0)
|
|
lea 8(n), R32(n)
|
|
mov w0, (rp)
|
|
adcx( w1, w2)
|
|
mulx( 16,(up), w0, w1)
|
|
adcx( w3, w0)
|
|
adox( 8,(rp), w2)
|
|
mov w2, 8(rp)
|
|
L(b7): mulx( 24,(up), w2, w3)
|
|
lea 64(up), up
|
|
adcx( w1, w2)
|
|
adox( 16,(rp), w0)
|
|
mov w0, 16(rp)
|
|
mulx( -32,(up), w0, w1)
|
|
adox( 24,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 24(rp)
|
|
mulx( -24,(up), w2, w3)
|
|
adcx( w1, w2)
|
|
adox( 32,(rp), w0)
|
|
mov w0, 32(rp)
|
|
mulx( -16,(up), w0, w1)
|
|
adox( 40,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, 40(rp)
|
|
adox( 48,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, 48(rp)
|
|
lea 64(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
jmp L(tp7)
|
|
|
|
L(corner3):
|
|
mov -24(up), u0
|
|
mulx( -16,(up), w0, w1)
|
|
adox( -8,(rp), w0)
|
|
mulx( -8,(up), w2, w3)
|
|
mov w0, -8(rp)
|
|
lea 8(rp), rp
|
|
adcx( w1, w2)
|
|
mulx( (up), w0, w1)
|
|
adox( -8,(rp), w2)
|
|
adcx( w3, w0)
|
|
mov w2, -8(rp)
|
|
adox( (rp), w0)
|
|
adox( %rcx, w1) C relies on rcx = 0
|
|
adcx( %rcx, w1) C relies on rcx = 0
|
|
L(corner2):
|
|
mov -16(up), u0
|
|
mulx( -8,(up), w2, w3)
|
|
mulx( (up), %rax, %rbx)
|
|
adox( w0, w2)
|
|
adcx( w3, %rax)
|
|
mov w2, (rp)
|
|
adox( w1, %rax)
|
|
adox( %rcx, %rbx) C relies on rcx = 0
|
|
mov %rax, 8(rp)
|
|
adc %rcx, %rbx C relies on rcx = 0
|
|
mov -8(up), %rdx
|
|
mulx( (up), %rax, %rdx)
|
|
add %rbx, %rax
|
|
mov %rax, 16(rp)
|
|
adc %rcx, %rdx C relies on rcx = 0
|
|
mov %rdx, 24(rp)
|
|
|
|
L(sqr_diag_addlsh1):
|
|
pop n
|
|
pop up
|
|
pop rp
|
|
|
|
ifdef(`SDA_VARIANT',,`define(`SDA_VARIANT', 2)')
|
|
|
|
ifelse(SDA_VARIANT,1,`
|
|
lea (n,n), %rax
|
|
movq $0, -8(rp,%rax,8) C FIXME
|
|
test R32(%rax), R32(%rax)
|
|
mov (up), %rdx
|
|
lea 8(up), up
|
|
mulx( %rdx, %r8, %rdx)
|
|
jmp L(dm)
|
|
|
|
ALIGN(16)
|
|
L(dtop):mov 8(rp), %r9
|
|
adcx( %r9, %r9)
|
|
adox( %rdx, %r9)
|
|
mov %r9, 8(rp)
|
|
lea 16(rp), rp
|
|
jrcxz L(dend)
|
|
mov (up), %rdx
|
|
mulx( %rdx, %rax, %rdx)
|
|
lea 8(up), up
|
|
mov (rp), %r8
|
|
adcx( %r8, %r8)
|
|
adox( %rax, %r8)
|
|
L(dm): mov %r8, (rp)
|
|
lea -1(n), n
|
|
jmp L(dtop)
|
|
L(dend):
|
|
')
|
|
|
|
ifelse(SDA_VARIANT,2,`
|
|
dec R32(n)
|
|
mov (up), %rdx
|
|
xor R32(%rbx), R32(%rbx) C clear CF as side effect
|
|
mulx( %rdx, %rax, %r10)
|
|
mov %rax, (rp)
|
|
mov 8(rp), %r8
|
|
mov 16(rp), %r9
|
|
jmp L(dm)
|
|
|
|
ALIGN(16)
|
|
L(dtop):mov 24(rp), %r8
|
|
mov 32(rp), %r9
|
|
lea 16(rp), rp
|
|
lea (%rdx,%rbx), %r10
|
|
L(dm): adc %r8, %r8
|
|
adc %r9, %r9
|
|
setc R8(%rbx)
|
|
mov 8(up), %rdx
|
|
lea 8(up), up
|
|
mulx( %rdx, %rax, %rdx)
|
|
add %r10, %r8
|
|
adc %rax, %r9
|
|
mov %r8, 8(rp)
|
|
mov %r9, 16(rp)
|
|
dec R32(n)
|
|
jnz L(dtop)
|
|
|
|
L(dend):adc %rbx, %rdx
|
|
mov %rdx, 24(rp)
|
|
')
|
|
|
|
ifelse(SDA_VARIANT,3,`
|
|
dec R32(n)
|
|
mov (up), %rdx
|
|
test R32(%rbx), R32(%rbx) C clear CF and OF
|
|
mulx( %rdx, %rax, %r10)
|
|
mov %rax, (rp)
|
|
mov 8(rp), %r8
|
|
mov 16(rp), %r9
|
|
jmp L(dm)
|
|
|
|
ALIGN(16)
|
|
L(dtop):jrcxz L(dend)
|
|
mov 24(rp), %r8
|
|
mov 32(rp), %r9
|
|
lea 16(rp), rp
|
|
L(dm): adcx( %r8, %r8)
|
|
adcx( %r9, %r9)
|
|
mov 8(up), %rdx
|
|
lea 8(up), up
|
|
adox( %r10, %r8)
|
|
mulx( %rdx, %rax, %r10)
|
|
adox( %rax, %r9)
|
|
mov %r8, 8(rp)
|
|
mov %r9, 16(rp)
|
|
lea -1(n), R32(n)
|
|
jmp L(dtop)
|
|
|
|
L(dend):adcx( %rcx, %r10)
|
|
adox( %rcx, %r10)
|
|
mov %r10, 24(rp)
|
|
')
|
|
|
|
pop %rbx
|
|
FUNC_EXIT()
|
|
ret
|
|
|
|
JUMPTABSECT
|
|
ALIGN(8)
|
|
L(mtab):JMPENT( L(mf7), L(mtab))
|
|
JMPENT( L(mf0), L(mtab))
|
|
JMPENT( L(mf1), L(mtab))
|
|
JMPENT( L(mf2), L(mtab))
|
|
JMPENT( L(mf3), L(mtab))
|
|
JMPENT( L(mf4), L(mtab))
|
|
JMPENT( L(mf5), L(mtab))
|
|
JMPENT( L(mf6), L(mtab))
|
|
L(atab):JMPENT( L(f6), L(atab))
|
|
JMPENT( L(f7), L(atab))
|
|
JMPENT( L(f0), L(atab))
|
|
JMPENT( L(f1), L(atab))
|
|
JMPENT( L(f2), L(atab))
|
|
JMPENT( L(f3), L(atab))
|
|
JMPENT( L(f4), L(atab))
|
|
JMPENT( L(f5), L(atab))
|
|
TEXT
|
|
EPILOGUE()
|