mpir/mpn/x86_64/skylake/avx/sqr_basecase.asm
2016-12-20 23:57:20 +01:00

841 lines
16 KiB
NASM

dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
dnl Copyright 2015 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb mul_1 addmul_1
C AMD K8,K9 n/a n/a
C AMD K10 n/a n/a
C AMD bull n/a n/a
C AMD pile n/a n/a
C AMD steam n/a n/a
C AMD excavator ? ?
C AMD bobcat n/a n/a
C AMD jaguar n/a n/a
C Intel P4 n/a n/a
C Intel core2 n/a n/a
C Intel NHM n/a n/a
C Intel SBR n/a n/a
C Intel IBR n/a n/a
C Intel HWL 1.68 n/a
C Intel BWL 1.69 1.8-1.9
C Intel atom n/a n/a
C Intel SLM n/a n/a
C VIA nano n/a n/a
C The inner loops of this code are the result of running a code generation and
C optimisation tool suite written by David Harvey and Torbjorn Granlund.
C TODO
C * We have 8 addmul_1 loops which fall into each other. The idea is to save
C on switching code, since a circularly updated computed goto target will
C hardly allow correct branch prediction. On 2nd thought, we now might make
C each of the 8 loop branches be poorly predicted since they will be
C executed fewer times for each time. With just one addmul_1 loop, the loop
C count will change only once each 8th time!
C * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code. We have
C 3 variants below, but the haswell code turns out to be fastest.
C * Do overlapped software pipelining.
C * When changing this, make sure the code which falls into the inner loops
C does not execute too many no-ops (for both PIC and non-PIC).
define(`rp', `%rdi')
define(`up', `%rsi')
define(`un_param',`%rdx')
define(`n', `%rcx')
define(`un_save', `%rbx')
define(`u0', `%rdx')
define(`w0', `%r8')
define(`w1', `%r9')
define(`w2', `%r10')
define(`w3', `%r11')
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_sqr_basecase)
FUNC_ENTRY(3)
cmp $2, un_param
jae L(gt1)
mov (up), %rdx
mulx( %rdx, %rax, %rdx)
mov %rax, (rp)
mov %rdx, 8(rp)
FUNC_EXIT()
ret
L(gt1): jne L(gt2)
mov (up), %rdx
mov 8(up), %rcx
mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
mov %rcx, %rdx
mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
add %r9, %r9 C W 1
adc %r10, %r10 C W 2
adc $0, %rdx C W 3
add %r9, %r8 C W 1
adc %r11, %r10 C W 2
adc $0, %rdx C W 3
mov %rax, (rp)
mov %r8, 8(rp)
mov %r10, 16(rp)
mov %rdx, 24(rp)
FUNC_EXIT()
ret
L(gt2): cmp $4, un_param
jae L(gt3)
push %rbx
mov (up), %rdx
mulx( 8,(up), w2, w3)
mulx( 16,(up), w0, w1)
add w3, w0
mov 8(up), %rdx
mulx( 16,(up), %rax, w3)
adc %rax, w1
adc $0, w3
test R32(%rbx), R32(%rbx)
mov (up), %rdx
mulx( %rdx, %rbx, %rcx)
mov %rbx, (rp)
mov 8(up), %rdx
mulx( %rdx, %rax, %rbx)
mov 16(up), %rdx
mulx( %rdx, %rsi, %rdx)
adcx( w2, w2)
adcx( w0, w0)
adcx( w1, w1)
adcx( w3, w3)
adox( w2, %rcx)
adox( w0, %rax)
adox( w1, %rbx)
adox( w3, %rsi)
mov $0, R32(%r8)
adox( %r8, %rdx)
adcx( %r8, %rdx)
mov %rcx, 8(rp)
mov %rax, 16(rp)
mov %rbx, 24(rp)
mov %rsi, 32(rp)
mov %rdx, 40(rp)
pop %rbx
FUNC_EXIT()
ret
L(gt3): push %rbx
push rp
push up
push un_param
lea -3(un_param), R32(un_save)
lea 5(un_param), n
mov R32(un_param), R32(%rax)
and $-8, R32(un_save)
shr $3, R32(n) C count for mul_1 loop
neg un_save C 8*count and offert for addmul_1 loops
and $7, R32(%rax) C clear CF for adc as side-effect
mov (up), u0
lea L(mtab)(%rip), %r10
ifdef(`PIC',
` movslq (%r10,%rax,4), %r8
lea (%r8, %r10), %r10
jmp *%r10
',`
jmp *(%r10,%rax,8)
')
L(mf0): mulx( 8,(up), w2, w3)
lea 64(up), up
C lea (rp), rp
jmp L(mb0)
L(mf3): mulx( 8,(up), w0, w1)
lea 24(up), up
lea 24(rp), rp
jmp L(mb3)
L(mf4): mulx( 8,(up), w2, w3)
lea 32(up), up
lea 32(rp), rp
jmp L(mb4)
L(mf5): mulx( 8,(up), w0, w1)
lea 40(up), up
lea 40(rp), rp
jmp L(mb5)
L(mf6): mulx( 8,(up), w2, w3)
lea 48(up), up
lea 48(rp), rp
jmp L(mb6)
L(mf7): mulx( 8,(up), w0, w1)
lea 56(up), up
lea 56(rp), rp
jmp L(mb7)
L(mf1): mulx( 8,(up), w0, w1)
lea 8(up), up
lea 8(rp), rp
jmp L(mb1)
L(mf2): mulx( 8,(up), w2, w3)
lea 16(up), up
lea 16(rp), rp
dec R32(n)
mulx( (up), w0, w1)
ALIGN(16)
L(top): mov w2, -8(rp)
adc w3, w0
L(mb1): mulx( 8,(up), w2, w3)
adc w1, w2
lea 64(up), up
mov w0, (rp)
L(mb0): mov w2, 8(rp)
mulx( -48,(up), w0, w1)
lea 64(rp), rp
adc w3, w0
L(mb7): mulx( -40,(up), w2, w3)
mov w0, -48(rp)
adc w1, w2
L(mb6): mov w2, -40(rp)
mulx( -32,(up), w0, w1)
adc w3, w0
L(mb5): mulx( -24,(up), w2, w3)
mov w0, -32(rp)
adc w1, w2
L(mb4): mulx( -16,(up), w0, w1)
mov w2, -24(rp)
adc w3, w0
L(mb3): mulx( -8,(up), w2, w3)
adc w1, w2
mov w0, -16(rp)
dec R32(n)
mulx( (up), w0, w1)
jnz L(top)
L(end): mov w2, -8(rp)
adc w3, w0
mov w0, (rp)
adc %rcx, w1
mov w1, 8(rp)
lea L(atab)(%rip), %r10
ifdef(`PIC',
` movslq (%r10,%rax,4), %r11
lea (%r11, %r10), %r11
jmp *%r11
',`
jmp *(%r10,%rax,8)
')
L(ed0): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f7): lea -64(up,un_save,8), up
or R32(un_save), R32(n)
mov 8(up), u0
mulx( 16,(up), w0, w1)
lea -56(rp,un_save,8), rp
jmp L(b7)
ALIGN(16)
L(tp0): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed0)
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
L(b0): mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp0)
L(ed1): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f0): lea -64(up,un_save,8), up
or R32(un_save), R32(n)
mov (up), u0
mulx( 8,(up), w2, w3)
lea -56(rp,un_save,8), rp
jmp L(b0)
ALIGN(16)
L(tp1): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed1)
L(b1): mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp1)
L(ed2): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f1): lea (up,un_save,8), up
or R32(un_save), R32(n)
lea 8(un_save), un_save
mov -8(up), u0
mulx( (up), w0, w1)
lea -56(rp,un_save,8), rp
jmp L(b1)
ALIGN(16)
L(tp2): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed2)
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp2)
L(ed3): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f2): lea (up,un_save,8), up
or R32(un_save), R32(n)
jz L(corner2)
mov -16(up), u0
mulx( -8,(up), w2, w3)
lea 8(rp,un_save,8), rp
mulx( (up), w0, w1)
jmp L(tp2)
ALIGN(16)
L(tp3): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed3)
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
L(b3): adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp3)
L(ed4): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f3): lea (up,un_save,8), up
or R32(un_save), R32(n)
jz L(corner3)
mov -24(up), u0
mulx( -16,(up), w0, w1)
lea -56(rp,un_save,8), rp
jmp L(b3)
ALIGN(16)
L(tp4): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed4)
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
L(b4): mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp4)
L(ed5): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f4): lea (up,un_save,8), up
or R32(un_save), R32(n)
mov -32(up), u0
mulx( -24,(up), w2, w3)
lea -56(rp,un_save,8), rp
jmp L(b4)
ALIGN(16)
L(tp5): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed5)
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
L(b5): mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp5)
L(ed6): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f5): lea (up,un_save,8), up
or R32(un_save), R32(n)
mov -40(up), u0
mulx( -32,(up), w0, w1)
lea -56(rp,un_save,8), rp
jmp L(b5)
ALIGN(16)
L(tp6): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed6)
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
L(b6): mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp6)
L(ed7): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
L(f6): lea (up,un_save,8), up
or R32(un_save), R32(n)
mov -48(up), u0
mulx( -40,(up), w2, w3)
lea -56(rp,un_save,8), rp
jmp L(b6)
ALIGN(16)
L(tp7): adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
jrcxz L(ed7)
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
mov w0, (rp)
adcx( w1, w2)
mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
L(b7): mulx( 24,(up), w2, w3)
lea 64(up), up
adcx( w1, w2)
adox( 16,(rp), w0)
mov w0, 16(rp)
mulx( -32,(up), w0, w1)
adox( 24,(rp), w2)
adcx( w3, w0)
mov w2, 24(rp)
mulx( -24,(up), w2, w3)
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
mulx( -16,(up), w0, w1)
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
jmp L(tp7)
L(corner3):
mov -24(up), u0
mulx( -16,(up), w0, w1)
adox( -8,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, -8(rp)
lea 8(rp), rp
adcx( w1, w2)
mulx( (up), w0, w1)
adox( -8,(rp), w2)
adcx( w3, w0)
mov w2, -8(rp)
adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
adcx( %rcx, w1) C relies on rcx = 0
L(corner2):
mov -16(up), u0
mulx( -8,(up), w2, w3)
mulx( (up), %rax, %rbx)
adox( w0, w2)
adcx( w3, %rax)
mov w2, (rp)
adox( w1, %rax)
adox( %rcx, %rbx) C relies on rcx = 0
mov %rax, 8(rp)
adc %rcx, %rbx C relies on rcx = 0
mov -8(up), %rdx
mulx( (up), %rax, %rdx)
add %rbx, %rax
mov %rax, 16(rp)
adc %rcx, %rdx C relies on rcx = 0
mov %rdx, 24(rp)
L(sqr_diag_addlsh1):
pop n
pop up
pop rp
ifdef(`SDA_VARIANT',,`define(`SDA_VARIANT', 2)')
ifelse(SDA_VARIANT,1,`
lea (n,n), %rax
movq $0, -8(rp,%rax,8) C FIXME
test R32(%rax), R32(%rax)
mov (up), %rdx
lea 8(up), up
mulx( %rdx, %r8, %rdx)
jmp L(dm)
ALIGN(16)
L(dtop):mov 8(rp), %r9
adcx( %r9, %r9)
adox( %rdx, %r9)
mov %r9, 8(rp)
lea 16(rp), rp
jrcxz L(dend)
mov (up), %rdx
mulx( %rdx, %rax, %rdx)
lea 8(up), up
mov (rp), %r8
adcx( %r8, %r8)
adox( %rax, %r8)
L(dm): mov %r8, (rp)
lea -1(n), n
jmp L(dtop)
L(dend):
')
ifelse(SDA_VARIANT,2,`
dec R32(n)
mov (up), %rdx
xor R32(%rbx), R32(%rbx) C clear CF as side effect
mulx( %rdx, %rax, %r10)
mov %rax, (rp)
mov 8(rp), %r8
mov 16(rp), %r9
jmp L(dm)
ALIGN(16)
L(dtop):mov 24(rp), %r8
mov 32(rp), %r9
lea 16(rp), rp
lea (%rdx,%rbx), %r10
L(dm): adc %r8, %r8
adc %r9, %r9
setc R8(%rbx)
mov 8(up), %rdx
lea 8(up), up
mulx( %rdx, %rax, %rdx)
add %r10, %r8
adc %rax, %r9
mov %r8, 8(rp)
mov %r9, 16(rp)
dec R32(n)
jnz L(dtop)
L(dend):adc %rbx, %rdx
mov %rdx, 24(rp)
')
ifelse(SDA_VARIANT,3,`
dec R32(n)
mov (up), %rdx
test R32(%rbx), R32(%rbx) C clear CF and OF
mulx( %rdx, %rax, %r10)
mov %rax, (rp)
mov 8(rp), %r8
mov 16(rp), %r9
jmp L(dm)
ALIGN(16)
L(dtop):jrcxz L(dend)
mov 24(rp), %r8
mov 32(rp), %r9
lea 16(rp), rp
L(dm): adcx( %r8, %r8)
adcx( %r9, %r9)
mov 8(up), %rdx
lea 8(up), up
adox( %r10, %r8)
mulx( %rdx, %rax, %r10)
adox( %rax, %r9)
mov %r8, 8(rp)
mov %r9, 16(rp)
lea -1(n), R32(n)
jmp L(dtop)
L(dend):adcx( %rcx, %r10)
adox( %rcx, %r10)
mov %r10, 24(rp)
')
pop %rbx
FUNC_EXIT()
ret
JUMPTABSECT
ALIGN(8)
L(mtab):JMPENT( L(mf7), L(mtab))
JMPENT( L(mf0), L(mtab))
JMPENT( L(mf1), L(mtab))
JMPENT( L(mf2), L(mtab))
JMPENT( L(mf3), L(mtab))
JMPENT( L(mf4), L(mtab))
JMPENT( L(mf5), L(mtab))
JMPENT( L(mf6), L(mtab))
L(atab):JMPENT( L(f6), L(atab))
JMPENT( L(f7), L(atab))
JMPENT( L(f0), L(atab))
JMPENT( L(f1), L(atab))
JMPENT( L(f2), L(atab))
JMPENT( L(f3), L(atab))
JMPENT( L(f4), L(atab))
JMPENT( L(f5), L(atab))
TEXT
EPILOGUE()