Convert divem_euclidean_qr_1.asm to yasm format.
This commit is contained in:
parent
0f5bb75287
commit
a54d23d652
273
mpn/x86_64/divrem_euclidean_qr_1.as
Normal file
273
mpn/x86_64/divrem_euclidean_qr_1.as
Normal file
@ -0,0 +1,273 @@
|
||||
; x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division.
|
||||
|
||||
; Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
|
||||
|
||||
; Copuright 2010 Brian Gladman
|
||||
|
||||
; This file is part of the GNU MP Library.
|
||||
|
||||
; The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 3 of the License, or (at
|
||||
; your option) any later version.
|
||||
|
||||
; The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
|
||||
; mp_limb_t
|
||||
; mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn,
|
||||
; mp_srcptr np, mp_size_t nn, mp_limb_t d)
|
||||
|
||||
; mp_limb_t
|
||||
; mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
|
||||
; mp_srcptr np, mp_size_t nn, mp_limb_t d,
|
||||
; mp_limb_t dinv, int shift)
|
||||
|
||||
; norm unorm frac
|
||||
; k8 13 13 12
|
||||
; netburst 44.2 44.2 42.3
|
||||
; core2 25 24.5 19.3
|
||||
; nehalem 21.5 20.7 18
|
||||
; atom 42 52 37
|
||||
|
||||
; INPUT PARAMETERS
|
||||
; qp %rdi
|
||||
; fn %rsi
|
||||
; np %rdx
|
||||
; nn %rcx
|
||||
; d %r8
|
||||
; dinv %r9 only for mpn_preinv_divrem_1
|
||||
; shift passed on stack only for mpn_preinv_divrem_1
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
%define SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
|
||||
|
||||
TEXT
|
||||
align 16
|
||||
GLOBAL_FUNC mpn_preinv_divrem_1
|
||||
|
||||
xor eax, eax
|
||||
push r13
|
||||
push r12
|
||||
push rbp
|
||||
push rbx
|
||||
|
||||
mov r12, rsi
|
||||
mov rbx, rcx
|
||||
add rcx, rsi
|
||||
mov rsi, rdx
|
||||
|
||||
lea rdi, [rdi+rcx*8-8]
|
||||
|
||||
test r8, r8
|
||||
js L_nent
|
||||
mov cl, [rsp+40]
|
||||
shl r8, cl
|
||||
jmp L_uent
|
||||
|
||||
align 16
|
||||
GLOBAL_FUNC mpn_divrem_euclidean_qr_1
|
||||
xor eax, eax
|
||||
push r13
|
||||
push r12
|
||||
push rbp
|
||||
push rbx
|
||||
|
||||
mov r12, rsi
|
||||
mov rbx, rcx
|
||||
add rcx, rsi
|
||||
mov rsi, rdx
|
||||
je L_ret
|
||||
|
||||
lea rdi, [rdi+rcx*8-8]
|
||||
xor ebp, ebp
|
||||
|
||||
%ifdef SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
|
||||
|
||||
test r8, r8
|
||||
jns L_unnormalized
|
||||
|
||||
L_normalized:
|
||||
test rbx, rbx
|
||||
je L_8
|
||||
mov rbp, [rsi+rbx*8-8]
|
||||
dec rbx
|
||||
mov rax, rbp
|
||||
sub rbp, r8
|
||||
cmovb rbp, rax
|
||||
sbb eax, eax
|
||||
inc eax
|
||||
mov [rdi], rax
|
||||
lea rdi, [rdi-8]
|
||||
L_8:
|
||||
mov rdx, r8
|
||||
mov rax, -1
|
||||
not rdx
|
||||
div r8
|
||||
mov r9, rax
|
||||
mov rax, rbp
|
||||
jmp L_nent
|
||||
|
||||
align 16
|
||||
L_nloop:
|
||||
mov r10, [rsi+rbx*8]
|
||||
lea rbp, [rax+1]
|
||||
mul r9
|
||||
add rax, r10
|
||||
adc rdx, rbp
|
||||
mov rbp, rax
|
||||
mov r13, rdx
|
||||
imul rdx, r8
|
||||
sub r10, rdx
|
||||
mov rax, r8
|
||||
add rax, r10
|
||||
cmp r10, rbp
|
||||
cmovb rax, r10
|
||||
adc r13, -1
|
||||
cmp rax, r8
|
||||
jae L_nfx
|
||||
L_nok:
|
||||
mov [rdi], r13
|
||||
sub rdi, 8
|
||||
L_nent:
|
||||
dec rbx
|
||||
jns L_nloop
|
||||
|
||||
xor ecx, ecx
|
||||
jmp L_87
|
||||
|
||||
L_nfx:
|
||||
sub rax, r8
|
||||
inc r13
|
||||
jmp L_nok
|
||||
|
||||
%endif
|
||||
|
||||
L_unnormalized:
|
||||
test rbx, rbx
|
||||
je L_44
|
||||
mov rax, [rsi+rbx*8-8]
|
||||
cmp rax, r8
|
||||
jae L_44
|
||||
mov [rdi], rbp
|
||||
mov rbp, rax
|
||||
lea rdi, [rdi-8]
|
||||
je L_ret
|
||||
dec rbx
|
||||
L_44:
|
||||
bsr rcx, r8
|
||||
not ecx
|
||||
sal r8, cl
|
||||
sal rbp, cl
|
||||
mov rdx, r8
|
||||
mov rax, -1
|
||||
not rdx
|
||||
div r8
|
||||
test rbx, rbx
|
||||
mov r9, rax
|
||||
mov rax, rbp
|
||||
je L_87
|
||||
L_uent:
|
||||
mov rbp, [rsi+rbx*8-8]
|
||||
shr rax, cl
|
||||
shld rax, rbp, cl
|
||||
sub rbx, 2
|
||||
js L_ulast
|
||||
|
||||
align 16
|
||||
L_uloop:
|
||||
nop
|
||||
mov r10, [rsi+rbx*8]
|
||||
lea r11, [rax+1]
|
||||
shld rbp, r10, cl
|
||||
mul r9
|
||||
add rax, rbp
|
||||
adc rdx, r11
|
||||
mov r11, rax
|
||||
mov r13, rdx
|
||||
imul rdx, r8
|
||||
sub rbp, rdx
|
||||
mov rax, r8
|
||||
add rax, rbp
|
||||
cmp rbp, r11
|
||||
cmovb rax, rbp
|
||||
adc r13, -1
|
||||
cmp rax, r8
|
||||
jae L_ufx
|
||||
L_uok:
|
||||
mov [rdi], r13
|
||||
sub rdi, 8
|
||||
dec rbx
|
||||
mov rbp, r10
|
||||
jns L_uloop
|
||||
L_ulast:
|
||||
lea r11, [rax+1]
|
||||
sal rbp, cl
|
||||
mul r9
|
||||
add rax, rbp
|
||||
adc rdx, r11
|
||||
mov r11, rax
|
||||
mov r13, rdx
|
||||
imul rdx, r8
|
||||
sub rbp, rdx
|
||||
mov rax, r8
|
||||
add rax, rbp
|
||||
cmp rbp, r11
|
||||
cmovb rax, rbp
|
||||
adc r13, -1
|
||||
cmp rax, r8
|
||||
jae L_93
|
||||
L_69:
|
||||
mov [rdi], r13
|
||||
sub rdi, 8
|
||||
jmp L_87
|
||||
|
||||
L_ufx:
|
||||
sub rax, r8
|
||||
inc r13
|
||||
jmp L_uok
|
||||
|
||||
L_93:
|
||||
sub rax, r8
|
||||
inc r13
|
||||
jmp L_69
|
||||
|
||||
L_87:
|
||||
mov rbp, r8
|
||||
neg rbp
|
||||
jmp L_87b
|
||||
|
||||
align 16
|
||||
L_floop:
|
||||
lea r11, [rax+1]
|
||||
mul r9
|
||||
add rdx, r11
|
||||
mov r11, rax
|
||||
mov r13, rdx
|
||||
imul rdx, rbp
|
||||
mov rax, r8
|
||||
add rax, rdx
|
||||
cmp rdx, r11
|
||||
cmovb rax, rdx
|
||||
adc r13, -1
|
||||
mov [rdi], r13
|
||||
sub rdi, 8
|
||||
L_87b:
|
||||
dec r12
|
||||
jns L_floop
|
||||
|
||||
shr rax, cl
|
||||
L_ret:
|
||||
pop rbx
|
||||
pop rbp
|
||||
pop r12
|
||||
pop r13
|
||||
ret
|
||||
|
||||
end
|
||||
|
@ -1,281 +0,0 @@
|
||||
dnl x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division.
|
||||
|
||||
dnl Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
|
||||
|
||||
dnl This file is part of the GNU MP Library.
|
||||
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
dnl it under the terms of the GNU Lesser General Public License as published
|
||||
dnl by the Free Software Foundation; either version 3 of the License, or (at
|
||||
dnl your option) any later version.
|
||||
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
dnl License for more details.
|
||||
|
||||
dnl You should have received a copy of the GNU Lesser General Public License
|
||||
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
|
||||
C norm unorm frac
|
||||
C K8 13 13 12
|
||||
C P4 44.2 44.2 42.3
|
||||
C P6 core2 25 24.5 19.3
|
||||
C P6 corei7 21.5 20.7 18
|
||||
C P6 atom 42 52 37
|
||||
|
||||
C TODO
|
||||
C * Compute the inverse without relying on the div instruction.
|
||||
C Newton's method and mulq, or perhaps the faster fdiv.
|
||||
C * Tune prologue.
|
||||
C * Optimize for Core 2.
|
||||
|
||||
C The code for unnormalized divisors works also for normalized divisors, but
|
||||
C for some reason it runs really slowly (on K8) for that case. Use special
|
||||
C code until we can address this. The Intel Atom is also affected, but
|
||||
C understandably (shld slowness).
|
||||
define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1)
|
||||
|
||||
C mp_limb_t
|
||||
C mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn,
|
||||
C mp_srcptr np, mp_size_t nn, mp_limb_t d)
|
||||
|
||||
C mp_limb_t
|
||||
C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
|
||||
C mp_srcptr np, mp_size_t nn, mp_limb_t d,
|
||||
C mp_limb_t dinv, int cnt)
|
||||
|
||||
C INPUT PARAMETERS
|
||||
define(`qp', `%rdi')
|
||||
define(`fn_param', `%rsi')
|
||||
define(`up_param', `%rdx')
|
||||
define(`un_param', `%rcx')
|
||||
define(`d', `%r8')
|
||||
define(`dinv', `%r9') C only for mpn_preinv_divrem_1
|
||||
C shift passed on stack C only for mpn_preinv_divrem_1
|
||||
|
||||
define(`cnt', `%rcx')
|
||||
define(`cnt8', `%cl')
|
||||
define(`up', `%rsi')
|
||||
define(`fn', `%r12')
|
||||
define(`un', `%rbx')
|
||||
|
||||
|
||||
C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
|
||||
C cnt qp d dinv
|
||||
|
||||
ASM_START()
|
||||
TEXT
|
||||
ALIGN(16)
|
||||
PROLOGUE(mpn_preinv_divrem_1)
|
||||
xor %eax, %eax
|
||||
push %r13
|
||||
push %r12
|
||||
push %rbp
|
||||
push %rbx
|
||||
|
||||
mov fn_param, fn
|
||||
mov un_param, un
|
||||
add fn_param, un_param
|
||||
mov up_param, up
|
||||
|
||||
lea -8(qp,un_param,8), qp
|
||||
|
||||
test d, d
|
||||
js L(nent)
|
||||
mov 40(%rsp), cnt8
|
||||
shl cnt8, d
|
||||
jmp L(uent)
|
||||
EPILOGUE()
|
||||
|
||||
ALIGN(16)
|
||||
PROLOGUE(mpn_divrem_euclidean_qr_1)
|
||||
xor %eax, %eax
|
||||
push %r13
|
||||
push %r12
|
||||
push %rbp
|
||||
push %rbx
|
||||
|
||||
mov fn_param, fn
|
||||
mov un_param, un
|
||||
add fn_param, un_param
|
||||
mov up_param, up
|
||||
je L(ret)
|
||||
|
||||
lea -8(qp,un_param,8), qp
|
||||
xor %ebp, %ebp
|
||||
|
||||
|
||||
ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',`
|
||||
test d, d
|
||||
jns L(unnormalized)
|
||||
|
||||
L(normalized):
|
||||
test un, un
|
||||
je L(8) C un == 0
|
||||
mov -8(up,un,8), %rbp
|
||||
dec un
|
||||
mov %rbp, %rax
|
||||
sub d, %rbp
|
||||
cmovb %rax, %rbp
|
||||
sbb %eax, %eax
|
||||
inc %eax
|
||||
mov %rax, (qp)
|
||||
lea -8(qp), qp
|
||||
L(8):
|
||||
mov d, %rdx
|
||||
mov $-1, %rax
|
||||
not %rdx
|
||||
div d C FREE rax rdx rcx r9 r10 r11
|
||||
mov %rax, dinv
|
||||
mov %rbp, %rax
|
||||
jmp L(nent)
|
||||
|
||||
ALIGN(16)
|
||||
L(nloop): C cycK8 cycP6 cycP4
|
||||
mov (up,un,8), %r10 C
|
||||
lea 1(%rax), %rbp C
|
||||
mul dinv C 0,13 0,19 0,45
|
||||
add %r10, %rax C 4 8 12
|
||||
adc %rbp, %rdx C 5 9 13
|
||||
mov %rax, %rbp C 5 9 13
|
||||
mov %rdx, %r13 C 6 11 23
|
||||
imul d, %rdx C 6 11 23
|
||||
sub %rdx, %r10 C 10 16 33
|
||||
mov d, %rax C
|
||||
add %r10, %rax C 11 17 34
|
||||
cmp %rbp, %r10 C 11 17 34
|
||||
cmovb %r10, %rax C 12 18 35
|
||||
adc $-1, %r13 C
|
||||
cmp d, %rax C
|
||||
jae L(nfx) C
|
||||
L(nok): mov %r13, (qp) C
|
||||
sub $8, qp C
|
||||
L(nent):dec un C
|
||||
jns L(nloop) C
|
||||
|
||||
xor %ecx, %ecx
|
||||
jmp L(87)
|
||||
|
||||
L(nfx): sub d, %rax
|
||||
inc %r13
|
||||
jmp L(nok)
|
||||
')
|
||||
|
||||
L(unnormalized):
|
||||
test un, un
|
||||
je L(44)
|
||||
mov -8(up,un,8), %rax
|
||||
cmp d, %rax
|
||||
jae L(44)
|
||||
mov %rbp, (qp)
|
||||
mov %rax, %rbp
|
||||
lea -8(qp), qp
|
||||
je L(ret)
|
||||
dec un
|
||||
L(44):
|
||||
bsr d, %rcx
|
||||
not %ecx
|
||||
sal %cl, d
|
||||
sal %cl, %rbp
|
||||
mov d, %rdx
|
||||
mov $-1, %rax
|
||||
not %rdx
|
||||
div d C FREE rax rdx r9 r10 r11
|
||||
test un, un
|
||||
mov %rax, dinv
|
||||
mov %rbp, %rax
|
||||
je L(87)
|
||||
L(uent):
|
||||
mov -8(up,un,8), %rbp
|
||||
shr %cl, %rax
|
||||
shld %cl, %rbp, %rax
|
||||
sub $2, un
|
||||
js L(ulast)
|
||||
|
||||
ALIGN(16)
|
||||
L(uloop):
|
||||
nop
|
||||
mov (up,un,8), %r10
|
||||
lea 1(%rax), %r11
|
||||
shld %cl, %r10, %rbp
|
||||
mul dinv
|
||||
add %rbp, %rax
|
||||
adc %r11, %rdx
|
||||
mov %rax, %r11
|
||||
mov %rdx, %r13
|
||||
imul d, %rdx
|
||||
sub %rdx, %rbp
|
||||
mov d, %rax
|
||||
add %rbp, %rax
|
||||
cmp %r11, %rbp
|
||||
cmovb %rbp, %rax
|
||||
adc $-1, %r13
|
||||
cmp d, %rax
|
||||
jae L(ufx)
|
||||
L(uok): mov %r13, (qp)
|
||||
sub $8, qp
|
||||
dec un
|
||||
mov %r10, %rbp
|
||||
jns L(uloop)
|
||||
L(ulast):
|
||||
lea 1(%rax), %r11
|
||||
sal %cl, %rbp
|
||||
mul dinv
|
||||
add %rbp, %rax
|
||||
adc %r11, %rdx
|
||||
mov %rax, %r11
|
||||
mov %rdx, %r13
|
||||
imul d, %rdx
|
||||
sub %rdx, %rbp
|
||||
mov d, %rax
|
||||
add %rbp, %rax
|
||||
cmp %r11, %rbp
|
||||
cmovb %rbp, %rax
|
||||
adc $-1, %r13
|
||||
cmp d, %rax
|
||||
jae L(93)
|
||||
L(69): mov %r13, (qp)
|
||||
sub $8, qp
|
||||
jmp L(87)
|
||||
|
||||
L(ufx): sub d, %rax
|
||||
inc %r13
|
||||
jmp L(uok)
|
||||
|
||||
L(93): sub d, %rax
|
||||
inc %r13
|
||||
jmp L(69)
|
||||
|
||||
L(87): mov d, %rbp
|
||||
neg %rbp
|
||||
jmp L(87b)
|
||||
|
||||
ALIGN(16)
|
||||
L(floop): C cycK8 cycP6 cycP4
|
||||
lea 1(%rax), %r11 C
|
||||
mul dinv C 0,12
|
||||
add %r11, %rdx C 5
|
||||
mov %rax, %r11 C 4
|
||||
mov %rdx, %r13 C 6
|
||||
imul %rbp, %rdx C 6
|
||||
mov d, %rax C
|
||||
add %rdx, %rax C 10
|
||||
cmp %r11, %rdx C 10
|
||||
cmovb %rdx, %rax C 11
|
||||
adc $-1, %r13 C
|
||||
mov %r13, (qp) C
|
||||
sub $8, qp C
|
||||
L(87b): dec fn C
|
||||
jns L(floop) C
|
||||
|
||||
shr %cl, %rax
|
||||
L(ret): pop %rbx
|
||||
pop %rbp
|
||||
pop %r12
|
||||
pop %r13
|
||||
ret
|
||||
EPILOGUE()
|
Loading…
Reference in New Issue
Block a user