Convert divem_euclidean_qr_1.asm to yasm format.

This commit is contained in:
(no author) 2010-02-07 13:32:16 +00:00
parent 0f5bb75287
commit a54d23d652
2 changed files with 273 additions and 281 deletions

View File

@ -0,0 +1,273 @@
; x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division.
; Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
; Copuright 2010 Brian Gladman
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 3 of the License, or (at
; your option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; mp_limb_t
; mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn,
; mp_srcptr np, mp_size_t nn, mp_limb_t d)
; mp_limb_t
; mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
; mp_srcptr np, mp_size_t nn, mp_limb_t d,
; mp_limb_t dinv, int shift)
; norm unorm frac
; k8 13 13 12
; netburst 44.2 44.2 42.3
; core2 25 24.5 19.3
; nehalem 21.5 20.7 18
; atom 42 52 37
; INPUT PARAMETERS
; qp %rdi
; fn %rsi
; np %rdx
; nn %rcx
; d %r8
; dinv %r9 only for mpn_preinv_divrem_1
; shift passed on stack only for mpn_preinv_divrem_1
%include 'yasm_mac.inc'
%define SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
TEXT
align 16
GLOBAL_FUNC mpn_preinv_divrem_1
xor eax, eax
push r13
push r12
push rbp
push rbx
mov r12, rsi
mov rbx, rcx
add rcx, rsi
mov rsi, rdx
lea rdi, [rdi+rcx*8-8]
test r8, r8
js L_nent
mov cl, [rsp+40]
shl r8, cl
jmp L_uent
align 16
GLOBAL_FUNC mpn_divrem_euclidean_qr_1
xor eax, eax
push r13
push r12
push rbp
push rbx
mov r12, rsi
mov rbx, rcx
add rcx, rsi
mov rsi, rdx
je L_ret
lea rdi, [rdi+rcx*8-8]
xor ebp, ebp
%ifdef SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
test r8, r8
jns L_unnormalized
L_normalized:
test rbx, rbx
je L_8
mov rbp, [rsi+rbx*8-8]
dec rbx
mov rax, rbp
sub rbp, r8
cmovb rbp, rax
sbb eax, eax
inc eax
mov [rdi], rax
lea rdi, [rdi-8]
L_8:
mov rdx, r8
mov rax, -1
not rdx
div r8
mov r9, rax
mov rax, rbp
jmp L_nent
align 16
L_nloop:
mov r10, [rsi+rbx*8]
lea rbp, [rax+1]
mul r9
add rax, r10
adc rdx, rbp
mov rbp, rax
mov r13, rdx
imul rdx, r8
sub r10, rdx
mov rax, r8
add rax, r10
cmp r10, rbp
cmovb rax, r10
adc r13, -1
cmp rax, r8
jae L_nfx
L_nok:
mov [rdi], r13
sub rdi, 8
L_nent:
dec rbx
jns L_nloop
xor ecx, ecx
jmp L_87
L_nfx:
sub rax, r8
inc r13
jmp L_nok
%endif
L_unnormalized:
test rbx, rbx
je L_44
mov rax, [rsi+rbx*8-8]
cmp rax, r8
jae L_44
mov [rdi], rbp
mov rbp, rax
lea rdi, [rdi-8]
je L_ret
dec rbx
L_44:
bsr rcx, r8
not ecx
sal r8, cl
sal rbp, cl
mov rdx, r8
mov rax, -1
not rdx
div r8
test rbx, rbx
mov r9, rax
mov rax, rbp
je L_87
L_uent:
mov rbp, [rsi+rbx*8-8]
shr rax, cl
shld rax, rbp, cl
sub rbx, 2
js L_ulast
align 16
L_uloop:
nop
mov r10, [rsi+rbx*8]
lea r11, [rax+1]
shld rbp, r10, cl
mul r9
add rax, rbp
adc rdx, r11
mov r11, rax
mov r13, rdx
imul rdx, r8
sub rbp, rdx
mov rax, r8
add rax, rbp
cmp rbp, r11
cmovb rax, rbp
adc r13, -1
cmp rax, r8
jae L_ufx
L_uok:
mov [rdi], r13
sub rdi, 8
dec rbx
mov rbp, r10
jns L_uloop
L_ulast:
lea r11, [rax+1]
sal rbp, cl
mul r9
add rax, rbp
adc rdx, r11
mov r11, rax
mov r13, rdx
imul rdx, r8
sub rbp, rdx
mov rax, r8
add rax, rbp
cmp rbp, r11
cmovb rax, rbp
adc r13, -1
cmp rax, r8
jae L_93
L_69:
mov [rdi], r13
sub rdi, 8
jmp L_87
L_ufx:
sub rax, r8
inc r13
jmp L_uok
L_93:
sub rax, r8
inc r13
jmp L_69
L_87:
mov rbp, r8
neg rbp
jmp L_87b
align 16
L_floop:
lea r11, [rax+1]
mul r9
add rdx, r11
mov r11, rax
mov r13, rdx
imul rdx, rbp
mov rax, r8
add rax, rdx
cmp rdx, r11
cmovb rax, rdx
adc r13, -1
mov [rdi], r13
sub rdi, 8
L_87b:
dec r12
jns L_floop
shr rax, cl
L_ret:
pop rbx
pop rbp
pop r12
pop r13
ret
end

View File

@ -1,281 +0,0 @@
dnl x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division.
dnl Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C norm unorm frac
C K8 13 13 12
C P4 44.2 44.2 42.3
C P6 core2 25 24.5 19.3
C P6 corei7 21.5 20.7 18
C P6 atom 42 52 37
C TODO
C * Compute the inverse without relying on the div instruction.
C Newton's method and mulq, or perhaps the faster fdiv.
C * Tune prologue.
C * Optimize for Core 2.
C The code for unnormalized divisors works also for normalized divisors, but
C for some reason it runs really slowly (on K8) for that case. Use special
C code until we can address this. The Intel Atom is also affected, but
C understandably (shld slowness).
define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1)
C mp_limb_t
C mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn,
C mp_srcptr np, mp_size_t nn, mp_limb_t d)
C mp_limb_t
C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
C mp_srcptr np, mp_size_t nn, mp_limb_t d,
C mp_limb_t dinv, int cnt)
C INPUT PARAMETERS
define(`qp', `%rdi')
define(`fn_param', `%rsi')
define(`up_param', `%rdx')
define(`un_param', `%rcx')
define(`d', `%r8')
define(`dinv', `%r9') C only for mpn_preinv_divrem_1
C shift passed on stack C only for mpn_preinv_divrem_1
define(`cnt', `%rcx')
define(`cnt8', `%cl')
define(`up', `%rsi')
define(`fn', `%r12')
define(`un', `%rbx')
C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
C cnt qp d dinv
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_preinv_divrem_1)
xor %eax, %eax
push %r13
push %r12
push %rbp
push %rbx
mov fn_param, fn
mov un_param, un
add fn_param, un_param
mov up_param, up
lea -8(qp,un_param,8), qp
test d, d
js L(nent)
mov 40(%rsp), cnt8
shl cnt8, d
jmp L(uent)
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_divrem_euclidean_qr_1)
xor %eax, %eax
push %r13
push %r12
push %rbp
push %rbx
mov fn_param, fn
mov un_param, un
add fn_param, un_param
mov up_param, up
je L(ret)
lea -8(qp,un_param,8), qp
xor %ebp, %ebp
ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',`
test d, d
jns L(unnormalized)
L(normalized):
test un, un
je L(8) C un == 0
mov -8(up,un,8), %rbp
dec un
mov %rbp, %rax
sub d, %rbp
cmovb %rax, %rbp
sbb %eax, %eax
inc %eax
mov %rax, (qp)
lea -8(qp), qp
L(8):
mov d, %rdx
mov $-1, %rax
not %rdx
div d C FREE rax rdx rcx r9 r10 r11
mov %rax, dinv
mov %rbp, %rax
jmp L(nent)
ALIGN(16)
L(nloop): C cycK8 cycP6 cycP4
mov (up,un,8), %r10 C
lea 1(%rax), %rbp C
mul dinv C 0,13 0,19 0,45
add %r10, %rax C 4 8 12
adc %rbp, %rdx C 5 9 13
mov %rax, %rbp C 5 9 13
mov %rdx, %r13 C 6 11 23
imul d, %rdx C 6 11 23
sub %rdx, %r10 C 10 16 33
mov d, %rax C
add %r10, %rax C 11 17 34
cmp %rbp, %r10 C 11 17 34
cmovb %r10, %rax C 12 18 35
adc $-1, %r13 C
cmp d, %rax C
jae L(nfx) C
L(nok): mov %r13, (qp) C
sub $8, qp C
L(nent):dec un C
jns L(nloop) C
xor %ecx, %ecx
jmp L(87)
L(nfx): sub d, %rax
inc %r13
jmp L(nok)
')
L(unnormalized):
test un, un
je L(44)
mov -8(up,un,8), %rax
cmp d, %rax
jae L(44)
mov %rbp, (qp)
mov %rax, %rbp
lea -8(qp), qp
je L(ret)
dec un
L(44):
bsr d, %rcx
not %ecx
sal %cl, d
sal %cl, %rbp
mov d, %rdx
mov $-1, %rax
not %rdx
div d C FREE rax rdx r9 r10 r11
test un, un
mov %rax, dinv
mov %rbp, %rax
je L(87)
L(uent):
mov -8(up,un,8), %rbp
shr %cl, %rax
shld %cl, %rbp, %rax
sub $2, un
js L(ulast)
ALIGN(16)
L(uloop):
nop
mov (up,un,8), %r10
lea 1(%rax), %r11
shld %cl, %r10, %rbp
mul dinv
add %rbp, %rax
adc %r11, %rdx
mov %rax, %r11
mov %rdx, %r13
imul d, %rdx
sub %rdx, %rbp
mov d, %rax
add %rbp, %rax
cmp %r11, %rbp
cmovb %rbp, %rax
adc $-1, %r13
cmp d, %rax
jae L(ufx)
L(uok): mov %r13, (qp)
sub $8, qp
dec un
mov %r10, %rbp
jns L(uloop)
L(ulast):
lea 1(%rax), %r11
sal %cl, %rbp
mul dinv
add %rbp, %rax
adc %r11, %rdx
mov %rax, %r11
mov %rdx, %r13
imul d, %rdx
sub %rdx, %rbp
mov d, %rax
add %rbp, %rax
cmp %r11, %rbp
cmovb %rbp, %rax
adc $-1, %r13
cmp d, %rax
jae L(93)
L(69): mov %r13, (qp)
sub $8, qp
jmp L(87)
L(ufx): sub d, %rax
inc %r13
jmp L(uok)
L(93): sub d, %rax
inc %r13
jmp L(69)
L(87): mov d, %rbp
neg %rbp
jmp L(87b)
ALIGN(16)
L(floop): C cycK8 cycP6 cycP4
lea 1(%rax), %r11 C
mul dinv C 0,12
add %r11, %rdx C 5
mov %rax, %r11 C 4
mov %rdx, %r13 C 6
imul %rbp, %rdx C 6
mov d, %rax C
add %rdx, %rax C 10
cmp %r11, %rdx C 10
cmovb %rdx, %rax C 11
adc $-1, %r13 C
mov %r13, (qp) C
sub $8, qp C
L(87b): dec fn C
jns L(floop) C
shr %cl, %rax
L(ret): pop %rbx
pop %rbp
pop %r12
pop %r13
ret
EPILOGUE()