mpir/mpn/x86_64/divrem_euclidean_qr_1.as
2010-03-14 02:13:05 +00:00

274 lines
4.7 KiB
ActionScript

; x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division.
; Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
; Copyright 2010 Brian Gladman (Conversion to yasm format)
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 3 of the License, or (at
; your option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; mp_limb_t
; mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn,
; mp_srcptr np, mp_size_t nn, mp_limb_t d)
; mp_limb_t
; mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
; mp_srcptr np, mp_size_t nn, mp_limb_t d,
; mp_limb_t dinv, int shift)
; norm unorm frac
; k8 13 13 12
; netburst 44.2 44.2 42.3
; core2 25 24.5 19.3
; nehalem 21.5 20.7 18
; atom 42 52 37
; INPUT PARAMETERS
; qp %rdi
; fn %rsi
; np %rdx
; nn %rcx
; d %r8
; dinv %r9 only for mpn_preinv_divrem_1
; shift passed on stack only for mpn_preinv_divrem_1
%include 'yasm_mac.inc'
%define SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
TEXT
align 16
GLOBAL_FUNC mpn_preinv_divrem_1
xor eax, eax
push r13
push r12
push rbp
push rbx
mov r12, rsi
mov rbx, rcx
add rcx, rsi
mov rsi, rdx
lea rdi, [rdi+rcx*8-8]
test r8, r8
js L_nent
mov cl, [rsp+40]
shl r8, cl
jmp L_uent
align 16
GLOBAL_FUNC mpn_divrem_euclidean_qr_1
xor eax, eax
push r13
push r12
push rbp
push rbx
mov r12, rsi
mov rbx, rcx
add rcx, rsi
mov rsi, rdx
je L_ret
lea rdi, [rdi+rcx*8-8]
xor ebp, ebp
%ifdef SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
test r8, r8
jns L_unnormalized
L_normalized:
test rbx, rbx
je L_8
mov rbp, [rsi+rbx*8-8]
dec rbx
mov rax, rbp
sub rbp, r8
cmovb rbp, rax
sbb eax, eax
inc eax
mov [rdi], rax
lea rdi, [rdi-8]
L_8:
mov rdx, r8
mov rax, -1
not rdx
div r8
mov r9, rax
mov rax, rbp
jmp L_nent
align 16
L_nloop:
mov r10, [rsi+rbx*8]
lea rbp, [rax+1]
mul r9
add rax, r10
adc rdx, rbp
mov rbp, rax
mov r13, rdx
imul rdx, r8
sub r10, rdx
mov rax, r8
add rax, r10
cmp r10, rbp
cmovb rax, r10
adc r13, -1
cmp rax, r8
jae L_nfx
L_nok:
mov [rdi], r13
sub rdi, 8
L_nent:
dec rbx
jns L_nloop
xor ecx, ecx
jmp L_87
L_nfx:
sub rax, r8
inc r13
jmp L_nok
%endif
L_unnormalized:
test rbx, rbx
je L_44
mov rax, [rsi+rbx*8-8]
cmp rax, r8
jae L_44
mov [rdi], rbp
mov rbp, rax
lea rdi, [rdi-8]
je L_ret
dec rbx
L_44:
bsr rcx, r8
not ecx
sal r8, cl
sal rbp, cl
mov rdx, r8
mov rax, -1
not rdx
div r8
test rbx, rbx
mov r9, rax
mov rax, rbp
je L_87
L_uent:
mov rbp, [rsi+rbx*8-8]
shr rax, cl
shld rax, rbp, cl
sub rbx, 2
js L_ulast
align 16
L_uloop:
nop
mov r10, [rsi+rbx*8]
lea r11, [rax+1]
shld rbp, r10, cl
mul r9
add rax, rbp
adc rdx, r11
mov r11, rax
mov r13, rdx
imul rdx, r8
sub rbp, rdx
mov rax, r8
add rax, rbp
cmp rbp, r11
cmovb rax, rbp
adc r13, -1
cmp rax, r8
jae L_ufx
L_uok:
mov [rdi], r13
sub rdi, 8
dec rbx
mov rbp, r10
jns L_uloop
L_ulast:
lea r11, [rax+1]
sal rbp, cl
mul r9
add rax, rbp
adc rdx, r11
mov r11, rax
mov r13, rdx
imul rdx, r8
sub rbp, rdx
mov rax, r8
add rax, rbp
cmp rbp, r11
cmovb rax, rbp
adc r13, -1
cmp rax, r8
jae L_93
L_69:
mov [rdi], r13
sub rdi, 8
jmp L_87
L_ufx:
sub rax, r8
inc r13
jmp L_uok
L_93:
sub rax, r8
inc r13
jmp L_69
L_87:
mov rbp, r8
neg rbp
jmp L_87b
align 16
L_floop:
lea r11, [rax+1]
mul r9
add rdx, r11
mov r11, rax
mov r13, rdx
imul rdx, rbp
mov rax, r8
add rax, rdx
cmp rdx, r11
cmovb rax, rdx
adc r13, -1
mov [rdi], r13
sub rdi, 8
L_87b:
dec r12
jns L_floop
shr rax, cl
L_ret:
pop rbx
pop rbp
pop r12
pop r13
ret
end