274 lines
4.7 KiB
ActionScript
274 lines
4.7 KiB
ActionScript
; x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division.
|
|
|
|
; Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
|
|
|
|
; Copyright 2010 Brian Gladman (Conversion to yasm format)
|
|
|
|
; This file is part of the GNU MP Library.
|
|
|
|
; The GNU MP Library is free software; you can redistribute it and/or modify
|
|
; it under the terms of the GNU Lesser General Public License as published
|
|
; by the Free Software Foundation; either version 3 of the License, or (at
|
|
; your option) any later version.
|
|
|
|
; The GNU MP Library is distributed in the hope that it will be useful, but
|
|
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
; License for more details.
|
|
|
|
; You should have received a copy of the GNU Lesser General Public License
|
|
|
|
; mp_limb_t
|
|
; mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn,
|
|
; mp_srcptr np, mp_size_t nn, mp_limb_t d)
|
|
|
|
; mp_limb_t
|
|
; mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
|
|
; mp_srcptr np, mp_size_t nn, mp_limb_t d,
|
|
; mp_limb_t dinv, int shift)
|
|
|
|
; norm unorm frac
|
|
; k8 13 13 12
|
|
; netburst 44.2 44.2 42.3
|
|
; core2 25 24.5 19.3
|
|
; nehalem 21.5 20.7 18
|
|
; atom 42 52 37
|
|
|
|
; INPUT PARAMETERS
|
|
; qp %rdi
|
|
; fn %rsi
|
|
; np %rdx
|
|
; nn %rcx
|
|
; d %r8
|
|
; dinv %r9 only for mpn_preinv_divrem_1
|
|
; shift passed on stack only for mpn_preinv_divrem_1
|
|
|
|
%include 'yasm_mac.inc'
|
|
|
|
%define SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
|
|
|
|
TEXT
|
|
align 16
|
|
GLOBAL_FUNC mpn_preinv_divrem_1
|
|
|
|
xor eax, eax
|
|
push r13
|
|
push r12
|
|
push rbp
|
|
push rbx
|
|
|
|
mov r12, rsi
|
|
mov rbx, rcx
|
|
add rcx, rsi
|
|
mov rsi, rdx
|
|
|
|
lea rdi, [rdi+rcx*8-8]
|
|
|
|
test r8, r8
|
|
js L_nent
|
|
mov cl, [rsp+40]
|
|
shl r8, cl
|
|
jmp L_uent
|
|
|
|
align 16
|
|
GLOBAL_FUNC mpn_divrem_euclidean_qr_1
|
|
xor eax, eax
|
|
push r13
|
|
push r12
|
|
push rbp
|
|
push rbx
|
|
|
|
mov r12, rsi
|
|
mov rbx, rcx
|
|
add rcx, rsi
|
|
mov rsi, rdx
|
|
je L_ret
|
|
|
|
lea rdi, [rdi+rcx*8-8]
|
|
xor ebp, ebp
|
|
|
|
%ifdef SPECIAL_CODE_FOR_NORMALIZED_DIVISOR
|
|
|
|
test r8, r8
|
|
jns L_unnormalized
|
|
|
|
L_normalized:
|
|
test rbx, rbx
|
|
je L_8
|
|
mov rbp, [rsi+rbx*8-8]
|
|
dec rbx
|
|
mov rax, rbp
|
|
sub rbp, r8
|
|
cmovb rbp, rax
|
|
sbb eax, eax
|
|
inc eax
|
|
mov [rdi], rax
|
|
lea rdi, [rdi-8]
|
|
L_8:
|
|
mov rdx, r8
|
|
mov rax, -1
|
|
not rdx
|
|
div r8
|
|
mov r9, rax
|
|
mov rax, rbp
|
|
jmp L_nent
|
|
|
|
align 16
|
|
L_nloop:
|
|
mov r10, [rsi+rbx*8]
|
|
lea rbp, [rax+1]
|
|
mul r9
|
|
add rax, r10
|
|
adc rdx, rbp
|
|
mov rbp, rax
|
|
mov r13, rdx
|
|
imul rdx, r8
|
|
sub r10, rdx
|
|
mov rax, r8
|
|
add rax, r10
|
|
cmp r10, rbp
|
|
cmovb rax, r10
|
|
adc r13, -1
|
|
cmp rax, r8
|
|
jae L_nfx
|
|
L_nok:
|
|
mov [rdi], r13
|
|
sub rdi, 8
|
|
L_nent:
|
|
dec rbx
|
|
jns L_nloop
|
|
|
|
xor ecx, ecx
|
|
jmp L_87
|
|
|
|
L_nfx:
|
|
sub rax, r8
|
|
inc r13
|
|
jmp L_nok
|
|
|
|
%endif
|
|
|
|
L_unnormalized:
|
|
test rbx, rbx
|
|
je L_44
|
|
mov rax, [rsi+rbx*8-8]
|
|
cmp rax, r8
|
|
jae L_44
|
|
mov [rdi], rbp
|
|
mov rbp, rax
|
|
lea rdi, [rdi-8]
|
|
je L_ret
|
|
dec rbx
|
|
L_44:
|
|
bsr rcx, r8
|
|
not ecx
|
|
sal r8, cl
|
|
sal rbp, cl
|
|
mov rdx, r8
|
|
mov rax, -1
|
|
not rdx
|
|
div r8
|
|
test rbx, rbx
|
|
mov r9, rax
|
|
mov rax, rbp
|
|
je L_87
|
|
L_uent:
|
|
mov rbp, [rsi+rbx*8-8]
|
|
shr rax, cl
|
|
shld rax, rbp, cl
|
|
sub rbx, 2
|
|
js L_ulast
|
|
|
|
align 16
|
|
L_uloop:
|
|
nop
|
|
mov r10, [rsi+rbx*8]
|
|
lea r11, [rax+1]
|
|
shld rbp, r10, cl
|
|
mul r9
|
|
add rax, rbp
|
|
adc rdx, r11
|
|
mov r11, rax
|
|
mov r13, rdx
|
|
imul rdx, r8
|
|
sub rbp, rdx
|
|
mov rax, r8
|
|
add rax, rbp
|
|
cmp rbp, r11
|
|
cmovb rax, rbp
|
|
adc r13, -1
|
|
cmp rax, r8
|
|
jae L_ufx
|
|
L_uok:
|
|
mov [rdi], r13
|
|
sub rdi, 8
|
|
dec rbx
|
|
mov rbp, r10
|
|
jns L_uloop
|
|
L_ulast:
|
|
lea r11, [rax+1]
|
|
sal rbp, cl
|
|
mul r9
|
|
add rax, rbp
|
|
adc rdx, r11
|
|
mov r11, rax
|
|
mov r13, rdx
|
|
imul rdx, r8
|
|
sub rbp, rdx
|
|
mov rax, r8
|
|
add rax, rbp
|
|
cmp rbp, r11
|
|
cmovb rax, rbp
|
|
adc r13, -1
|
|
cmp rax, r8
|
|
jae L_93
|
|
L_69:
|
|
mov [rdi], r13
|
|
sub rdi, 8
|
|
jmp L_87
|
|
|
|
L_ufx:
|
|
sub rax, r8
|
|
inc r13
|
|
jmp L_uok
|
|
|
|
L_93:
|
|
sub rax, r8
|
|
inc r13
|
|
jmp L_69
|
|
|
|
L_87:
|
|
mov rbp, r8
|
|
neg rbp
|
|
jmp L_87b
|
|
|
|
align 16
|
|
L_floop:
|
|
lea r11, [rax+1]
|
|
mul r9
|
|
add rdx, r11
|
|
mov r11, rax
|
|
mov r13, rdx
|
|
imul rdx, rbp
|
|
mov rax, r8
|
|
add rax, rdx
|
|
cmp rdx, r11
|
|
cmovb rax, rdx
|
|
adc r13, -1
|
|
mov [rdi], r13
|
|
sub rdi, 8
|
|
L_87b:
|
|
dec r12
|
|
jns L_floop
|
|
|
|
shr rax, cl
|
|
L_ret:
|
|
pop rbx
|
|
pop rbp
|
|
pop r12
|
|
pop r13
|
|
ret
|
|
|
|
end
|
|
|