diff --git a/mpn/x86_64/divrem_euclidean_qr_1.as b/mpn/x86_64/divrem_euclidean_qr_1.as new file mode 100644 index 00000000..be854422 --- /dev/null +++ b/mpn/x86_64/divrem_euclidean_qr_1.as @@ -0,0 +1,273 @@ +; x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division. + +; Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. + +; Copuright 2010 Brian Gladman + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 3 of the License, or (at +; your option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License + +; mp_limb_t +; mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn, +; mp_srcptr np, mp_size_t nn, mp_limb_t d) + +; mp_limb_t +; mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +; mp_srcptr np, mp_size_t nn, mp_limb_t d, +; mp_limb_t dinv, int shift) + +; norm unorm frac +; k8 13 13 12 +; netburst 44.2 44.2 42.3 +; core2 25 24.5 19.3 +; nehalem 21.5 20.7 18 +; atom 42 52 37 + +; INPUT PARAMETERS +; qp %rdi +; fn %rsi +; np %rdx +; nn %rcx +; d %r8 +; dinv %r9 only for mpn_preinv_divrem_1 +; shift passed on stack only for mpn_preinv_divrem_1 + +%include 'yasm_mac.inc' + +%define SPECIAL_CODE_FOR_NORMALIZED_DIVISOR + + TEXT + align 16 +GLOBAL_FUNC mpn_preinv_divrem_1 + + xor eax, eax + push r13 + push r12 + push rbp + push rbx + + mov r12, rsi + mov rbx, rcx + add rcx, rsi + mov rsi, rdx + + lea rdi, [rdi+rcx*8-8] + + test r8, r8 + js L_nent + mov cl, [rsp+40] + shl r8, cl + jmp L_uent + + align 16 +GLOBAL_FUNC mpn_divrem_euclidean_qr_1 + xor eax, eax + push r13 + push r12 + push rbp + push rbx + + mov r12, rsi + mov rbx, rcx + add rcx, rsi + mov rsi, rdx + je L_ret + + lea rdi, [rdi+rcx*8-8] + xor ebp, ebp + +%ifdef SPECIAL_CODE_FOR_NORMALIZED_DIVISOR + + test r8, r8 + jns L_unnormalized + +L_normalized: + test rbx, rbx + je L_8 + mov rbp, [rsi+rbx*8-8] + dec rbx + mov rax, rbp + sub rbp, r8 + cmovb rbp, rax + sbb eax, eax + inc eax + mov [rdi], rax + lea rdi, [rdi-8] +L_8: + mov rdx, r8 + mov rax, -1 + not rdx + div r8 + mov r9, rax + mov rax, rbp + jmp L_nent + + align 16 +L_nloop: + mov r10, [rsi+rbx*8] + lea rbp, [rax+1] + mul r9 + add rax, r10 + adc rdx, rbp + mov rbp, rax + mov r13, rdx + imul rdx, r8 + sub r10, rdx + mov rax, r8 + add rax, r10 + cmp r10, rbp + cmovb rax, r10 + adc r13, -1 + cmp rax, r8 + jae L_nfx +L_nok: + mov [rdi], r13 + sub rdi, 8 +L_nent: + dec rbx + jns L_nloop + + xor ecx, ecx + jmp L_87 + +L_nfx: + sub rax, r8 + inc r13 + jmp L_nok + +%endif + +L_unnormalized: + test rbx, rbx + je L_44 + mov rax, [rsi+rbx*8-8] + cmp rax, r8 + jae L_44 + mov [rdi], rbp + mov rbp, rax + lea rdi, [rdi-8] + je L_ret + dec rbx +L_44: + bsr rcx, r8 + not ecx + sal r8, cl + sal rbp, cl + mov rdx, r8 + mov rax, -1 + not rdx + div r8 + test rbx, rbx + mov r9, rax + mov rax, rbp + je L_87 +L_uent: + mov rbp, [rsi+rbx*8-8] + shr rax, cl + shld rax, rbp, cl + sub rbx, 2 + js L_ulast + + align 16 +L_uloop: + nop + mov r10, [rsi+rbx*8] + lea r11, [rax+1] + shld rbp, r10, cl + mul r9 + add rax, rbp + adc rdx, r11 + mov r11, rax + mov r13, rdx + imul rdx, r8 + sub rbp, rdx + mov rax, r8 + add rax, rbp + cmp rbp, r11 + cmovb rax, rbp + adc r13, -1 + cmp rax, r8 + jae L_ufx +L_uok: + mov [rdi], r13 + sub rdi, 8 + dec rbx + mov rbp, r10 + jns L_uloop +L_ulast: + lea r11, [rax+1] + sal rbp, cl + mul r9 + add rax, rbp + adc rdx, r11 + mov r11, rax + mov r13, rdx + imul rdx, r8 + sub rbp, rdx + mov rax, r8 + add rax, rbp + cmp rbp, r11 + cmovb rax, rbp + adc r13, -1 + cmp rax, r8 + jae L_93 +L_69: + mov [rdi], r13 + sub rdi, 8 + jmp L_87 + +L_ufx: + sub rax, r8 + inc r13 + jmp L_uok + +L_93: + sub rax, r8 + inc r13 + jmp L_69 + +L_87: + mov rbp, r8 + neg rbp + jmp L_87b + + align 16 +L_floop: + lea r11, [rax+1] + mul r9 + add rdx, r11 + mov r11, rax + mov r13, rdx + imul rdx, rbp + mov rax, r8 + add rax, rdx + cmp rdx, r11 + cmovb rax, rdx + adc r13, -1 + mov [rdi], r13 + sub rdi, 8 +L_87b: + dec r12 + jns L_floop + + shr rax, cl +L_ret: + pop rbx + pop rbp + pop r12 + pop r13 + ret + + end + diff --git a/mpn/x86_64/divrem_euclidean_qr_1.asm b/mpn/x86_64/divrem_euclidean_qr_1.asm deleted file mode 100644 index 9bc99824..00000000 --- a/mpn/x86_64/divrem_euclidean_qr_1.asm +++ /dev/null @@ -1,281 +0,0 @@ -dnl x86-64 mpn_divrem_euclidean_qr_1 -- mpn by limb division. - -dnl Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - - -C norm unorm frac -C K8 13 13 12 -C P4 44.2 44.2 42.3 -C P6 core2 25 24.5 19.3 -C P6 corei7 21.5 20.7 18 -C P6 atom 42 52 37 - -C TODO -C * Compute the inverse without relying on the div instruction. -C Newton's method and mulq, or perhaps the faster fdiv. -C * Tune prologue. -C * Optimize for Core 2. - -C The code for unnormalized divisors works also for normalized divisors, but -C for some reason it runs really slowly (on K8) for that case. Use special -C code until we can address this. The Intel Atom is also affected, but -C understandably (shld slowness). -define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1) - -C mp_limb_t -C mpn_divrem_euclidean_qr_1 (mp_ptr qp, mp_size_t fn, -C mp_srcptr np, mp_size_t nn, mp_limb_t d) - -C mp_limb_t -C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, -C mp_srcptr np, mp_size_t nn, mp_limb_t d, -C mp_limb_t dinv, int cnt) - -C INPUT PARAMETERS -define(`qp', `%rdi') -define(`fn_param', `%rsi') -define(`up_param', `%rdx') -define(`un_param', `%rcx') -define(`d', `%r8') -define(`dinv', `%r9') C only for mpn_preinv_divrem_1 -C shift passed on stack C only for mpn_preinv_divrem_1 - -define(`cnt', `%rcx') -define(`cnt8', `%cl') -define(`up', `%rsi') -define(`fn', `%r12') -define(`un', `%rbx') - - -C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 -C cnt qp d dinv - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_preinv_divrem_1) - xor %eax, %eax - push %r13 - push %r12 - push %rbp - push %rbx - - mov fn_param, fn - mov un_param, un - add fn_param, un_param - mov up_param, up - - lea -8(qp,un_param,8), qp - - test d, d - js L(nent) - mov 40(%rsp), cnt8 - shl cnt8, d - jmp L(uent) -EPILOGUE() - - ALIGN(16) -PROLOGUE(mpn_divrem_euclidean_qr_1) - xor %eax, %eax - push %r13 - push %r12 - push %rbp - push %rbx - - mov fn_param, fn - mov un_param, un - add fn_param, un_param - mov up_param, up - je L(ret) - - lea -8(qp,un_param,8), qp - xor %ebp, %ebp - - -ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',` - test d, d - jns L(unnormalized) - -L(normalized): - test un, un - je L(8) C un == 0 - mov -8(up,un,8), %rbp - dec un - mov %rbp, %rax - sub d, %rbp - cmovb %rax, %rbp - sbb %eax, %eax - inc %eax - mov %rax, (qp) - lea -8(qp), qp -L(8): - mov d, %rdx - mov $-1, %rax - not %rdx - div d C FREE rax rdx rcx r9 r10 r11 - mov %rax, dinv - mov %rbp, %rax - jmp L(nent) - - ALIGN(16) -L(nloop): C cycK8 cycP6 cycP4 - mov (up,un,8), %r10 C - lea 1(%rax), %rbp C - mul dinv C 0,13 0,19 0,45 - add %r10, %rax C 4 8 12 - adc %rbp, %rdx C 5 9 13 - mov %rax, %rbp C 5 9 13 - mov %rdx, %r13 C 6 11 23 - imul d, %rdx C 6 11 23 - sub %rdx, %r10 C 10 16 33 - mov d, %rax C - add %r10, %rax C 11 17 34 - cmp %rbp, %r10 C 11 17 34 - cmovb %r10, %rax C 12 18 35 - adc $-1, %r13 C - cmp d, %rax C - jae L(nfx) C -L(nok): mov %r13, (qp) C - sub $8, qp C -L(nent):dec un C - jns L(nloop) C - - xor %ecx, %ecx - jmp L(87) - -L(nfx): sub d, %rax - inc %r13 - jmp L(nok) -') - -L(unnormalized): - test un, un - je L(44) - mov -8(up,un,8), %rax - cmp d, %rax - jae L(44) - mov %rbp, (qp) - mov %rax, %rbp - lea -8(qp), qp - je L(ret) - dec un -L(44): - bsr d, %rcx - not %ecx - sal %cl, d - sal %cl, %rbp - mov d, %rdx - mov $-1, %rax - not %rdx - div d C FREE rax rdx r9 r10 r11 - test un, un - mov %rax, dinv - mov %rbp, %rax - je L(87) -L(uent): - mov -8(up,un,8), %rbp - shr %cl, %rax - shld %cl, %rbp, %rax - sub $2, un - js L(ulast) - - ALIGN(16) -L(uloop): - nop - mov (up,un,8), %r10 - lea 1(%rax), %r11 - shld %cl, %r10, %rbp - mul dinv - add %rbp, %rax - adc %r11, %rdx - mov %rax, %r11 - mov %rdx, %r13 - imul d, %rdx - sub %rdx, %rbp - mov d, %rax - add %rbp, %rax - cmp %r11, %rbp - cmovb %rbp, %rax - adc $-1, %r13 - cmp d, %rax - jae L(ufx) -L(uok): mov %r13, (qp) - sub $8, qp - dec un - mov %r10, %rbp - jns L(uloop) -L(ulast): - lea 1(%rax), %r11 - sal %cl, %rbp - mul dinv - add %rbp, %rax - adc %r11, %rdx - mov %rax, %r11 - mov %rdx, %r13 - imul d, %rdx - sub %rdx, %rbp - mov d, %rax - add %rbp, %rax - cmp %r11, %rbp - cmovb %rbp, %rax - adc $-1, %r13 - cmp d, %rax - jae L(93) -L(69): mov %r13, (qp) - sub $8, qp - jmp L(87) - -L(ufx): sub d, %rax - inc %r13 - jmp L(uok) - -L(93): sub d, %rax - inc %r13 - jmp L(69) - -L(87): mov d, %rbp - neg %rbp - jmp L(87b) - - ALIGN(16) -L(floop): C cycK8 cycP6 cycP4 - lea 1(%rax), %r11 C - mul dinv C 0,12 - add %r11, %rdx C 5 - mov %rax, %r11 C 4 - mov %rdx, %r13 C 6 - imul %rbp, %rdx C 6 - mov d, %rax C - add %rdx, %rax C 10 - cmp %r11, %rdx C 10 - cmovb %rdx, %rax C 11 - adc $-1, %r13 C - mov %r13, (qp) C - sub $8, qp C -L(87b): dec fn C - jns L(floop) C - - shr %cl, %rax -L(ret): pop %rbx - pop %rbp - pop %r12 - pop %r13 - ret -EPILOGUE()