New asm functions for AMD divrem_hensel_qr_1 divrem_hensel_r_1

2009-09-29 23:52:09 +00:00 · 2009-09-29 23:52:09 +00:00 · 56801786a7
commit 56801786a7
parent 1601f69b05
2 changed files with 174 additions and 0 deletions
--- a/mpn/x86_64/k8/divrem_hensel_qr_1.asm
+++ b/mpn/x86_64/k8/divrem_hensel_qr_1.asm
@ -0,0 +1,88 @@
+dnl  X86_64 mpn_divrem_hensel_qr_1
+
+dnl  Copyright 2009 Jason Moxham
+
+dnl  This file is part of the MPIR Library.
+
+dnl  The MPIR Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The MPIR Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
+dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+dnl  Boston, MA 02110-1301, USA.
+
+include(`../config.m4')
+
+C	(rdi,rdx)=(rsi,rdx) / rcx
+C	rax=hensel remainder from div 
+
+C	This is divrem_hensel_1 with shifting on the output of the quotient
+C	On k8/k10 the shifting comes for free so no need to have different
+C	fn for that. And on K8/K10 this runs at 10c/l which is optimal
+C	This function "replaces" divexact_1 and modexact_1_odd
+C	This is same as the shifting version but with  no shifting
+
+ASM_START()
+PROLOGUE(mpn_divrem_hensel_qr_1)
+mov $0,%r9
+sub %rdx,%r9
+lea (%rdi,%rdx,8),%rdi
+lea (%rsi,%rdx,8),%rsi	#// last use of rdx
+
+mov %rcx,%rdx	#// rdx is 3 bit inverse
+
+mov %rdx,%rax
+imul %ecx,%edx
+mov $2,%r11
+sub %rdx,%r11
+imul %eax,%r11d	#//r11 has 4 bits
+
+mov %r11,%rax
+imul %ecx,%r11d 
+mov $2,%rdx
+sub %r11,%rdx		
+imul %eax,%edx	#//rdx has 8 bits
+
+mov %rdx,%rax
+imul %ecx,%edx
+mov $2,%r11
+sub %rdx,%r11
+imul %eax,%r11d	#//r11 has 16 bits
+
+mov %r11,%rax
+imul %ecx,%r11d 
+mov $2,%rdx
+sub %r11,%rdx		
+imul %eax,%edx	#// rdx has 32 bits
+
+mov %rdx,%rax
+imul %rcx,%rdx
+mov $2,%r11
+sub %rdx,%r11
+imul %rax,%r11	#//r11 has 64 bits
+
+#clear carry
+xor %rdx,%rdx
+ALIGN(16)
+loop:
+    mov (%rsi,%r9,8),%rax
+    sbb %rdx,%rax
+    sbb %r8,%r8
+    imul %r11,%rax
+    mov %rax,(%rdi,%r9,8)
+    mul %rcx
+    add $1,%r8
+    inc %r9
+    jnz loop
+mov $0,%rax
+adc %rdx,%rax
+ret
+EPILOGUE()
--- a/mpn/x86_64/k8/divrem_hensel_r_1.asm
+++ b/mpn/x86_64/k8/divrem_hensel_r_1.asm
@ -0,0 +1,86 @@
+dnl  X86_64 mpn_divrem_hensel_r_1
+
+dnl  Copyright 2009 Jason Moxham
+
+dnl  This file is part of the MPIR Library.
+
+dnl  The MPIR Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The MPIR Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
+dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+dnl  Boston, MA 02110-1301, USA.
+
+include(`../config.m4')
+
+C	 hensel divide (rdi,rsi) / rdx
+C	rax=hensel remainder from div 
+
+C	This is divrem_hensel_1 with shifting on the output of the quotient
+C	On k8/k10 the shifting comes for free so no need to have different
+C	fn for that. And on K8/K10 this runs at 10c/l which is optimal
+C	This function "replaces" divexact_1 and modexact_1_odd
+C	This is same as the shifting version but with  no shifting
+
+ASM_START()
+PROLOGUE(mpn_divrem_hensel_r_1)
+mov $0,%r9
+sub %rsi,%r9
+lea (%rdi,%rsi,8),%rdi	#// last use of rsi
+
+mov %rdx,%rcx	#// rdx is 3 bit inverse and rcx is divisor
+
+mov %rdx,%rax
+imul %ecx,%edx
+mov $2,%r11
+sub %rdx,%r11
+imul %eax,%r11d	#//r11 has 4 bits
+
+mov %r11,%rax
+imul %ecx,%r11d 
+mov $2,%rdx
+sub %r11,%rdx		
+imul %eax,%edx	#//rdx has 8 bits
+
+mov %rdx,%rax
+imul %ecx,%edx
+mov $2,%r11
+sub %rdx,%r11
+imul %eax,%r11d	#//r11 has 16 bits
+
+mov %r11,%rax
+imul %ecx,%r11d 
+mov $2,%rdx
+sub %r11,%rdx		
+imul %eax,%edx	#// rdx has 32 bits
+
+mov %rdx,%rax
+imul %rcx,%rdx
+mov $2,%r11
+sub %rdx,%r11
+imul %rax,%r11	#//r11 has 64 bits
+
+#clear carry
+xor %rdx,%rdx
+ALIGN(16)
+loop:
+    mov (%rdi,%r9,8),%rax
+    sbb %rdx,%rax
+    sbb %r8,%r8
+    imul %r11,%rax
+    mul %rcx
+    add $1,%r8
+    inc %r9
+    jnz loop
+mov $0,%rax
+adc %rdx,%rax
+ret
+EPILOGUE()