add new function core2/penryn mpn_addlsh_n

2009-08-23 15:58:03 +00:00 · 2009-08-23 15:58:03 +00:00 · 211e597c89
commit 211e597c89
parent da44b12898
1 changed files with 143 additions and 0 deletions
--- a/mpn/x86_64/core2/addlsh_n.asm
+++ b/mpn/x86_64/core2/addlsh_n.asm
@ -0,0 +1,143 @@
+dnl  mpn_addlsh_n
+
+dnl  Copyright 2009 Jason Moxham
+
+dnl  This file is part of the MPIR Library.
+
+dnl  The MPIR Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The MPIR Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
+dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+dnl  Boston, MA 02110-1301, USA.
+
+include(`../config.m4')
+
+C	ret mpn_addlsh_n(mp_ptr,mp_ptr,mp_ptr,mp_size_t,shift)
+C	rax                 rdi,   rsi,   rdx,      rcx   r8
+
+ASM_START()
+PROLOGUE(mpn_addlsh_n)
+lea -32(%rdi,%rcx,8),%rdi
+lea -32(%rsi,%rcx,8),%rsi
+lea -32(%rdx,%rcx,8),%rdx
+push %r12
+push %rbx
+mov $4,%rbx
+sub %rcx,%rbx
+mov $64,%rcx
+sub %r8,%rcx
+mov $0,%r12
+mov $0,%rax
+mov (%rdx,%rbx,8),%r8
+cmp $0,%rbx
+jge skiplp
+ALIGN(16)
+lp:
+	mov 8(%rdx,%rbx,8),%r9
+	mov 16(%rdx,%rbx,8),%r10
+	mov 24(%rdx,%rbx,8),%r11
+	shrd %cl,%r8,%r12
+	shrd %cl,%r9,%r8
+	shrd %cl,%r10,%r9
+	shrd %cl,%r11,%r10
+	sahf
+	adc (%rsi,%rbx,8),%r12
+	mov %r12,(%rdi,%rbx,8)
+	adc 8(%rsi,%rbx,8),%r8
+	mov %r11,%r12
+	mov %r8,8(%rdi,%rbx,8)
+	adc 16(%rsi,%rbx,8),%r9
+	adc 24(%rsi,%rbx,8),%r10
+	mov %r10,24(%rdi,%rbx,8)
+	mov %r9,16(%rdi,%rbx,8)
+	lahf
+	mov 32(%rdx,%rbx,8),%r8
+	add $4,%rbx
+	jnc lp
+ALIGN(16)
+skiplp:
+cmp $2,%rbx
+ja case0
+je case1
+jp case2
+case3:
+	shrd %cl,%r8,%r12
+	mov 8(%rdx,%rbx,8),%r9
+	mov 16(%rdx,%rbx,8),%r10
+	mov 24(%rdx,%rbx,8),%r11
+	shrd %cl,%r9,%r8
+	shrd %cl,%r10,%r9
+	shrd %cl,%r11,%r10
+	sahf
+	adc (%rsi,%rbx,8),%r12
+	mov %r12,(%rdi,%rbx,8)
+	adc 8(%rsi,%rbx,8),%r8
+	mov %r11,%r12
+	mov %r8,8(%rdi,%rbx,8)
+	adc 16(%rsi,%rbx,8),%r9
+	adc 24(%rsi,%rbx,8),%r10
+	mov %r10,24(%rdi,%rbx,8)
+	mov %r9,16(%rdi,%rbx,8)
+	lahf
+	shr %cl,%r12
+	sahf
+	adc $0,%r12
+	mov %r12,%rax
+	pop %rbx
+	pop %r12
+	ret
+case2:
+	shrd %cl,%r8,%r12
+	mov 8(%rdx,%rbx,8),%r9
+	shrd %cl,%r9,%r8
+	mov 16(%rdx,%rbx,8),%r10
+	shrd %cl,%r10,%r9
+	shr %cl,%r10
+	sahf
+	adc (%rsi,%rbx,8),%r12
+	mov %r12,(%rdi,%rbx,8)
+	adc 8(%rsi,%rbx,8),%r8
+	mov $0,%rax
+	mov %r8,8(%rdi,%rbx,8)
+	adc 16(%rsi,%rbx,8),%r9
+	adc %r10,%rax
+	mov %r9,16(%rdi,%rbx,8)
+	pop %rbx
+	pop %r12
+	ret
+case1:
+	shrd %cl,%r8,%r12
+	mov 8(%rdx,%rbx,8),%r9
+	shrd %cl,%r9,%r8
+	shr %cl,%r9
+	sahf
+	adc (%rsi,%rbx,8),%r12
+	mov %r12,(%rdi,%rbx,8)
+	adc 8(%rsi,%rbx,8),%r8
+	mov $0,%rax
+	mov %r8,8(%rdi,%rbx,8)
+	adc %r9,%rax
+	pop %rbx
+	pop %r12
+	ret
+case0:
+	shrd %cl,%r8,%r12
+	shr %cl,%r8
+	sahf
+	adc (%rsi,%rbx,8),%r12
+	mov %r12,(%rdi,%rbx,8)
+	adc $0,%r8
+	mov %r8,%rax
+	pop %rbx
+	pop %r12
+	ret
+EPILOGUE()