From 211e597c89700754ab1910f6067cf0990b8cfe75 Mon Sep 17 00:00:00 2001 From: jasonmoxham Date: Sun, 23 Aug 2009 15:58:03 +0000 Subject: [PATCH] add new function core2/penryn mpn_addlsh_n --- mpn/x86_64/core2/addlsh_n.asm | 143 ++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 mpn/x86_64/core2/addlsh_n.asm diff --git a/mpn/x86_64/core2/addlsh_n.asm b/mpn/x86_64/core2/addlsh_n.asm new file mode 100644 index 00000000..a25ad60b --- /dev/null +++ b/mpn/x86_64/core2/addlsh_n.asm @@ -0,0 +1,143 @@ +dnl mpn_addlsh_n + +dnl Copyright 2009 Jason Moxham + +dnl This file is part of the MPIR Library. + +dnl The MPIR Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The MPIR Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the MPIR Library; see the file COPYING.LIB. If not, write +dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +dnl Boston, MA 02110-1301, USA. + +include(`../config.m4') + +C ret mpn_addlsh_n(mp_ptr,mp_ptr,mp_ptr,mp_size_t,shift) +C rax rdi, rsi, rdx, rcx r8 + +ASM_START() +PROLOGUE(mpn_addlsh_n) +lea -32(%rdi,%rcx,8),%rdi +lea -32(%rsi,%rcx,8),%rsi +lea -32(%rdx,%rcx,8),%rdx +push %r12 +push %rbx +mov $4,%rbx +sub %rcx,%rbx +mov $64,%rcx +sub %r8,%rcx +mov $0,%r12 +mov $0,%rax +mov (%rdx,%rbx,8),%r8 +cmp $0,%rbx +jge skiplp +ALIGN(16) +lp: + mov 8(%rdx,%rbx,8),%r9 + mov 16(%rdx,%rbx,8),%r10 + mov 24(%rdx,%rbx,8),%r11 + shrd %cl,%r8,%r12 + shrd %cl,%r9,%r8 + shrd %cl,%r10,%r9 + shrd %cl,%r11,%r10 + sahf + adc (%rsi,%rbx,8),%r12 + mov %r12,(%rdi,%rbx,8) + adc 8(%rsi,%rbx,8),%r8 + mov %r11,%r12 + mov %r8,8(%rdi,%rbx,8) + adc 16(%rsi,%rbx,8),%r9 + adc 24(%rsi,%rbx,8),%r10 + mov %r10,24(%rdi,%rbx,8) + mov %r9,16(%rdi,%rbx,8) + lahf + mov 32(%rdx,%rbx,8),%r8 + add $4,%rbx + jnc lp +ALIGN(16) +skiplp: +cmp $2,%rbx +ja case0 +je case1 +jp case2 +case3: + shrd %cl,%r8,%r12 + mov 8(%rdx,%rbx,8),%r9 + mov 16(%rdx,%rbx,8),%r10 + mov 24(%rdx,%rbx,8),%r11 + shrd %cl,%r9,%r8 + shrd %cl,%r10,%r9 + shrd %cl,%r11,%r10 + sahf + adc (%rsi,%rbx,8),%r12 + mov %r12,(%rdi,%rbx,8) + adc 8(%rsi,%rbx,8),%r8 + mov %r11,%r12 + mov %r8,8(%rdi,%rbx,8) + adc 16(%rsi,%rbx,8),%r9 + adc 24(%rsi,%rbx,8),%r10 + mov %r10,24(%rdi,%rbx,8) + mov %r9,16(%rdi,%rbx,8) + lahf + shr %cl,%r12 + sahf + adc $0,%r12 + mov %r12,%rax + pop %rbx + pop %r12 + ret +case2: + shrd %cl,%r8,%r12 + mov 8(%rdx,%rbx,8),%r9 + shrd %cl,%r9,%r8 + mov 16(%rdx,%rbx,8),%r10 + shrd %cl,%r10,%r9 + shr %cl,%r10 + sahf + adc (%rsi,%rbx,8),%r12 + mov %r12,(%rdi,%rbx,8) + adc 8(%rsi,%rbx,8),%r8 + mov $0,%rax + mov %r8,8(%rdi,%rbx,8) + adc 16(%rsi,%rbx,8),%r9 + adc %r10,%rax + mov %r9,16(%rdi,%rbx,8) + pop %rbx + pop %r12 + ret +case1: + shrd %cl,%r8,%r12 + mov 8(%rdx,%rbx,8),%r9 + shrd %cl,%r9,%r8 + shr %cl,%r9 + sahf + adc (%rsi,%rbx,8),%r12 + mov %r12,(%rdi,%rbx,8) + adc 8(%rsi,%rbx,8),%r8 + mov $0,%rax + mov %r8,8(%rdi,%rbx,8) + adc %r9,%rax + pop %rbx + pop %r12 + ret +case0: + shrd %cl,%r8,%r12 + shr %cl,%r8 + sahf + adc (%rsi,%rbx,8),%r12 + mov %r12,(%rdi,%rbx,8) + adc $0,%r8 + mov %r8,%rax + pop %rbx + pop %r12 + ret +EPILOGUE()