From 9541395dae813911abc513e02c79856c9d1be529 Mon Sep 17 00:00:00 2001 From: wbhart Date: Fri, 30 May 2008 05:23:25 +0000 Subject: [PATCH] Unfortunately Yasm doesn't seem to export global symbols properly on Darwin. --- mpn/x86_64/core2/lshift.asm | 73 +++++++++++++++++++++++++++++++++++ mpn/x86_64/core2/rshift.asm | 76 +++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 mpn/x86_64/core2/lshift.asm create mode 100644 mpn/x86_64/core2/rshift.asm diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm new file mode 100644 index 00000000..1e79a061 --- /dev/null +++ b/mpn/x86_64/core2/lshift.asm @@ -0,0 +1,73 @@ +dnl AMD64 mpn_lshift + +dnl Copyright 2004, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8: 2.5 +C Intel P4: 4 +C Intel Core 2: 2.4 + + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx +C cnt rcx + +ASM_START() +PROLOGUE(mpn_lshift) + movq -8(%rsi,%rdx,8), %mm7 + movd %ecx, %mm1 + movl $64, %eax + subl %ecx, %eax + movd %eax, %mm0 + movq %mm7, %mm3 + psrlq %mm0, %mm7 + .byte 0x48, 0x0f, 0x7e, 0xf8 C movd %mm7, %rax + subq $2, %rdx + jl L(endo) + + ALIGN(4) C minimal alignment for claimed speed +L(loop): + movq (%rsi,%rdx,8), %mm6 + movq %mm6, %mm2 + psrlq %mm0, %mm6 + psllq %mm1, %mm3 + por %mm6, %mm3 + movq %mm3, 8(%rdi,%rdx,8) + je L(ende) + movq -8(%rsi,%rdx,8), %mm7 + movq %mm7, %mm3 + psrlq %mm0, %mm7 + psllq %mm1, %mm2 + por %mm7, %mm2 + movq %mm2, (%rdi,%rdx,8) + subq $2, %rdx + jge L(loop) + +L(endo): + movq %mm3, %mm2 +L(ende): + psllq %mm1, %mm2 + movq %mm2, (%rdi) + emms + ret +EPILOGUE() diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm new file mode 100644 index 00000000..98b3321b --- /dev/null +++ b/mpn/x86_64/core2/rshift.asm @@ -0,0 +1,76 @@ +dnl AMD64 mpn_rshift + +dnl Copyright 2004, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8: 2.5 +C Intel P4: 4 +C Intel Core 2: 2.4 + + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx +C cnt rcx + +ASM_START() +PROLOGUE(mpn_rshift) + movq (%rsi), %mm7 + movd %ecx, %mm1 + movl $64, %eax + subl %ecx, %eax + movd %eax, %mm0 + movq %mm7, %mm3 + psllq %mm0, %mm7 + .byte 0x48, 0x0f, 0x7e, 0xf8 C movd %mm7, %rax + leaq (%rsi,%rdx,8), %rsi + leaq (%rdi,%rdx,8), %rdi + negq %rdx + addq $2, %rdx + jg L(endo) + + ALIGN(8) C minimal alignment for claimed speed +L(loop): + movq -8(%rsi,%rdx,8), %mm6 + movq %mm6, %mm2 + psllq %mm0, %mm6 + psrlq %mm1, %mm3 + por %mm6, %mm3 + movq %mm3, -16(%rdi,%rdx,8) + je L(ende) + movq (%rsi,%rdx,8), %mm7 + movq %mm7, %mm3 + psllq %mm0, %mm7 + psrlq %mm1, %mm2 + por %mm7, %mm2 + movq %mm2, -8(%rdi,%rdx,8) + addq $2, %rdx + jle L(loop) + +L(endo): + movq %mm3, %mm2 +L(ende): + psrlq %mm1, %mm2 + movq %mm2, -8(%rdi) + emms + ret +EPILOGUE()