dnl core2 mpn_rshift dnl Copyright 2009 Jason Moxham dnl This file is part of the MPIR Library. dnl The MPIR Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 2.1 of the License, or (at dnl your option) any later version. dnl The MPIR Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the MPIR Library; see the file COPYING.LIB. If not, write dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, dnl Boston, MA 02110-1301, USA. include(`../config.m4') C (rdi,rdx)=(rsi,rdx)>>rcx C rax=carry C decent assmeblers understand what movq means ,except C microsofts/apple masm (what a suprise there) so for the broken old masm C assembler. Needed for movq reg64,mediareg and movq mediareg,reg64 C only , where mediareg is xmm or mm define(`MOVQ',`movd') ASM_START() PROLOGUE(mpn_rshift) cmp $2,%rdx ja threeormore jz two one: mov (%rsi),%rdx mov %rdx,%rax shr %cl,%rdx neg %rcx shl %cl,%rax mov %rdx,(%rdi) ret two: mov (%rsi),%r8 mov 8(%rsi),%r9 mov %r8,%rax mov %r9,%r11 shr %cl,%r8 shr %cl,%r9 neg %rcx shl %cl,%r11 shl %cl,%rax or %r11,%r8 mov %r8,(%rdi) mov %r9,8(%rdi) ret threeormore: mov $64,%eax lea 8(%rsi),%r9 sub %rcx,%rax and $-16,%r9 MOVQ %rcx,%xmm0 MOVQ %rax,%xmm1 movdqa (%r9),%xmm5 movdqa %xmm5,%xmm3 psllq %xmm1,%xmm5 MOVQ %xmm5,%rax cmp %r9,%rsi lea -40(%rsi,%rdx,8),%rsi je aligned movq -8(%r9),%xmm2 movq %xmm2,%xmm4 psllq %xmm1,%xmm2 psrlq %xmm0,%xmm4 por %xmm5,%xmm4 movq %xmm4,(%rdi) lea 8(%rdi),%rdi sub $1,%rdx MOVQ %xmm2,%rax aligned: lea -40(%rdi,%rdx,8),%rdi psrlq %xmm0,%xmm3 mov $5,%r8d sub %rdx,%r8 jnc skiploop ALIGN(16) loop: movdqa 16(%rsi,%r8,8),%xmm2 movdqa %xmm2,%xmm4 psllq %xmm1,%xmm2 shufpd $1,%xmm2,%xmm5 por %xmm5,%xmm3 movq %xmm3,(%rdi,%r8,8) movhpd %xmm3,8(%rdi,%r8,8) psrlq %xmm0,%xmm4 movdqa 32(%rsi,%r8,8),%xmm5 movdqa %xmm5,%xmm3 psllq %xmm1,%xmm5 shufpd $1,%xmm5,%xmm2 psrlq %xmm0,%xmm3 por %xmm2,%xmm4 movq %xmm4,16(%rdi,%r8,8) movhpd %xmm4,24(%rdi,%r8,8) add $4,%r8 jnc loop skiploop: cmp $2,%r8 ja left0 jz left1 jp left2 left3: movdqa 16(%rsi,%r8,8),%xmm2 movdqa %xmm2,%xmm4 psllq %xmm1,%xmm2 shufpd $1,%xmm2,%xmm5 por %xmm5,%xmm3 movq %xmm3,(%rdi,%r8,8) movhpd %xmm3,8(%rdi,%r8,8) psrlq %xmm0,%xmm4 movq 32(%rsi,%r8,8),%xmm5 movq %xmm5,%xmm3 psllq %xmm1,%xmm5 shufpd $1,%xmm5,%xmm2 psrlq %xmm0,%xmm3 por %xmm2,%xmm4 movq %xmm4,16(%rdi,%r8,8) movhpd %xmm4,24(%rdi,%r8,8) psrldq $8,%xmm5 por %xmm5,%xmm3 movq %xmm3,32(%rdi,%r8,8) ret ALIGN(16) left2: movdqa 16(%rsi,%r8,8),%xmm2 movdqa %xmm2,%xmm4 psllq %xmm1,%xmm2 shufpd $1,%xmm2,%xmm5 por %xmm5,%xmm3 movq %xmm3,(%rdi,%r8,8) movhpd %xmm3,8(%rdi,%r8,8) psrlq %xmm0,%xmm4 psrldq $8,%xmm2 por %xmm2,%xmm4 movq %xmm4,16(%rdi,%r8,8) movhpd %xmm4,24(%rdi,%r8,8) ret ALIGN(16) left1: movq 16(%rsi,%r8,8),%xmm2 movq %xmm2,%xmm4 psllq %xmm1,%xmm2 shufpd $1,%xmm2,%xmm5 por %xmm5,%xmm3 movq %xmm3,(%rdi,%r8,8) movhpd %xmm3,8(%rdi,%r8,8) psrlq %xmm0,%xmm4 psrldq $8,%xmm2 por %xmm2,%xmm4 movq %xmm4,16(%rdi,%r8,8) ret ALIGN(16) left0: psrldq $8,%xmm5 por %xmm5,%xmm3 movq %xmm3,(%rdi,%r8,8) movhpd %xmm3,8(%rdi,%r8,8) ret EPILOGUE()