mpir/mpn/x86_64/core2/rshift.asm

169 lines
3.3 KiB
NASM

dnl core2 mpn_rshift
dnl Copyright 2009 Jason Moxham
dnl This file is part of the MPIR Library.
dnl The MPIR Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
dnl your option) any later version.
dnl The MPIR Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the MPIR Library; see the file COPYING.LIB. If not, write
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl Boston, MA 02110-1301, USA.
include(`../config.m4')
C (rdi,rdx)=(rsi,rdx)>>rcx
C rax=carry
ASM_START()
PROLOGUE(mpn_rshift)
# Version 1.0.4
cmp $2,%rdx
ja threeormore
jz two
one:
mov (%rsi),%rdx
mov %rdx,%rax
shr %cl,%rdx
neg %rcx
shl %cl,%rax
mov %rdx,(%rdi)
ret
two:
mov (%rsi),%r8
mov 8(%rsi),%r9
mov %r8,%rax
mov %r9,%r11
shr %cl,%r8
shr %cl,%r9
neg %rcx
shl %cl,%r11
shl %cl,%rax
or %r11,%r8
mov %r8,(%rdi)
mov %r9,8(%rdi)
ret
threeormore:
mov $64,%eax
lea 8(%rsi),%r9
sub %rcx,%rax
and $-16,%r9
movq %rcx,%xmm0
movq %rax,%xmm1
movdqa (%r9),%xmm5
movdqa %xmm5,%xmm3
psllq %xmm1,%xmm5
movq %xmm5,%rax
cmp %r9,%rsi
lea -40(%rsi,%rdx,8),%rsi
je aligned
movq -8(%r9),%xmm2
movq %xmm2,%xmm4
psllq %xmm1,%xmm2
psrlq %xmm0,%xmm4
por %xmm5,%xmm4
movq %xmm4,(%rdi)
lea 8(%rdi),%rdi
#dec %rdx
sub $1,%rdx
movq %xmm2,%rax
aligned:
lea -40(%rdi,%rdx,8),%rdi
psrlq %xmm0,%xmm3
mov $5,%r8d
sub %rdx,%r8
jnc skiploop
ALIGN(16)
loop:
movdqa 16(%rsi,%r8,8),%xmm2
movdqa %xmm2,%xmm4
psllq %xmm1,%xmm2
shufpd $1,%xmm2,%xmm5
por %xmm5,%xmm3
movq %xmm3,(%rdi,%r8,8)
movhpd %xmm3,8(%rdi,%r8,8)
psrlq %xmm0,%xmm4
movdqa 32(%rsi,%r8,8),%xmm5
movdqa %xmm5,%xmm3
psllq %xmm1,%xmm5
shufpd $1,%xmm5,%xmm2
psrlq %xmm0,%xmm3
por %xmm2,%xmm4
movq %xmm4,16(%rdi,%r8,8)
movhpd %xmm4,24(%rdi,%r8,8)
add $4,%r8
jnc loop
skiploop:
# have 3-r8 limbs left to do
cmp $2,%r8
ja left0
jz left1
jp left2
left3:
movdqa 16(%rsi,%r8,8),%xmm2
movdqa %xmm2,%xmm4
psllq %xmm1,%xmm2
shufpd $1,%xmm2,%xmm5
por %xmm5,%xmm3
movq %xmm3,(%rdi,%r8,8)
movhpd %xmm3,8(%rdi,%r8,8)
psrlq %xmm0,%xmm4
movq 32(%rsi,%r8,8),%xmm5
movq %xmm5,%xmm3
psllq %xmm1,%xmm5
shufpd $1,%xmm5,%xmm2
psrlq %xmm0,%xmm3
por %xmm2,%xmm4
movq %xmm4,16(%rdi,%r8,8)
movhpd %xmm4,24(%rdi,%r8,8)
psrldq $8,%xmm5
por %xmm5,%xmm3
movq %xmm3,32(%rdi,%r8,8)
ret
ALIGN(16)
left2:
movdqa 16(%rsi,%r8,8),%xmm2
movdqa %xmm2,%xmm4
psllq %xmm1,%xmm2
shufpd $1,%xmm2,%xmm5
por %xmm5,%xmm3
movq %xmm3,(%rdi,%r8,8)
movhpd %xmm3,8(%rdi,%r8,8)
psrlq %xmm0,%xmm4
psrldq $8,%xmm2
por %xmm2,%xmm4
movq %xmm4,16(%rdi,%r8,8)
movhpd %xmm4,24(%rdi,%r8,8)
ret
ALIGN(16)
left1:
movq 16(%rsi,%r8,8),%xmm2
movq %xmm2,%xmm4
psllq %xmm1,%xmm2
shufpd $1,%xmm2,%xmm5
por %xmm5,%xmm3
movq %xmm3,(%rdi,%r8,8)
movhpd %xmm3,8(%rdi,%r8,8)
psrlq %xmm0,%xmm4
psrldq $8,%xmm2
por %xmm2,%xmm4
movq %xmm4,16(%rdi,%r8,8)
ret
ALIGN(16)
left0:
psrldq $8,%xmm5
por %xmm5,%xmm3
movq %xmm3,(%rdi,%r8,8)
movhpd %xmm3,8(%rdi,%r8,8)
ret
EPILOGUE()