;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  Copyright 2008 Brian Gladman, William Hart
;
;  This file is part of the MPIR Library.
;
;  The MPIR Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The MPIR Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the MPIR Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
;  Fifth Floor, Boston, MA 02110-1301, USA.
;
;  Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
;  compiler and the YASM assembler.

;  AMD64 mpn_rshift -- mpn right shift
;
;  Calling interface:
;
; mp_limb_t mpn_rshift(
;     mp_ptr dst,       rdi
;     mp_srcptr src,    rsi
;     mp_size_t size,   rdx
;     unsigned shift    rcx
; )

%include '../yasm_mac.inc'


%define src rsi
%define dst rdi
%define r_tmpd ecx
%define s_len rdx

   BITS 64

GLOBAL_FUNC mpn_rshift
    movq    mm7, [src]               ; move bottom source into mm7
    movd    mm1, r_tmpd              ; move shift value into mm1
    mov     eax, 64
    sub     eax, r_tmpd
    movd    mm0, eax                 ; and 64 - shift value into mm0
    movq    mm3, mm7                 ; save mm7 in mm3
    psllq   mm7, mm0                 ; do shift
    movd    rax, mm7                 ; put remainder after shift into rax for return
    lea     src, [src+s_len*8]       
    lea     dst, [dst+s_len*8]
    neg     s_len
    add     s_len, 2
    jg      label1

    align   8
label0: 
    movq    mm6, [src+s_len*8-8]     ; load next source chunk
    movq    mm2, mm6                 ; copy it
    psllq   mm6, mm0                 ; shift left
    psrlq   mm3, mm1                 ; and right
    por     mm3, mm6                 ; and combine
    movq    [dst+s_len*8-16], mm3    ; store result
    je      label2
    movq    mm7, [src+s_len*8]       ; next source chunk
    movq    mm3, mm7                 ; save it
    psllq   mm7, mm0                 ; shift left
    psrlq   mm2, mm1                 ; and right
    por     mm2, mm7                 ; and combine
    movq    [dst+s_len*8-8], mm2     ; store result
    add     s_len, 2
    jle     label0
label1: 
    movq    mm2, mm3
label2: 
    psrlq   mm2, mm1                 ; final shift
    movq    [dst-8], mm2             ; and store
    emms
    ret