;  Copyright 2001 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\..\x86i.inc" 

%define	PARAM_SHIFT esp+frame+16 
%define PARAM_SIZE  esp+frame+12 
%define PARAM_SRC   esp+frame+8 
%define PARAM_DST   esp+frame+4 
%define frame		8 

;   Minimum 5,because the unrolled loop can't handle less. 
%define	UNROLL_THRESHOLD  5 

	section .text
	
	global	___gmpn_rshift
%ifdef	DLL
	export	___gmpn_rshift
%endif

	align   8
___gmpn_rshift:
    push    ebx
    push    edi
    mov     eax,[PARAM_SIZE]
    mov     edx,[PARAM_DST]
    mov     ebx,[PARAM_SRC]
    mov     ecx,[PARAM_SHIFT]
	cmp     eax,UNROLL_THRESHOLD
	jae     Lunroll
    dec     eax
    mov     edi,[ebx]		;  src low limb 
    jnz     Lsimple
	shrd	eax,edi,cl
    shr     edi,cl
    mov     [edx],edi       ;  dst low limb 
    pop     edi             ;  risk of data cache bank clash 
    pop     ebx
    ret

;  eax size-1 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

	align   8
Lsimple: 
    movd    mm5,[ebx]       ;  src[0] 
    lea     ebx,[ebx+eax*4]  ;  &src[size-1] 
    movd    mm6,ecx         ;  rshift 
    lea     edx,[-4+edx+eax*4] ;  &dst[size-2] 
    psllq   mm5,32
    neg     eax

;  This loop is 5 or 8 cycles,with every second load unaligned and a wasted 
;  cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4 
;  cycles and would be 8 in a simple loop.  Using mmx helps the return value 
;  and last limb calculations too. 

;  eax counter,limbs,negative 
;  ebx &src[size-1] 
;  ecx return value 
;  edx &dst[size-2] 
;
;  mm0 scratch 
;  mm5 return value 
;  mm6 shift 

Lsimple_top: 
    movq    mm0,[ebx+eax*4]
    inc     eax
    psrlq   mm0,mm6
    movd    [edx+eax*4],mm0
    jnz     Lsimple_top
    movd    mm0,[ebx]
    psrlq   mm5,mm6         ;  return value 
    psrlq   mm0,mm6
    pop     edi
    movd    eax,mm5
    pop     ebx
    movd    [4+edx],mm0
    emms
    ret

;  eax size 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

	align   8
Lunroll: 
    movd    mm5,[ebx]       ;  src[0] 
    mov     edi,4
    movd    mm6,ecx         ;  rshift 
    test    ebx,edi
    psllq   mm5,32
    jz      Lstart_src_aligned

;  src isn't aligned,process low limb separately (marked xxx) and 
;  step src and dst by one limb,making src aligned. 
;
;  source                  ebx 
;  --+-------+-------+-------+ 
;            |          xxx  | 
;  --+-------+-------+-------+ 
;          4mod8   0mod8   4mod8 
;
;          dest            edx 
;          --+-------+-------+ 
;            |       |  xxx  |   
;          --+-------+-------+ 

    movq    mm0,[ebx]       ;  unaligned load 
    psrlq   mm0,mm6
    add     ebx,4
    dec     eax
    movd    [edx],mm0
    add     edx,4
Lstart_src_aligned: 
    movq    mm1,[ebx]
    test    edx,edi
    psrlq   mm5,mm6         ;  retval 
    jz      Lstart_dst_aligned

;  dst isn't aligned,add 4 to make it so,and pretend the shift is 
;  32 bits extra.  Low limb of dst (marked xxx) handled here 
;  separately. 
;
;           source          ebx 
;           --+-------+-------+ 
;             |      mm1      | 
;           --+-------+-------+ 
;                   4mod8   0mod8 
;
;   dest                    edx 
;   --+-------+-------+-------+ 
;                     |  xxx  |         
;   --+-------+-------+-------+ 
;           4mod8   0mod8   4mod8 

    movq    mm0,mm1
    add     ecx,32         ;  new shift 
    psrlq   mm0,mm6
    movd    mm6,ecx
    movd    [edx],mm0
    add     edx,4
Lstart_dst_aligned: 
    movq    mm3,[8+ebx]
    neg     ecx
    movq    mm2,mm3			;  mm2 src qword 
    add     ecx,64
    movd    mm7,ecx
    psrlq   mm1,mm6
    lea     ebx,[-12+ebx+eax*4]
    lea     edx,[-20+edx+eax*4]
    psllq   mm3,mm7
    sub     eax,7			;  size-7 
    por     mm3,mm1         ;  mm3 ready to store 
    neg     eax             ;  -(size-7) 
    jns     Lfinish

;  This loop is the important bit,the rest is just support.  Careful 
;  instruction scheduling achieves the claimed 1.75 c/l.  The 
;  relevant parts of the pairing rules are: 
;
;  - mmx loads and stores execute only in the U pipe 
;  - only one mmx shift in a pair 
;  - wait one cycle before storing an mmx register result 
;  - the usual address generation interlock 
;
;  Two qword calculations are slightly interleaved.  The instructions 
;  marked "C" belong to the second qword,and the "C prev" one is for 
;  the second qword from the previous iteration. 

;  eax counter,limbs,negative 
;  ebx &src[size-12] 
;  ecx 
;  edx &dst[size-12] 
;  esi 
;  edi 
;
;  mm0 
;  mm1 
;  mm2 src qword from -8(%ebx,%eax,4) 
;  mm3 dst qword ready to store to -8(%edx,%eax,4) 
;
;  mm5 return value 
;  mm6 rshift 
;  mm7 lshift 

	align   8
Lunroll_loop: 
    movq    mm0,[ebx+eax*4]
    psrlq   mm2,mm6
    movq    mm1,mm0
    psllq   mm0,mm7
    movq    [-8+edx+eax*4],mm3
    por     mm0,mm2

	movq    mm3,[ebx+eax*4+8]
	psrlq   mm1,mm6
    movq    [edx+eax*4],mm0
	movq    mm2,mm3
	psllq   mm3,mm7
    add     eax,4
	por     mm3,mm1
    js      Lunroll_loop

Lfinish: 
;  eax 0 to 3 representing respectively 3 to 0 limbs remaining 

    test    al,2
    jnz     Lfinish_no_two
    movq    mm0,[ebx+eax*4]
    psrlq   mm2,mm6
    movq    mm1,mm0
    psllq   mm0,mm7
    movq    [-8+edx+eax*4],mm3  ;  prev 
    por     mm0,mm2
    movq    mm2,mm1
    movq    mm3,mm0
    add     eax,2
Lfinish_no_two: 

;  eax 2 or 3 representing respectively 1 or 0 limbs remaining 
;
;  mm2 src prev qword,from -8(%ebx,%eax,4) 
;  mm3 dst qword,for -8(%edx,%eax,4) 

    test    al,1
    pop     edi
    movd    eax,mm5  ;  retval 
    jnz     Lfinish_zero

;  One extra limb,destination was aligned. 
;
;  source                ebx 
;  +-------+---------------+-- 
;  |       |      mm2      | 
;  +-------+---------------+-- 
;
;  dest                                  edx 
;  +-------+---------------+---------------+-- 
;  |       |               |      mm3      | 
;  +-------+---------------+---------------+-- 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  One extra limb,destination was unaligned. 
;
;  source                ebx 
;  +-------+---------------+-- 
;  |       |      mm2      | 
;  +-------+---------------+-- 
;
;  dest                          edx 
;  +---------------+---------------+-- 
;  |               |      mm3      | 
;  +---------------+---------------+-- 
;
;  mm6 = shift+32 
;  mm7 = ecx = 64-(shift+32) 

;  In both cases there's one extra limb of src to fetch and combine 
;  with mm2 to make a qword at 8(%edx),and in the aligned case 
;  there's a further extra limb of dst to be formed. 


    movd    mm0,[8+ebx]
    psrlq   mm2,mm6
    movq    mm1,mm0
    psllq   mm0,mm7
    movq    [edx],mm3
    por     mm0,mm2
    psrlq   mm1,mm6
    and     ecx,32
    pop     ebx
    jz      Lfinish_one_unaligned

    ;  dst was aligned,must store one extra limb 
    movd    [16+edx],mm1
Lfinish_one_unaligned: 

    movq    [8+edx],mm0
    emms
    ret
Lfinish_zero: 

;  No extra limbs,destination was aligned. 
;
;  source        ebx 
;  +---------------+-- 
;  |      mm2      | 
;  +---------------+-- 
;
;  dest                        edx+4 
;  +---------------+---------------+-- 
;  |               |      mm3      | 
;  +---------------+---------------+-- 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  No extra limbs,destination was unaligned. 
;
;  source        ebx 
;  +---------------+-- 
;  |      mm2      | 
;  +---------------+-- 
;
;  dest                edx+4 
;  +-------+---------------+-- 
;  |       |      mm3      | 
;  +-------+---------------+-- 
;
;  mm6 = shift+32 
;  mm7 = 64-(shift+32) 

;  The movd for the unaligned case is clearly the same data as the 
;  movq for the aligned case,it's just a choice between whether one 
;  or two limbs should be written. 

    movq    [4+edx],mm3
    psrlq   mm2,mm6
    movd    [12+edx],mm2
    and     ecx,32
    pop     ebx
    jz      Lfinish_zero_unaligned
    movq    [12+edx],mm2
Lfinish_zero_unaligned: 
    emms
    ret

	end