;  Copyright 2001 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\..\x86i.inc" 

%define	PARAM_SHIFT esp+frame+16 
%define PARAM_SIZE  esp+frame+12 
%define PARAM_SRC   esp+frame+8 
%define PARAM_DST   esp+frame+4 
%define	frame		8 

;   minimum 5,because the unrolled loop can't handle less 
%define       UNROLL_THRESHOLD  5 

	section .text

	global	___gmpn_lshift
%ifdef	DLL
	export	___gmpn_lshift
%endif

	align   8
___gmpn_lshift:
    push    ebx
    push    edi
    mov     eax,[PARAM_SIZE]
    mov     edx,[PARAM_DST]
    mov     ebx,[PARAM_SRC]
    mov     ecx,[PARAM_SHIFT]
	cmp     eax,UNROLL_THRESHOLD
    jae     Lunroll
    mov     edi,[-4+ebx+eax*4]	;  src high limb 
    dec     eax
    jnz     Lsimple
	shld	eax,edi,cl
    shl     edi,cl
    mov     [edx],edi			;  dst low limb 
    pop     edi					;  risk of data cache bank clash 
    pop     ebx
    ret

;  eax size-1 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

Lsimple: 
    movd    mm5,[ebx+eax*4] ;  src high limb 
    movd    mm6,ecx         ;  lshift 
    neg     ecx
    psllq   mm5,mm6
    add     ecx,32
    movd    mm7,ecx
    psrlq   mm5,32          ;  retval 

;  eax counter,limbs,negative 
;  ebx src 
;  ecx 
;  edx dst 
;  esi 
;  edi 
; 
;  mm0 scratch 
;  mm5 return value 
;  mm6 shift 
;  mm7 32-shift 

Lsimple_top: 
    movq    mm0,[ebx+eax*4-4]
    dec     eax
    psrlq   mm0,mm7
    movd    [4+edx+eax*4],mm0
    jnz     Lsimple_top
    movd    mm0,[ebx]
    movd    eax,mm5
    psllq   mm0,mm6
    pop     edi
    pop     ebx
    movd    [edx],mm0
    emms
    ret

;  eax size 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

	align   8
Lunroll: 
    movd    mm5,[ebx+eax*4-4]	;  src high limb 
    lea     edi,[ebx+eax*4]
    movd    mm6,ecx				;  lshift 
    and     edi,4
    psllq   mm5,mm6
    jz      Lstart_src_aligned

;  src isn't aligned,process high limb separately (marked xxx) to 
;  make it so. 
; 
;   source     -8(ebx,%eax,4) 
;                   | 
;   +-------+-------+-------+-- 
;   |               |           
;   +-------+-------+-------+-- 
;         0mod8   4mod8   0mod8 
; 
;   dest 
;      -4(edx,%eax,4) 
;           | 
;   +-------+-------+-- 
;   |  xxx  |       |   
;   +-------+-------+-- 

    movq    mm0,[ebx+eax*4-8]		;  unaligned load 
    psllq   mm0,mm6
    dec     eax
    psrlq   mm0,32
    movd    [edx+eax*4],mm0
Lstart_src_aligned: 
    movq    mm1,[ebx+eax*4-8]		;  src high qword 
    lea     edi,[edx+eax*4]
    and     edi,4
    psrlq   mm5,32					;  return value 
    movq    mm3,[ebx+eax*4-16]		;  src second highest qword 
    jz      Lstart_dst_aligned

;  dst isn't aligned,subtract 4 to make it so,and pretend the shift 
;  is 32 bits extra.  High limb of dst (marked xxx) handled here 
;  separately. 
; 
;   source     -8(ebx,%eax,4) 
;                   | 
;   +-------+-------+-- 
;   |      mm1      |   
;   +-------+-------+-- 
;                 0mod8   4mod8 
; 
;   dest 
;      -4(edx,%eax,4) 
;           | 
;   +-------+-------+-------+-- 
;   |  xxx  |               | 
;   +-------+-------+-------+-- 
;         0mod8   4mod8   0mod8 

    movq    mm0,mm1
    add     ecx,32         ;  new shift 
    psllq   mm0,mm6
    movd    mm6,ecx
    psrlq   mm0,32

;  wasted cycle here waiting for %mm0 

    movd    [-4+edx+eax*4],mm0
    sub     edx,4
Lstart_dst_aligned: 

    psllq   mm1,mm6
    neg     ecx				;  -shift 
    add     ecx,64			;  64-shift 
    movq    mm2,mm3
    movd    mm7,ecx
    sub     eax,8			;  size-8 
    psrlq   mm3,mm7
    por     mm3,mm1         ;  mm3 ready to store 
    jc      Lfinish

;  The comments in mpn_rshift apply here too. 

;  eax counter,limbs 
;  ebx src 
;  ecx 
;  edx dst 
;  esi 
;  edi 
; 
;  mm0 
;  mm1 
;  mm2 src qword from 16(%ebx,%eax,4) 
;  mm3 dst qword ready to store to 24(%edx,%eax,4) 
; 
;  mm5 return value 
;  mm6 lshift 
;  mm7 rshift 

	align   8
Lunroll_loop: 
    movq    mm0,[ebx+eax*4+8]
    psllq   mm2,mm6
    movq    mm1,mm0
    psrlq   mm0,mm7
    movq    [24+edx+eax*4],mm3
    por     mm0,mm2
    movq    mm3,[ebx+eax*4]
    psllq   mm1,mm6
    movq    [16+edx+eax*4],mm0
    movq    mm2,mm3 
	psrlq   mm3,mm7
    sub     eax,4
	por     mm3,mm1
    jnc     Lunroll_loop
Lfinish: 
;  eax -4 to -1 representing respectively 0 to 3 limbs remaining 

    test    al,2
    jz      Lfinish_no_two
    movq    mm0,[ebx+eax*4+8]
    psllq   mm2,mm6
    movq    mm1,mm0
    psrlq   mm0,mm7
    movq    [24+edx+eax*4],mm3  ;  prev 
    por     mm0,mm2
    movq    mm2,mm1
    movq    mm3,mm0
    sub     eax,2
Lfinish_no_two: 

;  eax -4 or -3 representing respectively 0 or 1 limbs remaining 
;  mm2 src prev qword,from 16(%ebx,%eax,4) 
;  mm3 dst qword,for 24(%edx,%eax,4) 

    test    al,1
    movd    eax,mm5  ;  retval 
    pop     edi
    jz      Lfinish_zero

;  One extra src limb,destination was aligned. 
;
;                  source                  ebx 
;                  --+---------------+-------+ 
;                    |      mm2      |       | 
;                  --+---------------+-------+ 
;
;  dest         edx+12           edx+4     edx 
;  --+---------------+---------------+-------+ 
;    |      mm3      |               |       | 
;  --+---------------+---------------+-------+ 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  One extra src limb,destination was unaligned. 
;
;                  source                  ebx 
;                  --+---------------+-------+ 
;                    |      mm2      |       | 
;                  --+---------------+-------+ 
;
;          dest         edx+12           edx+4 
;          --+---------------+---------------+ 
;            |      mm3      |               | 
;          --+---------------+---------------+ 
;
;  mm6 = shift+32 
;  mm7 = ecx = 64-(shift+32) 


;  In both cases there's one extra limb of src to fetch and combine 
;  with mm2 to make a qword at 4(%edx),and in the aligned case 
;  there's an extra limb of dst to be formed from that extra src limb 
;  left shifted. 

    movd    mm0,[ebx]
    psllq   mm2,mm6
    movq    [12+edx],mm3
    psllq   mm0,32
    movq    mm1,mm0
    psrlq   mm0,mm7
    por     mm0,mm2
    psllq   mm1,mm6
    movq    [4+edx],mm0
    psrlq   mm1,32
    and     ecx,32
    pop     ebx
    jz      Lfinish_one_unaligned
    movd    [edx],mm1
Lfinish_one_unaligned: 
    emms
    ret
Lfinish_zero: 

;  No extra src limbs,destination was aligned. 
;
;                  source          ebx 
;                  --+---------------+ 
;                    |      mm2      | 
;                  --+---------------+ 
;
;  dest          edx+8             edx 
;  --+---------------+---------------+ 
;    |      mm3      |               | 
;  --+---------------+---------------+ 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  No extra src limbs,destination was unaligned. 
;
;                source            ebx 
;                  --+---------------+ 
;                    |      mm2      | 
;                  --+---------------+ 
;
;          dest          edx+8   edx+4 
;          --+---------------+-------+ 
;            |      mm3      |       | 
;          --+---------------+-------+ 
;
;  mm6 = shift+32 
;  mm7 = ecx = 64-(shift+32) 

;  The movd for the unaligned case writes the same data to 4(%edx) 
;  that the movq does for the aligned case. 

    movq    [8+edx],mm3
    and     ecx,32
    psllq   mm2,mm6
    jz      Lfinish_zero_unaligned
    movq    [edx],mm2
Lfinish_zero_unaligned: 
    psrlq   mm2,32
    pop     ebx
    movd    eax,mm5  ;  retval 
    movd    [4+edx],mm2
    emms
    ret

	end