mpir/mpn/x86i/pentium4/mmx/lshift.asm


;  Copyright 2001 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\..\x86i.inc" 

%define	PARAM_SHIFT esp+frame+16 
%define PARAM_SIZE  esp+frame+12 
%define PARAM_SRC   esp+frame+8 
%define PARAM_DST   esp+frame+4 
%define	frame		8 

;   minimum 5,because the unrolled loop can't handle less 
%define       UNROLL_THRESHOLD  5 

	section .text

	global	___gmpn_lshift
%ifdef	DLL
	export	___gmpn_lshift
%endif

	align   8
___gmpn_lshift:
    push    ebx
    push    edi
    mov     eax,[PARAM_SIZE]
    mov     edx,[PARAM_DST]
    mov     ebx,[PARAM_SRC]
    mov     ecx,[PARAM_SHIFT]
	cmp     eax,UNROLL_THRESHOLD
    jae     Lunroll
    mov     edi,[-4+ebx+eax*4]	;  src high limb 
    dec     eax
    jnz     Lsimple
	shld	eax,edi,cl
    shl     edi,cl
    mov     [edx],edi			;  dst low limb 
    pop     edi					;  risk of data cache bank clash 
    pop     ebx
    ret

;  eax size-1 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

Lsimple: 
    movd    mm5,[ebx+eax*4] ;  src high limb 
    movd    mm6,ecx         ;  lshift 
    neg     ecx
    psllq   mm5,mm6
    add     ecx,32
    movd    mm7,ecx
    psrlq   mm5,32          ;  retval 

;  eax counter,limbs,negative 
;  ebx src 
;  ecx 
;  edx dst 
;  esi 
;  edi 
; 
;  mm0 scratch 
;  mm5 return value 
;  mm6 shift 
;  mm7 32-shift 

Lsimple_top: 
    movq    mm0,[ebx+eax*4-4]
    dec     eax
    psrlq   mm0,mm7
    movd    [4+edx+eax*4],mm0
    jnz     Lsimple_top
    movd    mm0,[ebx]
    movd    eax,mm5
    psllq   mm0,mm6
    pop     edi
    pop     ebx
    movd    [edx],mm0
    emms
    ret

;  eax size 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

	align   8
Lunroll: 
    movd    mm5,[ebx+eax*4-4]	;  src high limb 
    lea     edi,[ebx+eax*4]
    movd    mm6,ecx				;  lshift 
    and     edi,4
    psllq   mm5,mm6
    jz      Lstart_src_aligned

;  src isn't aligned,process high limb separately (marked xxx) to 
;  make it so. 
; 
;   source     -8(ebx,%eax,4) 
;                   | 
;   +-------+-------+-------+-- 
;   |               |           
;   +-------+-------+-------+-- 
;         0mod8   4mod8   0mod8 
; 
;   dest 
;      -4(edx,%eax,4) 
;           | 
;   +-------+-------+-- 
;   |  xxx  |       |   
;   +-------+-------+-- 

    movq    mm0,[ebx+eax*4-8]		;  unaligned load 
    psllq   mm0,mm6
    dec     eax
    psrlq   mm0,32
    movd    [edx+eax*4],mm0
Lstart_src_aligned: 
    movq    mm1,[ebx+eax*4-8]		;  src high qword 
    lea     edi,[edx+eax*4]
    and     edi,4
    psrlq   mm5,32					;  return value 
    movq    mm3,[ebx+eax*4-16]		;  src second highest qword 
    jz      Lstart_dst_aligned

;  dst isn't aligned,subtract 4 to make it so,and pretend the shift 
;  is 32 bits extra.  High limb of dst (marked xxx) handled here 
;  separately. 
; 
;   source     -8(ebx,%eax,4) 
;                   | 
;   +-------+-------+-- 
;   |      mm1      |   
;   +-------+-------+-- 
;                 0mod8   4mod8 
; 
;   dest 
;      -4(edx,%eax,4) 
;           | 
;   +-------+-------+-------+-- 
;   |  xxx  |               | 
;   +-------+-------+-------+-- 
;         0mod8   4mod8   0mod8 

    movq    mm0,mm1
    add     ecx,32         ;  new shift 
    psllq   mm0,mm6
    movd    mm6,ecx
    psrlq   mm0,32

;  wasted cycle here waiting for %mm0 

    movd    [-4+edx+eax*4],mm0
    sub     edx,4
Lstart_dst_aligned: 

    psllq   mm1,mm6
    neg     ecx				;  -shift 
    add     ecx,64			;  64-shift 
    movq    mm2,mm3
    movd    mm7,ecx
    sub     eax,8			;  size-8 
    psrlq   mm3,mm7
    por     mm3,mm1         ;  mm3 ready to store 
    jc      Lfinish

;  The comments in mpn_rshift apply here too. 

;  eax counter,limbs 
;  ebx src 
;  ecx 
;  edx dst 
;  esi 
;  edi 
; 
;  mm0 
;  mm1 
;  mm2 src qword from 16(%ebx,%eax,4) 
;  mm3 dst qword ready to store to 24(%edx,%eax,4) 
; 
;  mm5 return value 
;  mm6 lshift 
;  mm7 rshift 

	align   8
Lunroll_loop: 
    movq    mm0,[ebx+eax*4+8]
    psllq   mm2,mm6
    movq    mm1,mm0
    psrlq   mm0,mm7
    movq    [24+edx+eax*4],mm3
    por     mm0,mm2
    movq    mm3,[ebx+eax*4]
    psllq   mm1,mm6
    movq    [16+edx+eax*4],mm0
    movq    mm2,mm3 
	psrlq   mm3,mm7
    sub     eax,4
	por     mm3,mm1
    jnc     Lunroll_loop
Lfinish: 
;  eax -4 to -1 representing respectively 0 to 3 limbs remaining 

    test    al,2
    jz      Lfinish_no_two
    movq    mm0,[ebx+eax*4+8]
    psllq   mm2,mm6
    movq    mm1,mm0
    psrlq   mm0,mm7
    movq    [24+edx+eax*4],mm3  ;  prev 
    por     mm0,mm2
    movq    mm2,mm1
    movq    mm3,mm0
    sub     eax,2
Lfinish_no_two: 

;  eax -4 or -3 representing respectively 0 or 1 limbs remaining 
;  mm2 src prev qword,from 16(%ebx,%eax,4) 
;  mm3 dst qword,for 24(%edx,%eax,4) 

    test    al,1
    movd    eax,mm5  ;  retval 
    pop     edi
    jz      Lfinish_zero

;  One extra src limb,destination was aligned. 
;
;                  source                  ebx 
;                  --+---------------+-------+ 
;                    |      mm2      |       | 
;                  --+---------------+-------+ 
;
;  dest         edx+12           edx+4     edx 
;  --+---------------+---------------+-------+ 
;    |      mm3      |               |       | 
;  --+---------------+---------------+-------+ 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  One extra src limb,destination was unaligned. 
;
;                  source                  ebx 
;                  --+---------------+-------+ 
;                    |      mm2      |       | 
;                  --+---------------+-------+ 
;
;          dest         edx+12           edx+4 
;          --+---------------+---------------+ 
;            |      mm3      |               | 
;          --+---------------+---------------+ 
;
;  mm6 = shift+32 
;  mm7 = ecx = 64-(shift+32) 


;  In both cases there's one extra limb of src to fetch and combine 
;  with mm2 to make a qword at 4(%edx),and in the aligned case 
;  there's an extra limb of dst to be formed from that extra src limb 
;  left shifted. 

    movd    mm0,[ebx]
    psllq   mm2,mm6
    movq    [12+edx],mm3
    psllq   mm0,32
    movq    mm1,mm0
    psrlq   mm0,mm7
    por     mm0,mm2
    psllq   mm1,mm6
    movq    [4+edx],mm0
    psrlq   mm1,32
    and     ecx,32
    pop     ebx
    jz      Lfinish_one_unaligned
    movd    [edx],mm1
Lfinish_one_unaligned: 
    emms
    ret
Lfinish_zero: 

;  No extra src limbs,destination was aligned. 
;
;                  source          ebx 
;                  --+---------------+ 
;                    |      mm2      | 
;                  --+---------------+ 
;
;  dest          edx+8             edx 
;  --+---------------+---------------+ 
;    |      mm3      |               | 
;  --+---------------+---------------+ 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  No extra src limbs,destination was unaligned. 
;
;                source            ebx 
;                  --+---------------+ 
;                    |      mm2      | 
;                  --+---------------+ 
;
;          dest          edx+8   edx+4 
;          --+---------------+-------+ 
;            |      mm3      |       | 
;          --+---------------+-------+ 
;
;  mm6 = shift+32 
;  mm7 = ecx = 64-(shift+32) 

;  The movd for the unaligned case writes the same data to 4(%edx) 
;  that the movq does for the aligned case. 

    movq    [8+edx],mm3
    and     ecx,32
    psllq   mm2,mm6
    jz      Lfinish_zero_unaligned
    movq    [edx],mm2
Lfinish_zero_unaligned: 
    psrlq   mm2,32
    pop     ebx
    movd    eax,mm5  ;  retval 
    movd    [4+edx],mm2
    emms
    ret

	end
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00
			`; Copyright 2001 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Translation of AT&T syntax code by Brian Gladman`

			`%include "..\..\x86i.inc"`

			`%define PARAM_SHIFT esp+frame+16`
			`%define PARAM_SIZE esp+frame+12`
			`%define PARAM_SRC esp+frame+8`
			`%define PARAM_DST esp+frame+4`
			`%define frame 8`

			`; minimum 5,because the unrolled loop can't handle less`
			`%define UNROLL_THRESHOLD 5`

			`section .text`

			`global ___gmpn_lshift`
			`%ifdef DLL`
			`export ___gmpn_lshift`
			`%endif`

			`align 8`
			`___gmpn_lshift:`
			`push ebx`
			`push edi`
			`mov eax,[PARAM_SIZE]`
			`mov edx,[PARAM_DST]`
			`mov ebx,[PARAM_SRC]`
			`mov ecx,[PARAM_SHIFT]`
			`cmp eax,UNROLL_THRESHOLD`
			`jae Lunroll`
			`mov edi,[-4+ebx+eax*4] ; src high limb`
			`dec eax`
			`jnz Lsimple`
			`shld eax,edi,cl`
			`shl edi,cl`
			`mov [edx],edi ; dst low limb`
			`pop edi ; risk of data cache bank clash`
			`pop ebx`
			`ret`

			`; eax size-1`
			`; ebx src`
			`; ecx shift`
			`; edx dst`
			`; esi`
			`; edi`
			`; ebp`

			`Lsimple:`
			`movd mm5,[ebx+eax*4] ; src high limb`
			`movd mm6,ecx ; lshift`
			`neg ecx`
			`psllq mm5,mm6`
			`add ecx,32`
			`movd mm7,ecx`
			`psrlq mm5,32 ; retval`

			`; eax counter,limbs,negative`
			`; ebx src`
			`; ecx`
			`; edx dst`
			`; esi`
			`; edi`
			`;`
			`; mm0 scratch`
			`; mm5 return value`
			`; mm6 shift`
			`; mm7 32-shift`

			`Lsimple_top:`
			`movq mm0,[ebx+eax*4-4]`
			`dec eax`
			`psrlq mm0,mm7`
			`movd [4+edx+eax*4],mm0`
			`jnz Lsimple_top`
			`movd mm0,[ebx]`
			`movd eax,mm5`
			`psllq mm0,mm6`
			`pop edi`
			`pop ebx`
			`movd [edx],mm0`
			`emms`
			`ret`

			`; eax size`
			`; ebx src`
			`; ecx shift`
			`; edx dst`
			`; esi`
			`; edi`
			`; ebp`

			`align 8`
			`Lunroll:`
			`movd mm5,[ebx+eax*4-4] ; src high limb`
			`lea edi,[ebx+eax*4]`
			`movd mm6,ecx ; lshift`
			`and edi,4`
			`psllq mm5,mm6`
			`jz Lstart_src_aligned`

			`; src isn't aligned,process high limb separately (marked xxx) to`
			`; make it so.`
			`;`
			`; source -8(ebx,%eax,4)`
			`; \|`
			`; +-------+-------+-------+--`
			`; \| \|`
			`; +-------+-------+-------+--`
			`; 0mod8 4mod8 0mod8`
			`;`
			`; dest`
			`; -4(edx,%eax,4)`
			`; \|`
			`; +-------+-------+--`
			`; \| xxx \| \|`
			`; +-------+-------+--`

			`movq mm0,[ebx+eax*4-8] ; unaligned load`
			`psllq mm0,mm6`
			`dec eax`
			`psrlq mm0,32`
			`movd [edx+eax*4],mm0`
			`Lstart_src_aligned:`
			`movq mm1,[ebx+eax*4-8] ; src high qword`
			`lea edi,[edx+eax*4]`
			`and edi,4`
			`psrlq mm5,32 ; return value`
			`movq mm3,[ebx+eax*4-16] ; src second highest qword`
			`jz Lstart_dst_aligned`

			`; dst isn't aligned,subtract 4 to make it so,and pretend the shift`
			`; is 32 bits extra. High limb of dst (marked xxx) handled here`
			`; separately.`
			`;`
			`; source -8(ebx,%eax,4)`
			`; \|`
			`; +-------+-------+--`
			`; \| mm1 \|`
			`; +-------+-------+--`
			`; 0mod8 4mod8`
			`;`
			`; dest`
			`; -4(edx,%eax,4)`
			`; \|`
			`; +-------+-------+-------+--`
			`; \| xxx \| \|`
			`; +-------+-------+-------+--`
			`; 0mod8 4mod8 0mod8`

			`movq mm0,mm1`
			`add ecx,32 ; new shift`
			`psllq mm0,mm6`
			`movd mm6,ecx`
			`psrlq mm0,32`

			`; wasted cycle here waiting for %mm0`

			`movd [-4+edx+eax*4],mm0`
			`sub edx,4`
			`Lstart_dst_aligned:`

			`psllq mm1,mm6`
			`neg ecx ; -shift`
			`add ecx,64 ; 64-shift`
			`movq mm2,mm3`
			`movd mm7,ecx`
			`sub eax,8 ; size-8`
			`psrlq mm3,mm7`
			`por mm3,mm1 ; mm3 ready to store`
			`jc Lfinish`

			`; The comments in mpn_rshift apply here too.`

			`; eax counter,limbs`
			`; ebx src`
			`; ecx`
			`; edx dst`
			`; esi`
			`; edi`
			`;`
			`; mm0`
			`; mm1`
			`; mm2 src qword from 16(%ebx,%eax,4)`
			`; mm3 dst qword ready to store to 24(%edx,%eax,4)`
			`;`
			`; mm5 return value`
			`; mm6 lshift`
			`; mm7 rshift`

			`align 8`
			`Lunroll_loop:`
			`movq mm0,[ebx+eax*4+8]`
			`psllq mm2,mm6`
			`movq mm1,mm0`
			`psrlq mm0,mm7`
			`movq [24+edx+eax*4],mm3`
			`por mm0,mm2`
			`movq mm3,[ebx+eax*4]`
			`psllq mm1,mm6`
			`movq [16+edx+eax*4],mm0`
			`movq mm2,mm3`
			`psrlq mm3,mm7`
			`sub eax,4`
			`por mm3,mm1`
			`jnc Lunroll_loop`
			`Lfinish:`
			`; eax -4 to -1 representing respectively 0 to 3 limbs remaining`

			`test al,2`
			`jz Lfinish_no_two`
			`movq mm0,[ebx+eax*4+8]`
			`psllq mm2,mm6`
			`movq mm1,mm0`
			`psrlq mm0,mm7`
			`movq [24+edx+eax*4],mm3 ; prev`
			`por mm0,mm2`
			`movq mm2,mm1`
			`movq mm3,mm0`
			`sub eax,2`
			`Lfinish_no_two:`

			`; eax -4 or -3 representing respectively 0 or 1 limbs remaining`
			`; mm2 src prev qword,from 16(%ebx,%eax,4)`
			`; mm3 dst qword,for 24(%edx,%eax,4)`

			`test al,1`
			`movd eax,mm5 ; retval`
			`pop edi`
			`jz Lfinish_zero`

			`; One extra src limb,destination was aligned.`
			`;`
			`; source ebx`
			`; --+---------------+-------+`
			`; \| mm2 \| \|`
			`; --+---------------+-------+`
			`;`
			`; dest edx+12 edx+4 edx`
			`; --+---------------+---------------+-------+`
			`; \| mm3 \| \| \|`
			`; --+---------------+---------------+-------+`
			`;`
			`; mm6 = shift`
			`; mm7 = ecx = 64-shift`

			`; One extra src limb,destination was unaligned.`
			`;`
			`; source ebx`
			`; --+---------------+-------+`
			`; \| mm2 \| \|`
			`; --+---------------+-------+`
			`;`
			`; dest edx+12 edx+4`
			`; --+---------------+---------------+`
			`; \| mm3 \| \|`
			`; --+---------------+---------------+`
			`;`
			`; mm6 = shift+32`
			`; mm7 = ecx = 64-(shift+32)`


			`; In both cases there's one extra limb of src to fetch and combine`
			`; with mm2 to make a qword at 4(%edx),and in the aligned case`
			`; there's an extra limb of dst to be formed from that extra src limb`
			`; left shifted.`

			`movd mm0,[ebx]`
			`psllq mm2,mm6`
			`movq [12+edx],mm3`
			`psllq mm0,32`
			`movq mm1,mm0`
			`psrlq mm0,mm7`
			`por mm0,mm2`
			`psllq mm1,mm6`
			`movq [4+edx],mm0`
			`psrlq mm1,32`
			`and ecx,32`
			`pop ebx`
			`jz Lfinish_one_unaligned`
			`movd [edx],mm1`
			`Lfinish_one_unaligned:`
			`emms`
			`ret`
			`Lfinish_zero:`

			`; No extra src limbs,destination was aligned.`
			`;`
			`; source ebx`
			`; --+---------------+`
			`; \| mm2 \|`
			`; --+---------------+`
			`;`
			`; dest edx+8 edx`
			`; --+---------------+---------------+`
			`; \| mm3 \| \|`
			`; --+---------------+---------------+`
			`;`
			`; mm6 = shift`
			`; mm7 = ecx = 64-shift`

			`; No extra src limbs,destination was unaligned.`
			`;`
			`; source ebx`
			`; --+---------------+`
			`; \| mm2 \|`
			`; --+---------------+`
			`;`
			`; dest edx+8 edx+4`
			`; --+---------------+-------+`
			`; \| mm3 \| \|`
			`; --+---------------+-------+`
			`;`
			`; mm6 = shift+32`
			`; mm7 = ecx = 64-(shift+32)`

			`; The movd for the unaligned case writes the same data to 4(%edx)`
			`; that the movq does for the aligned case.`

			`movq [8+edx],mm3`
			`and ecx,32`
			`psllq mm2,mm6`
			`jz Lfinish_zero_unaligned`
			`movq [edx],mm2`
			`Lfinish_zero_unaligned:`
			`psrlq mm2,32`
			`pop ebx`
			`movd eax,mm5 ; retval`
			`movd [4+edx],mm2`
			`emms`
			`ret`

			`end`