mpir/mpn/x86w/p3/lshift.asm


;  Copyright 2001 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\x86i.inc" 

	global	___gmpn_lshift 

%ifdef	DLL
	export	___gmpn_lshift
%endif

%define	PARAM_SHIFT esp+frame+16 
%define PARAM_SIZE  esp+frame+12 
%define PARAM_SRC   esp+frame+8 
%define PARAM_DST   esp+frame+4 
%define	frame		8 

;   minimum 5,because the unrolled loop can't handle less 
%define       UNROLL_THRESHOLD  5 

	section .text
	align   8

___gmpn_lshift: 
    push    ebx
    push    edi
    mov     eax,[PARAM_SIZE]
    mov     edx,[PARAM_DST]
    mov     ebx,[PARAM_SRC]
    mov     ecx,[PARAM_SHIFT]
	cmp     eax,UNROLL_THRESHOLD
    jae     Lunroll
    mov     edi,[-4+ebx+eax*4]	;  src high limb 
    dec     eax
    jnz     Lsimple
	shld	eax,edi,cl
    shl     edi,cl
    mov     [edx],edi			;  dst low limb 
    pop     edi					;  risk of data cache bank clash 
    pop     ebx
    ret

;  eax size-1 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

Lsimple: 
    movd    mm5,[ebx+eax*4] ;  src high limb 
    movd    mm6,ecx         ;  lshift 
    neg     ecx
    psllq   mm5,mm6
    add     ecx,32
    movd    mm7,ecx
    psrlq   mm5,32          ;  retval 

;  eax counter,limbs,negative 
;  ebx src 
;  ecx 
;  edx dst 
;  esi 
;  edi 
; 
;  mm0 scratch 
;  mm5 return value 
;  mm6 shift 
;  mm7 32-shift 

Lsimple_top: 
    movq    mm0,[ebx+eax*4-4]
    dec     eax
    psrlq   mm0,mm7
    movd    [4+edx+eax*4],mm0
    jnz     Lsimple_top
    movd    mm0,[ebx]
    movd    eax,mm5
    psllq   mm0,mm6
    pop     edi
    pop     ebx
    movd    [edx],mm0
    emms
    ret

;  eax size 
;  ebx src 
;  ecx shift 
;  edx dst 
;  esi 
;  edi 
;  ebp 

	align   8
Lunroll: 
    movd    mm5,[ebx+eax*4-4]	;  src high limb 
    lea     edi,[ebx+eax*4]
    movd    mm6,ecx				;  lshift 
    and     edi,4
    psllq   mm5,mm6
    jz      Lstart_src_aligned

;  src isn't aligned,process high limb separately (marked xxx) to 
;  make it so. 
; 
;   source     -8(ebx,%eax,4) 
;                   | 
;   +-------+-------+-------+-- 
;   |               |           
;   +-------+-------+-------+-- 
;         0mod8   4mod8   0mod8 
; 
;   dest 
;      -4(edx,%eax,4) 
;           | 
;   +-------+-------+-- 
;   |  xxx  |       |   
;   +-------+-------+-- 

    movq    mm0,[ebx+eax*4-8]		;  unaligned load 
    psllq   mm0,mm6
    dec     eax
    psrlq   mm0,32
    movd    [edx+eax*4],mm0
Lstart_src_aligned: 
    movq    mm1,[ebx+eax*4-8]		;  src high qword 
    lea     edi,[edx+eax*4]
    and     edi,4
    psrlq   mm5,32					;  return value 
    movq    mm3,[ebx+eax*4-16]		;  src second highest qword 
    jz      Lstart_dst_aligned

;  dst isn't aligned,subtract 4 to make it so,and pretend the shift 
;  is 32 bits extra.  High limb of dst (marked xxx) handled here 
;  separately. 
; 
;   source     -8(ebx,%eax,4) 
;                   | 
;   +-------+-------+-- 
;   |      mm1      |   
;   +-------+-------+-- 
;                 0mod8   4mod8 
; 
;   dest 
;      -4(edx,%eax,4) 
;           | 
;   +-------+-------+-------+-- 
;   |  xxx  |               | 
;   +-------+-------+-------+-- 
;         0mod8   4mod8   0mod8 

    movq    mm0,mm1
    add     ecx,32         ;  new shift 
    psllq   mm0,mm6
    movd    mm6,ecx
    psrlq   mm0,32

;  wasted cycle here waiting for %mm0 

    movd    [-4+edx+eax*4],mm0
    sub     edx,4
Lstart_dst_aligned: 

    psllq   mm1,mm6
    neg     ecx				;  -shift 
    add     ecx,64			;  64-shift 
    movq    mm2,mm3
    movd    mm7,ecx
    sub     eax,8			;  size-8 
    psrlq   mm3,mm7
    por     mm3,mm1         ;  mm3 ready to store 
    jc      Lfinish

;  The comments in mpn_rshift apply here too. 

;  eax counter,limbs 
;  ebx src 
;  ecx 
;  edx dst 
;  esi 
;  edi 
; 
;  mm0 
;  mm1 
;  mm2 src qword from 16(%ebx,%eax,4) 
;  mm3 dst qword ready to store to 24(%edx,%eax,4) 
; 
;  mm5 return value 
;  mm6 lshift 
;  mm7 rshift 

	align   8
Lunroll_loop: 
    movq    mm0,[ebx+eax*4+8]
    psllq   mm2,mm6
    movq    mm1,mm0
    psrlq   mm0,mm7
    movq    [24+edx+eax*4],mm3
    por     mm0,mm2
    movq    mm3,[ebx+eax*4]
    psllq   mm1,mm6
    movq    [16+edx+eax*4],mm0
    movq    mm2,mm3 
	psrlq   mm3,mm7
    sub     eax,4
	por     mm3,mm1
    jnc     Lunroll_loop
Lfinish: 
;  eax -4 to -1 representing respectively 0 to 3 limbs remaining 

    test    al,2
    jz      Lfinish_no_two
    movq    mm0,[ebx+eax*4+8]
    psllq   mm2,mm6
    movq    mm1,mm0
    psrlq   mm0,mm7
    movq    [24+edx+eax*4],mm3  ;  prev 
    por     mm0,mm2
    movq    mm2,mm1
    movq    mm3,mm0
    sub     eax,2
Lfinish_no_two: 

;  eax -4 or -3 representing respectively 0 or 1 limbs remaining 
;  mm2 src prev qword,from 16(%ebx,%eax,4) 
;  mm3 dst qword,for 24(%edx,%eax,4) 

    test    al,1
    movd    eax,mm5  ;  retval 
    pop     edi
    jz      Lfinish_zero

;  One extra src limb,destination was aligned. 
;
;                  source                  ebx 
;                  --+---------------+-------+ 
;                    |      mm2      |       | 
;                  --+---------------+-------+ 
;
;  dest         edx+12           edx+4     edx 
;  --+---------------+---------------+-------+ 
;    |      mm3      |               |       | 
;  --+---------------+---------------+-------+ 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  One extra src limb,destination was unaligned. 
;
;                  source                  ebx 
;                  --+---------------+-------+ 
;                    |      mm2      |       | 
;                  --+---------------+-------+ 
;
;          dest         edx+12           edx+4 
;          --+---------------+---------------+ 
;            |      mm3      |               | 
;          --+---------------+---------------+ 
;
;  mm6 = shift+32 
;  mm7 = ecx = 64-(shift+32) 


;  In both cases there's one extra limb of src to fetch and combine 
;  with mm2 to make a qword at 4(%edx),and in the aligned case 
;  there's an extra limb of dst to be formed from that extra src limb 
;  left shifted. 

    movd    mm0,[ebx]
    psllq   mm2,mm6
    movq    [12+edx],mm3
    psllq   mm0,32
    movq    mm1,mm0
    psrlq   mm0,mm7
    por     mm0,mm2
    psllq   mm1,mm6
    movq    [4+edx],mm0
    psrlq   mm1,32
    and     ecx,32
    pop     ebx
    jz      Lfinish_one_unaligned
    movd    [edx],mm1
Lfinish_one_unaligned: 
    emms
    ret
Lfinish_zero: 

;  No extra src limbs,destination was aligned. 
;
;                  source          ebx 
;                  --+---------------+ 
;                    |      mm2      | 
;                  --+---------------+ 
;
;  dest          edx+8             edx 
;  --+---------------+---------------+ 
;    |      mm3      |               | 
;  --+---------------+---------------+ 
;
;  mm6 = shift 
;  mm7 = ecx = 64-shift 

;  No extra src limbs,destination was unaligned. 
;
;                source            ebx 
;                  --+---------------+ 
;                    |      mm2      | 
;                  --+---------------+ 
;
;          dest          edx+8   edx+4 
;          --+---------------+-------+ 
;            |      mm3      |       | 
;          --+---------------+-------+ 
;
;  mm6 = shift+32 
;  mm7 = ecx = 64-(shift+32) 

;  The movd for the unaligned case writes the same data to 4(%edx) 
;  that the movq does for the aligned case. 

    movq    [8+edx],mm3
    and     ecx,32
    psllq   mm2,mm6
    jz      Lfinish_zero_unaligned
    movq    [edx],mm2
Lfinish_zero_unaligned: 
    psrlq   mm2,32
    pop     ebx
    movd    eax,mm5  ;  retval 
    movd    [4+edx],mm2
    emms
    ret

	end
update win32 builds 2012-02-12 18:40:14 -05:00
			`; Copyright 2001 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Translation of AT&T syntax code by Brian Gladman`

correct 32-bit build error reporteed by Case 2012-03-19 05:46:25 -04:00			`%include "..\x86i.inc"`
update win32 builds 2012-02-12 18:40:14 -05:00
			`global ___gmpn_lshift`

			`%ifdef DLL`
			`export ___gmpn_lshift`
			`%endif`

			`%define PARAM_SHIFT esp+frame+16`
			`%define PARAM_SIZE esp+frame+12`
			`%define PARAM_SRC esp+frame+8`
			`%define PARAM_DST esp+frame+4`
			`%define frame 8`

			`; minimum 5,because the unrolled loop can't handle less`
			`%define UNROLL_THRESHOLD 5`

			`section .text`
			`align 8`

			`___gmpn_lshift:`
			`push ebx`
			`push edi`
			`mov eax,[PARAM_SIZE]`
			`mov edx,[PARAM_DST]`
			`mov ebx,[PARAM_SRC]`
			`mov ecx,[PARAM_SHIFT]`
			`cmp eax,UNROLL_THRESHOLD`
			`jae Lunroll`
			`mov edi,[-4+ebx+eax*4] ; src high limb`
			`dec eax`
			`jnz Lsimple`
			`shld eax,edi,cl`
			`shl edi,cl`
			`mov [edx],edi ; dst low limb`
			`pop edi ; risk of data cache bank clash`
			`pop ebx`
			`ret`

			`; eax size-1`
			`; ebx src`
			`; ecx shift`
			`; edx dst`
			`; esi`
			`; edi`
			`; ebp`

			`Lsimple:`
			`movd mm5,[ebx+eax*4] ; src high limb`
			`movd mm6,ecx ; lshift`
			`neg ecx`
			`psllq mm5,mm6`
			`add ecx,32`
			`movd mm7,ecx`
			`psrlq mm5,32 ; retval`

			`; eax counter,limbs,negative`
			`; ebx src`
			`; ecx`
			`; edx dst`
			`; esi`
			`; edi`
			`;`
			`; mm0 scratch`
			`; mm5 return value`
			`; mm6 shift`
			`; mm7 32-shift`

			`Lsimple_top:`
			`movq mm0,[ebx+eax*4-4]`
			`dec eax`
			`psrlq mm0,mm7`
			`movd [4+edx+eax*4],mm0`
			`jnz Lsimple_top`
			`movd mm0,[ebx]`
			`movd eax,mm5`
			`psllq mm0,mm6`
			`pop edi`
			`pop ebx`
			`movd [edx],mm0`
			`emms`
			`ret`

			`; eax size`
			`; ebx src`
			`; ecx shift`
			`; edx dst`
			`; esi`
			`; edi`
			`; ebp`

			`align 8`
			`Lunroll:`
			`movd mm5,[ebx+eax*4-4] ; src high limb`
			`lea edi,[ebx+eax*4]`
			`movd mm6,ecx ; lshift`
			`and edi,4`
			`psllq mm5,mm6`
			`jz Lstart_src_aligned`

			`; src isn't aligned,process high limb separately (marked xxx) to`
			`; make it so.`
			`;`
			`; source -8(ebx,%eax,4)`
			`; \|`
			`; +-------+-------+-------+--`
			`; \| \|`
			`; +-------+-------+-------+--`
			`; 0mod8 4mod8 0mod8`
			`;`
			`; dest`
			`; -4(edx,%eax,4)`
			`; \|`
			`; +-------+-------+--`
			`; \| xxx \| \|`
			`; +-------+-------+--`

			`movq mm0,[ebx+eax*4-8] ; unaligned load`
			`psllq mm0,mm6`
			`dec eax`
			`psrlq mm0,32`
			`movd [edx+eax*4],mm0`
			`Lstart_src_aligned:`
			`movq mm1,[ebx+eax*4-8] ; src high qword`
			`lea edi,[edx+eax*4]`
			`and edi,4`
			`psrlq mm5,32 ; return value`
			`movq mm3,[ebx+eax*4-16] ; src second highest qword`
			`jz Lstart_dst_aligned`

			`; dst isn't aligned,subtract 4 to make it so,and pretend the shift`
			`; is 32 bits extra. High limb of dst (marked xxx) handled here`
			`; separately.`
			`;`
			`; source -8(ebx,%eax,4)`
			`; \|`
			`; +-------+-------+--`
			`; \| mm1 \|`
			`; +-------+-------+--`
			`; 0mod8 4mod8`
			`;`
			`; dest`
			`; -4(edx,%eax,4)`
			`; \|`
			`; +-------+-------+-------+--`
			`; \| xxx \| \|`
			`; +-------+-------+-------+--`
			`; 0mod8 4mod8 0mod8`

			`movq mm0,mm1`
			`add ecx,32 ; new shift`
			`psllq mm0,mm6`
			`movd mm6,ecx`
			`psrlq mm0,32`

			`; wasted cycle here waiting for %mm0`

			`movd [-4+edx+eax*4],mm0`
			`sub edx,4`
			`Lstart_dst_aligned:`

			`psllq mm1,mm6`
			`neg ecx ; -shift`
			`add ecx,64 ; 64-shift`
			`movq mm2,mm3`
			`movd mm7,ecx`
			`sub eax,8 ; size-8`
			`psrlq mm3,mm7`
			`por mm3,mm1 ; mm3 ready to store`
			`jc Lfinish`

			`; The comments in mpn_rshift apply here too.`

			`; eax counter,limbs`
			`; ebx src`
			`; ecx`
			`; edx dst`
			`; esi`
			`; edi`
			`;`
			`; mm0`
			`; mm1`
			`; mm2 src qword from 16(%ebx,%eax,4)`
			`; mm3 dst qword ready to store to 24(%edx,%eax,4)`
			`;`
			`; mm5 return value`
			`; mm6 lshift`
			`; mm7 rshift`

			`align 8`
			`Lunroll_loop:`
			`movq mm0,[ebx+eax*4+8]`
			`psllq mm2,mm6`
			`movq mm1,mm0`
			`psrlq mm0,mm7`
			`movq [24+edx+eax*4],mm3`
			`por mm0,mm2`
			`movq mm3,[ebx+eax*4]`
			`psllq mm1,mm6`
			`movq [16+edx+eax*4],mm0`
			`movq mm2,mm3`
			`psrlq mm3,mm7`
			`sub eax,4`
			`por mm3,mm1`
			`jnc Lunroll_loop`
			`Lfinish:`
			`; eax -4 to -1 representing respectively 0 to 3 limbs remaining`

			`test al,2`
			`jz Lfinish_no_two`
			`movq mm0,[ebx+eax*4+8]`
			`psllq mm2,mm6`
			`movq mm1,mm0`
			`psrlq mm0,mm7`
			`movq [24+edx+eax*4],mm3 ; prev`
			`por mm0,mm2`
			`movq mm2,mm1`
			`movq mm3,mm0`
			`sub eax,2`
			`Lfinish_no_two:`

			`; eax -4 or -3 representing respectively 0 or 1 limbs remaining`
			`; mm2 src prev qword,from 16(%ebx,%eax,4)`
			`; mm3 dst qword,for 24(%edx,%eax,4)`

			`test al,1`
			`movd eax,mm5 ; retval`
			`pop edi`
			`jz Lfinish_zero`

			`; One extra src limb,destination was aligned.`
			`;`
			`; source ebx`
			`; --+---------------+-------+`
			`; \| mm2 \| \|`
			`; --+---------------+-------+`
			`;`
			`; dest edx+12 edx+4 edx`
			`; --+---------------+---------------+-------+`
			`; \| mm3 \| \| \|`
			`; --+---------------+---------------+-------+`
			`;`
			`; mm6 = shift`
			`; mm7 = ecx = 64-shift`

			`; One extra src limb,destination was unaligned.`
			`;`
			`; source ebx`
			`; --+---------------+-------+`
			`; \| mm2 \| \|`
			`; --+---------------+-------+`
			`;`
			`; dest edx+12 edx+4`
			`; --+---------------+---------------+`
			`; \| mm3 \| \|`
			`; --+---------------+---------------+`
			`;`
			`; mm6 = shift+32`
			`; mm7 = ecx = 64-(shift+32)`


			`; In both cases there's one extra limb of src to fetch and combine`
			`; with mm2 to make a qword at 4(%edx),and in the aligned case`
			`; there's an extra limb of dst to be formed from that extra src limb`
			`; left shifted.`

			`movd mm0,[ebx]`
			`psllq mm2,mm6`
			`movq [12+edx],mm3`
			`psllq mm0,32`
			`movq mm1,mm0`
			`psrlq mm0,mm7`
			`por mm0,mm2`
			`psllq mm1,mm6`
			`movq [4+edx],mm0`
			`psrlq mm1,32`
			`and ecx,32`
			`pop ebx`
			`jz Lfinish_one_unaligned`
			`movd [edx],mm1`
			`Lfinish_one_unaligned:`
			`emms`
			`ret`
			`Lfinish_zero:`

			`; No extra src limbs,destination was aligned.`
			`;`
			`; source ebx`
			`; --+---------------+`
			`; \| mm2 \|`
			`; --+---------------+`
			`;`
			`; dest edx+8 edx`
			`; --+---------------+---------------+`
			`; \| mm3 \| \|`
			`; --+---------------+---------------+`
			`;`
			`; mm6 = shift`
			`; mm7 = ecx = 64-shift`

			`; No extra src limbs,destination was unaligned.`
			`;`
			`; source ebx`
			`; --+---------------+`
			`; \| mm2 \|`
			`; --+---------------+`
			`;`
			`; dest edx+8 edx+4`
			`; --+---------------+-------+`
			`; \| mm3 \| \|`
			`; --+---------------+-------+`
			`;`
			`; mm6 = shift+32`
			`; mm7 = ecx = 64-(shift+32)`

			`; The movd for the unaligned case writes the same data to 4(%edx)`
			`; that the movq does for the aligned case.`

			`movq [8+edx],mm3`
			`and ecx,32`
			`psllq mm2,mm6`
			`jz Lfinish_zero_unaligned`
			`movq [edx],mm2`
			`Lfinish_zero_unaligned:`
			`psrlq mm2,32`
			`pop ebx`
			`movd eax,mm5 ; retval`
			`movd [4+edx],mm2`
			`emms`
			`ret`

			`end`