; Copyright 2001 Free Software Foundation, Inc. ; ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The GNU MP Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the GNU MP Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 59 Temple Place - ; Suite 330, Boston, MA 02111-1307, USA. ; ; Translation of AT&T syntax code by Brian Gladman %include "..\..\x86i.inc" %define PARAM_SHIFT esp+frame+16 %define PARAM_SIZE esp+frame+12 %define PARAM_SRC esp+frame+8 %define PARAM_DST esp+frame+4 %define frame 8 ; minimum 5,because the unrolled loop can't handle less %define UNROLL_THRESHOLD 5 section .text global ___gmpn_lshift %ifdef DLL export ___gmpn_lshift %endif align 8 ___gmpn_lshift: push ebx push edi mov eax,[PARAM_SIZE] mov edx,[PARAM_DST] mov ebx,[PARAM_SRC] mov ecx,[PARAM_SHIFT] cmp eax,UNROLL_THRESHOLD jae Lunroll mov edi,[-4+ebx+eax*4] ; src high limb dec eax jnz Lsimple shld eax,edi,cl shl edi,cl mov [edx],edi ; dst low limb pop edi ; risk of data cache bank clash pop ebx ret ; eax size-1 ; ebx src ; ecx shift ; edx dst ; esi ; edi ; ebp Lsimple: movd mm5,[ebx+eax*4] ; src high limb movd mm6,ecx ; lshift neg ecx psllq mm5,mm6 add ecx,32 movd mm7,ecx psrlq mm5,32 ; retval ; eax counter,limbs,negative ; ebx src ; ecx ; edx dst ; esi ; edi ; ; mm0 scratch ; mm5 return value ; mm6 shift ; mm7 32-shift Lsimple_top: movq mm0,[ebx+eax*4-4] dec eax psrlq mm0,mm7 movd [4+edx+eax*4],mm0 jnz Lsimple_top movd mm0,[ebx] movd eax,mm5 psllq mm0,mm6 pop edi pop ebx movd [edx],mm0 emms ret ; eax size ; ebx src ; ecx shift ; edx dst ; esi ; edi ; ebp align 8 Lunroll: movd mm5,[ebx+eax*4-4] ; src high limb lea edi,[ebx+eax*4] movd mm6,ecx ; lshift and edi,4 psllq mm5,mm6 jz Lstart_src_aligned ; src isn't aligned,process high limb separately (marked xxx) to ; make it so. ; ; source -8(ebx,%eax,4) ; | ; +-------+-------+-------+-- ; | | ; +-------+-------+-------+-- ; 0mod8 4mod8 0mod8 ; ; dest ; -4(edx,%eax,4) ; | ; +-------+-------+-- ; | xxx | | ; +-------+-------+-- movq mm0,[ebx+eax*4-8] ; unaligned load psllq mm0,mm6 dec eax psrlq mm0,32 movd [edx+eax*4],mm0 Lstart_src_aligned: movq mm1,[ebx+eax*4-8] ; src high qword lea edi,[edx+eax*4] and edi,4 psrlq mm5,32 ; return value movq mm3,[ebx+eax*4-16] ; src second highest qword jz Lstart_dst_aligned ; dst isn't aligned,subtract 4 to make it so,and pretend the shift ; is 32 bits extra. High limb of dst (marked xxx) handled here ; separately. ; ; source -8(ebx,%eax,4) ; | ; +-------+-------+-- ; | mm1 | ; +-------+-------+-- ; 0mod8 4mod8 ; ; dest ; -4(edx,%eax,4) ; | ; +-------+-------+-------+-- ; | xxx | | ; +-------+-------+-------+-- ; 0mod8 4mod8 0mod8 movq mm0,mm1 add ecx,32 ; new shift psllq mm0,mm6 movd mm6,ecx psrlq mm0,32 ; wasted cycle here waiting for %mm0 movd [-4+edx+eax*4],mm0 sub edx,4 Lstart_dst_aligned: psllq mm1,mm6 neg ecx ; -shift add ecx,64 ; 64-shift movq mm2,mm3 movd mm7,ecx sub eax,8 ; size-8 psrlq mm3,mm7 por mm3,mm1 ; mm3 ready to store jc Lfinish ; The comments in mpn_rshift apply here too. ; eax counter,limbs ; ebx src ; ecx ; edx dst ; esi ; edi ; ; mm0 ; mm1 ; mm2 src qword from 16(%ebx,%eax,4) ; mm3 dst qword ready to store to 24(%edx,%eax,4) ; ; mm5 return value ; mm6 lshift ; mm7 rshift align 8 Lunroll_loop: movq mm0,[ebx+eax*4+8] psllq mm2,mm6 movq mm1,mm0 psrlq mm0,mm7 movq [24+edx+eax*4],mm3 por mm0,mm2 movq mm3,[ebx+eax*4] psllq mm1,mm6 movq [16+edx+eax*4],mm0 movq mm2,mm3 psrlq mm3,mm7 sub eax,4 por mm3,mm1 jnc Lunroll_loop Lfinish: ; eax -4 to -1 representing respectively 0 to 3 limbs remaining test al,2 jz Lfinish_no_two movq mm0,[ebx+eax*4+8] psllq mm2,mm6 movq mm1,mm0 psrlq mm0,mm7 movq [24+edx+eax*4],mm3 ; prev por mm0,mm2 movq mm2,mm1 movq mm3,mm0 sub eax,2 Lfinish_no_two: ; eax -4 or -3 representing respectively 0 or 1 limbs remaining ; mm2 src prev qword,from 16(%ebx,%eax,4) ; mm3 dst qword,for 24(%edx,%eax,4) test al,1 movd eax,mm5 ; retval pop edi jz Lfinish_zero ; One extra src limb,destination was aligned. ; ; source ebx ; --+---------------+-------+ ; | mm2 | | ; --+---------------+-------+ ; ; dest edx+12 edx+4 edx ; --+---------------+---------------+-------+ ; | mm3 | | | ; --+---------------+---------------+-------+ ; ; mm6 = shift ; mm7 = ecx = 64-shift ; One extra src limb,destination was unaligned. ; ; source ebx ; --+---------------+-------+ ; | mm2 | | ; --+---------------+-------+ ; ; dest edx+12 edx+4 ; --+---------------+---------------+ ; | mm3 | | ; --+---------------+---------------+ ; ; mm6 = shift+32 ; mm7 = ecx = 64-(shift+32) ; In both cases there's one extra limb of src to fetch and combine ; with mm2 to make a qword at 4(%edx),and in the aligned case ; there's an extra limb of dst to be formed from that extra src limb ; left shifted. movd mm0,[ebx] psllq mm2,mm6 movq [12+edx],mm3 psllq mm0,32 movq mm1,mm0 psrlq mm0,mm7 por mm0,mm2 psllq mm1,mm6 movq [4+edx],mm0 psrlq mm1,32 and ecx,32 pop ebx jz Lfinish_one_unaligned movd [edx],mm1 Lfinish_one_unaligned: emms ret Lfinish_zero: ; No extra src limbs,destination was aligned. ; ; source ebx ; --+---------------+ ; | mm2 | ; --+---------------+ ; ; dest edx+8 edx ; --+---------------+---------------+ ; | mm3 | | ; --+---------------+---------------+ ; ; mm6 = shift ; mm7 = ecx = 64-shift ; No extra src limbs,destination was unaligned. ; ; source ebx ; --+---------------+ ; | mm2 | ; --+---------------+ ; ; dest edx+8 edx+4 ; --+---------------+-------+ ; | mm3 | | ; --+---------------+-------+ ; ; mm6 = shift+32 ; mm7 = ecx = 64-(shift+32) ; The movd for the unaligned case writes the same data to 4(%edx) ; that the movq does for the aligned case. movq [8+edx],mm3 and ecx,32 psllq mm2,mm6 jz Lfinish_zero_unaligned movq [edx],mm2 Lfinish_zero_unaligned: psrlq mm2,32 pop ebx movd eax,mm5 ; retval movd [4+edx],mm2 emms ret end