mpir/mpn/x86i/pentium4/mmx/lshift.asm

354 lines
8.6 KiB
NASM
Raw Normal View History

; Copyright 2001 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
%include "..\..\x86i.inc"
%define PARAM_SHIFT esp+frame+16
%define PARAM_SIZE esp+frame+12
%define PARAM_SRC esp+frame+8
%define PARAM_DST esp+frame+4
%define frame 8
; minimum 5,because the unrolled loop can't handle less
%define UNROLL_THRESHOLD 5
section .text
global ___gmpn_lshift
%ifdef DLL
export ___gmpn_lshift
%endif
align 8
___gmpn_lshift:
push ebx
push edi
mov eax,[PARAM_SIZE]
mov edx,[PARAM_DST]
mov ebx,[PARAM_SRC]
mov ecx,[PARAM_SHIFT]
cmp eax,UNROLL_THRESHOLD
jae Lunroll
mov edi,[-4+ebx+eax*4] ; src high limb
dec eax
jnz Lsimple
shld eax,edi,cl
shl edi,cl
mov [edx],edi ; dst low limb
pop edi ; risk of data cache bank clash
pop ebx
ret
; eax size-1
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
Lsimple:
movd mm5,[ebx+eax*4] ; src high limb
movd mm6,ecx ; lshift
neg ecx
psllq mm5,mm6
add ecx,32
movd mm7,ecx
psrlq mm5,32 ; retval
; eax counter,limbs,negative
; ebx src
; ecx
; edx dst
; esi
; edi
;
; mm0 scratch
; mm5 return value
; mm6 shift
; mm7 32-shift
Lsimple_top:
movq mm0,[ebx+eax*4-4]
dec eax
psrlq mm0,mm7
movd [4+edx+eax*4],mm0
jnz Lsimple_top
movd mm0,[ebx]
movd eax,mm5
psllq mm0,mm6
pop edi
pop ebx
movd [edx],mm0
emms
ret
; eax size
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
align 8
Lunroll:
movd mm5,[ebx+eax*4-4] ; src high limb
lea edi,[ebx+eax*4]
movd mm6,ecx ; lshift
and edi,4
psllq mm5,mm6
jz Lstart_src_aligned
; src isn't aligned,process high limb separately (marked xxx) to
; make it so.
;
; source -8(ebx,%eax,4)
; |
; +-------+-------+-------+--
; | |
; +-------+-------+-------+--
; 0mod8 4mod8 0mod8
;
; dest
; -4(edx,%eax,4)
; |
; +-------+-------+--
; | xxx | |
; +-------+-------+--
movq mm0,[ebx+eax*4-8] ; unaligned load
psllq mm0,mm6
dec eax
psrlq mm0,32
movd [edx+eax*4],mm0
Lstart_src_aligned:
movq mm1,[ebx+eax*4-8] ; src high qword
lea edi,[edx+eax*4]
and edi,4
psrlq mm5,32 ; return value
movq mm3,[ebx+eax*4-16] ; src second highest qword
jz Lstart_dst_aligned
; dst isn't aligned,subtract 4 to make it so,and pretend the shift
; is 32 bits extra. High limb of dst (marked xxx) handled here
; separately.
;
; source -8(ebx,%eax,4)
; |
; +-------+-------+--
; | mm1 |
; +-------+-------+--
; 0mod8 4mod8
;
; dest
; -4(edx,%eax,4)
; |
; +-------+-------+-------+--
; | xxx | |
; +-------+-------+-------+--
; 0mod8 4mod8 0mod8
movq mm0,mm1
add ecx,32 ; new shift
psllq mm0,mm6
movd mm6,ecx
psrlq mm0,32
; wasted cycle here waiting for %mm0
movd [-4+edx+eax*4],mm0
sub edx,4
Lstart_dst_aligned:
psllq mm1,mm6
neg ecx ; -shift
add ecx,64 ; 64-shift
movq mm2,mm3
movd mm7,ecx
sub eax,8 ; size-8
psrlq mm3,mm7
por mm3,mm1 ; mm3 ready to store
jc Lfinish
; The comments in mpn_rshift apply here too.
; eax counter,limbs
; ebx src
; ecx
; edx dst
; esi
; edi
;
; mm0
; mm1
; mm2 src qword from 16(%ebx,%eax,4)
; mm3 dst qword ready to store to 24(%edx,%eax,4)
;
; mm5 return value
; mm6 lshift
; mm7 rshift
align 8
Lunroll_loop:
movq mm0,[ebx+eax*4+8]
psllq mm2,mm6
movq mm1,mm0
psrlq mm0,mm7
movq [24+edx+eax*4],mm3
por mm0,mm2
movq mm3,[ebx+eax*4]
psllq mm1,mm6
movq [16+edx+eax*4],mm0
movq mm2,mm3
psrlq mm3,mm7
sub eax,4
por mm3,mm1
jnc Lunroll_loop
Lfinish:
; eax -4 to -1 representing respectively 0 to 3 limbs remaining
test al,2
jz Lfinish_no_two
movq mm0,[ebx+eax*4+8]
psllq mm2,mm6
movq mm1,mm0
psrlq mm0,mm7
movq [24+edx+eax*4],mm3 ; prev
por mm0,mm2
movq mm2,mm1
movq mm3,mm0
sub eax,2
Lfinish_no_two:
; eax -4 or -3 representing respectively 0 or 1 limbs remaining
; mm2 src prev qword,from 16(%ebx,%eax,4)
; mm3 dst qword,for 24(%edx,%eax,4)
test al,1
movd eax,mm5 ; retval
pop edi
jz Lfinish_zero
; One extra src limb,destination was aligned.
;
; source ebx
; --+---------------+-------+
; | mm2 | |
; --+---------------+-------+
;
; dest edx+12 edx+4 edx
; --+---------------+---------------+-------+
; | mm3 | | |
; --+---------------+---------------+-------+
;
; mm6 = shift
; mm7 = ecx = 64-shift
; One extra src limb,destination was unaligned.
;
; source ebx
; --+---------------+-------+
; | mm2 | |
; --+---------------+-------+
;
; dest edx+12 edx+4
; --+---------------+---------------+
; | mm3 | |
; --+---------------+---------------+
;
; mm6 = shift+32
; mm7 = ecx = 64-(shift+32)
; In both cases there's one extra limb of src to fetch and combine
; with mm2 to make a qword at 4(%edx),and in the aligned case
; there's an extra limb of dst to be formed from that extra src limb
; left shifted.
movd mm0,[ebx]
psllq mm2,mm6
movq [12+edx],mm3
psllq mm0,32
movq mm1,mm0
psrlq mm0,mm7
por mm0,mm2
psllq mm1,mm6
movq [4+edx],mm0
psrlq mm1,32
and ecx,32
pop ebx
jz Lfinish_one_unaligned
movd [edx],mm1
Lfinish_one_unaligned:
emms
ret
Lfinish_zero:
; No extra src limbs,destination was aligned.
;
; source ebx
; --+---------------+
; | mm2 |
; --+---------------+
;
; dest edx+8 edx
; --+---------------+---------------+
; | mm3 | |
; --+---------------+---------------+
;
; mm6 = shift
; mm7 = ecx = 64-shift
; No extra src limbs,destination was unaligned.
;
; source ebx
; --+---------------+
; | mm2 |
; --+---------------+
;
; dest edx+8 edx+4
; --+---------------+-------+
; | mm3 | |
; --+---------------+-------+
;
; mm6 = shift+32
; mm7 = ecx = 64-(shift+32)
; The movd for the unaligned case writes the same data to 4(%edx)
; that the movq does for the aligned case.
movq [8+edx],mm3
and ecx,32
psllq mm2,mm6
jz Lfinish_zero_unaligned
movq [edx],mm2
Lfinish_zero_unaligned:
psrlq mm2,32
pop ebx
movd eax,mm5 ; retval
movd [4+edx],mm2
emms
ret
end