mpir/mpn/x86i/pentium4/mmx/lshift.asm
brgladman 48248cda46 1. longlong.h change to add MSVC intrinsics
2. longlong.h rearrangement for Intel compiler
3. MSVC additions in test  code 
4. GMP 4.2.1 bug fixes
5. Intel format assembly code
2008-05-18 22:20:43 +00:00

354 lines
8.6 KiB
NASM

; Copyright 2001 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
%include "..\..\x86i.inc"
%define PARAM_SHIFT esp+frame+16
%define PARAM_SIZE esp+frame+12
%define PARAM_SRC esp+frame+8
%define PARAM_DST esp+frame+4
%define frame 8
; minimum 5,because the unrolled loop can't handle less
%define UNROLL_THRESHOLD 5
section .text
global ___gmpn_lshift
%ifdef DLL
export ___gmpn_lshift
%endif
align 8
___gmpn_lshift:
push ebx
push edi
mov eax,[PARAM_SIZE]
mov edx,[PARAM_DST]
mov ebx,[PARAM_SRC]
mov ecx,[PARAM_SHIFT]
cmp eax,UNROLL_THRESHOLD
jae Lunroll
mov edi,[-4+ebx+eax*4] ; src high limb
dec eax
jnz Lsimple
shld eax,edi,cl
shl edi,cl
mov [edx],edi ; dst low limb
pop edi ; risk of data cache bank clash
pop ebx
ret
; eax size-1
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
Lsimple:
movd mm5,[ebx+eax*4] ; src high limb
movd mm6,ecx ; lshift
neg ecx
psllq mm5,mm6
add ecx,32
movd mm7,ecx
psrlq mm5,32 ; retval
; eax counter,limbs,negative
; ebx src
; ecx
; edx dst
; esi
; edi
;
; mm0 scratch
; mm5 return value
; mm6 shift
; mm7 32-shift
Lsimple_top:
movq mm0,[ebx+eax*4-4]
dec eax
psrlq mm0,mm7
movd [4+edx+eax*4],mm0
jnz Lsimple_top
movd mm0,[ebx]
movd eax,mm5
psllq mm0,mm6
pop edi
pop ebx
movd [edx],mm0
emms
ret
; eax size
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
align 8
Lunroll:
movd mm5,[ebx+eax*4-4] ; src high limb
lea edi,[ebx+eax*4]
movd mm6,ecx ; lshift
and edi,4
psllq mm5,mm6
jz Lstart_src_aligned
; src isn't aligned,process high limb separately (marked xxx) to
; make it so.
;
; source -8(ebx,%eax,4)
; |
; +-------+-------+-------+--
; | |
; +-------+-------+-------+--
; 0mod8 4mod8 0mod8
;
; dest
; -4(edx,%eax,4)
; |
; +-------+-------+--
; | xxx | |
; +-------+-------+--
movq mm0,[ebx+eax*4-8] ; unaligned load
psllq mm0,mm6
dec eax
psrlq mm0,32
movd [edx+eax*4],mm0
Lstart_src_aligned:
movq mm1,[ebx+eax*4-8] ; src high qword
lea edi,[edx+eax*4]
and edi,4
psrlq mm5,32 ; return value
movq mm3,[ebx+eax*4-16] ; src second highest qword
jz Lstart_dst_aligned
; dst isn't aligned,subtract 4 to make it so,and pretend the shift
; is 32 bits extra. High limb of dst (marked xxx) handled here
; separately.
;
; source -8(ebx,%eax,4)
; |
; +-------+-------+--
; | mm1 |
; +-------+-------+--
; 0mod8 4mod8
;
; dest
; -4(edx,%eax,4)
; |
; +-------+-------+-------+--
; | xxx | |
; +-------+-------+-------+--
; 0mod8 4mod8 0mod8
movq mm0,mm1
add ecx,32 ; new shift
psllq mm0,mm6
movd mm6,ecx
psrlq mm0,32
; wasted cycle here waiting for %mm0
movd [-4+edx+eax*4],mm0
sub edx,4
Lstart_dst_aligned:
psllq mm1,mm6
neg ecx ; -shift
add ecx,64 ; 64-shift
movq mm2,mm3
movd mm7,ecx
sub eax,8 ; size-8
psrlq mm3,mm7
por mm3,mm1 ; mm3 ready to store
jc Lfinish
; The comments in mpn_rshift apply here too.
; eax counter,limbs
; ebx src
; ecx
; edx dst
; esi
; edi
;
; mm0
; mm1
; mm2 src qword from 16(%ebx,%eax,4)
; mm3 dst qword ready to store to 24(%edx,%eax,4)
;
; mm5 return value
; mm6 lshift
; mm7 rshift
align 8
Lunroll_loop:
movq mm0,[ebx+eax*4+8]
psllq mm2,mm6
movq mm1,mm0
psrlq mm0,mm7
movq [24+edx+eax*4],mm3
por mm0,mm2
movq mm3,[ebx+eax*4]
psllq mm1,mm6
movq [16+edx+eax*4],mm0
movq mm2,mm3
psrlq mm3,mm7
sub eax,4
por mm3,mm1
jnc Lunroll_loop
Lfinish:
; eax -4 to -1 representing respectively 0 to 3 limbs remaining
test al,2
jz Lfinish_no_two
movq mm0,[ebx+eax*4+8]
psllq mm2,mm6
movq mm1,mm0
psrlq mm0,mm7
movq [24+edx+eax*4],mm3 ; prev
por mm0,mm2
movq mm2,mm1
movq mm3,mm0
sub eax,2
Lfinish_no_two:
; eax -4 or -3 representing respectively 0 or 1 limbs remaining
; mm2 src prev qword,from 16(%ebx,%eax,4)
; mm3 dst qword,for 24(%edx,%eax,4)
test al,1
movd eax,mm5 ; retval
pop edi
jz Lfinish_zero
; One extra src limb,destination was aligned.
;
; source ebx
; --+---------------+-------+
; | mm2 | |
; --+---------------+-------+
;
; dest edx+12 edx+4 edx
; --+---------------+---------------+-------+
; | mm3 | | |
; --+---------------+---------------+-------+
;
; mm6 = shift
; mm7 = ecx = 64-shift
; One extra src limb,destination was unaligned.
;
; source ebx
; --+---------------+-------+
; | mm2 | |
; --+---------------+-------+
;
; dest edx+12 edx+4
; --+---------------+---------------+
; | mm3 | |
; --+---------------+---------------+
;
; mm6 = shift+32
; mm7 = ecx = 64-(shift+32)
; In both cases there's one extra limb of src to fetch and combine
; with mm2 to make a qword at 4(%edx),and in the aligned case
; there's an extra limb of dst to be formed from that extra src limb
; left shifted.
movd mm0,[ebx]
psllq mm2,mm6
movq [12+edx],mm3
psllq mm0,32
movq mm1,mm0
psrlq mm0,mm7
por mm0,mm2
psllq mm1,mm6
movq [4+edx],mm0
psrlq mm1,32
and ecx,32
pop ebx
jz Lfinish_one_unaligned
movd [edx],mm1
Lfinish_one_unaligned:
emms
ret
Lfinish_zero:
; No extra src limbs,destination was aligned.
;
; source ebx
; --+---------------+
; | mm2 |
; --+---------------+
;
; dest edx+8 edx
; --+---------------+---------------+
; | mm3 | |
; --+---------------+---------------+
;
; mm6 = shift
; mm7 = ecx = 64-shift
; No extra src limbs,destination was unaligned.
;
; source ebx
; --+---------------+
; | mm2 |
; --+---------------+
;
; dest edx+8 edx+4
; --+---------------+-------+
; | mm3 | |
; --+---------------+-------+
;
; mm6 = shift+32
; mm7 = ecx = 64-(shift+32)
; The movd for the unaligned case writes the same data to 4(%edx)
; that the movq does for the aligned case.
movq [8+edx],mm3
and ecx,32
psllq mm2,mm6
jz Lfinish_zero_unaligned
movq [edx],mm2
Lfinish_zero_unaligned:
psrlq mm2,32
pop ebx
movd eax,mm5 ; retval
movd [4+edx],mm2
emms
ret
end