48248cda46
2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code
354 lines
8.6 KiB
NASM
354 lines
8.6 KiB
NASM
|
|
; Copyright 2001 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
; Translation of AT&T syntax code by Brian Gladman
|
|
|
|
%include "..\..\x86i.inc"
|
|
|
|
%define PARAM_SHIFT esp+frame+16
|
|
%define PARAM_SIZE esp+frame+12
|
|
%define PARAM_SRC esp+frame+8
|
|
%define PARAM_DST esp+frame+4
|
|
%define frame 8
|
|
|
|
; minimum 5,because the unrolled loop can't handle less
|
|
%define UNROLL_THRESHOLD 5
|
|
|
|
section .text
|
|
|
|
global ___gmpn_lshift
|
|
%ifdef DLL
|
|
export ___gmpn_lshift
|
|
%endif
|
|
|
|
align 8
|
|
___gmpn_lshift:
|
|
push ebx
|
|
push edi
|
|
mov eax,[PARAM_SIZE]
|
|
mov edx,[PARAM_DST]
|
|
mov ebx,[PARAM_SRC]
|
|
mov ecx,[PARAM_SHIFT]
|
|
cmp eax,UNROLL_THRESHOLD
|
|
jae Lunroll
|
|
mov edi,[-4+ebx+eax*4] ; src high limb
|
|
dec eax
|
|
jnz Lsimple
|
|
shld eax,edi,cl
|
|
shl edi,cl
|
|
mov [edx],edi ; dst low limb
|
|
pop edi ; risk of data cache bank clash
|
|
pop ebx
|
|
ret
|
|
|
|
; eax size-1
|
|
; ebx src
|
|
; ecx shift
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
; ebp
|
|
|
|
Lsimple:
|
|
movd mm5,[ebx+eax*4] ; src high limb
|
|
movd mm6,ecx ; lshift
|
|
neg ecx
|
|
psllq mm5,mm6
|
|
add ecx,32
|
|
movd mm7,ecx
|
|
psrlq mm5,32 ; retval
|
|
|
|
; eax counter,limbs,negative
|
|
; ebx src
|
|
; ecx
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
;
|
|
; mm0 scratch
|
|
; mm5 return value
|
|
; mm6 shift
|
|
; mm7 32-shift
|
|
|
|
Lsimple_top:
|
|
movq mm0,[ebx+eax*4-4]
|
|
dec eax
|
|
psrlq mm0,mm7
|
|
movd [4+edx+eax*4],mm0
|
|
jnz Lsimple_top
|
|
movd mm0,[ebx]
|
|
movd eax,mm5
|
|
psllq mm0,mm6
|
|
pop edi
|
|
pop ebx
|
|
movd [edx],mm0
|
|
emms
|
|
ret
|
|
|
|
; eax size
|
|
; ebx src
|
|
; ecx shift
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
; ebp
|
|
|
|
align 8
|
|
Lunroll:
|
|
movd mm5,[ebx+eax*4-4] ; src high limb
|
|
lea edi,[ebx+eax*4]
|
|
movd mm6,ecx ; lshift
|
|
and edi,4
|
|
psllq mm5,mm6
|
|
jz Lstart_src_aligned
|
|
|
|
; src isn't aligned,process high limb separately (marked xxx) to
|
|
; make it so.
|
|
;
|
|
; source -8(ebx,%eax,4)
|
|
; |
|
|
; +-------+-------+-------+--
|
|
; | |
|
|
; +-------+-------+-------+--
|
|
; 0mod8 4mod8 0mod8
|
|
;
|
|
; dest
|
|
; -4(edx,%eax,4)
|
|
; |
|
|
; +-------+-------+--
|
|
; | xxx | |
|
|
; +-------+-------+--
|
|
|
|
movq mm0,[ebx+eax*4-8] ; unaligned load
|
|
psllq mm0,mm6
|
|
dec eax
|
|
psrlq mm0,32
|
|
movd [edx+eax*4],mm0
|
|
Lstart_src_aligned:
|
|
movq mm1,[ebx+eax*4-8] ; src high qword
|
|
lea edi,[edx+eax*4]
|
|
and edi,4
|
|
psrlq mm5,32 ; return value
|
|
movq mm3,[ebx+eax*4-16] ; src second highest qword
|
|
jz Lstart_dst_aligned
|
|
|
|
; dst isn't aligned,subtract 4 to make it so,and pretend the shift
|
|
; is 32 bits extra. High limb of dst (marked xxx) handled here
|
|
; separately.
|
|
;
|
|
; source -8(ebx,%eax,4)
|
|
; |
|
|
; +-------+-------+--
|
|
; | mm1 |
|
|
; +-------+-------+--
|
|
; 0mod8 4mod8
|
|
;
|
|
; dest
|
|
; -4(edx,%eax,4)
|
|
; |
|
|
; +-------+-------+-------+--
|
|
; | xxx | |
|
|
; +-------+-------+-------+--
|
|
; 0mod8 4mod8 0mod8
|
|
|
|
movq mm0,mm1
|
|
add ecx,32 ; new shift
|
|
psllq mm0,mm6
|
|
movd mm6,ecx
|
|
psrlq mm0,32
|
|
|
|
; wasted cycle here waiting for %mm0
|
|
|
|
movd [-4+edx+eax*4],mm0
|
|
sub edx,4
|
|
Lstart_dst_aligned:
|
|
|
|
psllq mm1,mm6
|
|
neg ecx ; -shift
|
|
add ecx,64 ; 64-shift
|
|
movq mm2,mm3
|
|
movd mm7,ecx
|
|
sub eax,8 ; size-8
|
|
psrlq mm3,mm7
|
|
por mm3,mm1 ; mm3 ready to store
|
|
jc Lfinish
|
|
|
|
; The comments in mpn_rshift apply here too.
|
|
|
|
; eax counter,limbs
|
|
; ebx src
|
|
; ecx
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
;
|
|
; mm0
|
|
; mm1
|
|
; mm2 src qword from 16(%ebx,%eax,4)
|
|
; mm3 dst qword ready to store to 24(%edx,%eax,4)
|
|
;
|
|
; mm5 return value
|
|
; mm6 lshift
|
|
; mm7 rshift
|
|
|
|
align 8
|
|
Lunroll_loop:
|
|
movq mm0,[ebx+eax*4+8]
|
|
psllq mm2,mm6
|
|
movq mm1,mm0
|
|
psrlq mm0,mm7
|
|
movq [24+edx+eax*4],mm3
|
|
por mm0,mm2
|
|
movq mm3,[ebx+eax*4]
|
|
psllq mm1,mm6
|
|
movq [16+edx+eax*4],mm0
|
|
movq mm2,mm3
|
|
psrlq mm3,mm7
|
|
sub eax,4
|
|
por mm3,mm1
|
|
jnc Lunroll_loop
|
|
Lfinish:
|
|
; eax -4 to -1 representing respectively 0 to 3 limbs remaining
|
|
|
|
test al,2
|
|
jz Lfinish_no_two
|
|
movq mm0,[ebx+eax*4+8]
|
|
psllq mm2,mm6
|
|
movq mm1,mm0
|
|
psrlq mm0,mm7
|
|
movq [24+edx+eax*4],mm3 ; prev
|
|
por mm0,mm2
|
|
movq mm2,mm1
|
|
movq mm3,mm0
|
|
sub eax,2
|
|
Lfinish_no_two:
|
|
|
|
; eax -4 or -3 representing respectively 0 or 1 limbs remaining
|
|
; mm2 src prev qword,from 16(%ebx,%eax,4)
|
|
; mm3 dst qword,for 24(%edx,%eax,4)
|
|
|
|
test al,1
|
|
movd eax,mm5 ; retval
|
|
pop edi
|
|
jz Lfinish_zero
|
|
|
|
; One extra src limb,destination was aligned.
|
|
;
|
|
; source ebx
|
|
; --+---------------+-------+
|
|
; | mm2 | |
|
|
; --+---------------+-------+
|
|
;
|
|
; dest edx+12 edx+4 edx
|
|
; --+---------------+---------------+-------+
|
|
; | mm3 | | |
|
|
; --+---------------+---------------+-------+
|
|
;
|
|
; mm6 = shift
|
|
; mm7 = ecx = 64-shift
|
|
|
|
; One extra src limb,destination was unaligned.
|
|
;
|
|
; source ebx
|
|
; --+---------------+-------+
|
|
; | mm2 | |
|
|
; --+---------------+-------+
|
|
;
|
|
; dest edx+12 edx+4
|
|
; --+---------------+---------------+
|
|
; | mm3 | |
|
|
; --+---------------+---------------+
|
|
;
|
|
; mm6 = shift+32
|
|
; mm7 = ecx = 64-(shift+32)
|
|
|
|
|
|
; In both cases there's one extra limb of src to fetch and combine
|
|
; with mm2 to make a qword at 4(%edx),and in the aligned case
|
|
; there's an extra limb of dst to be formed from that extra src limb
|
|
; left shifted.
|
|
|
|
movd mm0,[ebx]
|
|
psllq mm2,mm6
|
|
movq [12+edx],mm3
|
|
psllq mm0,32
|
|
movq mm1,mm0
|
|
psrlq mm0,mm7
|
|
por mm0,mm2
|
|
psllq mm1,mm6
|
|
movq [4+edx],mm0
|
|
psrlq mm1,32
|
|
and ecx,32
|
|
pop ebx
|
|
jz Lfinish_one_unaligned
|
|
movd [edx],mm1
|
|
Lfinish_one_unaligned:
|
|
emms
|
|
ret
|
|
Lfinish_zero:
|
|
|
|
; No extra src limbs,destination was aligned.
|
|
;
|
|
; source ebx
|
|
; --+---------------+
|
|
; | mm2 |
|
|
; --+---------------+
|
|
;
|
|
; dest edx+8 edx
|
|
; --+---------------+---------------+
|
|
; | mm3 | |
|
|
; --+---------------+---------------+
|
|
;
|
|
; mm6 = shift
|
|
; mm7 = ecx = 64-shift
|
|
|
|
; No extra src limbs,destination was unaligned.
|
|
;
|
|
; source ebx
|
|
; --+---------------+
|
|
; | mm2 |
|
|
; --+---------------+
|
|
;
|
|
; dest edx+8 edx+4
|
|
; --+---------------+-------+
|
|
; | mm3 | |
|
|
; --+---------------+-------+
|
|
;
|
|
; mm6 = shift+32
|
|
; mm7 = ecx = 64-(shift+32)
|
|
|
|
; The movd for the unaligned case writes the same data to 4(%edx)
|
|
; that the movq does for the aligned case.
|
|
|
|
movq [8+edx],mm3
|
|
and ecx,32
|
|
psllq mm2,mm6
|
|
jz Lfinish_zero_unaligned
|
|
movq [edx],mm2
|
|
Lfinish_zero_unaligned:
|
|
psrlq mm2,32
|
|
pop ebx
|
|
movd eax,mm5 ; retval
|
|
movd [4+edx],mm2
|
|
emms
|
|
ret
|
|
|
|
end
|