354 lines
8.6 KiB
NASM
354 lines
8.6 KiB
NASM
|
|
||
|
; Copyright 2001 Free Software Foundation, Inc.
|
||
|
;
|
||
|
; This file is part of the GNU MP Library.
|
||
|
;
|
||
|
; The GNU MP Library is free software; you can redistribute it and/or
|
||
|
; modify it under the terms of the GNU Lesser General Public License as
|
||
|
; published by the Free Software Foundation; either version 2.1 of the
|
||
|
; License, or (at your option) any later version.
|
||
|
;
|
||
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
||
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
; Lesser General Public License for more details.
|
||
|
;
|
||
|
; You should have received a copy of the GNU Lesser General Public
|
||
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
||
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
||
|
; Suite 330, Boston, MA 02111-1307, USA.
|
||
|
;
|
||
|
; Translation of AT&T syntax code by Brian Gladman
|
||
|
|
||
|
%include "..\..\x86i.inc"
|
||
|
|
||
|
%define PARAM_SHIFT esp+frame+16
|
||
|
%define PARAM_SIZE esp+frame+12
|
||
|
%define PARAM_SRC esp+frame+8
|
||
|
%define PARAM_DST esp+frame+4
|
||
|
%define frame 8
|
||
|
|
||
|
; minimum 5,because the unrolled loop can't handle less
|
||
|
%define UNROLL_THRESHOLD 5
|
||
|
|
||
|
section .text
|
||
|
|
||
|
global ___gmpn_lshift
|
||
|
%ifdef DLL
|
||
|
export ___gmpn_lshift
|
||
|
%endif
|
||
|
|
||
|
align 8
|
||
|
___gmpn_lshift:
|
||
|
push ebx
|
||
|
push edi
|
||
|
mov eax,[PARAM_SIZE]
|
||
|
mov edx,[PARAM_DST]
|
||
|
mov ebx,[PARAM_SRC]
|
||
|
mov ecx,[PARAM_SHIFT]
|
||
|
cmp eax,UNROLL_THRESHOLD
|
||
|
jae Lunroll
|
||
|
mov edi,[-4+ebx+eax*4] ; src high limb
|
||
|
dec eax
|
||
|
jnz Lsimple
|
||
|
shld eax,edi,cl
|
||
|
shl edi,cl
|
||
|
mov [edx],edi ; dst low limb
|
||
|
pop edi ; risk of data cache bank clash
|
||
|
pop ebx
|
||
|
ret
|
||
|
|
||
|
; eax size-1
|
||
|
; ebx src
|
||
|
; ecx shift
|
||
|
; edx dst
|
||
|
; esi
|
||
|
; edi
|
||
|
; ebp
|
||
|
|
||
|
Lsimple:
|
||
|
movd mm5,[ebx+eax*4] ; src high limb
|
||
|
movd mm6,ecx ; lshift
|
||
|
neg ecx
|
||
|
psllq mm5,mm6
|
||
|
add ecx,32
|
||
|
movd mm7,ecx
|
||
|
psrlq mm5,32 ; retval
|
||
|
|
||
|
; eax counter,limbs,negative
|
||
|
; ebx src
|
||
|
; ecx
|
||
|
; edx dst
|
||
|
; esi
|
||
|
; edi
|
||
|
;
|
||
|
; mm0 scratch
|
||
|
; mm5 return value
|
||
|
; mm6 shift
|
||
|
; mm7 32-shift
|
||
|
|
||
|
Lsimple_top:
|
||
|
movq mm0,[ebx+eax*4-4]
|
||
|
dec eax
|
||
|
psrlq mm0,mm7
|
||
|
movd [4+edx+eax*4],mm0
|
||
|
jnz Lsimple_top
|
||
|
movd mm0,[ebx]
|
||
|
movd eax,mm5
|
||
|
psllq mm0,mm6
|
||
|
pop edi
|
||
|
pop ebx
|
||
|
movd [edx],mm0
|
||
|
emms
|
||
|
ret
|
||
|
|
||
|
; eax size
|
||
|
; ebx src
|
||
|
; ecx shift
|
||
|
; edx dst
|
||
|
; esi
|
||
|
; edi
|
||
|
; ebp
|
||
|
|
||
|
align 8
|
||
|
Lunroll:
|
||
|
movd mm5,[ebx+eax*4-4] ; src high limb
|
||
|
lea edi,[ebx+eax*4]
|
||
|
movd mm6,ecx ; lshift
|
||
|
and edi,4
|
||
|
psllq mm5,mm6
|
||
|
jz Lstart_src_aligned
|
||
|
|
||
|
; src isn't aligned,process high limb separately (marked xxx) to
|
||
|
; make it so.
|
||
|
;
|
||
|
; source -8(ebx,%eax,4)
|
||
|
; |
|
||
|
; +-------+-------+-------+--
|
||
|
; | |
|
||
|
; +-------+-------+-------+--
|
||
|
; 0mod8 4mod8 0mod8
|
||
|
;
|
||
|
; dest
|
||
|
; -4(edx,%eax,4)
|
||
|
; |
|
||
|
; +-------+-------+--
|
||
|
; | xxx | |
|
||
|
; +-------+-------+--
|
||
|
|
||
|
movq mm0,[ebx+eax*4-8] ; unaligned load
|
||
|
psllq mm0,mm6
|
||
|
dec eax
|
||
|
psrlq mm0,32
|
||
|
movd [edx+eax*4],mm0
|
||
|
Lstart_src_aligned:
|
||
|
movq mm1,[ebx+eax*4-8] ; src high qword
|
||
|
lea edi,[edx+eax*4]
|
||
|
and edi,4
|
||
|
psrlq mm5,32 ; return value
|
||
|
movq mm3,[ebx+eax*4-16] ; src second highest qword
|
||
|
jz Lstart_dst_aligned
|
||
|
|
||
|
; dst isn't aligned,subtract 4 to make it so,and pretend the shift
|
||
|
; is 32 bits extra. High limb of dst (marked xxx) handled here
|
||
|
; separately.
|
||
|
;
|
||
|
; source -8(ebx,%eax,4)
|
||
|
; |
|
||
|
; +-------+-------+--
|
||
|
; | mm1 |
|
||
|
; +-------+-------+--
|
||
|
; 0mod8 4mod8
|
||
|
;
|
||
|
; dest
|
||
|
; -4(edx,%eax,4)
|
||
|
; |
|
||
|
; +-------+-------+-------+--
|
||
|
; | xxx | |
|
||
|
; +-------+-------+-------+--
|
||
|
; 0mod8 4mod8 0mod8
|
||
|
|
||
|
movq mm0,mm1
|
||
|
add ecx,32 ; new shift
|
||
|
psllq mm0,mm6
|
||
|
movd mm6,ecx
|
||
|
psrlq mm0,32
|
||
|
|
||
|
; wasted cycle here waiting for %mm0
|
||
|
|
||
|
movd [-4+edx+eax*4],mm0
|
||
|
sub edx,4
|
||
|
Lstart_dst_aligned:
|
||
|
|
||
|
psllq mm1,mm6
|
||
|
neg ecx ; -shift
|
||
|
add ecx,64 ; 64-shift
|
||
|
movq mm2,mm3
|
||
|
movd mm7,ecx
|
||
|
sub eax,8 ; size-8
|
||
|
psrlq mm3,mm7
|
||
|
por mm3,mm1 ; mm3 ready to store
|
||
|
jc Lfinish
|
||
|
|
||
|
; The comments in mpn_rshift apply here too.
|
||
|
|
||
|
; eax counter,limbs
|
||
|
; ebx src
|
||
|
; ecx
|
||
|
; edx dst
|
||
|
; esi
|
||
|
; edi
|
||
|
;
|
||
|
; mm0
|
||
|
; mm1
|
||
|
; mm2 src qword from 16(%ebx,%eax,4)
|
||
|
; mm3 dst qword ready to store to 24(%edx,%eax,4)
|
||
|
;
|
||
|
; mm5 return value
|
||
|
; mm6 lshift
|
||
|
; mm7 rshift
|
||
|
|
||
|
align 8
|
||
|
Lunroll_loop:
|
||
|
movq mm0,[ebx+eax*4+8]
|
||
|
psllq mm2,mm6
|
||
|
movq mm1,mm0
|
||
|
psrlq mm0,mm7
|
||
|
movq [24+edx+eax*4],mm3
|
||
|
por mm0,mm2
|
||
|
movq mm3,[ebx+eax*4]
|
||
|
psllq mm1,mm6
|
||
|
movq [16+edx+eax*4],mm0
|
||
|
movq mm2,mm3
|
||
|
psrlq mm3,mm7
|
||
|
sub eax,4
|
||
|
por mm3,mm1
|
||
|
jnc Lunroll_loop
|
||
|
Lfinish:
|
||
|
; eax -4 to -1 representing respectively 0 to 3 limbs remaining
|
||
|
|
||
|
test al,2
|
||
|
jz Lfinish_no_two
|
||
|
movq mm0,[ebx+eax*4+8]
|
||
|
psllq mm2,mm6
|
||
|
movq mm1,mm0
|
||
|
psrlq mm0,mm7
|
||
|
movq [24+edx+eax*4],mm3 ; prev
|
||
|
por mm0,mm2
|
||
|
movq mm2,mm1
|
||
|
movq mm3,mm0
|
||
|
sub eax,2
|
||
|
Lfinish_no_two:
|
||
|
|
||
|
; eax -4 or -3 representing respectively 0 or 1 limbs remaining
|
||
|
; mm2 src prev qword,from 16(%ebx,%eax,4)
|
||
|
; mm3 dst qword,for 24(%edx,%eax,4)
|
||
|
|
||
|
test al,1
|
||
|
movd eax,mm5 ; retval
|
||
|
pop edi
|
||
|
jz Lfinish_zero
|
||
|
|
||
|
; One extra src limb,destination was aligned.
|
||
|
;
|
||
|
; source ebx
|
||
|
; --+---------------+-------+
|
||
|
; | mm2 | |
|
||
|
; --+---------------+-------+
|
||
|
;
|
||
|
; dest edx+12 edx+4 edx
|
||
|
; --+---------------+---------------+-------+
|
||
|
; | mm3 | | |
|
||
|
; --+---------------+---------------+-------+
|
||
|
;
|
||
|
; mm6 = shift
|
||
|
; mm7 = ecx = 64-shift
|
||
|
|
||
|
; One extra src limb,destination was unaligned.
|
||
|
;
|
||
|
; source ebx
|
||
|
; --+---------------+-------+
|
||
|
; | mm2 | |
|
||
|
; --+---------------+-------+
|
||
|
;
|
||
|
; dest edx+12 edx+4
|
||
|
; --+---------------+---------------+
|
||
|
; | mm3 | |
|
||
|
; --+---------------+---------------+
|
||
|
;
|
||
|
; mm6 = shift+32
|
||
|
; mm7 = ecx = 64-(shift+32)
|
||
|
|
||
|
|
||
|
; In both cases there's one extra limb of src to fetch and combine
|
||
|
; with mm2 to make a qword at 4(%edx),and in the aligned case
|
||
|
; there's an extra limb of dst to be formed from that extra src limb
|
||
|
; left shifted.
|
||
|
|
||
|
movd mm0,[ebx]
|
||
|
psllq mm2,mm6
|
||
|
movq [12+edx],mm3
|
||
|
psllq mm0,32
|
||
|
movq mm1,mm0
|
||
|
psrlq mm0,mm7
|
||
|
por mm0,mm2
|
||
|
psllq mm1,mm6
|
||
|
movq [4+edx],mm0
|
||
|
psrlq mm1,32
|
||
|
and ecx,32
|
||
|
pop ebx
|
||
|
jz Lfinish_one_unaligned
|
||
|
movd [edx],mm1
|
||
|
Lfinish_one_unaligned:
|
||
|
emms
|
||
|
ret
|
||
|
Lfinish_zero:
|
||
|
|
||
|
; No extra src limbs,destination was aligned.
|
||
|
;
|
||
|
; source ebx
|
||
|
; --+---------------+
|
||
|
; | mm2 |
|
||
|
; --+---------------+
|
||
|
;
|
||
|
; dest edx+8 edx
|
||
|
; --+---------------+---------------+
|
||
|
; | mm3 | |
|
||
|
; --+---------------+---------------+
|
||
|
;
|
||
|
; mm6 = shift
|
||
|
; mm7 = ecx = 64-shift
|
||
|
|
||
|
; No extra src limbs,destination was unaligned.
|
||
|
;
|
||
|
; source ebx
|
||
|
; --+---------------+
|
||
|
; | mm2 |
|
||
|
; --+---------------+
|
||
|
;
|
||
|
; dest edx+8 edx+4
|
||
|
; --+---------------+-------+
|
||
|
; | mm3 | |
|
||
|
; --+---------------+-------+
|
||
|
;
|
||
|
; mm6 = shift+32
|
||
|
; mm7 = ecx = 64-(shift+32)
|
||
|
|
||
|
; The movd for the unaligned case writes the same data to 4(%edx)
|
||
|
; that the movq does for the aligned case.
|
||
|
|
||
|
movq [8+edx],mm3
|
||
|
and ecx,32
|
||
|
psllq mm2,mm6
|
||
|
jz Lfinish_zero_unaligned
|
||
|
movq [edx],mm2
|
||
|
Lfinish_zero_unaligned:
|
||
|
psrlq mm2,32
|
||
|
pop ebx
|
||
|
movd eax,mm5 ; retval
|
||
|
movd [4+edx],mm2
|
||
|
emms
|
||
|
ret
|
||
|
|
||
|
end
|