362 lines
8.7 KiB
NASM
362 lines
8.7 KiB
NASM
|
|
; Copyright 2001 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
; Translation of AT&T syntax code by Brian Gladman
|
|
|
|
%include "..\..\x86i.inc"
|
|
|
|
%define PARAM_SHIFT esp+frame+16
|
|
%define PARAM_SIZE esp+frame+12
|
|
%define PARAM_SRC esp+frame+8
|
|
%define PARAM_DST esp+frame+4
|
|
%define frame 8
|
|
|
|
; Minimum 5,because the unrolled loop can't handle less.
|
|
%define UNROLL_THRESHOLD 5
|
|
|
|
section .text
|
|
|
|
global ___gmpn_rshift
|
|
%ifdef DLL
|
|
export ___gmpn_rshift
|
|
%endif
|
|
|
|
align 8
|
|
___gmpn_rshift:
|
|
push ebx
|
|
push edi
|
|
mov eax,[PARAM_SIZE]
|
|
mov edx,[PARAM_DST]
|
|
mov ebx,[PARAM_SRC]
|
|
mov ecx,[PARAM_SHIFT]
|
|
cmp eax,UNROLL_THRESHOLD
|
|
jae Lunroll
|
|
dec eax
|
|
mov edi,[ebx] ; src low limb
|
|
jnz Lsimple
|
|
shrd eax,edi,cl
|
|
shr edi,cl
|
|
mov [edx],edi ; dst low limb
|
|
pop edi ; risk of data cache bank clash
|
|
pop ebx
|
|
ret
|
|
|
|
; eax size-1
|
|
; ebx src
|
|
; ecx shift
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
; ebp
|
|
|
|
align 8
|
|
Lsimple:
|
|
movd mm5,[ebx] ; src[0]
|
|
lea ebx,[ebx+eax*4] ; &src[size-1]
|
|
movd mm6,ecx ; rshift
|
|
lea edx,[-4+edx+eax*4] ; &dst[size-2]
|
|
psllq mm5,32
|
|
neg eax
|
|
|
|
; This loop is 5 or 8 cycles,with every second load unaligned and a wasted
|
|
; cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
|
|
; cycles and would be 8 in a simple loop. Using mmx helps the return value
|
|
; and last limb calculations too.
|
|
|
|
; eax counter,limbs,negative
|
|
; ebx &src[size-1]
|
|
; ecx return value
|
|
; edx &dst[size-2]
|
|
;
|
|
; mm0 scratch
|
|
; mm5 return value
|
|
; mm6 shift
|
|
|
|
Lsimple_top:
|
|
movq mm0,[ebx+eax*4]
|
|
inc eax
|
|
psrlq mm0,mm6
|
|
movd [edx+eax*4],mm0
|
|
jnz Lsimple_top
|
|
movd mm0,[ebx]
|
|
psrlq mm5,mm6 ; return value
|
|
psrlq mm0,mm6
|
|
pop edi
|
|
movd eax,mm5
|
|
pop ebx
|
|
movd [4+edx],mm0
|
|
emms
|
|
ret
|
|
|
|
; eax size
|
|
; ebx src
|
|
; ecx shift
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
; ebp
|
|
|
|
align 8
|
|
Lunroll:
|
|
movd mm5,[ebx] ; src[0]
|
|
mov edi,4
|
|
movd mm6,ecx ; rshift
|
|
test ebx,edi
|
|
psllq mm5,32
|
|
jz Lstart_src_aligned
|
|
|
|
; src isn't aligned,process low limb separately (marked xxx) and
|
|
; step src and dst by one limb,making src aligned.
|
|
;
|
|
; source ebx
|
|
; --+-------+-------+-------+
|
|
; | xxx |
|
|
; --+-------+-------+-------+
|
|
; 4mod8 0mod8 4mod8
|
|
;
|
|
; dest edx
|
|
; --+-------+-------+
|
|
; | | xxx |
|
|
; --+-------+-------+
|
|
|
|
movq mm0,[ebx] ; unaligned load
|
|
psrlq mm0,mm6
|
|
add ebx,4
|
|
dec eax
|
|
movd [edx],mm0
|
|
add edx,4
|
|
Lstart_src_aligned:
|
|
movq mm1,[ebx]
|
|
test edx,edi
|
|
psrlq mm5,mm6 ; retval
|
|
jz Lstart_dst_aligned
|
|
|
|
; dst isn't aligned,add 4 to make it so,and pretend the shift is
|
|
; 32 bits extra. Low limb of dst (marked xxx) handled here
|
|
; separately.
|
|
;
|
|
; source ebx
|
|
; --+-------+-------+
|
|
; | mm1 |
|
|
; --+-------+-------+
|
|
; 4mod8 0mod8
|
|
;
|
|
; dest edx
|
|
; --+-------+-------+-------+
|
|
; | xxx |
|
|
; --+-------+-------+-------+
|
|
; 4mod8 0mod8 4mod8
|
|
|
|
movq mm0,mm1
|
|
add ecx,32 ; new shift
|
|
psrlq mm0,mm6
|
|
movd mm6,ecx
|
|
movd [edx],mm0
|
|
add edx,4
|
|
Lstart_dst_aligned:
|
|
movq mm3,[8+ebx]
|
|
neg ecx
|
|
movq mm2,mm3 ; mm2 src qword
|
|
add ecx,64
|
|
movd mm7,ecx
|
|
psrlq mm1,mm6
|
|
lea ebx,[-12+ebx+eax*4]
|
|
lea edx,[-20+edx+eax*4]
|
|
psllq mm3,mm7
|
|
sub eax,7 ; size-7
|
|
por mm3,mm1 ; mm3 ready to store
|
|
neg eax ; -(size-7)
|
|
jns Lfinish
|
|
|
|
; This loop is the important bit,the rest is just support. Careful
|
|
; instruction scheduling achieves the claimed 1.75 c/l. The
|
|
; relevant parts of the pairing rules are:
|
|
;
|
|
; - mmx loads and stores execute only in the U pipe
|
|
; - only one mmx shift in a pair
|
|
; - wait one cycle before storing an mmx register result
|
|
; - the usual address generation interlock
|
|
;
|
|
; Two qword calculations are slightly interleaved. The instructions
|
|
; marked "C" belong to the second qword,and the "C prev" one is for
|
|
; the second qword from the previous iteration.
|
|
|
|
; eax counter,limbs,negative
|
|
; ebx &src[size-12]
|
|
; ecx
|
|
; edx &dst[size-12]
|
|
; esi
|
|
; edi
|
|
;
|
|
; mm0
|
|
; mm1
|
|
; mm2 src qword from -8(%ebx,%eax,4)
|
|
; mm3 dst qword ready to store to -8(%edx,%eax,4)
|
|
;
|
|
; mm5 return value
|
|
; mm6 rshift
|
|
; mm7 lshift
|
|
|
|
align 8
|
|
Lunroll_loop:
|
|
movq mm0,[ebx+eax*4]
|
|
psrlq mm2,mm6
|
|
movq mm1,mm0
|
|
psllq mm0,mm7
|
|
movq [-8+edx+eax*4],mm3
|
|
por mm0,mm2
|
|
|
|
movq mm3,[ebx+eax*4+8]
|
|
psrlq mm1,mm6
|
|
movq [edx+eax*4],mm0
|
|
movq mm2,mm3
|
|
psllq mm3,mm7
|
|
add eax,4
|
|
por mm3,mm1
|
|
js Lunroll_loop
|
|
|
|
Lfinish:
|
|
; eax 0 to 3 representing respectively 3 to 0 limbs remaining
|
|
|
|
test al,2
|
|
jnz Lfinish_no_two
|
|
movq mm0,[ebx+eax*4]
|
|
psrlq mm2,mm6
|
|
movq mm1,mm0
|
|
psllq mm0,mm7
|
|
movq [-8+edx+eax*4],mm3 ; prev
|
|
por mm0,mm2
|
|
movq mm2,mm1
|
|
movq mm3,mm0
|
|
add eax,2
|
|
Lfinish_no_two:
|
|
|
|
; eax 2 or 3 representing respectively 1 or 0 limbs remaining
|
|
;
|
|
; mm2 src prev qword,from -8(%ebx,%eax,4)
|
|
; mm3 dst qword,for -8(%edx,%eax,4)
|
|
|
|
test al,1
|
|
pop edi
|
|
movd eax,mm5 ; retval
|
|
jnz Lfinish_zero
|
|
|
|
; One extra limb,destination was aligned.
|
|
;
|
|
; source ebx
|
|
; +-------+---------------+--
|
|
; | | mm2 |
|
|
; +-------+---------------+--
|
|
;
|
|
; dest edx
|
|
; +-------+---------------+---------------+--
|
|
; | | | mm3 |
|
|
; +-------+---------------+---------------+--
|
|
;
|
|
; mm6 = shift
|
|
; mm7 = ecx = 64-shift
|
|
|
|
; One extra limb,destination was unaligned.
|
|
;
|
|
; source ebx
|
|
; +-------+---------------+--
|
|
; | | mm2 |
|
|
; +-------+---------------+--
|
|
;
|
|
; dest edx
|
|
; +---------------+---------------+--
|
|
; | | mm3 |
|
|
; +---------------+---------------+--
|
|
;
|
|
; mm6 = shift+32
|
|
; mm7 = ecx = 64-(shift+32)
|
|
|
|
; In both cases there's one extra limb of src to fetch and combine
|
|
; with mm2 to make a qword at 8(%edx),and in the aligned case
|
|
; there's a further extra limb of dst to be formed.
|
|
|
|
|
|
movd mm0,[8+ebx]
|
|
psrlq mm2,mm6
|
|
movq mm1,mm0
|
|
psllq mm0,mm7
|
|
movq [edx],mm3
|
|
por mm0,mm2
|
|
psrlq mm1,mm6
|
|
and ecx,32
|
|
pop ebx
|
|
jz Lfinish_one_unaligned
|
|
|
|
; dst was aligned,must store one extra limb
|
|
movd [16+edx],mm1
|
|
Lfinish_one_unaligned:
|
|
|
|
movq [8+edx],mm0
|
|
emms
|
|
ret
|
|
Lfinish_zero:
|
|
|
|
; No extra limbs,destination was aligned.
|
|
;
|
|
; source ebx
|
|
; +---------------+--
|
|
; | mm2 |
|
|
; +---------------+--
|
|
;
|
|
; dest edx+4
|
|
; +---------------+---------------+--
|
|
; | | mm3 |
|
|
; +---------------+---------------+--
|
|
;
|
|
; mm6 = shift
|
|
; mm7 = ecx = 64-shift
|
|
|
|
; No extra limbs,destination was unaligned.
|
|
;
|
|
; source ebx
|
|
; +---------------+--
|
|
; | mm2 |
|
|
; +---------------+--
|
|
;
|
|
; dest edx+4
|
|
; +-------+---------------+--
|
|
; | | mm3 |
|
|
; +-------+---------------+--
|
|
;
|
|
; mm6 = shift+32
|
|
; mm7 = 64-(shift+32)
|
|
|
|
; The movd for the unaligned case is clearly the same data as the
|
|
; movq for the aligned case,it's just a choice between whether one
|
|
; or two limbs should be written.
|
|
|
|
movq [4+edx],mm3
|
|
psrlq mm2,mm6
|
|
movd [12+edx],mm2
|
|
and ecx,32
|
|
pop ebx
|
|
jz Lfinish_zero_unaligned
|
|
movq [12+edx],mm2
|
|
Lfinish_zero_unaligned:
|
|
emms
|
|
ret
|
|
|
|
end
|