48248cda46
2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code
362 lines
9.0 KiB
NASM
362 lines
9.0 KiB
NASM
|
|
; Copyright 2001 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
; Translation of AT&T syntax code by Brian Gladman
|
|
|
|
%include "..\..\x86i.inc"
|
|
|
|
%define PARAM_SHIFT esp+frame+16
|
|
%define PARAM_SIZE esp+frame+12
|
|
%define PARAM_SRC esp+frame+8
|
|
%define PARAM_DST esp+frame+4
|
|
%define frame 8
|
|
|
|
; Minimum 5,because the unrolled loop can't handle less.
|
|
%define UNROLL_THRESHOLD 5
|
|
|
|
section .text
|
|
|
|
global ___gmpn_rshift
|
|
%ifdef DLL
|
|
export ___gmpn_rshift
|
|
%endif
|
|
|
|
align 8
|
|
___gmpn_rshift:
|
|
push ebx
|
|
push edi
|
|
mov eax,[PARAM_SIZE]
|
|
mov edx,[PARAM_DST]
|
|
mov ebx,[PARAM_SRC]
|
|
mov ecx,[PARAM_SHIFT]
|
|
cmp eax,UNROLL_THRESHOLD
|
|
jae Lunroll
|
|
dec eax
|
|
mov edi,[ebx] ; src low limb
|
|
jnz Lsimple
|
|
shrd eax,edi,cl
|
|
shr edi,cl
|
|
mov [edx],edi ; dst low limb
|
|
pop edi ; risk of data cache bank clash
|
|
pop ebx
|
|
ret
|
|
|
|
; eax size-1
|
|
; ebx src
|
|
; ecx shift
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
; ebp
|
|
|
|
align 8
|
|
Lsimple:
|
|
movd mm5,[ebx] ; src[0]
|
|
lea ebx,[ebx+eax*4] ; &src[size-1]
|
|
movd mm6,ecx ; rshift
|
|
lea edx,[-4+edx+eax*4] ; &dst[size-2]
|
|
psllq mm5,32
|
|
neg eax
|
|
|
|
; This loop is 5 or 8 cycles,with every second load unaligned and a wasted
|
|
; cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
|
|
; cycles and would be 8 in a simple loop. Using mmx helps the return value
|
|
; and last limb calculations too.
|
|
|
|
; eax counter,limbs,negative
|
|
; ebx &src[size-1]
|
|
; ecx return value
|
|
; edx &dst[size-2]
|
|
;
|
|
; mm0 scratch
|
|
; mm5 return value
|
|
; mm6 shift
|
|
|
|
Lsimple_top:
|
|
movq mm0,[ebx+eax*4]
|
|
inc eax
|
|
psrlq mm0,mm6
|
|
movd [edx+eax*4],mm0
|
|
jnz Lsimple_top
|
|
movd mm0,[ebx]
|
|
psrlq mm5,mm6 ; return value
|
|
psrlq mm0,mm6
|
|
pop edi
|
|
movd eax,mm5
|
|
pop ebx
|
|
movd [4+edx],mm0
|
|
emms
|
|
ret
|
|
|
|
; eax size
|
|
; ebx src
|
|
; ecx shift
|
|
; edx dst
|
|
; esi
|
|
; edi
|
|
; ebp
|
|
|
|
align 8
|
|
Lunroll:
|
|
movd mm5,[ebx] ; src[0]
|
|
mov edi,4
|
|
movd mm6,ecx ; rshift
|
|
test ebx,edi
|
|
psllq mm5,32
|
|
jz Lstart_src_aligned
|
|
|
|
; src isn't aligned,process low limb separately (marked xxx) and
|
|
; step src and dst by one limb,making src aligned.
|
|
;
|
|
; source ebx
|
|
; --+-------+-------+-------+
|
|
; | xxx |
|
|
; --+-------+-------+-------+
|
|
; 4mod8 0mod8 4mod8
|
|
;
|
|
; dest edx
|
|
; --+-------+-------+
|
|
; | | xxx |
|
|
; --+-------+-------+
|
|
|
|
movq mm0,[ebx] ; unaligned load
|
|
psrlq mm0,mm6
|
|
add ebx,4
|
|
dec eax
|
|
movd [edx],mm0
|
|
add edx,4
|
|
Lstart_src_aligned:
|
|
movq mm1,[ebx]
|
|
test edx,edi
|
|
psrlq mm5,mm6 ; retval
|
|
jz Lstart_dst_aligned
|
|
|
|
; dst isn't aligned,add 4 to make it so,and pretend the shift is
|
|
; 32 bits extra. Low limb of dst (marked xxx) handled here
|
|
; separately.
|
|
;
|
|
; source ebx
|
|
; --+-------+-------+
|
|
; | mm1 |
|
|
; --+-------+-------+
|
|
; 4mod8 0mod8
|
|
;
|
|
; dest edx
|
|
; --+-------+-------+-------+
|
|
; | xxx |
|
|
; --+-------+-------+-------+
|
|
; 4mod8 0mod8 4mod8
|
|
|
|
movq mm0,mm1
|
|
add ecx,32 ; new shift
|
|
psrlq mm0,mm6
|
|
movd mm6,ecx
|
|
movd [edx],mm0
|
|
add edx,4
|
|
Lstart_dst_aligned:
|
|
movq mm3,[8+ebx]
|
|
neg ecx
|
|
movq mm2,mm3 ; mm2 src qword
|
|
add ecx,64
|
|
movd mm7,ecx
|
|
psrlq mm1,mm6
|
|
lea ebx,[-12+ebx+eax*4]
|
|
lea edx,[-20+edx+eax*4]
|
|
psllq mm3,mm7
|
|
sub eax,7 ; size-7
|
|
por mm3,mm1 ; mm3 ready to store
|
|
neg eax ; -(size-7)
|
|
jns Lfinish
|
|
|
|
; This loop is the important bit,the rest is just support. Careful
|
|
; instruction scheduling achieves the claimed 1.75 c/l. The
|
|
; relevant parts of the pairing rules are:
|
|
;
|
|
; - mmx loads and stores execute only in the U pipe
|
|
; - only one mmx shift in a pair
|
|
; - wait one cycle before storing an mmx register result
|
|
; - the usual address generation interlock
|
|
;
|
|
; Two qword calculations are slightly interleaved. The instructions
|
|
; marked "C" belong to the second qword,and the "C prev" one is for
|
|
; the second qword from the previous iteration.
|
|
|
|
; eax counter,limbs,negative
|
|
; ebx &src[size-12]
|
|
; ecx
|
|
; edx &dst[size-12]
|
|
; esi
|
|
; edi
|
|
;
|
|
; mm0
|
|
; mm1
|
|
; mm2 src qword from -8(%ebx,%eax,4)
|
|
; mm3 dst qword ready to store to -8(%edx,%eax,4)
|
|
;
|
|
; mm5 return value
|
|
; mm6 rshift
|
|
; mm7 lshift
|
|
|
|
align 8
|
|
Lunroll_loop:
|
|
movq mm0,[ebx+eax*4]
|
|
psrlq mm2,mm6
|
|
movq mm1,mm0
|
|
psllq mm0,mm7
|
|
movq [-8+edx+eax*4],mm3
|
|
por mm0,mm2
|
|
|
|
movq mm3,[ebx+eax*4+8]
|
|
psrlq mm1,mm6
|
|
movq [edx+eax*4],mm0
|
|
movq mm2,mm3
|
|
psllq mm3,mm7
|
|
add eax,4
|
|
por mm3,mm1
|
|
js Lunroll_loop
|
|
|
|
Lfinish:
|
|
; eax 0 to 3 representing respectively 3 to 0 limbs remaining
|
|
|
|
test al,2
|
|
jnz Lfinish_no_two
|
|
movq mm0,[ebx+eax*4]
|
|
psrlq mm2,mm6
|
|
movq mm1,mm0
|
|
psllq mm0,mm7
|
|
movq [-8+edx+eax*4],mm3 ; prev
|
|
por mm0,mm2
|
|
movq mm2,mm1
|
|
movq mm3,mm0
|
|
add eax,2
|
|
Lfinish_no_two:
|
|
|
|
; eax 2 or 3 representing respectively 1 or 0 limbs remaining
|
|
;
|
|
; mm2 src prev qword,from -8(%ebx,%eax,4)
|
|
; mm3 dst qword,for -8(%edx,%eax,4)
|
|
|
|
test al,1
|
|
pop edi
|
|
movd eax,mm5 ; retval
|
|
jnz Lfinish_zero
|
|
|
|
; One extra limb,destination was aligned.
|
|
;
|
|
; source ebx
|
|
; +-------+---------------+--
|
|
; | | mm2 |
|
|
; +-------+---------------+--
|
|
;
|
|
; dest edx
|
|
; +-------+---------------+---------------+--
|
|
; | | | mm3 |
|
|
; +-------+---------------+---------------+--
|
|
;
|
|
; mm6 = shift
|
|
; mm7 = ecx = 64-shift
|
|
|
|
; One extra limb,destination was unaligned.
|
|
;
|
|
; source ebx
|
|
; +-------+---------------+--
|
|
; | | mm2 |
|
|
; +-------+---------------+--
|
|
;
|
|
; dest edx
|
|
; +---------------+---------------+--
|
|
; | | mm3 |
|
|
; +---------------+---------------+--
|
|
;
|
|
; mm6 = shift+32
|
|
; mm7 = ecx = 64-(shift+32)
|
|
|
|
; In both cases there's one extra limb of src to fetch and combine
|
|
; with mm2 to make a qword at 8(%edx),and in the aligned case
|
|
; there's a further extra limb of dst to be formed.
|
|
|
|
|
|
movd mm0,[8+ebx]
|
|
psrlq mm2,mm6
|
|
movq mm1,mm0
|
|
psllq mm0,mm7
|
|
movq [edx],mm3
|
|
por mm0,mm2
|
|
psrlq mm1,mm6
|
|
and ecx,32
|
|
pop ebx
|
|
jz Lfinish_one_unaligned
|
|
|
|
; dst was aligned,must store one extra limb
|
|
movd [16+edx],mm1
|
|
Lfinish_one_unaligned:
|
|
|
|
movq [8+edx],mm0
|
|
emms
|
|
ret
|
|
Lfinish_zero:
|
|
|
|
; No extra limbs,destination was aligned.
|
|
;
|
|
; source ebx
|
|
; +---------------+--
|
|
; | mm2 |
|
|
; +---------------+--
|
|
;
|
|
; dest edx+4
|
|
; +---------------+---------------+--
|
|
; | | mm3 |
|
|
; +---------------+---------------+--
|
|
;
|
|
; mm6 = shift
|
|
; mm7 = ecx = 64-shift
|
|
|
|
; No extra limbs,destination was unaligned.
|
|
;
|
|
; source ebx
|
|
; +---------------+--
|
|
; | mm2 |
|
|
; +---------------+--
|
|
;
|
|
; dest edx+4
|
|
; +-------+---------------+--
|
|
; | | mm3 |
|
|
; +-------+---------------+--
|
|
;
|
|
; mm6 = shift+32
|
|
; mm7 = 64-(shift+32)
|
|
|
|
; The movd for the unaligned case is clearly the same data as the
|
|
; movq for the aligned case,it's just a choice between whether one
|
|
; or two limbs should be written.
|
|
|
|
movq [4+edx],mm3
|
|
psrlq mm2,mm6
|
|
movd [12+edx],mm2
|
|
and ecx,32
|
|
pop ebx
|
|
jz Lfinish_zero_unaligned
|
|
movq [12+edx],mm2
|
|
Lfinish_zero_unaligned:
|
|
emms
|
|
ret
|
|
|
|
end
|