mpir/mpn/x86w/pentium4/mmx/rshift.asm
brgladman 9c467c6415
2008-07-04 10:39:15 +00:00

362 lines
8.7 KiB
NASM

; Copyright 2001 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
%include "..\..\x86i.inc"
%define PARAM_SHIFT esp+frame+16
%define PARAM_SIZE esp+frame+12
%define PARAM_SRC esp+frame+8
%define PARAM_DST esp+frame+4
%define frame 8
; Minimum 5,because the unrolled loop can't handle less.
%define UNROLL_THRESHOLD 5
section .text
global ___gmpn_rshift
%ifdef DLL
export ___gmpn_rshift
%endif
align 8
___gmpn_rshift:
push ebx
push edi
mov eax,[PARAM_SIZE]
mov edx,[PARAM_DST]
mov ebx,[PARAM_SRC]
mov ecx,[PARAM_SHIFT]
cmp eax,UNROLL_THRESHOLD
jae Lunroll
dec eax
mov edi,[ebx] ; src low limb
jnz Lsimple
shrd eax,edi,cl
shr edi,cl
mov [edx],edi ; dst low limb
pop edi ; risk of data cache bank clash
pop ebx
ret
; eax size-1
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
align 8
Lsimple:
movd mm5,[ebx] ; src[0]
lea ebx,[ebx+eax*4] ; &src[size-1]
movd mm6,ecx ; rshift
lea edx,[-4+edx+eax*4] ; &dst[size-2]
psllq mm5,32
neg eax
; This loop is 5 or 8 cycles,with every second load unaligned and a wasted
; cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
; cycles and would be 8 in a simple loop. Using mmx helps the return value
; and last limb calculations too.
; eax counter,limbs,negative
; ebx &src[size-1]
; ecx return value
; edx &dst[size-2]
;
; mm0 scratch
; mm5 return value
; mm6 shift
Lsimple_top:
movq mm0,[ebx+eax*4]
inc eax
psrlq mm0,mm6
movd [edx+eax*4],mm0
jnz Lsimple_top
movd mm0,[ebx]
psrlq mm5,mm6 ; return value
psrlq mm0,mm6
pop edi
movd eax,mm5
pop ebx
movd [4+edx],mm0
emms
ret
; eax size
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
align 8
Lunroll:
movd mm5,[ebx] ; src[0]
mov edi,4
movd mm6,ecx ; rshift
test ebx,edi
psllq mm5,32
jz Lstart_src_aligned
; src isn't aligned,process low limb separately (marked xxx) and
; step src and dst by one limb,making src aligned.
;
; source ebx
; --+-------+-------+-------+
; | xxx |
; --+-------+-------+-------+
; 4mod8 0mod8 4mod8
;
; dest edx
; --+-------+-------+
; | | xxx |
; --+-------+-------+
movq mm0,[ebx] ; unaligned load
psrlq mm0,mm6
add ebx,4
dec eax
movd [edx],mm0
add edx,4
Lstart_src_aligned:
movq mm1,[ebx]
test edx,edi
psrlq mm5,mm6 ; retval
jz Lstart_dst_aligned
; dst isn't aligned,add 4 to make it so,and pretend the shift is
; 32 bits extra. Low limb of dst (marked xxx) handled here
; separately.
;
; source ebx
; --+-------+-------+
; | mm1 |
; --+-------+-------+
; 4mod8 0mod8
;
; dest edx
; --+-------+-------+-------+
; | xxx |
; --+-------+-------+-------+
; 4mod8 0mod8 4mod8
movq mm0,mm1
add ecx,32 ; new shift
psrlq mm0,mm6
movd mm6,ecx
movd [edx],mm0
add edx,4
Lstart_dst_aligned:
movq mm3,[8+ebx]
neg ecx
movq mm2,mm3 ; mm2 src qword
add ecx,64
movd mm7,ecx
psrlq mm1,mm6
lea ebx,[-12+ebx+eax*4]
lea edx,[-20+edx+eax*4]
psllq mm3,mm7
sub eax,7 ; size-7
por mm3,mm1 ; mm3 ready to store
neg eax ; -(size-7)
jns Lfinish
; This loop is the important bit,the rest is just support. Careful
; instruction scheduling achieves the claimed 1.75 c/l. The
; relevant parts of the pairing rules are:
;
; - mmx loads and stores execute only in the U pipe
; - only one mmx shift in a pair
; - wait one cycle before storing an mmx register result
; - the usual address generation interlock
;
; Two qword calculations are slightly interleaved. The instructions
; marked "C" belong to the second qword,and the "C prev" one is for
; the second qword from the previous iteration.
; eax counter,limbs,negative
; ebx &src[size-12]
; ecx
; edx &dst[size-12]
; esi
; edi
;
; mm0
; mm1
; mm2 src qword from -8(%ebx,%eax,4)
; mm3 dst qword ready to store to -8(%edx,%eax,4)
;
; mm5 return value
; mm6 rshift
; mm7 lshift
align 8
Lunroll_loop:
movq mm0,[ebx+eax*4]
psrlq mm2,mm6
movq mm1,mm0
psllq mm0,mm7
movq [-8+edx+eax*4],mm3
por mm0,mm2
movq mm3,[ebx+eax*4+8]
psrlq mm1,mm6
movq [edx+eax*4],mm0
movq mm2,mm3
psllq mm3,mm7
add eax,4
por mm3,mm1
js Lunroll_loop
Lfinish:
; eax 0 to 3 representing respectively 3 to 0 limbs remaining
test al,2
jnz Lfinish_no_two
movq mm0,[ebx+eax*4]
psrlq mm2,mm6
movq mm1,mm0
psllq mm0,mm7
movq [-8+edx+eax*4],mm3 ; prev
por mm0,mm2
movq mm2,mm1
movq mm3,mm0
add eax,2
Lfinish_no_two:
; eax 2 or 3 representing respectively 1 or 0 limbs remaining
;
; mm2 src prev qword,from -8(%ebx,%eax,4)
; mm3 dst qword,for -8(%edx,%eax,4)
test al,1
pop edi
movd eax,mm5 ; retval
jnz Lfinish_zero
; One extra limb,destination was aligned.
;
; source ebx
; +-------+---------------+--
; | | mm2 |
; +-------+---------------+--
;
; dest edx
; +-------+---------------+---------------+--
; | | | mm3 |
; +-------+---------------+---------------+--
;
; mm6 = shift
; mm7 = ecx = 64-shift
; One extra limb,destination was unaligned.
;
; source ebx
; +-------+---------------+--
; | | mm2 |
; +-------+---------------+--
;
; dest edx
; +---------------+---------------+--
; | | mm3 |
; +---------------+---------------+--
;
; mm6 = shift+32
; mm7 = ecx = 64-(shift+32)
; In both cases there's one extra limb of src to fetch and combine
; with mm2 to make a qword at 8(%edx),and in the aligned case
; there's a further extra limb of dst to be formed.
movd mm0,[8+ebx]
psrlq mm2,mm6
movq mm1,mm0
psllq mm0,mm7
movq [edx],mm3
por mm0,mm2
psrlq mm1,mm6
and ecx,32
pop ebx
jz Lfinish_one_unaligned
; dst was aligned,must store one extra limb
movd [16+edx],mm1
Lfinish_one_unaligned:
movq [8+edx],mm0
emms
ret
Lfinish_zero:
; No extra limbs,destination was aligned.
;
; source ebx
; +---------------+--
; | mm2 |
; +---------------+--
;
; dest edx+4
; +---------------+---------------+--
; | | mm3 |
; +---------------+---------------+--
;
; mm6 = shift
; mm7 = ecx = 64-shift
; No extra limbs,destination was unaligned.
;
; source ebx
; +---------------+--
; | mm2 |
; +---------------+--
;
; dest edx+4
; +-------+---------------+--
; | | mm3 |
; +-------+---------------+--
;
; mm6 = shift+32
; mm7 = 64-(shift+32)
; The movd for the unaligned case is clearly the same data as the
; movq for the aligned case,it's just a choice between whether one
; or two limbs should be written.
movq [4+edx],mm3
psrlq mm2,mm6
movd [12+edx],mm2
and ecx,32
pop ebx
jz Lfinish_zero_unaligned
movq [12+edx],mm2
Lfinish_zero_unaligned:
emms
ret
end