mpir/mpn/x86i/pentium4/mmx/rshift.asm
brgladman 48248cda46 1. longlong.h change to add MSVC intrinsics
2. longlong.h rearrangement for Intel compiler
3. MSVC additions in test  code 
4. GMP 4.2.1 bug fixes
5. Intel format assembly code
2008-05-18 22:20:43 +00:00

362 lines
9.0 KiB
NASM

; Copyright 2001 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
%include "..\..\x86i.inc"
%define PARAM_SHIFT esp+frame+16
%define PARAM_SIZE esp+frame+12
%define PARAM_SRC esp+frame+8
%define PARAM_DST esp+frame+4
%define frame 8
; Minimum 5,because the unrolled loop can't handle less.
%define UNROLL_THRESHOLD 5
section .text
global ___gmpn_rshift
%ifdef DLL
export ___gmpn_rshift
%endif
align 8
___gmpn_rshift:
push ebx
push edi
mov eax,[PARAM_SIZE]
mov edx,[PARAM_DST]
mov ebx,[PARAM_SRC]
mov ecx,[PARAM_SHIFT]
cmp eax,UNROLL_THRESHOLD
jae Lunroll
dec eax
mov edi,[ebx] ; src low limb
jnz Lsimple
shrd eax,edi,cl
shr edi,cl
mov [edx],edi ; dst low limb
pop edi ; risk of data cache bank clash
pop ebx
ret
; eax size-1
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
align 8
Lsimple:
movd mm5,[ebx] ; src[0]
lea ebx,[ebx+eax*4] ; &src[size-1]
movd mm6,ecx ; rshift
lea edx,[-4+edx+eax*4] ; &dst[size-2]
psllq mm5,32
neg eax
; This loop is 5 or 8 cycles,with every second load unaligned and a wasted
; cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
; cycles and would be 8 in a simple loop. Using mmx helps the return value
; and last limb calculations too.
; eax counter,limbs,negative
; ebx &src[size-1]
; ecx return value
; edx &dst[size-2]
;
; mm0 scratch
; mm5 return value
; mm6 shift
Lsimple_top:
movq mm0,[ebx+eax*4]
inc eax
psrlq mm0,mm6
movd [edx+eax*4],mm0
jnz Lsimple_top
movd mm0,[ebx]
psrlq mm5,mm6 ; return value
psrlq mm0,mm6
pop edi
movd eax,mm5
pop ebx
movd [4+edx],mm0
emms
ret
; eax size
; ebx src
; ecx shift
; edx dst
; esi
; edi
; ebp
align 8
Lunroll:
movd mm5,[ebx] ; src[0]
mov edi,4
movd mm6,ecx ; rshift
test ebx,edi
psllq mm5,32
jz Lstart_src_aligned
; src isn't aligned,process low limb separately (marked xxx) and
; step src and dst by one limb,making src aligned.
;
; source ebx
; --+-------+-------+-------+
; | xxx |
; --+-------+-------+-------+
; 4mod8 0mod8 4mod8
;
; dest edx
; --+-------+-------+
; | | xxx |
; --+-------+-------+
movq mm0,[ebx] ; unaligned load
psrlq mm0,mm6
add ebx,4
dec eax
movd [edx],mm0
add edx,4
Lstart_src_aligned:
movq mm1,[ebx]
test edx,edi
psrlq mm5,mm6 ; retval
jz Lstart_dst_aligned
; dst isn't aligned,add 4 to make it so,and pretend the shift is
; 32 bits extra. Low limb of dst (marked xxx) handled here
; separately.
;
; source ebx
; --+-------+-------+
; | mm1 |
; --+-------+-------+
; 4mod8 0mod8
;
; dest edx
; --+-------+-------+-------+
; | xxx |
; --+-------+-------+-------+
; 4mod8 0mod8 4mod8
movq mm0,mm1
add ecx,32 ; new shift
psrlq mm0,mm6
movd mm6,ecx
movd [edx],mm0
add edx,4
Lstart_dst_aligned:
movq mm3,[8+ebx]
neg ecx
movq mm2,mm3 ; mm2 src qword
add ecx,64
movd mm7,ecx
psrlq mm1,mm6
lea ebx,[-12+ebx+eax*4]
lea edx,[-20+edx+eax*4]
psllq mm3,mm7
sub eax,7 ; size-7
por mm3,mm1 ; mm3 ready to store
neg eax ; -(size-7)
jns Lfinish
; This loop is the important bit,the rest is just support. Careful
; instruction scheduling achieves the claimed 1.75 c/l. The
; relevant parts of the pairing rules are:
;
; - mmx loads and stores execute only in the U pipe
; - only one mmx shift in a pair
; - wait one cycle before storing an mmx register result
; - the usual address generation interlock
;
; Two qword calculations are slightly interleaved. The instructions
; marked "C" belong to the second qword,and the "C prev" one is for
; the second qword from the previous iteration.
; eax counter,limbs,negative
; ebx &src[size-12]
; ecx
; edx &dst[size-12]
; esi
; edi
;
; mm0
; mm1
; mm2 src qword from -8(%ebx,%eax,4)
; mm3 dst qword ready to store to -8(%edx,%eax,4)
;
; mm5 return value
; mm6 rshift
; mm7 lshift
align 8
Lunroll_loop:
movq mm0,[ebx+eax*4]
psrlq mm2,mm6
movq mm1,mm0
psllq mm0,mm7
movq [-8+edx+eax*4],mm3
por mm0,mm2
movq mm3,[ebx+eax*4+8]
psrlq mm1,mm6
movq [edx+eax*4],mm0
movq mm2,mm3
psllq mm3,mm7
add eax,4
por mm3,mm1
js Lunroll_loop
Lfinish:
; eax 0 to 3 representing respectively 3 to 0 limbs remaining
test al,2
jnz Lfinish_no_two
movq mm0,[ebx+eax*4]
psrlq mm2,mm6
movq mm1,mm0
psllq mm0,mm7
movq [-8+edx+eax*4],mm3 ; prev
por mm0,mm2
movq mm2,mm1
movq mm3,mm0
add eax,2
Lfinish_no_two:
; eax 2 or 3 representing respectively 1 or 0 limbs remaining
;
; mm2 src prev qword,from -8(%ebx,%eax,4)
; mm3 dst qword,for -8(%edx,%eax,4)
test al,1
pop edi
movd eax,mm5 ; retval
jnz Lfinish_zero
; One extra limb,destination was aligned.
;
; source ebx
; +-------+---------------+--
; | | mm2 |
; +-------+---------------+--
;
; dest edx
; +-------+---------------+---------------+--
; | | | mm3 |
; +-------+---------------+---------------+--
;
; mm6 = shift
; mm7 = ecx = 64-shift
; One extra limb,destination was unaligned.
;
; source ebx
; +-------+---------------+--
; | | mm2 |
; +-------+---------------+--
;
; dest edx
; +---------------+---------------+--
; | | mm3 |
; +---------------+---------------+--
;
; mm6 = shift+32
; mm7 = ecx = 64-(shift+32)
; In both cases there's one extra limb of src to fetch and combine
; with mm2 to make a qword at 8(%edx),and in the aligned case
; there's a further extra limb of dst to be formed.
movd mm0,[8+ebx]
psrlq mm2,mm6
movq mm1,mm0
psllq mm0,mm7
movq [edx],mm3
por mm0,mm2
psrlq mm1,mm6
and ecx,32
pop ebx
jz Lfinish_one_unaligned
; dst was aligned,must store one extra limb
movd [16+edx],mm1
Lfinish_one_unaligned:
movq [8+edx],mm0
emms
ret
Lfinish_zero:
; No extra limbs,destination was aligned.
;
; source ebx
; +---------------+--
; | mm2 |
; +---------------+--
;
; dest edx+4
; +---------------+---------------+--
; | | mm3 |
; +---------------+---------------+--
;
; mm6 = shift
; mm7 = ecx = 64-shift
; No extra limbs,destination was unaligned.
;
; source ebx
; +---------------+--
; | mm2 |
; +---------------+--
;
; dest edx+4
; +-------+---------------+--
; | | mm3 |
; +-------+---------------+--
;
; mm6 = shift+32
; mm7 = 64-(shift+32)
; The movd for the unaligned case is clearly the same data as the
; movq for the aligned case,it's just a choice between whether one
; or two limbs should be written.
movq [4+edx],mm3
psrlq mm2,mm6
movd [12+edx],mm2
and ecx,32
pop ebx
jz Lfinish_zero_unaligned
movq [12+edx],mm2
Lfinish_zero_unaligned:
emms
ret
end