mpir/mpn/x86w/pentium4/mmx/rshift.asm


;  Copyright 2001 Free Software Foundation, Inc.
;
;  This file is part of the GNU MP Library.
;
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman

%include "..\..\x86i.inc"

%define	PARAM_SHIFT esp+frame+16
%define PARAM_SIZE  esp+frame+12
%define PARAM_SRC   esp+frame+8
%define PARAM_DST   esp+frame+4
%define frame		8

;   Minimum 5,because the unrolled loop can't handle less.
%define	UNROLL_THRESHOLD  5

	section .text

	global	___gmpn_rshift
%ifdef	DLL
	export	___gmpn_rshift
%endif

	align   8
___gmpn_rshift:
    push    ebx
    push    edi
    mov     eax,[PARAM_SIZE]
    mov     edx,[PARAM_DST]
    mov     ebx,[PARAM_SRC]
    mov     ecx,[PARAM_SHIFT]
	cmp     eax,UNROLL_THRESHOLD
	jae     Lunroll
    dec     eax
    mov     edi,[ebx]		;  src low limb
    jnz     Lsimple
	shrd	eax,edi,cl
    shr     edi,cl
    mov     [edx],edi       ;  dst low limb
    pop     edi             ;  risk of data cache bank clash
    pop     ebx
    ret

;  eax size-1
;  ebx src
;  ecx shift
;  edx dst
;  esi
;  edi
;  ebp

	align   8
Lsimple:
    movd    mm5,[ebx]       ;  src[0]
    lea     ebx,[ebx+eax*4]  ;  &src[size-1]
    movd    mm6,ecx         ;  rshift
    lea     edx,[-4+edx+eax*4] ;  &dst[size-2]
    psllq   mm5,32
    neg     eax

;  This loop is 5 or 8 cycles,with every second load unaligned and a wasted
;  cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
;  cycles and would be 8 in a simple loop.  Using mmx helps the return value
;  and last limb calculations too.

;  eax counter,limbs,negative
;  ebx &src[size-1]
;  ecx return value
;  edx &dst[size-2]
;
;  mm0 scratch
;  mm5 return value
;  mm6 shift

Lsimple_top:
    movq    mm0,[ebx+eax*4]
    inc     eax
    psrlq   mm0,mm6
    movd    [edx+eax*4],mm0
    jnz     Lsimple_top
    movd    mm0,[ebx]
    psrlq   mm5,mm6         ;  return value
    psrlq   mm0,mm6
    pop     edi
    movd    eax,mm5
    pop     ebx
    movd    [4+edx],mm0
    emms
    ret

;  eax size
;  ebx src
;  ecx shift
;  edx dst
;  esi
;  edi
;  ebp

	align   8
Lunroll:
    movd    mm5,[ebx]       ;  src[0]
    mov     edi,4
    movd    mm6,ecx         ;  rshift
    test    ebx,edi
    psllq   mm5,32
    jz      Lstart_src_aligned

;  src isn't aligned,process low limb separately (marked xxx) and
;  step src and dst by one limb,making src aligned.
;
;  source                  ebx
;  --+-------+-------+-------+
;            |          xxx  |
;  --+-------+-------+-------+
;          4mod8   0mod8   4mod8
;
;          dest            edx
;          --+-------+-------+
;            |       |  xxx  |
;          --+-------+-------+

    movq    mm0,[ebx]       ;  unaligned load
    psrlq   mm0,mm6
    add     ebx,4
    dec     eax
    movd    [edx],mm0
    add     edx,4
Lstart_src_aligned:
    movq    mm1,[ebx]
    test    edx,edi
    psrlq   mm5,mm6         ;  retval
    jz      Lstart_dst_aligned

;  dst isn't aligned,add 4 to make it so,and pretend the shift is
;  32 bits extra.  Low limb of dst (marked xxx) handled here
;  separately.
;
;           source          ebx
;           --+-------+-------+
;             |      mm1      |
;           --+-------+-------+
;                   4mod8   0mod8
;
;   dest                    edx
;   --+-------+-------+-------+
;                     |  xxx  |
;   --+-------+-------+-------+
;           4mod8   0mod8   4mod8

    movq    mm0,mm1
    add     ecx,32         ;  new shift
    psrlq   mm0,mm6
    movd    mm6,ecx
    movd    [edx],mm0
    add     edx,4
Lstart_dst_aligned:
    movq    mm3,[8+ebx]
    neg     ecx
    movq    mm2,mm3			;  mm2 src qword
    add     ecx,64
    movd    mm7,ecx
    psrlq   mm1,mm6
    lea     ebx,[-12+ebx+eax*4]
    lea     edx,[-20+edx+eax*4]
    psllq   mm3,mm7
    sub     eax,7			;  size-7
    por     mm3,mm1         ;  mm3 ready to store
    neg     eax             ;  -(size-7)
    jns     Lfinish

;  This loop is the important bit,the rest is just support.  Careful
;  instruction scheduling achieves the claimed 1.75 c/l.  The
;  relevant parts of the pairing rules are:
;
;  - mmx loads and stores execute only in the U pipe
;  - only one mmx shift in a pair
;  - wait one cycle before storing an mmx register result
;  - the usual address generation interlock
;
;  Two qword calculations are slightly interleaved.  The instructions
;  marked "C" belong to the second qword,and the "C prev" one is for
;  the second qword from the previous iteration.

;  eax counter,limbs,negative
;  ebx &src[size-12]
;  ecx
;  edx &dst[size-12]
;  esi
;  edi
;
;  mm0
;  mm1
;  mm2 src qword from -8(%ebx,%eax,4)
;  mm3 dst qword ready to store to -8(%edx,%eax,4)
;
;  mm5 return value
;  mm6 rshift
;  mm7 lshift

	align   8
Lunroll_loop:
    movq    mm0,[ebx+eax*4]
    psrlq   mm2,mm6
    movq    mm1,mm0
    psllq   mm0,mm7
    movq    [-8+edx+eax*4],mm3
    por     mm0,mm2

	movq    mm3,[ebx+eax*4+8]
	psrlq   mm1,mm6
    movq    [edx+eax*4],mm0
	movq    mm2,mm3
	psllq   mm3,mm7
    add     eax,4
	por     mm3,mm1
    js      Lunroll_loop

Lfinish:
;  eax 0 to 3 representing respectively 3 to 0 limbs remaining

    test    al,2
    jnz     Lfinish_no_two
    movq    mm0,[ebx+eax*4]
    psrlq   mm2,mm6
    movq    mm1,mm0
    psllq   mm0,mm7
    movq    [-8+edx+eax*4],mm3  ;  prev
    por     mm0,mm2
    movq    mm2,mm1
    movq    mm3,mm0
    add     eax,2
Lfinish_no_two:

;  eax 2 or 3 representing respectively 1 or 0 limbs remaining
;
;  mm2 src prev qword,from -8(%ebx,%eax,4)
;  mm3 dst qword,for -8(%edx,%eax,4)

    test    al,1
    pop     edi
    movd    eax,mm5  ;  retval
    jnz     Lfinish_zero

;  One extra limb,destination was aligned.
;
;  source                ebx
;  +-------+---------------+--
;  |       |      mm2      |
;  +-------+---------------+--
;
;  dest                                  edx
;  +-------+---------------+---------------+--
;  |       |               |      mm3      |
;  +-------+---------------+---------------+--
;
;  mm6 = shift
;  mm7 = ecx = 64-shift

;  One extra limb,destination was unaligned.
;
;  source                ebx
;  +-------+---------------+--
;  |       |      mm2      |
;  +-------+---------------+--
;
;  dest                          edx
;  +---------------+---------------+--
;  |               |      mm3      |
;  +---------------+---------------+--
;
;  mm6 = shift+32
;  mm7 = ecx = 64-(shift+32)

;  In both cases there's one extra limb of src to fetch and combine
;  with mm2 to make a qword at 8(%edx),and in the aligned case
;  there's a further extra limb of dst to be formed.


    movd    mm0,[8+ebx]
    psrlq   mm2,mm6
    movq    mm1,mm0
    psllq   mm0,mm7
    movq    [edx],mm3
    por     mm0,mm2
    psrlq   mm1,mm6
    and     ecx,32
    pop     ebx
    jz      Lfinish_one_unaligned

    ;  dst was aligned,must store one extra limb
    movd    [16+edx],mm1
Lfinish_one_unaligned:

    movq    [8+edx],mm0
    emms
    ret
Lfinish_zero:

;  No extra limbs,destination was aligned.
;
;  source        ebx
;  +---------------+--
;  |      mm2      |
;  +---------------+--
;
;  dest                        edx+4
;  +---------------+---------------+--
;  |               |      mm3      |
;  +---------------+---------------+--
;
;  mm6 = shift
;  mm7 = ecx = 64-shift

;  No extra limbs,destination was unaligned.
;
;  source        ebx
;  +---------------+--
;  |      mm2      |
;  +---------------+--
;
;  dest                edx+4
;  +-------+---------------+--
;  |       |      mm3      |
;  +-------+---------------+--
;
;  mm6 = shift+32
;  mm7 = 64-(shift+32)

;  The movd for the unaligned case is clearly the same data as the
;  movq for the aligned case,it's just a choice between whether one
;  or two limbs should be written.

    movq    [4+edx],mm3
    psrlq   mm2,mm6
    movd    [12+edx],mm2
    and     ecx,32
    pop     ebx
    jz      Lfinish_zero_unaligned
    movq    [12+edx],mm2
Lfinish_zero_unaligned:
    emms
    ret

	end