mpir/mpn/x86/nehalem/mod_34lsub1.asm
2009-09-27 14:54:29 +00:00

167 lines
3.3 KiB
NASM

dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
dnl Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C Pentium4: 1.0 cycles/limb
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
C
C Enhancements:
C
C There might a couple of cycles to save by using plain integer code for
C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
C about 46 (inclusive of some function call overheads).
defframe(PARAM_SIZE, 8)
defframe(PARAM_SRC, 4)
dnl re-use parameter space
define(SAVE_EBX, `PARAM_SRC')
define(SAVE_ESI, `PARAM_SIZE')
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_34lsub1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
movl PARAM_SRC, %edx
movl (%edx), %eax
subl $2, %ecx
ja L(three_or_more)
jne L(one)
movl 4(%edx), %edx
movl %eax, %ecx
shrl $24, %eax C src[0] high
andl $0x00FFFFFF, %ecx C src[0] low
addl %ecx, %eax
movl %edx, %ecx
shll $8, %edx
shrl $16, %ecx C src[1] low
addl %ecx, %eax
andl $0x00FFFF00, %edx C src[1] high
addl %edx, %eax
L(one):
ret
L(three_or_more):
pxor %mm0, %mm0
pxor %mm1, %mm1
pxor %mm2, %mm2
pcmpeqd %mm7, %mm7
psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
pcmpeqd %mm6, %mm6
psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
L(top):
C eax
C ebx
C ecx counter, size-2 to 0, -1 or -2
C edx src, incrementing
C
C mm0 sum 0mod3
C mm1 sum 1mod3
C mm2 sum 2mod3
C mm3
C mm4
C mm5
C mm6 0x0000000000FFFFFF
C mm7 0x00000000FFFFFFFF
movd (%edx), %mm3
paddq %mm3, %mm0
movd 4(%edx), %mm3
paddq %mm3, %mm1
movd 8(%edx), %mm3
paddq %mm3, %mm2
addl $12, %edx
subl $3, %ecx
ja L(top)
C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
addl $1, %ecx
js L(combine) C 0 more
movd (%edx), %mm3
paddq %mm3, %mm0
jz L(combine) C 1 more
movd 4(%edx), %mm3
paddq %mm3, %mm1
L(combine):
movq %mm7, %mm3 C low halves
pand %mm0, %mm3
movq %mm7, %mm4
pand %mm1, %mm4
movq %mm7, %mm5
pand %mm2, %mm5
psrlq $32, %mm0 C high halves
psrlq $32, %mm1
psrlq $32, %mm2
paddq %mm0, %mm4 C fold high halves to give 33 bits each
paddq %mm1, %mm5
paddq %mm2, %mm3
psllq $8, %mm4 C combine at respective offsets
psllq $16, %mm5
paddq %mm4, %mm3
paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
pand %mm3, %mm6 C fold at 24 bits
psrlq $24, %mm3
paddq %mm6, %mm3
movd %mm3, %eax
ASSERT(z, C nothing left in high dword
`psrlq $32, %mm3
movd %mm3, %ecx
orl %ecx, %ecx')
emms
ret
EPILOGUE()