mpir/mpn/x86/nehalem/mod_1.asm
2009-09-27 14:54:29 +00:00

394 lines
7.7 KiB
NASM

dnl Intel Pentium-4 mpn_mod_1 -- mpn by limb remainder.
dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
dnl P4: 31 cycles/limb.
C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C mp_limb_t carry);
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C mp_limb_t inverse);
C
C An idea was tried in the mul-by-inverse to process the last limb by a jump
C back to the top of the loop skipping the -4(%esi) fetch. But that seemed
C to produce slightly strange timings, like 9 and 10 limb operations about
C the same speed. The jump would be successively taken and not-taken, which
C in theory should predict ok, but perhaps isn't enjoyed by the chip.
C Duplicating the loop for the last limb seems to be a couple of cycles
C quicker too.
C
C Enhancements:
C
C The loop measures 31 cycles, but the dependent chain would suggest it
C could be done with 30. Not sure where to start looking for the extra
C cycle.
dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
dnl used, rather than plain "divl"s. Minimum value 2.
dnl
dnl The inverse takes about 80-90 cycles to calculate, but after that the
dnl multiply is 31 c/l versus division at about 58 c/l.
deflit(MUL_THRESHOLD, 5)
defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1
defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
defframe(PARAM_DIVISOR,12)
defframe(PARAM_SIZE, 8)
defframe(PARAM_SRC, 4)
dnl re-use parameter space
define(SAVE_ESI,`PARAM_SIZE')
define(SAVE_EBP,`PARAM_SRC')
TEXT
ALIGN(16)
PROLOGUE(mpn_preinv_mod_1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
movl %esi, SAVE_ESI
movl $32, %eax
movd %eax, %mm6 C l = 0, so 32-l = 32
movl PARAM_SRC, %esi
movl %ebp, SAVE_EBP
movd PARAM_DIVISOR, %mm5
pxor %mm7, %mm7 C l = 0
movd -4(%esi,%ecx,4), %mm0 C src high limb
leal -8(%esi,%ecx,4), %esi C &src[size-2]
movd PARAM_INVERSE, %mm4
subl $2, %ecx C size-2
psubq %mm5, %mm0 C high-divisor
movq %mm0, %mm2
psrlq $32, %mm0 C -1 if underflow
pand %mm5, %mm0 C divisor if underflow
paddq %mm2, %mm0 C addback if underflow
jz L(inverse_last) C if size==2
ja L(inverse_top) C if size>2
C if size==1
movl SAVE_ESI, %esi
movd %mm0, %eax
emms
ret
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_mod_1c)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
movl %esi, SAVE_ESI
movl PARAM_SRC, %esi
movl %ebp, SAVE_EBP
movl PARAM_CARRY, %edx
orl %ecx, %ecx
jz L(divide_done) C result==carry if size==0
movl PARAM_DIVISOR, %ebp
jmp L(start_1c)
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_mod_1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
movl %esi, SAVE_ESI
movl PARAM_SRC, %esi
movl %ebp, SAVE_EBP
movl PARAM_DIVISOR, %ebp
xorl %edx, %edx C result 0 if size==0
orl %ecx, %ecx
jz L(divide_done)
movl -4(%esi,%ecx,4), %eax C src high limb
leal -1(%ecx), %edx
cmpl %ebp, %eax C c if high<divisor
cmovc( %edx, %ecx) C size-1 if high<divisor
movl $0, %edx C initial carry
cmovc( %eax, %edx) C src high limb if high<divisor
orl %ecx, %ecx
jz L(divide_done) C if size==1 and skip div
L(start_1c):
C eax
C ebx
C ecx size
C edx carry
C esi src
C edi
C ebp divisor
leal -4(%esi,%ecx,4), %esi C &src[size-1]
cmpl $MUL_THRESHOLD, %ecx
jae L(mul_by_inverse)
L(divide_top):
C eax
C ebx
C ecx counter, limbs, decrementing
C edx remainder
C esi src, decrementing
C edi
C ebp divisor
movl (%esi), %eax
subl $4, %esi
divl %ebp
subl $1, %ecx
jnz L(divide_top)
L(divide_done):
movl SAVE_ESI, %esi
movl SAVE_EBP, %ebp
movl %edx, %eax
ret
C -----------------------------------------------------------------------------
L(mul_by_inverse):
C eax
C ebx
C ecx size
C edx carry
C esi src
C edi
C ebp divisor
bsrl %ebp, %eax C 31-l
movd %edx, %mm1 C carry
movl %ecx, %edx C size
movl $31, %ecx
C
xorl %eax, %ecx C l = leading zeros on d
addl $1, %eax C 32-l
shll %cl, %ebp C normalize d
movd %ecx, %mm7 C l
leal -1(%edx), %ecx C size-1
movd %eax, %mm6 C 32-l
movl $-1, %edx
movl $-1, %eax
C
subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
divl %ebp C floor (b*(b-d)-1 / d)
movd %ebp, %mm5 C d
movd (%esi), %mm0 C src high limb
punpckldq %mm1, %mm0
psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
C
movd %eax, %mm4 C m
C The dependent chain here consists of
C
C 2 paddd n1+n2
C 8 pmuludq m*(n1+n2)
C 2 paddq n2:nadj + m*(n1+n2)
C 2 psrlq q1
C 8 pmuludq d*q1
C 2 psubq (n-d)-q1*d
C 2 psrlq high mask
C 2 pand d masked
C 2 paddd n2+d addback
C --
C 30
C
C But it seems to run at 31 cycles, so presumably there's something else
C going on.
ALIGN(16)
L(inverse_top):
C eax
C ebx
C ecx counter, size-1 to 1
C edx
C esi src, decrementing
C edi
C ebp
C
C mm0 n2
C mm4 m
C mm5 d
C mm6 32-l
C mm7 l
ASSERT(b,`C n2<d
movd %mm0, %eax
movd %mm5, %edx
cmpl %edx, %eax')
movd -4(%esi), %mm1 C next src limbs
movd (%esi), %mm2
leal -4(%esi), %esi
punpckldq %mm2, %mm1
psrlq %mm6, %mm1 C n10
movq %mm1, %mm2 C n10
movq %mm1, %mm3 C n10
psrad $31, %mm1 C -n1
pand %mm5, %mm1 C -n1 & d
paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
psrld $31, %mm2 C n1
paddd %mm0, %mm2 C n2+n1
punpckldq %mm0, %mm1 C n2:nadj
pmuludq %mm4, %mm2 C m*(n2+n1)
paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
pmuludq %mm5, %mm1 C q1*d
punpckldq %mm0, %mm3 C n
psubq %mm5, %mm3 C n - d
pxor %mm0, %mm0
psubq %mm1, %mm3 C n - (q1+1)*d
por %mm3, %mm0 C remainder -> n2
psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
ASSERT(be,`C 0 or -1
movd %mm3, %eax
addl $1, %eax
cmpl $1, %eax')
pand %mm5, %mm3 C mask & d
paddd %mm3, %mm0 C addback if necessary
subl $1, %ecx
jnz L(inverse_top)
C Least significant limb.
C Same code as the loop, but there's no -4(%esi) limb to fetch.
L(inverse_last):
C eax
C ebx
C ecx
C edx
C esi &src[0]
C
C mm0 n2
C mm4 m
C mm5 d
C mm6 32-l
C mm7 l
movd (%esi), %mm1 C src[0]
psllq %mm7, %mm1 C n10
movq %mm1, %mm2 C n10
movq %mm1, %mm3 C n10
psrad $31, %mm1 C -n1
pand %mm5, %mm1 C -n1 & d
paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
psrld $31, %mm2 C n1
paddd %mm0, %mm2 C n2+n1
punpckldq %mm0, %mm1 C n2:nadj
pmuludq %mm4, %mm2 C m*(n2+n1)
paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
pmuludq %mm5, %mm1 C q1*d
punpckldq %mm0, %mm3 C n
psubq %mm5, %mm3 C n - d
pxor %mm0, %mm0
psubq %mm1, %mm3 C n - (q1+1)*d
por %mm3, %mm0 C remainder -> n2
psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
ASSERT(be,`C 0 or -1
movd %mm3, %eax
addl $1, %eax
cmpl $1, %eax')
movl SAVE_EBP, %ebp
pand %mm5, %mm3 C mask & d
movl SAVE_ESI, %esi
paddd %mm3, %mm0 C addback if necessary
psrld %mm7, %mm0
movd %mm0, %eax
emms
ret
EPILOGUE()