165 lines
3.5 KiB
NASM
165 lines
3.5 KiB
NASM
dnl Intel Pentium-4 mpn_mul_basecase -- mpn by mpn multiplication.
|
|
|
|
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
|
|
dnl
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
|
dnl License, or (at your option) any later version.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
dnl Lesser General Public License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public
|
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
C P4: 6.0 cycles/crossproduct (approx)
|
|
|
|
|
|
C void mpn_mul_basecase (mp_ptr wp,
|
|
C mp_srcptr xp, mp_size_t xsize,
|
|
C mp_srcptr yp, mp_size_t ysize);
|
|
C
|
|
C Nothing special here, basically just mpn/generic/mul_basecase.c done with
|
|
C mpn_mul_1 and mpn_addmul_1 inline. As per mpn_addmul_1, the dependent
|
|
C chain in the inner loop is 4 c/l, but measures about 6.
|
|
C
|
|
C Enhancements:
|
|
C
|
|
C Perhaps some sort of vertical method would suit, though there'd be branch
|
|
C mispredictions on the end sections. But it's not clear how to get less
|
|
C than 4 instructions per crossproduct, and unless that can be done then a
|
|
C basic addmul_1 style may as well be used (assuming it can be brought up to
|
|
C its proper 4 c/l).
|
|
|
|
defframe(PARAM_YSIZE, 20)
|
|
defframe(PARAM_YP, 16)
|
|
defframe(PARAM_XSIZE, 12)
|
|
defframe(PARAM_XP, 8)
|
|
defframe(PARAM_WP, 4)
|
|
|
|
define(SAVE_EBX,`PARAM_XP')
|
|
define(SAVE_ESI,`PARAM_YP')
|
|
define(SAVE_EDI,`PARAM_YSIZE')
|
|
define(SAVE_EBP,`PARAM_WP')
|
|
|
|
TEXT
|
|
ALIGN(8)
|
|
PROLOGUE(mpn_mul_basecase)
|
|
deflit(`FRAME',0)
|
|
|
|
movl PARAM_XP, %eax
|
|
movl %ebx, SAVE_EBX
|
|
pxor %mm0, %mm0 C initial carry
|
|
|
|
movl PARAM_YP, %edx
|
|
movl %esi, SAVE_ESI
|
|
|
|
movl PARAM_WP, %ebx
|
|
movl %ebp, SAVE_EBP
|
|
movl %eax, %esi C xp
|
|
|
|
movd (%edx), %mm7 C yp[0]
|
|
|
|
movl PARAM_XSIZE, %ecx
|
|
|
|
movl PARAM_YSIZE, %ebp
|
|
movl %edi, SAVE_EDI
|
|
movl %ebx, %edi C wp
|
|
|
|
L(mul1):
|
|
C eax xp, incrementing
|
|
C ebx wp, incrementing
|
|
C ecx xsize, decrementing
|
|
C edx yp
|
|
C esi xp
|
|
C edi wp
|
|
C ebp ysize
|
|
C
|
|
C mm0 carry limb
|
|
C mm7 multiplier
|
|
|
|
movd (%eax), %mm1
|
|
addl $4, %eax
|
|
pmuludq %mm7, %mm1
|
|
paddq %mm1, %mm0
|
|
movd %mm0, (%ebx)
|
|
addl $4, %ebx
|
|
psrlq $32, %mm0
|
|
subl $1, %ecx
|
|
jnz L(mul1)
|
|
|
|
movd %mm0, (%ebx)
|
|
|
|
subl $1, %ebp
|
|
jz L(done)
|
|
|
|
|
|
L(outer):
|
|
C eax
|
|
C ebx
|
|
C ecx
|
|
C edx yp, incrementing
|
|
C esi xp
|
|
C edi wp, incrementing
|
|
C ebp ysize, decrementing
|
|
|
|
movl %esi, %eax C xp
|
|
|
|
leal 4(%edi), %ebx C next wp
|
|
addl $4, %edi
|
|
|
|
movd 4(%edx), %mm7 C next yp limb
|
|
addl $4, %edx
|
|
|
|
pxor %mm0, %mm0 C initial carry
|
|
|
|
movl PARAM_XSIZE, %ecx
|
|
|
|
|
|
L(inner):
|
|
C eax xp, incrementing
|
|
C ebx wp, incrementing
|
|
C ecx xsize, decrementing
|
|
C edx outer yp
|
|
C esi outer xp
|
|
C edi outer wp
|
|
C ebp outer ysize
|
|
|
|
movd (%eax), %mm1
|
|
leal 4(%eax), %eax
|
|
movd (%ebx),%mm2
|
|
pmuludq %mm7, %mm1
|
|
paddq %mm2, %mm1
|
|
paddq %mm1, %mm0
|
|
subl $1, %ecx
|
|
movd %mm0, (%ebx)
|
|
psrlq $32, %mm0
|
|
leal 4(%ebx), %ebx
|
|
jnz L(inner)
|
|
|
|
movd %mm0, (%ebx)
|
|
|
|
subl $1, %ebp
|
|
jnz L(outer)
|
|
|
|
|
|
L(done):
|
|
movl SAVE_EBX, %ebx
|
|
movl SAVE_ESI, %esi
|
|
movl SAVE_EDI, %edi
|
|
movl SAVE_EBP, %ebp
|
|
emms
|
|
ret
|
|
|
|
EPILOGUE()
|