dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1. dnl Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, dnl Fifth Floor, Boston, MA 02110-1301, USA. include(`../config.m4') C Pentium4: 1.0 cycles/limb C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) C C Enhancements: C C There might a couple of cycles to save by using plain integer code for C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to C about 46 (inclusive of some function call overheads). defframe(PARAM_SIZE, 8) defframe(PARAM_SRC, 4) dnl re-use parameter space define(SAVE_EBX, `PARAM_SRC') define(SAVE_ESI, `PARAM_SIZE') TEXT ALIGN(16) PROLOGUE(mpn_mod_34lsub1) deflit(`FRAME',0) movl PARAM_SIZE, %ecx movl PARAM_SRC, %edx movl (%edx), %eax subl $2, %ecx ja L(three_or_more) jne L(one) movl 4(%edx), %edx movl %eax, %ecx shrl $24, %eax C src[0] high andl $0x00FFFFFF, %ecx C src[0] low addl %ecx, %eax movl %edx, %ecx shll $8, %edx shrl $16, %ecx C src[1] low addl %ecx, %eax andl $0x00FFFF00, %edx C src[1] high addl %edx, %eax L(one): ret L(three_or_more): pxor %mm0, %mm0 pxor %mm1, %mm1 pxor %mm2, %mm2 pcmpeqd %mm7, %mm7 psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits pcmpeqd %mm6, %mm6 psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits L(top): C eax C ebx C ecx counter, size-2 to 0, -1 or -2 C edx src, incrementing C C mm0 sum 0mod3 C mm1 sum 1mod3 C mm2 sum 2mod3 C mm3 C mm4 C mm5 C mm6 0x0000000000FFFFFF C mm7 0x00000000FFFFFFFF movd (%edx), %mm3 paddq %mm3, %mm0 movd 4(%edx), %mm3 paddq %mm3, %mm1 movd 8(%edx), %mm3 paddq %mm3, %mm2 addl $12, %edx subl $3, %ecx ja L(top) C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively addl $1, %ecx js L(combine) C 0 more movd (%edx), %mm3 paddq %mm3, %mm0 jz L(combine) C 1 more movd 4(%edx), %mm3 paddq %mm3, %mm1 L(combine): movq %mm7, %mm3 C low halves pand %mm0, %mm3 movq %mm7, %mm4 pand %mm1, %mm4 movq %mm7, %mm5 pand %mm2, %mm5 psrlq $32, %mm0 C high halves psrlq $32, %mm1 psrlq $32, %mm2 paddq %mm0, %mm4 C fold high halves to give 33 bits each paddq %mm1, %mm5 paddq %mm2, %mm3 psllq $8, %mm4 C combine at respective offsets psllq $16, %mm5 paddq %mm4, %mm3 paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits pand %mm3, %mm6 C fold at 24 bits psrlq $24, %mm3 paddq %mm6, %mm3 movd %mm3, %eax ASSERT(z, C nothing left in high dword `psrlq $32, %mm3 movd %mm3, %ecx orl %ecx, %ecx') emms ret EPILOGUE()