dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, dnl Fifth Floor, Boston, MA 02110-1301, USA. include(`../config.m4') C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor); C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor, mp_limb_t carry); C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor, mp_limb_t inverse, C unsigned shift); C C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, C see that file for some comments. It's possible what's here can be improved. dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by dnl inverse method is used, rather than plain "divl"s. Minimum value 1. dnl dnl The different speeds of the integer and fraction parts means that using dnl xsize+size isn't quite right. The threshold wants to be a bit higher dnl for the integer part and a bit lower for the fraction part. (Or what's dnl really wanted is to speed up the integer part!) dnl dnl The threshold is set to make the integer part right. At 4 limbs the dnl div and mul are about the same there, but on the fractional part the dnl mul is much faster. deflit(MUL_THRESHOLD, 4) defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c defframe(PARAM_DIVISOR,20) defframe(PARAM_SIZE, 16) defframe(PARAM_SRC, 12) defframe(PARAM_XSIZE, 8) defframe(PARAM_DST, 4) defframe(SAVE_EBX, -4) defframe(SAVE_ESI, -8) defframe(SAVE_EDI, -12) defframe(SAVE_EBP, -16) defframe(VAR_NORM, -20) defframe(VAR_INVERSE, -24) defframe(VAR_SRC, -28) defframe(VAR_DST, -32) defframe(VAR_DST_STOP,-36) deflit(STACK_SPACE, 36) TEXT ALIGN(16) PROLOGUE(mpn_preinv_divrem_1) deflit(`FRAME',0) movl PARAM_XSIZE, %ecx subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) movl %esi, SAVE_ESI movl PARAM_SRC, %esi movl %ebx, SAVE_EBX movl PARAM_SIZE, %ebx movl %ebp, SAVE_EBP movl PARAM_DIVISOR, %ebp movl %edi, SAVE_EDI movl PARAM_DST, %edx movl -4(%esi,%ebx,4), %eax C src high limb xorl %edi, %edi C initial carry (if can't skip a div) C leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] xor %ecx, %ecx movl %edx, VAR_DST_STOP C &dst[xsize+2] cmpl %ebp, %eax C high cmp divisor cmovc( %eax, %edi) C high is carry if high n2 leal (%ebp,%esi), %edx cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 movd %mm0, %esi sbbl $0, %ebx C q subl $4, %ecx movl %ebx, (%ecx) cmpl %eax, %ecx movl %ecx, VAR_DST jne L(integer_top) L(integer_loop_done): C ----------------------------------------------------------------------------- C C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz C q1_ff special case. This make the code a bit smaller and simpler, and C costs only 2 cycles (each). L(integer_two_left): C eax scratch C ebx scratch (nadj, q1) C ecx scratch (src, dst) C edx scratch C esi n10 C edi n2 C ebp divisor C C mm7 rshift movl %esi, %eax movl %ebp, %ebx sarl $31, %eax C -n1 movl PARAM_SRC, %ecx andl %eax, %ebx C -n1 & d negl %eax C n1 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow addl %edi, %eax C n2+n1 mull VAR_INVERSE C m*(n2+n1) movd (%ecx), %mm0 C src low limb movl VAR_DST_STOP, %ecx C C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 sbbl $0, %ebx mull %ebx C (q1+1)*d psllq $32, %mm0 psrlq %mm7, %mm0 C C subl %eax, %esi sbbl %edx, %edi C n - (q1+1)*d movl %esi, %edi C remainder -> n2 leal (%ebp,%esi), %edx cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 movd %mm0, %esi sbbl $0, %ebx C q movl %ebx, -4(%ecx) C ----------------------------------------------------------------------------- L(integer_one_left): C eax scratch C ebx scratch (nadj, q1) C ecx scratch (dst) C edx scratch C esi n10 C edi n2 C ebp divisor C C mm7 rshift movl %esi, %eax movl %ebp, %ebx sarl $31, %eax C -n1 movl VAR_DST_STOP, %ecx andl %eax, %ebx C -n1 & d negl %eax C n1 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow addl %edi, %eax C n2+n1 mull VAR_INVERSE C m*(n2+n1) C C C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d C adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 sbbl $0, %ebx C q1 if q1+1 overflowed mull %ebx C C C C subl %eax, %esi movl PARAM_XSIZE, %eax sbbl %edx, %edi C n - (q1+1)*d movl %esi, %edi C remainder -> n2 leal (%ebp,%esi), %edx cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 sbbl $0, %ebx C q movl %ebx, -8(%ecx) subl $8, %ecx orl %eax, %eax C xsize jnz L(fraction_some) movl %edi, %eax L(fraction_done): movl VAR_NORM, %ecx L(zero_done): movl SAVE_EBP, %ebp movl SAVE_EDI, %edi movl SAVE_ESI, %esi movl SAVE_EBX, %ebx addl $STACK_SPACE, %esp shrl %cl, %eax emms ret C ----------------------------------------------------------------------------- C C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword C of q*d is simply -d and the remainder n-q*d = n10+d L(q1_ff): C eax (divisor) C ebx (q1+1 == 0) C ecx C edx C esi n10 C edi n2 C ebp divisor movl VAR_DST, %ecx movl VAR_DST_STOP, %edx subl $4, %ecx movl %ecx, VAR_DST psrlq %mm7, %mm0 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 movl $-1, (%ecx) movd %mm0, %esi C next n10 cmpl %ecx, %edx jne L(integer_top) jmp L(integer_loop_done) C ----------------------------------------------------------------------------- C C In the current implementation, the following successively dependent C micro-ops seem to exist. C C uops C mul 5 C q1+1 1 (addl) C mul 5 C sub 3 (negl/sbbl) C addback 2 (cmov) C --- C 16 C C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for C the addback was found to be a touch slower. ALIGN(16) L(fraction_some): C eax C ebx C ecx C edx C esi C edi carry C ebp divisor movl PARAM_DST, %esi movl VAR_DST_STOP, %ecx C &dst[xsize+2] movl %edi, %eax subl $8, %ecx C &dst[xsize] ALIGN(16) L(fraction_top): C eax n2, then scratch C ebx scratch (nadj, q1) C ecx dst, decrementing C edx scratch C esi dst stop point C edi n2 C ebp divisor mull VAR_INVERSE C m*n2 movl %ebp, %eax C d subl $4, %ecx C dst leal 1(%edi), %ebx C C C addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 mull %ebx C (q1+1)*d C C C C negl %eax C low of n - (q1+1)*d sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry leal (%ebp,%eax), %edx cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 sbbl $0, %ebx C q movl %eax, %edi C remainder->n2 cmpl %esi, %ecx movl %ebx, (%ecx) C previous q jne L(fraction_top) jmp L(fraction_done) EPILOGUE()