; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. ; ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The GNU MP Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the GNU MP Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 59 Temple Place - ; Suite 330, Boston, MA 02111-1307, USA. ; ; Translation of AT&T syntax code by Brian Gladman %include "..\..\x86i.inc" global ___gmpn_preinv_divrem_1 global ___gmpn_divrem_1c global ___gmpn_divrem_1 %ifdef DLL export ___gmpn_divrem_1c export ___gmpn_divrem_1 %endif %define MUL_THRESHOLD 4 %define PARAM_PREINV_SHIFT esp+frame+28 %define PARAM_PREINV_INVERSE esp+frame+24 %define PARAM_CARRY esp+frame+24 %define PARAM_DIVISOR esp+frame+20 %define PARAM_SIZE esp+frame+16 %define PARAM_SRC esp+frame+12 %define PARAM_XSIZE esp+frame+8 %define PARAM_DST esp+frame+4 %define SAVE_EBX esp+frame-4 %define SAVE_ESI esp+frame-8 %define SAVE_EDI esp+frame-12 %define SAVE_EBP esp+frame-16 %define VAR_NORM esp+frame-20 %define VAR_INVERSE esp+frame-24 %define VAR_SRC esp+frame-28 %define VAR_DST esp+frame-32 %define VAR_DST_STOP esp+frame-36 %define STACK_SPACE 36 %define frame 0 section .text align 16 ___gmpn_preinv_divrem_1: mov ecx,[PARAM_XSIZE] sub esp,STACK_SPACE FR_sesp STACK_SPACE mov [SAVE_ESI],esi mov esi,[PARAM_SRC] mov [SAVE_EBX],ebx mov ebx,[PARAM_SIZE] mov [SAVE_EBP],ebp mov ebp,[PARAM_DIVISOR] mov [SAVE_EDI],edi mov edx,[PARAM_DST] mov eax,[-4+esi+ebx*4] ; src high limb xor edi,edi ; initial carry (if can't skip a div) lea edx,[8+edx+ecx*4] ; &dst[xsize+2] xor ecx,ecx mov [VAR_DST_STOP],edx ; &dst[xsize+2] cmp eax,ebp ; high cmp divisor cmovc edi,eax cmovnc ecx,eax ; (the latter in case src==dst) mov [-12+edx+ebx*4],ecx ; dst high limb sbb ebx,0 ; skip one division if high n2 lea edx,[ebp+esi] cmovc edi,edx movd esi,mm0 sbb ebx,0 ; q sub ecx,4 mov [ecx],ebx cmp ecx,eax mov [VAR_DST],ecx jne Linteger_top Linteger_loop_done: ; Here,and in integer_one_left below,an sbbl $0 is used rather than a jz ; q1_ff special case. This make the code a bit smaller and simpler,and ; costs only 2 cycles (each). ; eax scratch ; ebx scratch (nadj,q1) ; ecx scratch (src,dst) ; edx scratch ; esi n10 ; edi n2 ; ebp divisor ; ; mm7 rshift Linteger_two_left: mov eax,esi mov ebx,ebp sar eax,31 ; -n1 mov ecx,[PARAM_SRC] and ebx,eax ; -n1 & d neg eax ; n1 add ebx,esi ; nadj = n10 + (-n1 & d),ignoring overflow add eax,edi ; n2+n1 mul dword [VAR_INVERSE] ; m*(n2+n1) movd mm0,[ecx] ; src low limb mov ecx,[VAR_DST_STOP] add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag lea ebx,[1+edi] ; n2+1 mov eax,ebp ; d adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 sbb ebx,0 mul ebx ; (q1+1)*d psllq mm0,32 psrlq mm0,mm7 sub esi,eax sbb edi,edx ; n - (q1+1)*d mov edi,esi ; remainder -> n2 lea edx,[ebp+esi] cmovc edi,edx movd esi,mm0 sbb ebx,0 ; q mov [-4+ecx],ebx ; eax scratch ; ebx scratch (nadj,q1) ; ecx scratch (dst) ; edx scratch ; esi n10 ; edi n2 ; ebp divisor ; ; mm7 rshift Linteger_one_left: mov eax,esi mov ebx,ebp sar eax,31 ; -n1 mov ecx,[VAR_DST_STOP] and ebx,eax ; -n1 & d neg eax ; n1 add ebx,esi ; nadj = n10 + (-n1 & d),ignoring overflow add eax,edi ; n2+n1 mul dword [VAR_INVERSE] ; m*(n2+n1) add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag lea ebx,[1+edi] ; n2+1 mov eax,ebp ; d adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 sbb ebx,0 ; q1 if q1+1 overflowed mul ebx sub esi,eax mov eax,[PARAM_XSIZE] sbb edi,edx ; n - (q1+1)*d mov edi,esi ; remainder -> n2 lea edx,[ebp+esi] cmovc edi,edx sbb ebx,0 ; q mov [-8+ecx],ebx sub ecx,8 or eax,eax ; xsize jnz Lfraction_some mov eax,edi Lfraction_done: mov ecx,[VAR_NORM] Lzero_done: mov ebp,[SAVE_EBP] mov edi,[SAVE_EDI] mov esi,[SAVE_ESI] mov ebx,[SAVE_EBX] add esp,STACK_SPACE shr eax,cl emms ret ; Special case for q1=0xFFFFFFFF,giving q=0xFFFFFFFF meaning the low dword ; of q*d is simply -d and the remainder n-q*d = n10+d ; ; eax (divisor) ; ebx (q1+1 == 0) ; ecx ; edx ; esi n10 ; edi n2 ; ebp divisor Lq1_ff: mov ecx,[VAR_DST] mov edx,[VAR_DST_STOP] sub ecx,4 mov [VAR_DST],ecx psrlq mm0,mm7 lea edi,[ebp+esi] ; n-q*d remainder -> next n2 mov [ecx],dword -1 movd esi,mm0 ; next n10 cmp edx,ecx jne Linteger_top jmp Linteger_loop_done ; ; In the current implementation,the following successively dependent ; micro-ops seem to exist. ; ; uops ; mul 5 ; q1+1 1 (addl) ; mul 5 ; sub 3 (negl/sbbl) ; addback 2 (cmov) ; --- ; 16 ; ; The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for ; the addback was found to be a touch slower. ; eax ; ebx ; ecx ; edx ; esi ; edi carry ; ebp divisor align 16 Lfraction_some: mov esi,[PARAM_DST] mov ecx,[VAR_DST_STOP] ; &dst[xsize+2] mov eax,edi sub ecx,8 ; &dst[xsize] ; eax n2,then scratch ; ebx scratch (nadj,q1) ; ecx dst,decrementing ; edx scratch ; esi dst stop point ; edi n2 ; ebp divisor align 16 Lfraction_top: mul dword [VAR_INVERSE] ; m*n2 mov eax,ebp ; d sub ecx,4 ; dst lea ebx,[edi+1] add ebx,edx ; 1 + high(n2<<32 + m*n2) = q1+1 mul ebx ; (q1+1)*d neg eax ; low of n - (q1+1)*d sbb edi,edx ; high of n - (q1+1)*d,caring only about carry lea edx,[ebp+eax] cmovc eax,edx sbb ebx,0 ; q mov edi,eax ; remainder->n2 cmp ecx,esi mov [ecx],ebx ; previous q jne Lfraction_top jmp Lfraction_done end