; Copyright 1999, 2000, 2002 Free Software Foundation, Inc. ; ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The GNU MP Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the GNU MP Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 59 Temple Place - ; Suite 330, Boston, MA 02111-1307, USA. ; ; Translation of AT&T syntax code by Brian Gladman %include "..\x86i.inc" global ___gmpn_preinv_mod_1 global ___gmpn_mod_1c global ___gmpn_mod_1 %ifdef DLL export ___gmpn_mod_1c export ___gmpn_mod_1 %endif %define MUL_NORM_THRESHOLD 4 %define MUL_UNNORM_THRESHOLD 5 %define MUL_NORM_DELTA MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD %define PARAM_INVERSE esp+frame+16 %define PARAM_CARRY esp+frame+16 %define PARAM_DIVISOR esp+frame+12 %define PARAM_SIZE esp+frame+8 %define PARAM_SRC esp+frame+4 %define SAVE_EBX esp+frame-4 %define SAVE_ESI esp+frame-8 %define SAVE_EDI esp+frame-12 %define SAVE_EBP esp+frame-16 %define VAR_NORM esp+frame-20 %define VAR_INVERSE esp+frame-24 %define STACK_SPACE 24 section .text align 16 %define frame 0 ___gmpn_preinv_mod_1: mov edx,[PARAM_SRC] sub esp,STACK_SPACE FR_sesp STACK_SPACE mov [SAVE_EBX],ebx mov ebx,[PARAM_SIZE] mov [SAVE_EBP],ebp mov ebp,[PARAM_DIVISOR] mov [SAVE_ESI],esi mov eax,[PARAM_INVERSE] mov [SAVE_EDI],edi mov edi,[-4+edx+ebx*4] ; src high limb mov [VAR_NORM],dword 0 lea ecx,[-8+edx+ebx*4] ; &src[size-2] mov esi,edi sub edi,ebp ; high-divisor cmovc edi,esi dec ebx jnz Lpreinv_entry jmp Ldone_edi align 16 %define frame 0 ___gmpn_mod_1c: mov ecx,[PARAM_SIZE] sub esp,STACK_SPACE FR_sesp STACK_SPACE mov [SAVE_EBP],ebp mov eax,[PARAM_DIVISOR] mov [SAVE_ESI],esi mov edx,[PARAM_CARRY] mov esi,[PARAM_SRC] or ecx,ecx jz Ldone_edx ; result==carry if size==0 sar eax,31 mov ebp,[PARAM_DIVISOR] and eax,MUL_NORM_DELTA add eax,MUL_UNNORM_THRESHOLD cmp ecx,eax jb Ldivide_top ; The carry parameter pretends to be the src high limb. mov [SAVE_EBX],ebx lea ebx,[1+ecx] ; size+1 mov eax,edx ; carry jmp Lmul_by_inverse_1c align 16 %define frame 0 ___gmpn_mod_1: mov ecx,[PARAM_SIZE] sub esp,STACK_SPACE FR_sesp STACK_SPACE mov edx,0 ; initial carry (if can't skip a div) mov [SAVE_ESI],esi mov eax,[PARAM_SRC] mov [SAVE_EBP],ebp mov ebp,[PARAM_DIVISOR] mov esi,[PARAM_DIVISOR] or ecx,ecx jz Ldone_edx mov eax,[-4+eax+ecx*4] ; src high limb sar ebp,31 and ebp,MUL_NORM_DELTA add ebp,MUL_UNNORM_THRESHOLD cmp eax,esi ; carry flag if high n2 sub eax,ebp cmovnc edi,eax mov eax,-1 mov edx,-1 sub edx,ebp ; (b-d)-1 so edx:eax = b*(b-d)-1 lea ecx,[-8+esi+ebx*4] ; &src[size-2] div ebp ; floor (b*(b-d)-1) / d Lpreinv_entry: mov [VAR_INVERSE],eax ; No special scheduling of loads is necessary in this loop,out of order ; execution hides the latencies already. ; ; The way q1+1 is generated in %ebx and d is moved to %eax for the multiply ; seems fastest. The obvious change to generate q1+1 in %eax and then just ; multiply by %ebp (as per mpn/x86/pentium/mod_1.asm in fact) runs 1 cycle ; slower,for no obvious reason. ; eax n10 (then scratch) ; ebx scratch (nadj,q1) ; ecx src pointer,decrementing ; edx scratch ; esi n10 ; edi n2 ; ebp divisor align 16 Linverse_top: mov eax,[ecx] ; next src limb mov esi,eax sar eax,31 ; -n1 mov ebx,ebp and ebx,eax ; -n1 & d neg eax ; n1 add eax,edi ; n2+n1 mul dword [VAR_INVERSE] ; m*(n2+n1) add ebx,esi ; nadj = n10 + (-n1 & d),ignoring overflow sub ecx,4 add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag lea ebx,[1+edi] ; n2+1 mov eax,ebp ; d adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 jz Lq1_ff mul ebx ; (q1+1)*d sub esi,eax ; low n - (q1+1)*d sbb edi,edx ; high n - (q1+1)*d,0 or -1 and edi,ebp ; d if underflow add edi,esi ; remainder with addback if necessary cmp ecx,[PARAM_SRC] jae Linverse_top ; %edi is the remainder modulo d*2^n and now must be reduced to ; 0<=r next n2 cmp ecx,[PARAM_SRC] jae Linverse_top jmp Linverse_loop_done end