310 lines
8.3 KiB
NASM
310 lines
8.3 KiB
NASM
|
|
; Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
; Translation of AT&T syntax code by Brian Gladman
|
|
|
|
%include "..\x86i.inc"
|
|
|
|
global ___gmpn_preinv_mod_1
|
|
global ___gmpn_mod_1c
|
|
global ___gmpn_mod_1
|
|
|
|
%ifdef DLL
|
|
export ___gmpn_mod_1c
|
|
export ___gmpn_mod_1
|
|
%endif
|
|
|
|
%define MUL_NORM_THRESHOLD 4
|
|
%define MUL_UNNORM_THRESHOLD 5
|
|
|
|
%define MUL_NORM_DELTA MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD
|
|
|
|
%define PARAM_INVERSE esp+frame+16
|
|
%define PARAM_CARRY esp+frame+16
|
|
%define PARAM_DIVISOR esp+frame+12
|
|
%define PARAM_SIZE esp+frame+8
|
|
%define PARAM_SRC esp+frame+4
|
|
|
|
%define SAVE_EBX esp+frame-4
|
|
%define SAVE_ESI esp+frame-8
|
|
%define SAVE_EDI esp+frame-12
|
|
%define SAVE_EBP esp+frame-16
|
|
|
|
%define VAR_NORM esp+frame-20
|
|
%define VAR_INVERSE esp+frame-24
|
|
%define STACK_SPACE 24
|
|
|
|
section .text
|
|
|
|
align 16
|
|
|
|
%define frame 0
|
|
|
|
___gmpn_preinv_mod_1:
|
|
mov edx,[PARAM_SRC]
|
|
sub esp,STACK_SPACE
|
|
FR_sesp STACK_SPACE
|
|
mov [SAVE_EBX],ebx
|
|
mov ebx,[PARAM_SIZE]
|
|
mov [SAVE_EBP],ebp
|
|
mov ebp,[PARAM_DIVISOR]
|
|
mov [SAVE_ESI],esi
|
|
mov eax,[PARAM_INVERSE]
|
|
mov [SAVE_EDI],edi
|
|
mov edi,[-4+edx+ebx*4] ; src high limb
|
|
mov [VAR_NORM],dword 0
|
|
lea ecx,[-8+edx+ebx*4] ; &src[size-2]
|
|
mov esi,edi
|
|
sub edi,ebp ; high-divisor
|
|
cmovc edi,esi
|
|
dec ebx
|
|
jnz Lpreinv_entry
|
|
jmp Ldone_edi
|
|
|
|
align 16
|
|
|
|
%define frame 0
|
|
|
|
___gmpn_mod_1c:
|
|
mov ecx,[PARAM_SIZE]
|
|
sub esp,STACK_SPACE
|
|
FR_sesp STACK_SPACE
|
|
mov [SAVE_EBP],ebp
|
|
mov eax,[PARAM_DIVISOR]
|
|
mov [SAVE_ESI],esi
|
|
mov edx,[PARAM_CARRY]
|
|
mov esi,[PARAM_SRC]
|
|
or ecx,ecx
|
|
jz Ldone_edx ; result==carry if size==0
|
|
sar eax,31
|
|
mov ebp,[PARAM_DIVISOR]
|
|
and eax,MUL_NORM_DELTA
|
|
add eax,MUL_UNNORM_THRESHOLD
|
|
cmp ecx,eax
|
|
jb Ldivide_top
|
|
|
|
; The carry parameter pretends to be the src high limb.
|
|
mov [SAVE_EBX],ebx
|
|
lea ebx,[1+ecx] ; size+1
|
|
mov eax,edx ; carry
|
|
jmp Lmul_by_inverse_1c
|
|
|
|
align 16
|
|
|
|
%define frame 0
|
|
|
|
___gmpn_mod_1:
|
|
mov ecx,[PARAM_SIZE]
|
|
sub esp,STACK_SPACE
|
|
FR_sesp STACK_SPACE
|
|
mov edx,0 ; initial carry (if can't skip a div)
|
|
mov [SAVE_ESI],esi
|
|
mov eax,[PARAM_SRC]
|
|
mov [SAVE_EBP],ebp
|
|
mov ebp,[PARAM_DIVISOR]
|
|
mov esi,[PARAM_DIVISOR]
|
|
or ecx,ecx
|
|
jz Ldone_edx
|
|
mov eax,[-4+eax+ecx*4] ; src high limb
|
|
sar ebp,31
|
|
and ebp,MUL_NORM_DELTA
|
|
add ebp,MUL_UNNORM_THRESHOLD
|
|
cmp eax,esi ; carry flag if high<divisor
|
|
cmovc edx,eax
|
|
mov esi,[PARAM_SRC]
|
|
sbb ecx,0 ; size-1 to skip one div
|
|
jz Ldone_eax ; done if had size==1
|
|
cmp ecx,ebp
|
|
mov ebp,[PARAM_DIVISOR]
|
|
jae Lmul_by_inverse
|
|
|
|
; eax scratch (quotient)
|
|
; ebx
|
|
; ecx counter,limbs,decrementing
|
|
; edx scratch (remainder)
|
|
; esi src
|
|
; edi
|
|
; ebp divisor
|
|
|
|
Ldivide_top:
|
|
mov eax,[-4+esi+ecx*4]
|
|
div ebp
|
|
dec ecx
|
|
jnz Ldivide_top
|
|
Ldone_edx:
|
|
mov eax,edx
|
|
Ldone_eax:
|
|
mov esi,[SAVE_ESI]
|
|
mov ebp,[SAVE_EBP]
|
|
add esp,STACK_SPACE
|
|
ret
|
|
|
|
; eax src high limb
|
|
; ebx
|
|
; ecx
|
|
; edx
|
|
; esi src
|
|
; edi
|
|
; ebp divisor
|
|
|
|
Lmul_by_inverse:
|
|
mov [SAVE_EBX],ebx
|
|
mov ebx,[PARAM_SIZE]
|
|
Lmul_by_inverse_1c:
|
|
bsr ecx,ebp ; 31-l
|
|
mov [SAVE_EDI],edi
|
|
xor ecx,31 ; l
|
|
mov [VAR_NORM],ecx
|
|
shl ebp,cl ; d normalized
|
|
mov edi,eax ; src high -> n2
|
|
sub eax,ebp
|
|
cmovnc edi,eax
|
|
mov eax,-1
|
|
mov edx,-1
|
|
sub edx,ebp ; (b-d)-1 so edx:eax = b*(b-d)-1
|
|
lea ecx,[-8+esi+ebx*4] ; &src[size-2]
|
|
div ebp ; floor (b*(b-d)-1) / d
|
|
Lpreinv_entry:
|
|
mov [VAR_INVERSE],eax
|
|
|
|
; No special scheduling of loads is necessary in this loop,out of order
|
|
; execution hides the latencies already.
|
|
;
|
|
; The way q1+1 is generated in %ebx and d is moved to %eax for the multiply
|
|
; seems fastest. The obvious change to generate q1+1 in %eax and then just
|
|
; multiply by %ebp (as per mpn/x86/pentium/mod_1.asm in fact) runs 1 cycle
|
|
; slower,for no obvious reason.
|
|
|
|
; eax n10 (then scratch)
|
|
; ebx scratch (nadj,q1)
|
|
; ecx src pointer,decrementing
|
|
; edx scratch
|
|
; esi n10
|
|
; edi n2
|
|
; ebp divisor
|
|
|
|
align 16
|
|
Linverse_top:
|
|
mov eax,[ecx] ; next src limb
|
|
mov esi,eax
|
|
sar eax,31 ; -n1
|
|
mov ebx,ebp
|
|
and ebx,eax ; -n1 & d
|
|
neg eax ; n1
|
|
add eax,edi ; n2+n1
|
|
mul dword [VAR_INVERSE] ; m*(n2+n1)
|
|
add ebx,esi ; nadj = n10 + (-n1 & d),ignoring overflow
|
|
sub ecx,4
|
|
add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag
|
|
lea ebx,[1+edi] ; n2+1
|
|
mov eax,ebp ; d
|
|
adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
|
|
jz Lq1_ff
|
|
mul ebx ; (q1+1)*d
|
|
sub esi,eax ; low n - (q1+1)*d
|
|
sbb edi,edx ; high n - (q1+1)*d,0 or -1
|
|
and edi,ebp ; d if underflow
|
|
add edi,esi ; remainder with addback if necessary
|
|
cmp ecx,[PARAM_SRC]
|
|
jae Linverse_top
|
|
|
|
; %edi is the remainder modulo d*2^n and now must be reduced to
|
|
; 0<=r<d by calculating r*2^n mod d*2^n and then right shifting by
|
|
; n. If d was already normalized on entry so that n==0 then nothing
|
|
; is needed here. The chance of n==0 is low,but it's true of say
|
|
; PP from gmp-impl.h.
|
|
;
|
|
; eax
|
|
; ebx
|
|
; ecx
|
|
; edx
|
|
; esi
|
|
; edi remainder
|
|
; ebp divisor (normalized)
|
|
|
|
Linverse_loop_done:
|
|
mov ecx,[VAR_NORM]
|
|
mov esi,0
|
|
or ecx,ecx
|
|
jz Ldone_edi
|
|
|
|
; Here use %edi=n10 and %esi=n2,opposite to the loop above.
|
|
;
|
|
; The q1=0xFFFFFFFF case is handled with an sbbl to adjust q1+1
|
|
; back,rather than q1_ff special case code. This is simpler and
|
|
; costs only 2 uops.
|
|
|
|
shld esi,edi,cl
|
|
shl edi,cl
|
|
mov eax,edi ; n10
|
|
mov ebx,ebp ; d
|
|
sar eax,31 ; -n1
|
|
and ebx,eax ; -n1 & d
|
|
neg eax ; n1
|
|
add ebx,edi ; nadj = n10 + (-n1 & d),ignoring overflow
|
|
add eax,esi ; n2+n1
|
|
mul dword [VAR_INVERSE] ; m*(n2+n1)
|
|
add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag
|
|
lea ebx,[1+esi] ; n2+1
|
|
adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
|
|
sbb ebx,0
|
|
mov eax,ebp ; d
|
|
mul ebx ; (q1+1)*d
|
|
mov ebx,[SAVE_EBX]
|
|
sub edi,eax ; low n - (q1+1)*d is remainder
|
|
sbb esi,edx ; high n - (q1+1)*d,0 or -1
|
|
and esi,ebp
|
|
mov ebp,[SAVE_EBP]
|
|
lea eax,[esi+edi] ; remainder
|
|
mov esi,[SAVE_ESI]
|
|
shr eax,cl ; denorm remainder
|
|
mov edi,[SAVE_EDI]
|
|
add esp,STACK_SPACE
|
|
ret
|
|
Ldone_edi:
|
|
mov ebx,[SAVE_EBX]
|
|
mov eax,edi
|
|
mov esi,[SAVE_ESI]
|
|
mov edi,[SAVE_EDI]
|
|
mov ebp,[SAVE_EBP]
|
|
add esp,STACK_SPACE
|
|
ret
|
|
|
|
; Special case for q1=0xFFFFFFFF,giving q=0xFFFFFFFF meaning the low dword
|
|
; of q*d is simply -d and the remainder n-q*d = n10+d.
|
|
;
|
|
; This is reached only very rarely.
|
|
;
|
|
; eax (divisor)
|
|
; ebx (q1+1 == 0)
|
|
; ecx src pointer
|
|
; edx
|
|
; esi n10
|
|
; edi (n2)
|
|
; ebp divisor
|
|
|
|
Lq1_ff:
|
|
lea edi,[ebp+esi] ; n-q*d remainder -> next n2
|
|
cmp ecx,[PARAM_SRC]
|
|
jae Linverse_top
|
|
jmp Linverse_loop_done
|
|
|
|
end
|