193 lines
4.9 KiB
NASM
193 lines
4.9 KiB
NASM
|
|
; Copyright 2001, 2002 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
; Translation of AT&T syntax code by Brian Gladman
|
|
|
|
%include "..\x86i.inc"
|
|
|
|
extern ___gmp_modlimb_invert_table
|
|
global ___gmpn_divexact_1
|
|
|
|
%ifdef DLL
|
|
export ___gmpn_divexact_1
|
|
%endif
|
|
|
|
%define PARAM_DIVISOR esp+frame+16
|
|
%define PARAM_SIZE esp+frame+12
|
|
%define PARAM_SRC esp+frame+8
|
|
%define PARAM_DST esp+frame+4
|
|
|
|
%define SAVE_EBX esp+frame-4
|
|
%define SAVE_ESI esp+frame-8
|
|
%define SAVE_EDI esp+frame-12
|
|
%define SAVE_EBP esp+frame-16
|
|
%define VAR_INVERSE esp+frame-20
|
|
%define STACK_SPACE 20
|
|
%define frame 0
|
|
|
|
section .text
|
|
|
|
align 16
|
|
|
|
___gmpn_divexact_1:
|
|
mov eax,[PARAM_DIVISOR]
|
|
sub esp,STACK_SPACE
|
|
FR_sesp STACK_SPACE
|
|
mov [SAVE_ESI],esi
|
|
mov esi,[PARAM_SRC]
|
|
mov [SAVE_EBX],ebx
|
|
mov ebx,[PARAM_SIZE]
|
|
bsf ecx,eax ; trailing twos
|
|
mov [SAVE_EBP],ebp
|
|
shr eax,cl ; d without twos
|
|
mov edx,eax
|
|
shr eax,1 ; d/2 without twos
|
|
mov [PARAM_DIVISOR],edx
|
|
and eax,127
|
|
|
|
%ifdef PIC
|
|
call Lmovl_eip_ebp
|
|
add ebp,_GLOBAL_OFFSET_TABLE_
|
|
mov ebp,[___gmp_modlimb_invert_table+edx+ebp]
|
|
movzx ebp,byte [eax+ebp] ; inv 8 bits
|
|
%else
|
|
movzx ebp,byte [___gmp_modlimb_invert_table+eax] ; inv 8 bits
|
|
%endif
|
|
|
|
lea eax,[ebp+ebp] ; 2*inv
|
|
imul ebp,ebp ; inv*inv
|
|
mov [SAVE_EDI],edi
|
|
mov edi,[PARAM_DST]
|
|
lea esi,[esi+ebx*4] ; src end
|
|
imul ebp,[PARAM_DIVISOR] ; inv*inv*d
|
|
sub eax,ebp ; inv = 2*inv - inv*inv*d
|
|
lea ebp,[eax+eax] ; 2*inv
|
|
imul eax,eax ; inv*inv
|
|
lea edi,[edi+ebx*4] ; dst end
|
|
neg ebx ; -size
|
|
mov [PARAM_DST],edi
|
|
imul eax,[PARAM_DIVISOR] ; inv*inv*d
|
|
sub ebp,eax ; inv = 2*inv - inv*inv*d
|
|
|
|
mov [VAR_INVERSE],ebp
|
|
mov eax,[esi+ebx*4] ; src[0]
|
|
or ecx,ecx
|
|
jnz Leven
|
|
jmp Lodd_entry ; ecx initial carry is zero
|
|
|
|
; The dependent chain here is
|
|
;
|
|
; subl %edx,%eax 1
|
|
; imull %ebp,%eax 4
|
|
; mull PARAM_DIVISOR 5
|
|
; ----
|
|
; total 10
|
|
;
|
|
; and this is the measured speed. No special scheduling is necessary,out
|
|
; of order execution hides the load latency.
|
|
;
|
|
; eax scratch (src limb)
|
|
; ebx counter,limbs,negative
|
|
; ecx carry bit
|
|
; edx carry limb,high of last product
|
|
; esi &src[size]
|
|
; edi &dst[size]
|
|
|
|
Lodd_top:
|
|
mul dword [PARAM_DIVISOR]
|
|
mov eax,[esi+ebx*4]
|
|
sub eax,ecx
|
|
sbb ecx,ecx
|
|
sub eax,edx
|
|
sbb ecx,0
|
|
Lodd_entry:
|
|
imul eax,[VAR_INVERSE]
|
|
mov [edi+ebx*4],eax
|
|
neg ecx
|
|
inc ebx
|
|
jnz Lodd_top
|
|
mov esi,[SAVE_ESI]
|
|
mov edi,[SAVE_EDI]
|
|
mov ebp,[SAVE_EBP]
|
|
mov ebx,[SAVE_EBX]
|
|
add esp,STACK_SPACE
|
|
ret
|
|
|
|
; eax src[0]
|
|
; ebx counter,limbs,negative
|
|
; ecx shift
|
|
|
|
Leven:
|
|
xor ebp,ebp ; initial carry bit
|
|
xor edx,edx ; initial carry limb (for size==1)
|
|
inc ebx
|
|
jz Leven_one
|
|
mov edi,[esi+ebx*4] ; src[1]
|
|
shrd eax,edi,cl
|
|
jmp Leven_entry
|
|
|
|
; eax scratch
|
|
; ebx counter,limbs,negative
|
|
; ecx shift
|
|
; edx scratch
|
|
; esi &src[size]
|
|
; edi &dst[size] and scratch
|
|
; ebp carry bit
|
|
|
|
Leven_top:
|
|
mov edi,[esi+ebx*4]
|
|
mul dword [PARAM_DIVISOR]
|
|
mov eax,[-4+esi+ebx*4]
|
|
shrd eax,edi,cl
|
|
sub eax,ebp
|
|
sbb ebp,ebp
|
|
sub eax,edx
|
|
sbb ebp,0
|
|
|
|
Leven_entry:
|
|
imul eax,[VAR_INVERSE]
|
|
mov edi,[PARAM_DST]
|
|
neg ebp
|
|
mov [-4+edi+ebx*4],eax
|
|
inc ebx
|
|
jnz Leven_top
|
|
mul dword [PARAM_DIVISOR]
|
|
mov eax,[-4+esi]
|
|
Leven_one:
|
|
shr eax,cl
|
|
mov esi,[SAVE_ESI]
|
|
sub eax,ebp
|
|
mov ebp,[SAVE_EBP]
|
|
sub eax,edx
|
|
mov ebx,[SAVE_EBX]
|
|
imul eax,[VAR_INVERSE]
|
|
mov [-4+edi],eax
|
|
mov edi,[SAVE_EDI]
|
|
add esp,STACK_SPACE
|
|
ret
|
|
|
|
%ifdef PIC
|
|
Lmovl_eip_ebp:
|
|
mov ebp,[esp]
|
|
ret
|
|
%endif
|
|
|
|
end
|