mpir/mpn/x86w/p6/mode1o.asm
brgladman 9c467c6415
2008-07-04 10:39:15 +00:00

138 lines
3.6 KiB
NASM

; Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
%include "..\x86i.inc"
extern ___gmp_modlimb_invert_table
global ___gmpn_modexact_1c_odd
global ___gmpn_modexact_1_odd
%ifdef DLL
export ___gmpn_modexact_1c_odd
export ___gmpn_modexact_1_odd
%endif
%define PARAM_CARRY esp+frame+16
%define PARAM_DIVISOR esp+frame+12
%define PARAM_SIZE esp+frame+8
%define PARAM_SRC esp+frame+4
; Not enough room under modexact_1 to make these re-use the parameter
; space,unfortunately.
%define SAVE_EBX esp+frame-4
%define SAVE_ESI esp+frame-8
%define SAVE_EDI esp+frame-12
%define STACK_SPACE 12
%define frame 0
section .text
align 16
___gmpn_modexact_1c_odd:
mov ecx,[PARAM_CARRY]
jmp Lstart_1c
align 16
___gmpn_modexact_1_odd:
xor ecx,ecx
Lstart_1c:
mov eax,[PARAM_DIVISOR]
sub esp,STACK_SPACE
FR_sesp STACK_SPACE
mov [SAVE_ESI],esi
mov esi,[PARAM_SRC]
shr eax,1 ; d/2
mov [SAVE_EDI],edi
and eax,127
%ifdef PIC
call Lmovl_eip_edi
add edi,_GLOBAL_OFFSET_TABLE_
mov edi,[___gmp_modlimb_invert_table+edi]
movzx edi,byte [eax+edi] ; inv 8 bits
%else
movzx edi,byte [___gmp_modlimb_invert_table+eax] ; inv 8 bits
%endif
xor edx,edx ; initial extra carry
lea eax,[edi+edi] ; 2*inv
imul edi,edi ; inv*inv
mov [SAVE_EBX],ebx
mov ebx,[PARAM_SIZE]
imul edi,[PARAM_DIVISOR] ; inv*inv*d
sub eax,edi ; inv = 2*inv - inv*inv*d
lea edi,[eax+eax] ; 2*inv
imul eax,eax ; inv*inv
imul eax,[PARAM_DIVISOR] ; inv*inv*d
lea esi,[esi+ebx*4] ; src end
neg ebx ; -size
sub edi,eax ; inv = 2*inv - inv*inv*d
; The dependent chain here is
;
; subl %edx,%eax 1
; imull %edi,%eax 4
; mull PARAM_DIVISOR 5
; ----
; total 10
;
; and this is the measured speed. No special scheduling is necessary,out
; of order execution hides the load latency.
;
; eax scratch (src limb)
; ebx counter,limbs,negative
; ecx carry bit,0 or 1
; edx carry limb,high of last product
; esi &src[size]
; edi inverse
; ebp
Ltop:
mov eax,[esi+ebx*4]
sub eax,ecx
sbb ecx,ecx
sub eax,edx
sbb ecx,0
imul eax,edi
neg ecx
mul dword [PARAM_DIVISOR]
inc ebx
jnz Ltop
mov esi,[SAVE_ESI]
lea eax,[ecx+edx]
mov edi,[SAVE_EDI]
mov ebx,[SAVE_EBX]
add esp,STACK_SPACE
ret
%ifdef PIC
Lmovl_eip_edi:
mov edi,[esp]
ret
%endif
end