mpir/mpn/x86i/p6/dive_1.asm
brgladman 48248cda46 1. longlong.h change to add MSVC intrinsics
2. longlong.h rearrangement for Intel compiler
3. MSVC additions in test  code 
4. GMP 4.2.1 bug fixes
5. Intel format assembly code
2008-05-18 22:20:43 +00:00

193 lines
5.1 KiB
NASM

; Copyright 2001, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
%include "..\x86i.inc"
extern ___gmp_modlimb_invert_table
global ___gmpn_divexact_1
%ifdef DLL
export ___gmpn_divexact_1
%endif
%define PARAM_DIVISOR esp+frame+16
%define PARAM_SIZE esp+frame+12
%define PARAM_SRC esp+frame+8
%define PARAM_DST esp+frame+4
%define SAVE_EBX esp+frame-4
%define SAVE_ESI esp+frame-8
%define SAVE_EDI esp+frame-12
%define SAVE_EBP esp+frame-16
%define VAR_INVERSE esp+frame-20
%define STACK_SPACE 20
%define frame 0
section .text
align 16
___gmpn_divexact_1:
mov eax,[PARAM_DIVISOR]
sub esp,STACK_SPACE
FR_sesp STACK_SPACE
mov [SAVE_ESI],esi
mov esi,[PARAM_SRC]
mov [SAVE_EBX],ebx
mov ebx,[PARAM_SIZE]
bsf ecx,eax ; trailing twos
mov [SAVE_EBP],ebp
shr eax,cl ; d without twos
mov edx,eax
shr eax,1 ; d/2 without twos
mov [PARAM_DIVISOR],edx
and eax,127
%ifdef PIC
call Lmovl_eip_ebp
add ebp,_GLOBAL_OFFSET_TABLE_
mov ebp,[___gmp_modlimb_invert_table+edx+ebp]
movzx ebp,byte [eax+ebp] ; inv 8 bits
%else
movzx ebp,byte [___gmp_modlimb_invert_table+eax] ; inv 8 bits
%endif
lea eax,[ebp+ebp] ; 2*inv
imul ebp,ebp ; inv*inv
mov [SAVE_EDI],edi
mov edi,[PARAM_DST]
lea esi,[esi+ebx*4] ; src end
imul ebp,[PARAM_DIVISOR] ; inv*inv*d
sub eax,ebp ; inv = 2*inv - inv*inv*d
lea ebp,[eax+eax] ; 2*inv
imul eax,eax ; inv*inv
lea edi,[edi+ebx*4] ; dst end
neg ebx ; -size
mov [PARAM_DST],edi
imul eax,[PARAM_DIVISOR] ; inv*inv*d
sub ebp,eax ; inv = 2*inv - inv*inv*d
mov [VAR_INVERSE],ebp
mov eax,[esi+ebx*4] ; src[0]
or ecx,ecx
jnz Leven
jmp Lodd_entry ; ecx initial carry is zero
; The dependent chain here is
;
; subl %edx,%eax 1
; imull %ebp,%eax 4
; mull PARAM_DIVISOR 5
; ----
; total 10
;
; and this is the measured speed. No special scheduling is necessary,out
; of order execution hides the load latency.
;
; eax scratch (src limb)
; ebx counter,limbs,negative
; ecx carry bit
; edx carry limb,high of last product
; esi &src[size]
; edi &dst[size]
Lodd_top:
mul dword [PARAM_DIVISOR]
mov eax,[esi+ebx*4]
sub eax,ecx
sbb ecx,ecx
sub eax,edx
sbb ecx,0
Lodd_entry:
imul eax,[VAR_INVERSE]
mov [edi+ebx*4],eax
neg ecx
inc ebx
jnz Lodd_top
mov esi,[SAVE_ESI]
mov edi,[SAVE_EDI]
mov ebp,[SAVE_EBP]
mov ebx,[SAVE_EBX]
add esp,STACK_SPACE
ret
; eax src[0]
; ebx counter,limbs,negative
; ecx shift
Leven:
xor ebp,ebp ; initial carry bit
xor edx,edx ; initial carry limb (for size==1)
inc ebx
jz Leven_one
mov edi,[esi+ebx*4] ; src[1]
shrd eax,edi,cl
jmp Leven_entry
; eax scratch
; ebx counter,limbs,negative
; ecx shift
; edx scratch
; esi &src[size]
; edi &dst[size] and scratch
; ebp carry bit
Leven_top:
mov edi,[esi+ebx*4]
mul dword [PARAM_DIVISOR]
mov eax,[-4+esi+ebx*4]
shrd eax,edi,cl
sub eax,ebp
sbb ebp,ebp
sub eax,edx
sbb ebp,0
Leven_entry:
imul eax,[VAR_INVERSE]
mov edi,[PARAM_DST]
neg ebp
mov [-4+edi+ebx*4],eax
inc ebx
jnz Leven_top
mul dword [PARAM_DIVISOR]
mov eax,[-4+esi]
Leven_one:
shr eax,cl
mov esi,[SAVE_ESI]
sub eax,ebp
mov ebp,[SAVE_EBP]
sub eax,edx
mov ebx,[SAVE_EBX]
imul eax,[VAR_INVERSE]
mov [-4+edi],eax
mov edi,[SAVE_EDI]
add esp,STACK_SPACE
ret
%ifdef PIC
Lmovl_eip_ebp:
mov ebp,[esp]
ret
%endif
end