;  Copyright 2001, 2002 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\x86i.inc" 

	extern	___gmp_modlimb_invert_table
	global  ___gmpn_divexact_1 

%ifdef	DLL
	export	___gmpn_divexact_1
%endif

%define	PARAM_DIVISOR	esp+frame+16 
%define PARAM_SIZE      esp+frame+12 
%define PARAM_SRC       esp+frame+8 
%define PARAM_DST       esp+frame+4 

%define SAVE_EBX        esp+frame-4 
%define SAVE_ESI        esp+frame-8 
%define SAVE_EDI		esp+frame-12 
%define SAVE_EBP		esp+frame-16 
%define VAR_INVERSE		esp+frame-20 
%define STACK_SPACE		20 
%define frame			0 

	section .text

	align   16

___gmpn_divexact_1: 
	mov     eax,[PARAM_DIVISOR]
    sub     esp,STACK_SPACE
	FR_sesp	STACK_SPACE
    mov     [SAVE_ESI],esi
    mov     esi,[PARAM_SRC]
    mov     [SAVE_EBX],ebx
    mov     ebx,[PARAM_SIZE]
    bsf     ecx,eax         ;  trailing twos 
    mov     [SAVE_EBP],ebp
    shr     eax,cl          ;  d without twos 
    mov     edx,eax
    shr     eax,1           ;  d/2 without twos 
    mov     [PARAM_DIVISOR],edx
    and     eax,127

%ifdef	PIC
    call    Lmovl_eip_ebp
    add     ebp,_GLOBAL_OFFSET_TABLE_
    mov     ebp,[___gmp_modlimb_invert_table+edx+ebp]
    movzx   ebp,byte [eax+ebp]							;  inv 8 bits 
%else
	movzx   ebp,byte [___gmp_modlimb_invert_table+eax]	;  inv 8 bits 
%endif

    lea     eax,[ebp+ebp]		;  2*inv 
    imul    ebp,ebp				;  inv*inv 
    mov     [SAVE_EDI],edi
    mov     edi,[PARAM_DST]
    lea     esi,[esi+ebx*4]		;  src end 
    imul    ebp,[PARAM_DIVISOR]	;  inv*inv*d 
    sub     eax,ebp				;  inv = 2*inv - inv*inv*d 
    lea     ebp,[eax+eax]		;  2*inv 
    imul    eax,eax				;  inv*inv 
    lea     edi,[edi+ebx*4]		;  dst end 
    neg     ebx					;  -size 
    mov     [PARAM_DST],edi
    imul    eax,[PARAM_DIVISOR] ;  inv*inv*d 
    sub     ebp,eax				;  inv = 2*inv - inv*inv*d 

    mov     [VAR_INVERSE],ebp
    mov     eax,[esi+ebx*4]		;  src[0] 
    or      ecx,ecx
    jnz     Leven
    jmp     Lodd_entry			;  ecx initial carry is zero 

;  The dependent chain here is 
; 
;      subl    %edx,%eax        1 
;      imull   %ebp,%eax        4 
;      mull    PARAM_DIVISOR    5 
;                             ---- 
;        total                 10 
; 
;  and this is the measured speed.  No special scheduling is necessary,out 
;  of order execution hides the load latency. 
;
;  eax scratch (src limb) 
;  ebx counter,limbs,negative 
;  ecx carry bit 
;  edx carry limb,high of last product 
;  esi &src[size] 
;  edi &dst[size] 

Lodd_top: 
    mul     dword [PARAM_DIVISOR]
    mov     eax,[esi+ebx*4]
    sub     eax,ecx
    sbb     ecx,ecx
    sub     eax,edx
    sbb     ecx,0
Lodd_entry: 
    imul    eax,[VAR_INVERSE]
    mov     [edi+ebx*4],eax
    neg     ecx
    inc     ebx
    jnz     Lodd_top
    mov     esi,[SAVE_ESI]
    mov     edi,[SAVE_EDI]
    mov     ebp,[SAVE_EBP]
    mov     ebx,[SAVE_EBX]
    add     esp,STACK_SPACE
    ret

;  eax src[0] 
;  ebx counter,limbs,negative 
;  ecx shift 

Leven: 
    xor     ebp,ebp         ;  initial carry bit 
    xor     edx,edx         ;  initial carry limb (for size==1) 
    inc     ebx
    jz      Leven_one
    mov     edi,[esi+ebx*4]	;  src[1] 
	shrd	eax,edi,cl
    jmp     Leven_entry

;  eax scratch 
;  ebx counter,limbs,negative 
;  ecx shift 
;  edx scratch 
;  esi &src[size] 
;  edi &dst[size] and scratch 
;  ebp carry bit 

Leven_top: 
    mov     edi,[esi+ebx*4]
    mul     dword [PARAM_DIVISOR]
    mov     eax,[-4+esi+ebx*4]
	shrd	eax,edi,cl
    sub     eax,ebp
    sbb     ebp,ebp
    sub     eax,edx
    sbb     ebp,0

Leven_entry: 
    imul    eax,[VAR_INVERSE]
    mov     edi,[PARAM_DST]
    neg     ebp
    mov     [-4+edi+ebx*4],eax
    inc     ebx
    jnz     Leven_top
    mul     dword [PARAM_DIVISOR]
    mov     eax,[-4+esi]
Leven_one: 
    shr     eax,cl
    mov     esi,[SAVE_ESI]
    sub     eax,ebp
    mov     ebp,[SAVE_EBP]
    sub     eax,edx
    mov     ebx,[SAVE_EBX]
    imul    eax,[VAR_INVERSE]
    mov     [-4+edi],eax
    mov     edi,[SAVE_EDI]
    add     esp,STACK_SPACE
    ret

%ifdef	PIC
Lmovl_eip_ebp: 
    mov     ebp,[esp]
    ret
%endif

	end