;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\..\x86i.inc" 

	global  ___gmpn_preinv_divrem_1 
    global  ___gmpn_divrem_1c 
    global  ___gmpn_divrem_1 

%ifdef	DLL
	export	___gmpn_divrem_1c
	export	___gmpn_divrem_1
%endif

%define	MUL_THRESHOLD		4 
%define	PARAM_PREINV_SHIFT      esp+frame+28 
%define PARAM_PREINV_INVERSE	esp+frame+24 
%define PARAM_CARRY     esp+frame+24 
%define PARAM_DIVISOR   esp+frame+20 
%define PARAM_SIZE      esp+frame+16 
%define PARAM_SRC       esp+frame+12 
%define PARAM_XSIZE     esp+frame+8 
%define PARAM_DST       esp+frame+4 

%define SAVE_EBX        esp+frame-4 
%define SAVE_ESI        esp+frame-8 
%define SAVE_EDI        esp+frame-12 
%define SAVE_EBP        esp+frame-16 

%define VAR_NORM        esp+frame-20 
%define VAR_INVERSE     esp+frame-24 
%define VAR_SRC			esp+frame-28 
%define VAR_DST			esp+frame-32 
%define VAR_DST_STOP    esp+frame-36 
%define STACK_SPACE		36 
%define frame			0 

	section .text

	align   16   

___gmpn_preinv_divrem_1: 
    mov     ecx,[PARAM_XSIZE]
    sub     esp,STACK_SPACE
	FR_sesp	STACK_SPACE
    mov     [SAVE_ESI],esi
    mov     esi,[PARAM_SRC]
    mov     [SAVE_EBX],ebx
    mov     ebx,[PARAM_SIZE]
    mov     [SAVE_EBP],ebp
    mov     ebp,[PARAM_DIVISOR]
    mov     [SAVE_EDI],edi
    mov     edx,[PARAM_DST]
    mov     eax,[-4+esi+ebx*4]	;  src high limb 
    xor     edi,edi				;  initial carry (if can't skip a div) 
	lea     edx,[8+edx+ecx*4]	;  &dst[xsize+2] 
	xor     ecx,ecx
    mov     [VAR_DST_STOP],edx	;  &dst[xsize+2] 
    cmp     eax,ebp				;  high cmp divisor 
	cmovc	edi,eax
	cmovnc	ecx,eax				;  (the latter in case src==dst) 
    mov     [-12+edx+ebx*4],ecx	;  dst high limb 
	sbb     ebx,0				;  skip one division if high<divisor 
    mov     ecx,[PARAM_PREINV_SHIFT]
    lea     edx,[-8+edx+ebx*4]	;  &dst[xsize+size] 
    mov     eax,32
    mov     [VAR_DST],edx		;  &dst[xsize+size] 
    shl     ebp,cl				;  d normalized 
    sub     eax,ecx
    mov     [VAR_NORM],ecx
    movd    mm7,eax				;  rshift 
    mov     eax,[PARAM_PREINV_INVERSE]
    jmp     Lstart_preinv

	align   16

%define       frame   0 

___gmpn_divrem_1c: 
    mov     edx,[PARAM_CARRY]
    mov     ecx,[PARAM_SIZE]
    sub     esp,STACK_SPACE
%define frame   STACK_SPACE 
    mov     [SAVE_EBX],ebx
    mov     ebx,[PARAM_XSIZE]
    mov     [SAVE_EDI],edi
    mov     edi,[PARAM_DST]
    mov     [SAVE_EBP],ebp
    mov     ebp,[PARAM_DIVISOR]
    mov     [SAVE_ESI],esi
    mov     esi,[PARAM_SRC]
    lea     edi,[-4+edi+ebx*4]
    jmp     Lstart_1c

;  offset 0x31,close enough to aligned 

%define       frame   0 

___gmpn_divrem_1: 
    mov     ecx,[PARAM_SIZE]
    mov     edx,0				;  initial carry (if can't skip a div) 
    sub     esp,STACK_SPACE
%define frame   STACK_SPACE 
    mov     [SAVE_EBP],ebp
    mov     ebp,[PARAM_DIVISOR]
    mov     [SAVE_EBX],ebx
    mov     ebx,[PARAM_XSIZE]
    mov     [SAVE_ESI],esi
    mov     esi,[PARAM_SRC]
    or      ecx,ecx				;  size 
    mov     [SAVE_EDI],edi
    mov     edi,[PARAM_DST]
    lea     edi,[-4+edi+ebx*4]	;  &dst[xsize-1] 
    jz      Lno_skip_div		;  if size==0 
    mov     eax,[-4+esi+ecx*4]	;  src high limb 
    xor     esi,esi
    cmp     eax,ebp				;  high cmp divisor 
	cmovc	edx,eax
	cmovnc	esi,eax				;  (the latter in case src==dst) 
    mov     [edi+ecx*4],esi		;  dst high limb 
    sbb     ecx,0				;  size-1 if high<divisor 
    mov     esi,[PARAM_SRC]		;  reload 
Lno_skip_div: 

;  eax  
;  ebx xsize 
;  ecx size 
;  edx carry 
;  esi src 
;  edi &dst[xsize-1] 
;  ebp divisor 

Lstart_1c: 
    lea     eax,[ebx+ecx]		;  size+xsize 
    cmp     eax,MUL_THRESHOLD
    jae     Lmul_by_inverse
    or      ecx,ecx
    jz      Ldivide_no_integer

;  eax scratch (quotient) 
;  ebx xsize 
;  ecx counter 
;  edx scratch (remainder) 
;  esi src 
;  edi &dst[xsize-1] 
;  ebp divisor 

Ldivide_integer: 
    mov     eax,[-4+esi+ecx*4]
    div     ebp
    mov     [edi+ecx*4],eax
    dec     ecx
    jnz     Ldivide_integer
Ldivide_no_integer: 
    mov     edi,[PARAM_DST]
    or      ebx,ebx
    jnz     Ldivide_fraction
Ldivide_done: 
    mov     esi,[SAVE_ESI]
    mov     edi,[SAVE_EDI]
    mov     ebx,[SAVE_EBX]
    mov     eax,edx
    mov     ebp,[SAVE_EBP]
    add     esp,STACK_SPACE
    ret

;  eax scratch (quotient) 
;  ebx counter 
;  ecx 
;  edx scratch (remainder) 
;  esi 
;  edi dst 
;  ebp divisor 

Ldivide_fraction: 
    mov     eax,0
    div     ebp
    mov     [-4+edi+ebx*4],eax
    dec     ebx
    jnz     Ldivide_fraction
    jmp     Ldivide_done

;  eax 
;  ebx xsize 
;  ecx size 
;  edx carry 
;  esi src 
;  edi &dst[xsize-1] 
;  ebp divisor 

Lmul_by_inverse: 
    lea     ebx,[12+edi]   ;  &dst[xsize+2],loop dst stop 
    mov     [VAR_DST_STOP],ebx
    lea     edi,[4+edi+ecx*4] ;  &dst[xsize+size] 
    mov     [VAR_DST],edi
    mov     ebx,ecx         ;  size 
    bsr     ecx,ebp         ;  31-l 
    mov     edi,edx         ;  carry 
    lea     eax,[1+ecx]    ;  32-l 
    xor     ecx,31         ;  l 
    mov     [VAR_NORM],ecx
    mov     edx,-1
    shl     ebp,cl          ;  d normalized 
    movd    mm7,eax
    mov     eax,-1
    sub     edx,ebp         ;  (b-d)-1 giving edx:eax = b*(b-d)-1 
    div     ebp             ;  floor (b*(b-d)-1) / d 

;  eax inverse 
;  ebx size 
;  ecx shift 
;  edx 
;  esi src 
;  edi carry 
;  ebp divisor 
;
;  mm7 rshift 

Lstart_preinv: 
    mov     [VAR_INVERSE],eax
    or      ebx,ebx         ;  size 
    lea     eax,[-12+esi+ebx*4] ;  &src[size-3] 
    mov     [VAR_SRC],eax
    jz      Lstart_zero
    mov     esi,[8+eax]    ;  src high limb 
    cmp     ebx,1
    jz      Lstart_one
Lstart_two_or_more: 
    mov     edx,[4+eax]    ;  src second highest limb 
	shld	edi,esi,cl
	shld	esi,edx,cl
    cmp     ebx,2
    je      Linteger_two_left
    jmp     Linteger_top

Lstart_one: 
	shld	edi,esi,cl
    shl     esi,cl          ;  n10 = high << l 
    jmp     Linteger_one_left

Lstart_zero: 
;  Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and 
;  skipped a division. 

    shl     edi,cl          ;  n2 = carry << l 
    mov     eax,edi         ;  return value for zero_done 
    cmp     [PARAM_XSIZE],dword 0
    je      Lzero_done
    jmp     Lfraction_some

;  This loop runs at about 25 cycles,which is probably sub-optimal,and 
;  certainly more than the dependent chain would suggest.  A better loop,or 
;  a better rough analysis of what's possible,would be welcomed. 
; 
;  In the current implementation,the following successively dependent 
;  micro-ops seem to exist. 
; 
;                     uops 
;              n2+n1   1   (addl) 
;              mul     5 
;              q1+1    3   (addl/adcl) 
;              mul     5 
;              sub     3   (subl/sbbl) 
;              addback 2   (cmov) 
;                     --- 
;                     19 
; 
;  Lack of registers hinders explicit scheduling and it might be that the 
;  normal out of order execution isn't able to hide enough under the mul 
;  latencies. 
; 
;  Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than 
;  cmov (and takes one uop off the dependent chain).  A sarl/andl/addl 
;  combination was tried for the addback (despite the fact it would lengthen 
;  the dependent chain) but found to be no faster. 

;  eax scratch 
;  ebx scratch (nadj,q1) 
;  ecx scratch (src,dst) 
;  edx scratch 
;  esi n10 
;  edi n2 
;  ebp d 
;
;  mm0 scratch (src qword) 
;  mm7 rshift for normalization 

	align   16
Linteger_top: 
    mov     eax,esi
    mov     ebx,ebp
    sar     eax,31				;  -n1 
    mov     ecx,[VAR_SRC]
    and     ebx,eax				;  -n1 & d 
    neg     eax					;  n1 
    add     ebx,esi				;  nadj = n10 + (-n1 & d),ignoring overflow 
    add     eax,edi				;  n2+n1 
    movq    mm0,[ecx]			;  next src limb and the one below it 
    mul     dword [VAR_INVERSE] ;  m*(n2+n1) 
    sub     ecx,4
    mov     [VAR_SRC],ecx
    add     eax,ebx				;  m*(n2+n1) + nadj,low giving carry flag 
    mov     eax,ebp				;  d 
    lea     ebx,[1+edi]			;  n2+1 
    adc     ebx,edx				;  1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 
    jz      Lq1_ff
    mul     ebx					;  (q1+1)*d 
    mov     ecx,[VAR_DST]
    psrlq   mm0,mm7
    sub     esi,eax
    mov     eax,[VAR_DST_STOP]
    sbb     edi,edx				;  n - (q1+1)*d 
    mov     edi,esi				;  remainder -> n2 
    lea     edx,[ebp+esi]
	cmovc	edi,edx
    movd    esi,mm0
    sbb     ebx,0    ;  q 
    sub     ecx,4
    mov     [ecx],ebx
    cmp     ecx,eax
    mov     [VAR_DST],ecx
    jne     Linteger_top
Linteger_loop_done: 
 
;  Here,and in integer_one_left below,an sbbl $0 is used rather than a jz 
;  q1_ff special case.  This make the code a bit smaller and simpler,and 
;  costs only 2 cycles (each). 

;  eax scratch 
;  ebx scratch (nadj,q1) 
;  ecx scratch (src,dst) 
;  edx scratch 
;  esi n10 
;  edi n2 
;  ebp divisor 
;
;  mm7 rshift 

Linteger_two_left: 
    mov     eax,esi
    mov     ebx,ebp
    sar     eax,31				;  -n1 
    mov     ecx,[PARAM_SRC]
    and     ebx,eax				;  -n1 & d 
    neg     eax					;  n1 
    add     ebx,esi				;  nadj = n10 + (-n1 & d),ignoring overflow 
    add     eax,edi				;  n2+n1 
    mul     dword [VAR_INVERSE] ;  m*(n2+n1) 
    movd    mm0,[ecx]			;  src low limb 
    mov     ecx,[VAR_DST_STOP]
    add     eax,ebx				;  m*(n2+n1) + nadj,low giving carry flag 
    lea     ebx,[1+edi]			;  n2+1 
    mov     eax,ebp				;  d 
    adc     ebx,edx				;  1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 
    sbb     ebx,0
    mul     ebx					;  (q1+1)*d 
    psllq   mm0,32
    psrlq   mm0,mm7
    sub     esi,eax
    sbb     edi,edx				;  n - (q1+1)*d 
    mov     edi,esi				;  remainder -> n2 
    lea     edx,[ebp+esi]
	cmovc	edi,edx
    movd    esi,mm0
    sbb     ebx,0				;  q 
    mov     [-4+ecx],ebx

;  eax scratch 
;  ebx scratch (nadj,q1) 
;  ecx scratch (dst) 
;  edx scratch 
;  esi n10 
;  edi n2 
;  ebp divisor 
;
;  mm7 rshift 

Linteger_one_left: 
    mov     eax,esi
    mov     ebx,ebp
    sar     eax,31				;  -n1 
    mov     ecx,[VAR_DST_STOP]
    and     ebx,eax				;  -n1 & d 
    neg     eax					;  n1 
    add     ebx,esi				;  nadj = n10 + (-n1 & d),ignoring overflow 
    add     eax,edi				;  n2+n1 
    mul     dword [VAR_INVERSE]	;  m*(n2+n1) 
    add     eax,ebx				;  m*(n2+n1) + nadj,low giving carry flag 
    lea     ebx,[1+edi]			;  n2+1 
    mov     eax,ebp				;  d 
    adc     ebx,edx				;  1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 
    sbb     ebx,0				;  q1 if q1+1 overflowed 
    mul     ebx
    sub     esi,eax
    mov     eax,[PARAM_XSIZE]
    sbb     edi,edx				;  n - (q1+1)*d 
    mov     edi,esi				;  remainder -> n2 
    lea     edx,[ebp+esi]
	cmovc	edi,edx
    sbb     ebx,0				;  q 
    mov     [-8+ecx],ebx
    sub     ecx,8
    or      eax,eax				;  xsize 
    jnz     Lfraction_some
    mov     eax,edi
Lfraction_done: 
    mov     ecx,[VAR_NORM]
Lzero_done: 
    mov     ebp,[SAVE_EBP]
    mov     edi,[SAVE_EDI]
    mov     esi,[SAVE_ESI]
    mov     ebx,[SAVE_EBX]
    add     esp,STACK_SPACE
    shr     eax,cl
    emms
    ret

;  Special case for q1=0xFFFFFFFF,giving q=0xFFFFFFFF meaning the low dword 
;  of q*d is simply -d and the remainder n-q*d = n10+d 
;
;  eax (divisor) 
;  ebx (q1+1 == 0) 
;  ecx 
;  edx 
;  esi n10 
;  edi n2 
;  ebp divisor 

Lq1_ff: 
    mov     ecx,[VAR_DST]
    mov     edx,[VAR_DST_STOP]
    sub     ecx,4
    mov     [VAR_DST],ecx
    psrlq   mm0,mm7
    lea     edi,[ebp+esi]		;  n-q*d remainder -> next n2 
    mov     [ecx],dword -1
    movd    esi,mm0				;  next n10 
    cmp     edx,ecx
    jne     Linteger_top
    jmp     Linteger_loop_done

; 
;  In the current implementation,the following successively dependent 
;  micro-ops seem to exist. 
; 
;                     uops 
;              mul     5 
;              q1+1    1   (addl) 
;              mul     5 
;              sub     3   (negl/sbbl) 
;              addback 2   (cmov) 
;                     --- 
;                     16 
; 
;  The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for 
;  the addback was found to be a touch slower. 

;  eax 
;  ebx 
;  ecx 
;  edx 
;  esi 
;  edi carry 
;  ebp divisor 

	align   16
Lfraction_some: 
    mov     esi,[PARAM_DST]
    mov     ecx,[VAR_DST_STOP]	;  &dst[xsize+2] 
    mov     eax,edi
    sub     ecx,8				;  &dst[xsize] 

;  eax n2,then scratch 
;  ebx scratch (nadj,q1) 
;  ecx dst,decrementing 
;  edx scratch 
;  esi dst stop point 
;  edi n2 
;  ebp divisor 

	align   16
Lfraction_top: 
    mul     dword [VAR_INVERSE]	;  m*n2 
    mov     eax,ebp				;  d 
    sub     ecx,4				;  dst 
    lea     ebx,[edi+1]
    add     ebx,edx				;  1 + high(n2<<32 + m*n2) = q1+1 
    mul     ebx					;  (q1+1)*d 
    neg     eax					;  low of n - (q1+1)*d 
    sbb     edi,edx				;  high of n - (q1+1)*d,caring only about carry 
    lea     edx,[ebp+eax]
	cmovc	eax,edx
    sbb     ebx,0				;  q 
    mov     edi,eax				;  remainder->n2 
    cmp     ecx,esi
    mov     [ecx],ebx			;  previous q 
    jne     Lfraction_top
    jmp     Lfraction_done

	end