mpir/mpn/x86w/p3/divrem_1.asm


;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\x86i.inc" 

	global  ___gmpn_preinv_divrem_1 
    global  ___gmpn_divrem_1c 
    global  ___gmpn_divrem_1 

%ifdef	DLL
	export	___gmpn_divrem_1c
	export	___gmpn_divrem_1
%endif

%define	MUL_THRESHOLD		4 
%define	PARAM_PREINV_SHIFT      esp+frame+28 
%define PARAM_PREINV_INVERSE	esp+frame+24 
%define PARAM_CARRY     esp+frame+24 
%define PARAM_DIVISOR   esp+frame+20 
%define PARAM_SIZE      esp+frame+16 
%define PARAM_SRC       esp+frame+12 
%define PARAM_XSIZE     esp+frame+8 
%define PARAM_DST       esp+frame+4 

%define SAVE_EBX        esp+frame-4 
%define SAVE_ESI        esp+frame-8 
%define SAVE_EDI        esp+frame-12 
%define SAVE_EBP        esp+frame-16 

%define VAR_NORM        esp+frame-20 
%define VAR_INVERSE     esp+frame-24 
%define VAR_SRC			esp+frame-28 
%define VAR_DST			esp+frame-32 
%define VAR_DST_STOP    esp+frame-36 
%define STACK_SPACE		36 
%define frame			0 

	section .text

	align   16   

___gmpn_preinv_divrem_1: 
    mov     ecx,[PARAM_XSIZE]
    sub     esp,STACK_SPACE
	FR_sesp	STACK_SPACE
    mov     [SAVE_ESI],esi
    mov     esi,[PARAM_SRC]
    mov     [SAVE_EBX],ebx
    mov     ebx,[PARAM_SIZE]
    mov     [SAVE_EBP],ebp
    mov     ebp,[PARAM_DIVISOR]
    mov     [SAVE_EDI],edi
    mov     edx,[PARAM_DST]
    mov     eax,[-4+esi+ebx*4]	;  src high limb 
    xor     edi,edi				;  initial carry (if can't skip a div) 
	lea     edx,[8+edx+ecx*4]	;  &dst[xsize+2] 
	xor     ecx,ecx
    mov     [VAR_DST_STOP],edx	;  &dst[xsize+2] 
    cmp     eax,ebp				;  high cmp divisor 
	cmovc	edi,eax
	cmovnc	ecx,eax				;  (the latter in case src==dst) 
    mov     [-12+edx+ebx*4],ecx	;  dst high limb 
	sbb     ebx,0				;  skip one division if high<divisor 
    mov     ecx,[PARAM_PREINV_SHIFT]
    lea     edx,[-8+edx+ebx*4]	;  &dst[xsize+size] 
    mov     eax,32
    mov     [VAR_DST],edx		;  &dst[xsize+size] 
    shl     ebp,cl				;  d normalized 
    sub     eax,ecx
    mov     [VAR_NORM],ecx
    movd    mm7,eax				;  rshift 
    mov     eax,[PARAM_PREINV_INVERSE]
    jmp     Lstart_preinv

	align   16

%define       frame   0 

___gmpn_divrem_1c: 
    mov     edx,[PARAM_CARRY]
    mov     ecx,[PARAM_SIZE]
    sub     esp,STACK_SPACE
%define frame   STACK_SPACE 
    mov     [SAVE_EBX],ebx
    mov     ebx,[PARAM_XSIZE]
    mov     [SAVE_EDI],edi
    mov     edi,[PARAM_DST]
    mov     [SAVE_EBP],ebp
    mov     ebp,[PARAM_DIVISOR]
    mov     [SAVE_ESI],esi
    mov     esi,[PARAM_SRC]
    lea     edi,[-4+edi+ebx*4]
    jmp     Lstart_1c

;  offset 0x31,close enough to aligned 

%define       frame   0 

___gmpn_divrem_1: 
    mov     ecx,[PARAM_SIZE]
    mov     edx,0				;  initial carry (if can't skip a div) 
    sub     esp,STACK_SPACE
%define frame   STACK_SPACE 
    mov     [SAVE_EBP],ebp
    mov     ebp,[PARAM_DIVISOR]
    mov     [SAVE_EBX],ebx
    mov     ebx,[PARAM_XSIZE]
    mov     [SAVE_ESI],esi
    mov     esi,[PARAM_SRC]
    or      ecx,ecx				;  size 
    mov     [SAVE_EDI],edi
    mov     edi,[PARAM_DST]
    lea     edi,[-4+edi+ebx*4]	;  &dst[xsize-1] 
    jz      Lno_skip_div		;  if size==0 
    mov     eax,[-4+esi+ecx*4]	;  src high limb 
    xor     esi,esi
    cmp     eax,ebp				;  high cmp divisor 
	cmovc	edx,eax
	cmovnc	esi,eax				;  (the latter in case src==dst) 
    mov     [edi+ecx*4],esi		;  dst high limb 
    sbb     ecx,0				;  size-1 if high<divisor 
    mov     esi,[PARAM_SRC]		;  reload 
Lno_skip_div: 

;  eax  
;  ebx xsize 
;  ecx size 
;  edx carry 
;  esi src 
;  edi &dst[xsize-1] 
;  ebp divisor 

Lstart_1c: 
    lea     eax,[ebx+ecx]		;  size+xsize 
    cmp     eax,MUL_THRESHOLD
    jae     Lmul_by_inverse
    or      ecx,ecx
    jz      Ldivide_no_integer

;  eax scratch (quotient) 
;  ebx xsize 
;  ecx counter 
;  edx scratch (remainder) 
;  esi src 
;  edi &dst[xsize-1] 
;  ebp divisor 

Ldivide_integer: 
    mov     eax,[-4+esi+ecx*4]
    div     ebp
    mov     [edi+ecx*4],eax
    dec     ecx
    jnz     Ldivide_integer
Ldivide_no_integer: 
    mov     edi,[PARAM_DST]
    or      ebx,ebx
    jnz     Ldivide_fraction
Ldivide_done: 
    mov     esi,[SAVE_ESI]
    mov     edi,[SAVE_EDI]
    mov     ebx,[SAVE_EBX]
    mov     eax,edx
    mov     ebp,[SAVE_EBP]
    add     esp,STACK_SPACE
    ret

;  eax scratch (quotient) 
;  ebx counter 
;  ecx 
;  edx scratch (remainder) 
;  esi 
;  edi dst 
;  ebp divisor 

Ldivide_fraction: 
    mov     eax,0
    div     ebp
    mov     [-4+edi+ebx*4],eax
    dec     ebx
    jnz     Ldivide_fraction
    jmp     Ldivide_done

;  eax 
;  ebx xsize 
;  ecx size 
;  edx carry 
;  esi src 
;  edi &dst[xsize-1] 
;  ebp divisor 

Lmul_by_inverse: 
    lea     ebx,[12+edi]   ;  &dst[xsize+2],loop dst stop 
    mov     [VAR_DST_STOP],ebx
    lea     edi,[4+edi+ecx*4] ;  &dst[xsize+size] 
    mov     [VAR_DST],edi
    mov     ebx,ecx         ;  size 
    bsr     ecx,ebp         ;  31-l 
    mov     edi,edx         ;  carry 
    lea     eax,[1+ecx]    ;  32-l 
    xor     ecx,31         ;  l 
    mov     [VAR_NORM],ecx
    mov     edx,-1
    shl     ebp,cl          ;  d normalized 
    movd    mm7,eax
    mov     eax,-1
    sub     edx,ebp         ;  (b-d)-1 giving edx:eax = b*(b-d)-1 
    div     ebp             ;  floor (b*(b-d)-1) / d 

;  eax inverse 
;  ebx size 
;  ecx shift 
;  edx 
;  esi src 
;  edi carry 
;  ebp divisor 
;
;  mm7 rshift 

Lstart_preinv: 
    mov     [VAR_INVERSE],eax
    or      ebx,ebx         ;  size 
    lea     eax,[-12+esi+ebx*4] ;  &src[size-3] 
    mov     [VAR_SRC],eax
    jz      Lstart_zero
    mov     esi,[8+eax]    ;  src high limb 
    cmp     ebx,1
    jz      Lstart_one
Lstart_two_or_more: 
    mov     edx,[4+eax]    ;  src second highest limb 
	shld	edi,esi,cl
	shld	esi,edx,cl
    cmp     ebx,2
    je      Linteger_two_left
    jmp     Linteger_top

Lstart_one: 
	shld	edi,esi,cl
    shl     esi,cl          ;  n10 = high << l 
    jmp     Linteger_one_left

Lstart_zero: 
;  Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and 
;  skipped a division. 

    shl     edi,cl          ;  n2 = carry << l 
    mov     eax,edi         ;  return value for zero_done 
    cmp     [PARAM_XSIZE],dword 0
    je      Lzero_done
    jmp     Lfraction_some

;  This loop runs at about 25 cycles,which is probably sub-optimal,and 
;  certainly more than the dependent chain would suggest.  A better loop,or 
;  a better rough analysis of what's possible,would be welcomed. 
; 
;  In the current implementation,the following successively dependent 
;  micro-ops seem to exist. 
; 
;                     uops 
;              n2+n1   1   (addl) 
;              mul     5 
;              q1+1    3   (addl/adcl) 
;              mul     5 
;              sub     3   (subl/sbbl) 
;              addback 2   (cmov) 
;                     --- 
;                     19 
; 
;  Lack of registers hinders explicit scheduling and it might be that the 
;  normal out of order execution isn't able to hide enough under the mul 
;  latencies. 
; 
;  Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than 
;  cmov (and takes one uop off the dependent chain).  A sarl/andl/addl 
;  combination was tried for the addback (despite the fact it would lengthen 
;  the dependent chain) but found to be no faster. 

;  eax scratch 
;  ebx scratch (nadj,q1) 
;  ecx scratch (src,dst) 
;  edx scratch 
;  esi n10 
;  edi n2 
;  ebp d 
;
;  mm0 scratch (src qword) 
;  mm7 rshift for normalization 

	align   16
Linteger_top: 
    mov     eax,esi
    mov     ebx,ebp
    sar     eax,31				;  -n1 
    mov     ecx,[VAR_SRC]
    and     ebx,eax				;  -n1 & d 
    neg     eax					;  n1 
    add     ebx,esi				;  nadj = n10 + (-n1 & d),ignoring overflow 
    add     eax,edi				;  n2+n1 
    movq    mm0,[ecx]			;  next src limb and the one below it 
    mul     dword [VAR_INVERSE] ;  m*(n2+n1) 
    sub     ecx,4
    mov     [VAR_SRC],ecx
    add     eax,ebx				;  m*(n2+n1) + nadj,low giving carry flag 
    mov     eax,ebp				;  d 
    lea     ebx,[1+edi]			;  n2+1 
    adc     ebx,edx				;  1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 
    jz      Lq1_ff
    mul     ebx					;  (q1+1)*d 
    mov     ecx,[VAR_DST]
    psrlq   mm0,mm7
    sub     esi,eax
    mov     eax,[VAR_DST_STOP]
    sbb     edi,edx				;  n - (q1+1)*d 
    mov     edi,esi				;  remainder -> n2 
    lea     edx,[ebp+esi]
	cmovc	edi,edx
    movd    esi,mm0
    sbb     ebx,0    ;  q 
    sub     ecx,4
    mov     [ecx],ebx
    cmp     ecx,eax
    mov     [VAR_DST],ecx
    jne     Linteger_top
Linteger_loop_done: 
 
;  Here,and in integer_one_left below,an sbbl $0 is used rather than a jz 
;  q1_ff special case.  This make the code a bit smaller and simpler,and 
;  costs only 2 cycles (each). 

;  eax scratch 
;  ebx scratch (nadj,q1) 
;  ecx scratch (src,dst) 
;  edx scratch 
;  esi n10 
;  edi n2 
;  ebp divisor 
;
;  mm7 rshift 

Linteger_two_left: 
    mov     eax,esi
    mov     ebx,ebp
    sar     eax,31				;  -n1 
    mov     ecx,[PARAM_SRC]
    and     ebx,eax				;  -n1 & d 
    neg     eax					;  n1 
    add     ebx,esi				;  nadj = n10 + (-n1 & d),ignoring overflow 
    add     eax,edi				;  n2+n1 
    mul     dword [VAR_INVERSE] ;  m*(n2+n1) 
    movd    mm0,[ecx]			;  src low limb 
    mov     ecx,[VAR_DST_STOP]
    add     eax,ebx				;  m*(n2+n1) + nadj,low giving carry flag 
    lea     ebx,[1+edi]			;  n2+1 
    mov     eax,ebp				;  d 
    adc     ebx,edx				;  1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 
    sbb     ebx,0
    mul     ebx					;  (q1+1)*d 
    psllq   mm0,32
    psrlq   mm0,mm7
    sub     esi,eax
    sbb     edi,edx				;  n - (q1+1)*d 
    mov     edi,esi				;  remainder -> n2 
    lea     edx,[ebp+esi]
	cmovc	edi,edx
    movd    esi,mm0
    sbb     ebx,0				;  q 
    mov     [-4+ecx],ebx

;  eax scratch 
;  ebx scratch (nadj,q1) 
;  ecx scratch (dst) 
;  edx scratch 
;  esi n10 
;  edi n2 
;  ebp divisor 
;
;  mm7 rshift 

Linteger_one_left: 
    mov     eax,esi
    mov     ebx,ebp
    sar     eax,31				;  -n1 
    mov     ecx,[VAR_DST_STOP]
    and     ebx,eax				;  -n1 & d 
    neg     eax					;  n1 
    add     ebx,esi				;  nadj = n10 + (-n1 & d),ignoring overflow 
    add     eax,edi				;  n2+n1 
    mul     dword [VAR_INVERSE]	;  m*(n2+n1) 
    add     eax,ebx				;  m*(n2+n1) + nadj,low giving carry flag 
    lea     ebx,[1+edi]			;  n2+1 
    mov     eax,ebp				;  d 
    adc     ebx,edx				;  1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 
    sbb     ebx,0				;  q1 if q1+1 overflowed 
    mul     ebx
    sub     esi,eax
    mov     eax,[PARAM_XSIZE]
    sbb     edi,edx				;  n - (q1+1)*d 
    mov     edi,esi				;  remainder -> n2 
    lea     edx,[ebp+esi]
	cmovc	edi,edx
    sbb     ebx,0				;  q 
    mov     [-8+ecx],ebx
    sub     ecx,8
    or      eax,eax				;  xsize 
    jnz     Lfraction_some
    mov     eax,edi
Lfraction_done: 
    mov     ecx,[VAR_NORM]
Lzero_done: 
    mov     ebp,[SAVE_EBP]
    mov     edi,[SAVE_EDI]
    mov     esi,[SAVE_ESI]
    mov     ebx,[SAVE_EBX]
    add     esp,STACK_SPACE
    shr     eax,cl
    emms
    ret

;  Special case for q1=0xFFFFFFFF,giving q=0xFFFFFFFF meaning the low dword 
;  of q*d is simply -d and the remainder n-q*d = n10+d 
;
;  eax (divisor) 
;  ebx (q1+1 == 0) 
;  ecx 
;  edx 
;  esi n10 
;  edi n2 
;  ebp divisor 

Lq1_ff: 
    mov     ecx,[VAR_DST]
    mov     edx,[VAR_DST_STOP]
    sub     ecx,4
    mov     [VAR_DST],ecx
    psrlq   mm0,mm7
    lea     edi,[ebp+esi]		;  n-q*d remainder -> next n2 
    mov     [ecx],dword -1
    movd    esi,mm0				;  next n10 
    cmp     edx,ecx
    jne     Linteger_top
    jmp     Linteger_loop_done

; 
;  In the current implementation,the following successively dependent 
;  micro-ops seem to exist. 
; 
;                     uops 
;              mul     5 
;              q1+1    1   (addl) 
;              mul     5 
;              sub     3   (negl/sbbl) 
;              addback 2   (cmov) 
;                     --- 
;                     16 
; 
;  The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for 
;  the addback was found to be a touch slower. 

;  eax 
;  ebx 
;  ecx 
;  edx 
;  esi 
;  edi carry 
;  ebp divisor 

	align   16
Lfraction_some: 
    mov     esi,[PARAM_DST]
    mov     ecx,[VAR_DST_STOP]	;  &dst[xsize+2] 
    mov     eax,edi
    sub     ecx,8				;  &dst[xsize] 

;  eax n2,then scratch 
;  ebx scratch (nadj,q1) 
;  ecx dst,decrementing 
;  edx scratch 
;  esi dst stop point 
;  edi n2 
;  ebp divisor 

	align   16
Lfraction_top: 
    mul     dword [VAR_INVERSE]	;  m*n2 
    mov     eax,ebp				;  d 
    sub     ecx,4				;  dst 
    lea     ebx,[edi+1]
    add     ebx,edx				;  1 + high(n2<<32 + m*n2) = q1+1 
    mul     ebx					;  (q1+1)*d 
    neg     eax					;  low of n - (q1+1)*d 
    sbb     edi,edx				;  high of n - (q1+1)*d,caring only about carry 
    lea     edx,[ebp+eax]
	cmovc	eax,edx
    sbb     ebx,0				;  q 
    mov     edi,eax				;  remainder->n2 
    cmp     ecx,esi
    mov     [ecx],ebx			;  previous q 
    jne     Lfraction_top
    jmp     Lfraction_done

	end
update win32 builds 2012-02-12 18:40:14 -05:00
			`; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Translation of AT&T syntax code by Brian Gladman`

correct 32-bit build error reporteed by Case 2012-03-19 05:46:25 -04:00			`%include "..\x86i.inc"`
update win32 builds 2012-02-12 18:40:14 -05:00
			`global ___gmpn_preinv_divrem_1`
			`global ___gmpn_divrem_1c`
			`global ___gmpn_divrem_1`

			`%ifdef DLL`
			`export ___gmpn_divrem_1c`
			`export ___gmpn_divrem_1`
			`%endif`

			`%define MUL_THRESHOLD 4`
			`%define PARAM_PREINV_SHIFT esp+frame+28`
			`%define PARAM_PREINV_INVERSE esp+frame+24`
			`%define PARAM_CARRY esp+frame+24`
			`%define PARAM_DIVISOR esp+frame+20`
			`%define PARAM_SIZE esp+frame+16`
			`%define PARAM_SRC esp+frame+12`
			`%define PARAM_XSIZE esp+frame+8`
			`%define PARAM_DST esp+frame+4`

			`%define SAVE_EBX esp+frame-4`
			`%define SAVE_ESI esp+frame-8`
			`%define SAVE_EDI esp+frame-12`
			`%define SAVE_EBP esp+frame-16`

			`%define VAR_NORM esp+frame-20`
			`%define VAR_INVERSE esp+frame-24`
			`%define VAR_SRC esp+frame-28`
			`%define VAR_DST esp+frame-32`
			`%define VAR_DST_STOP esp+frame-36`
			`%define STACK_SPACE 36`
			`%define frame 0`

			`section .text`

			`align 16`

			`___gmpn_preinv_divrem_1:`
			`mov ecx,[PARAM_XSIZE]`
			`sub esp,STACK_SPACE`
			`FR_sesp STACK_SPACE`
			`mov [SAVE_ESI],esi`
			`mov esi,[PARAM_SRC]`
			`mov [SAVE_EBX],ebx`
			`mov ebx,[PARAM_SIZE]`
			`mov [SAVE_EBP],ebp`
			`mov ebp,[PARAM_DIVISOR]`
			`mov [SAVE_EDI],edi`
			`mov edx,[PARAM_DST]`
			`mov eax,[-4+esi+ebx*4] ; src high limb`
			`xor edi,edi ; initial carry (if can't skip a div)`
			`lea edx,[8+edx+ecx*4] ; &dst[xsize+2]`
			`xor ecx,ecx`
			`mov [VAR_DST_STOP],edx ; &dst[xsize+2]`
			`cmp eax,ebp ; high cmp divisor`
			`cmovc edi,eax`
			`cmovnc ecx,eax ; (the latter in case src==dst)`
			`mov [-12+edx+ebx*4],ecx ; dst high limb`
			`sbb ebx,0 ; skip one division if high<divisor`
			`mov ecx,[PARAM_PREINV_SHIFT]`
			`lea edx,[-8+edx+ebx*4] ; &dst[xsize+size]`
			`mov eax,32`
			`mov [VAR_DST],edx ; &dst[xsize+size]`
			`shl ebp,cl ; d normalized`
			`sub eax,ecx`
			`mov [VAR_NORM],ecx`
			`movd mm7,eax ; rshift`
			`mov eax,[PARAM_PREINV_INVERSE]`
			`jmp Lstart_preinv`

			`align 16`

			`%define frame 0`

			`___gmpn_divrem_1c:`
			`mov edx,[PARAM_CARRY]`
			`mov ecx,[PARAM_SIZE]`
			`sub esp,STACK_SPACE`
			`%define frame STACK_SPACE`
			`mov [SAVE_EBX],ebx`
			`mov ebx,[PARAM_XSIZE]`
			`mov [SAVE_EDI],edi`
			`mov edi,[PARAM_DST]`
			`mov [SAVE_EBP],ebp`
			`mov ebp,[PARAM_DIVISOR]`
			`mov [SAVE_ESI],esi`
			`mov esi,[PARAM_SRC]`
			`lea edi,[-4+edi+ebx*4]`
			`jmp Lstart_1c`

			`; offset 0x31,close enough to aligned`

			`%define frame 0`

			`___gmpn_divrem_1:`
			`mov ecx,[PARAM_SIZE]`
			`mov edx,0 ; initial carry (if can't skip a div)`
			`sub esp,STACK_SPACE`
			`%define frame STACK_SPACE`
			`mov [SAVE_EBP],ebp`
			`mov ebp,[PARAM_DIVISOR]`
			`mov [SAVE_EBX],ebx`
			`mov ebx,[PARAM_XSIZE]`
			`mov [SAVE_ESI],esi`
			`mov esi,[PARAM_SRC]`
			`or ecx,ecx ; size`
			`mov [SAVE_EDI],edi`
			`mov edi,[PARAM_DST]`
			`lea edi,[-4+edi+ebx*4] ; &dst[xsize-1]`
			`jz Lno_skip_div ; if size==0`
			`mov eax,[-4+esi+ecx*4] ; src high limb`
			`xor esi,esi`
			`cmp eax,ebp ; high cmp divisor`
			`cmovc edx,eax`
			`cmovnc esi,eax ; (the latter in case src==dst)`
			`mov [edi+ecx*4],esi ; dst high limb`
			`sbb ecx,0 ; size-1 if high<divisor`
			`mov esi,[PARAM_SRC] ; reload`
			`Lno_skip_div:`

			`; eax`
			`; ebx xsize`
			`; ecx size`
			`; edx carry`
			`; esi src`
			`; edi &dst[xsize-1]`
			`; ebp divisor`

			`Lstart_1c:`
			`lea eax,[ebx+ecx] ; size+xsize`
			`cmp eax,MUL_THRESHOLD`
			`jae Lmul_by_inverse`
			`or ecx,ecx`
			`jz Ldivide_no_integer`

			`; eax scratch (quotient)`
			`; ebx xsize`
			`; ecx counter`
			`; edx scratch (remainder)`
			`; esi src`
			`; edi &dst[xsize-1]`
			`; ebp divisor`

			`Ldivide_integer:`
			`mov eax,[-4+esi+ecx*4]`
			`div ebp`
			`mov [edi+ecx*4],eax`
			`dec ecx`
			`jnz Ldivide_integer`
			`Ldivide_no_integer:`
			`mov edi,[PARAM_DST]`
			`or ebx,ebx`
			`jnz Ldivide_fraction`
			`Ldivide_done:`
			`mov esi,[SAVE_ESI]`
			`mov edi,[SAVE_EDI]`
			`mov ebx,[SAVE_EBX]`
			`mov eax,edx`
			`mov ebp,[SAVE_EBP]`
			`add esp,STACK_SPACE`
			`ret`

			`; eax scratch (quotient)`
			`; ebx counter`
			`; ecx`
			`; edx scratch (remainder)`
			`; esi`
			`; edi dst`
			`; ebp divisor`

			`Ldivide_fraction:`
			`mov eax,0`
			`div ebp`
			`mov [-4+edi+ebx*4],eax`
			`dec ebx`
			`jnz Ldivide_fraction`
			`jmp Ldivide_done`

			`; eax`
			`; ebx xsize`
			`; ecx size`
			`; edx carry`
			`; esi src`
			`; edi &dst[xsize-1]`
			`; ebp divisor`

			`Lmul_by_inverse:`
			`lea ebx,[12+edi] ; &dst[xsize+2],loop dst stop`
			`mov [VAR_DST_STOP],ebx`
			`lea edi,[4+edi+ecx*4] ; &dst[xsize+size]`
			`mov [VAR_DST],edi`
			`mov ebx,ecx ; size`
			`bsr ecx,ebp ; 31-l`
			`mov edi,edx ; carry`
			`lea eax,[1+ecx] ; 32-l`
			`xor ecx,31 ; l`
			`mov [VAR_NORM],ecx`
			`mov edx,-1`
			`shl ebp,cl ; d normalized`
			`movd mm7,eax`
			`mov eax,-1`
			`sub edx,ebp ; (b-d)-1 giving edx:eax = b*(b-d)-1`
			`div ebp ; floor (b*(b-d)-1) / d`

			`; eax inverse`
			`; ebx size`
			`; ecx shift`
			`; edx`
			`; esi src`
			`; edi carry`
			`; ebp divisor`
			`;`
			`; mm7 rshift`

			`Lstart_preinv:`
			`mov [VAR_INVERSE],eax`
			`or ebx,ebx ; size`
			`lea eax,[-12+esi+ebx*4] ; &src[size-3]`
			`mov [VAR_SRC],eax`
			`jz Lstart_zero`
			`mov esi,[8+eax] ; src high limb`
			`cmp ebx,1`
			`jz Lstart_one`
			`Lstart_two_or_more:`
			`mov edx,[4+eax] ; src second highest limb`
			`shld edi,esi,cl`
			`shld esi,edx,cl`
			`cmp ebx,2`
			`je Linteger_two_left`
			`jmp Linteger_top`

			`Lstart_one:`
			`shld edi,esi,cl`
			`shl esi,cl ; n10 = high << l`
			`jmp Linteger_one_left`

			`Lstart_zero:`
			`; Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and`
			`; skipped a division.`

			`shl edi,cl ; n2 = carry << l`
			`mov eax,edi ; return value for zero_done`
			`cmp [PARAM_XSIZE],dword 0`
			`je Lzero_done`
			`jmp Lfraction_some`

			`; This loop runs at about 25 cycles,which is probably sub-optimal,and`
			`; certainly more than the dependent chain would suggest. A better loop,or`
			`; a better rough analysis of what's possible,would be welcomed.`
			`;`
			`; In the current implementation,the following successively dependent`
			`; micro-ops seem to exist.`
			`;`
			`; uops`
			`; n2+n1 1 (addl)`
			`; mul 5`
			`; q1+1 3 (addl/adcl)`
			`; mul 5`
			`; sub 3 (subl/sbbl)`
			`; addback 2 (cmov)`
			`; ---`
			`; 19`
			`;`
			`; Lack of registers hinders explicit scheduling and it might be that the`
			`; normal out of order execution isn't able to hide enough under the mul`
			`; latencies.`
			`;`
			`; Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than`
			`; cmov (and takes one uop off the dependent chain). A sarl/andl/addl`
			`; combination was tried for the addback (despite the fact it would lengthen`
			`; the dependent chain) but found to be no faster.`

			`; eax scratch`
			`; ebx scratch (nadj,q1)`
			`; ecx scratch (src,dst)`
			`; edx scratch`
			`; esi n10`
			`; edi n2`
			`; ebp d`
			`;`
			`; mm0 scratch (src qword)`
			`; mm7 rshift for normalization`

			`align 16`
			`Linteger_top:`
			`mov eax,esi`
			`mov ebx,ebp`
			`sar eax,31 ; -n1`
			`mov ecx,[VAR_SRC]`
			`and ebx,eax ; -n1 & d`
			`neg eax ; n1`
			`add ebx,esi ; nadj = n10 + (-n1 & d),ignoring overflow`
			`add eax,edi ; n2+n1`
			`movq mm0,[ecx] ; next src limb and the one below it`
			`mul dword [VAR_INVERSE] ; m*(n2+n1)`
			`sub ecx,4`
			`mov [VAR_SRC],ecx`
			`add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag`
			`mov eax,ebp ; d`
			`lea ebx,[1+edi] ; n2+1`
			`adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1`
			`jz Lq1_ff`
			`mul ebx ; (q1+1)*d`
			`mov ecx,[VAR_DST]`
			`psrlq mm0,mm7`
			`sub esi,eax`
			`mov eax,[VAR_DST_STOP]`
			`sbb edi,edx ; n - (q1+1)*d`
			`mov edi,esi ; remainder -> n2`
			`lea edx,[ebp+esi]`
			`cmovc edi,edx`
			`movd esi,mm0`
			`sbb ebx,0 ; q`
			`sub ecx,4`
			`mov [ecx],ebx`
			`cmp ecx,eax`
			`mov [VAR_DST],ecx`
			`jne Linteger_top`
			`Linteger_loop_done:`

			`; Here,and in integer_one_left below,an sbbl $0 is used rather than a jz`
			`; q1_ff special case. This make the code a bit smaller and simpler,and`
			`; costs only 2 cycles (each).`

			`; eax scratch`
			`; ebx scratch (nadj,q1)`
			`; ecx scratch (src,dst)`
			`; edx scratch`
			`; esi n10`
			`; edi n2`
			`; ebp divisor`
			`;`
			`; mm7 rshift`

			`Linteger_two_left:`
			`mov eax,esi`
			`mov ebx,ebp`
			`sar eax,31 ; -n1`
			`mov ecx,[PARAM_SRC]`
			`and ebx,eax ; -n1 & d`
			`neg eax ; n1`
			`add ebx,esi ; nadj = n10 + (-n1 & d),ignoring overflow`
			`add eax,edi ; n2+n1`
			`mul dword [VAR_INVERSE] ; m*(n2+n1)`
			`movd mm0,[ecx] ; src low limb`
			`mov ecx,[VAR_DST_STOP]`
			`add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag`
			`lea ebx,[1+edi] ; n2+1`
			`mov eax,ebp ; d`
			`adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1`
			`sbb ebx,0`
			`mul ebx ; (q1+1)*d`
			`psllq mm0,32`
			`psrlq mm0,mm7`
			`sub esi,eax`
			`sbb edi,edx ; n - (q1+1)*d`
			`mov edi,esi ; remainder -> n2`
			`lea edx,[ebp+esi]`
			`cmovc edi,edx`
			`movd esi,mm0`
			`sbb ebx,0 ; q`
			`mov [-4+ecx],ebx`

			`; eax scratch`
			`; ebx scratch (nadj,q1)`
			`; ecx scratch (dst)`
			`; edx scratch`
			`; esi n10`
			`; edi n2`
			`; ebp divisor`
			`;`
			`; mm7 rshift`

			`Linteger_one_left:`
			`mov eax,esi`
			`mov ebx,ebp`
			`sar eax,31 ; -n1`
			`mov ecx,[VAR_DST_STOP]`
			`and ebx,eax ; -n1 & d`
			`neg eax ; n1`
			`add ebx,esi ; nadj = n10 + (-n1 & d),ignoring overflow`
			`add eax,edi ; n2+n1`
			`mul dword [VAR_INVERSE] ; m*(n2+n1)`
			`add eax,ebx ; m*(n2+n1) + nadj,low giving carry flag`
			`lea ebx,[1+edi] ; n2+1`
			`mov eax,ebp ; d`
			`adc ebx,edx ; 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1`
			`sbb ebx,0 ; q1 if q1+1 overflowed`
			`mul ebx`
			`sub esi,eax`
			`mov eax,[PARAM_XSIZE]`
			`sbb edi,edx ; n - (q1+1)*d`
			`mov edi,esi ; remainder -> n2`
			`lea edx,[ebp+esi]`
			`cmovc edi,edx`
			`sbb ebx,0 ; q`
			`mov [-8+ecx],ebx`
			`sub ecx,8`
			`or eax,eax ; xsize`
			`jnz Lfraction_some`
			`mov eax,edi`
			`Lfraction_done:`
			`mov ecx,[VAR_NORM]`
			`Lzero_done:`
			`mov ebp,[SAVE_EBP]`
			`mov edi,[SAVE_EDI]`
			`mov esi,[SAVE_ESI]`
			`mov ebx,[SAVE_EBX]`
			`add esp,STACK_SPACE`
			`shr eax,cl`
			`emms`
			`ret`

			`; Special case for q1=0xFFFFFFFF,giving q=0xFFFFFFFF meaning the low dword`
			`; of qd is simply -d and the remainder n-qd = n10+d`
			`;`
			`; eax (divisor)`
			`; ebx (q1+1 == 0)`
			`; ecx`
			`; edx`
			`; esi n10`
			`; edi n2`
			`; ebp divisor`

			`Lq1_ff:`
			`mov ecx,[VAR_DST]`
			`mov edx,[VAR_DST_STOP]`
			`sub ecx,4`
			`mov [VAR_DST],ecx`
			`psrlq mm0,mm7`
			`lea edi,[ebp+esi] ; n-q*d remainder -> next n2`
			`mov [ecx],dword -1`
			`movd esi,mm0 ; next n10`
			`cmp edx,ecx`
			`jne Linteger_top`
			`jmp Linteger_loop_done`

			`;`
			`; In the current implementation,the following successively dependent`
			`; micro-ops seem to exist.`
			`;`
			`; uops`
			`; mul 5`
			`; q1+1 1 (addl)`
			`; mul 5`
			`; sub 3 (negl/sbbl)`
			`; addback 2 (cmov)`
			`; ---`
			`; 16`
			`;`
			`; The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for`
			`; the addback was found to be a touch slower.`

			`; eax`
			`; ebx`
			`; ecx`
			`; edx`
			`; esi`
			`; edi carry`
			`; ebp divisor`

			`align 16`
			`Lfraction_some:`
			`mov esi,[PARAM_DST]`
			`mov ecx,[VAR_DST_STOP] ; &dst[xsize+2]`
			`mov eax,edi`
			`sub ecx,8 ; &dst[xsize]`

			`; eax n2,then scratch`
			`; ebx scratch (nadj,q1)`
			`; ecx dst,decrementing`
			`; edx scratch`
			`; esi dst stop point`
			`; edi n2`
			`; ebp divisor`

			`align 16`
			`Lfraction_top:`
			`mul dword [VAR_INVERSE] ; m*n2`
			`mov eax,ebp ; d`
			`sub ecx,4 ; dst`
			`lea ebx,[edi+1]`
			`add ebx,edx ; 1 + high(n2<<32 + m*n2) = q1+1`
			`mul ebx ; (q1+1)*d`
			`neg eax ; low of n - (q1+1)*d`
			`sbb edi,edx ; high of n - (q1+1)*d,caring only about carry`
			`lea edx,[ebp+eax]`
			`cmovc eax,edx`
			`sbb ebx,0 ; q`
			`mov edi,eax ; remainder->n2`
			`cmp ecx,esi`
			`mov [ecx],ebx ; previous q`
			`jne Lfraction_top`
			`jmp Lfraction_done`

			`end`