mpir/mpn/x86w/p3/divexact_by3c.asm


;  Copyright 2000, 2002 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\x86i.inc" 

	global  ___gmpn_divexact_by3c 

%ifdef	DLL
	export	___gmpn_divexact_by3c
%endif

%define	PARAM_CARRY esp+frame+16 
%define PARAM_SIZE  esp+frame+12 
%define PARAM_SRC   esp+frame+8 
%define PARAM_DST   esp+frame+4 
%define	frame		0 

;   multiplicative inverse of 3,modulo 2^32 
;   ceil(b/3),ceil(b*2/3) and floor(b*2/3) where b=2^32 
%define	INVERSE_3		-0x55555555
%define	ONE_THIRD_CEIL		0x55555556
%define	TWO_THIRDS_CEIL		0xAAAAAAAB
%define	TWO_THIRDS_FLOOR	0xAAAAAAAA

	section .text

	align   8   

___gmpn_divexact_by3c: 
    mov     ecx,[PARAM_SRC]
    mov     edx,[PARAM_SIZE]
    dec     edx
    jnz     Ltwo_or_more
    mov     edx,[ecx]
    mov     eax,[PARAM_CARRY]			;  risk of cache bank clash here 
    mov     ecx,[PARAM_DST]
    sub     edx,eax
    sbb     eax,eax						;  0 or -1 
    imul    edx,edx,INVERSE_3
    neg     eax							;  0 or 1 
    cmp     edx,ONE_THIRD_CEIL
    sbb     eax,-1						;  +1 if edx>=ceil(b/3) 
    cmp     edx,TWO_THIRDS_CEIL
    sbb     eax,-1						;  +1 if edx>=ceil(b*2/3) 
    mov     [ecx],edx
    ret

;  eax 
;  ebx 
;  ecx src 
;  edx size-1 
;  esi 
;  edi 
;  ebp 

Ltwo_or_more: 
	FR_push	ebx
	FR_push	esi
	FR_push	edi
	FR_push	ebp
    mov     edi,[PARAM_DST]
    mov     esi,[PARAM_CARRY]
    mov     eax,[ecx]				;  src low limb 
    xor     ebx,ebx
	sub     eax,esi
    mov     esi,TWO_THIRDS_FLOOR
    lea     ecx,[ecx+edx*4]			;  &src[size-1] 
    lea     edi,[edi+edx*4]			;  &dst[size-1] 
    adc     ebx,0					;  carry,0 or 1 
    neg     edx						;  -(size-1) 

;  The loop needs a source limb ready at the top,which leads to one limb 
;  handled separately at the end,and the special case above for size==1. 
;  There doesn't seem to be any scheduling that would keep the speed but move 
;  the source load and carry subtract up to the top. 
; 
;  The destination cache line prefetching adds 1 cycle to the loop but is 
;  considered worthwhile.  The slowdown is a factor of 1.07,but will prevent 
;  repeated write-throughs if the destination isn't in L1.  A version using 
;  an outer loop to prefetch only every 8 limbs (a cache line) proved to be 
;  no faster,due to unavoidable branch mispreditions in the inner loop. 
; 
;  setc is 2 cycles on P54,so an adcl is used instead.  If the movl $0,%ebx 
;  could be avoided then the src limb fetch could pair up and save a cycle. 
;  This would probably mean going to a two limb loop with the carry limb 
;  alternately positive or negative,since an sbbl %ebx,%ebx will leave a 
;  value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax. 
; 
;  A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as 
;  "cmpl %edx,$n" with the immediate as the second operand. 
; 
;  The "4" source displacement is in the loop rather than the setup because 
;  this gets Ltop aligned to 8 bytes at no cost. 

;  eax source limb,carry subtracted 
;  ebx carry (0 or 1) 
;  ecx &src[size-1] 
;  edx counter,limbs,negative 
;  esi TWO_THIRDS_FLOOR 
;  edi &dst[size-1] 
;  ebp scratch (result limb) 

	align   8
Ltop: 
    imul    ebp,eax,INVERSE_3
    cmp     ebp,ONE_THIRD_CEIL
    mov     eax,[edi+edx*4]		;  dst cache line prefetch 
    sbb     ebx,-1				;  +1 if ebp>=ceil(b/3) 
    cmp     esi,ebp
    mov     eax,[4+ecx+edx*4]	;  next src limb 
    sbb     eax,ebx				;  and further -1 if ebp>=ceil(b*2/3) 
    mov     ebx,0
    adc     ebx,0				;  new carry 
    mov     [edi+edx*4],ebp
    inc     edx
    jnz     Ltop
    imul    edx,eax,INVERSE_3
    cmp     edx,ONE_THIRD_CEIL
    mov     [edi],edx
    sbb     ebx,-1				;  +1 if edx>=ceil(b/3) 
    cmp     edx,TWO_THIRDS_CEIL
    sbb     ebx,-1				;  +1 if edx>=ceil(b*2/3) 
    pop     ebp
    mov     eax,ebx
    pop     edi
    pop     esi
    pop     ebx
    ret

	end
Line endings 2012-11-25 17:13:44 -05:00
			`; Copyright 2000, 2002 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Translation of AT&T syntax code by Brian Gladman`

			`%include "..\x86i.inc"`

			`global ___gmpn_divexact_by3c`

			`%ifdef DLL`
			`export ___gmpn_divexact_by3c`
			`%endif`

			`%define PARAM_CARRY esp+frame+16`
			`%define PARAM_SIZE esp+frame+12`
			`%define PARAM_SRC esp+frame+8`
			`%define PARAM_DST esp+frame+4`
			`%define frame 0`

			`; multiplicative inverse of 3,modulo 2^32`
			`; ceil(b/3),ceil(b2/3) and floor(b2/3) where b=2^32`
			`%define INVERSE_3 -0x55555555`
			`%define ONE_THIRD_CEIL 0x55555556`
			`%define TWO_THIRDS_CEIL 0xAAAAAAAB`
			`%define TWO_THIRDS_FLOOR 0xAAAAAAAA`

			`section .text`

			`align 8`

			`___gmpn_divexact_by3c:`
			`mov ecx,[PARAM_SRC]`
			`mov edx,[PARAM_SIZE]`
			`dec edx`
			`jnz Ltwo_or_more`
			`mov edx,[ecx]`
			`mov eax,[PARAM_CARRY] ; risk of cache bank clash here`
			`mov ecx,[PARAM_DST]`
			`sub edx,eax`
			`sbb eax,eax ; 0 or -1`
			`imul edx,edx,INVERSE_3`
			`neg eax ; 0 or 1`
			`cmp edx,ONE_THIRD_CEIL`
			`sbb eax,-1 ; +1 if edx>=ceil(b/3)`
			`cmp edx,TWO_THIRDS_CEIL`
			`sbb eax,-1 ; +1 if edx>=ceil(b*2/3)`
			`mov [ecx],edx`
			`ret`

			`; eax`
			`; ebx`
			`; ecx src`
			`; edx size-1`
			`; esi`
			`; edi`
			`; ebp`

			`Ltwo_or_more:`
			`FR_push ebx`
			`FR_push esi`
			`FR_push edi`
			`FR_push ebp`
			`mov edi,[PARAM_DST]`
			`mov esi,[PARAM_CARRY]`
			`mov eax,[ecx] ; src low limb`
			`xor ebx,ebx`
			`sub eax,esi`
			`mov esi,TWO_THIRDS_FLOOR`
			`lea ecx,[ecx+edx*4] ; &src[size-1]`
			`lea edi,[edi+edx*4] ; &dst[size-1]`
			`adc ebx,0 ; carry,0 or 1`
			`neg edx ; -(size-1)`

			`; The loop needs a source limb ready at the top,which leads to one limb`
			`; handled separately at the end,and the special case above for size==1.`
			`; There doesn't seem to be any scheduling that would keep the speed but move`
			`; the source load and carry subtract up to the top.`
			`;`
			`; The destination cache line prefetching adds 1 cycle to the loop but is`
			`; considered worthwhile. The slowdown is a factor of 1.07,but will prevent`
			`; repeated write-throughs if the destination isn't in L1. A version using`
			`; an outer loop to prefetch only every 8 limbs (a cache line) proved to be`
			`; no faster,due to unavoidable branch mispreditions in the inner loop.`
			`;`
			`; setc is 2 cycles on P54,so an adcl is used instead. If the movl $0,%ebx`
			`; could be avoided then the src limb fetch could pair up and save a cycle.`
			`; This would probably mean going to a two limb loop with the carry limb`
			`; alternately positive or negative,since an sbbl %ebx,%ebx will leave a`
			`; value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.`
			`;`
			`; A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as`
			`; "cmpl %edx,$n" with the immediate as the second operand.`
			`;`
			`; The "4" source displacement is in the loop rather than the setup because`
			`; this gets Ltop aligned to 8 bytes at no cost.`

			`; eax source limb,carry subtracted`
			`; ebx carry (0 or 1)`
			`; ecx &src[size-1]`
			`; edx counter,limbs,negative`
			`; esi TWO_THIRDS_FLOOR`
			`; edi &dst[size-1]`
			`; ebp scratch (result limb)`

			`align 8`
			`Ltop:`
			`imul ebp,eax,INVERSE_3`
			`cmp ebp,ONE_THIRD_CEIL`
			`mov eax,[edi+edx*4] ; dst cache line prefetch`
			`sbb ebx,-1 ; +1 if ebp>=ceil(b/3)`
			`cmp esi,ebp`
			`mov eax,[4+ecx+edx*4] ; next src limb`
			`sbb eax,ebx ; and further -1 if ebp>=ceil(b*2/3)`
			`mov ebx,0`
			`adc ebx,0 ; new carry`
			`mov [edi+edx*4],ebp`
			`inc edx`
			`jnz Ltop`
			`imul edx,eax,INVERSE_3`
			`cmp edx,ONE_THIRD_CEIL`
			`mov [edi],edx`
			`sbb ebx,-1 ; +1 if edx>=ceil(b/3)`
			`cmp edx,TWO_THIRDS_CEIL`
			`sbb ebx,-1 ; +1 if edx>=ceil(b*2/3)`
			`pop ebp`
			`mov eax,ebx`
			`pop edi`
			`pop esi`
			`pop ebx`
			`ret`

			`end`