2012-11-25 17:13:44 -05:00
|
|
|
|
|
|
|
; Copyright 2000, 2002 Free Software Foundation, Inc.
|
|
|
|
;
|
|
|
|
; This file is part of the GNU MP Library.
|
|
|
|
;
|
|
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
|
|
; License, or (at your option) any later version.
|
|
|
|
;
|
|
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
; Lesser General Public License for more details.
|
|
|
|
;
|
|
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
|
|
;
|
|
|
|
; Translation of AT&T syntax code by Brian Gladman
|
|
|
|
|
|
|
|
%include "..\x86i.inc"
|
|
|
|
|
|
|
|
global ___gmpn_divexact_by3c
|
|
|
|
|
|
|
|
%ifdef DLL
|
|
|
|
export ___gmpn_divexact_by3c
|
|
|
|
%endif
|
|
|
|
|
|
|
|
%define PARAM_CARRY esp+frame+16
|
|
|
|
%define PARAM_SIZE esp+frame+12
|
|
|
|
%define PARAM_SRC esp+frame+8
|
|
|
|
%define PARAM_DST esp+frame+4
|
|
|
|
%define frame 0
|
|
|
|
|
|
|
|
; multiplicative inverse of 3,modulo 2^32
|
|
|
|
; ceil(b/3),ceil(b*2/3) and floor(b*2/3) where b=2^32
|
|
|
|
%define INVERSE_3 -0x55555555
|
|
|
|
%define ONE_THIRD_CEIL 0x55555556
|
|
|
|
%define TWO_THIRDS_CEIL 0xAAAAAAAB
|
|
|
|
%define TWO_THIRDS_FLOOR 0xAAAAAAAA
|
|
|
|
|
|
|
|
section .text
|
|
|
|
|
|
|
|
align 8
|
|
|
|
|
|
|
|
___gmpn_divexact_by3c:
|
|
|
|
mov ecx,[PARAM_SRC]
|
|
|
|
mov edx,[PARAM_SIZE]
|
|
|
|
dec edx
|
|
|
|
jnz Ltwo_or_more
|
|
|
|
mov edx,[ecx]
|
|
|
|
mov eax,[PARAM_CARRY] ; risk of cache bank clash here
|
|
|
|
mov ecx,[PARAM_DST]
|
|
|
|
sub edx,eax
|
|
|
|
sbb eax,eax ; 0 or -1
|
|
|
|
imul edx,edx,INVERSE_3
|
|
|
|
neg eax ; 0 or 1
|
|
|
|
cmp edx,ONE_THIRD_CEIL
|
|
|
|
sbb eax,-1 ; +1 if edx>=ceil(b/3)
|
|
|
|
cmp edx,TWO_THIRDS_CEIL
|
|
|
|
sbb eax,-1 ; +1 if edx>=ceil(b*2/3)
|
|
|
|
mov [ecx],edx
|
|
|
|
ret
|
|
|
|
|
|
|
|
; eax
|
|
|
|
; ebx
|
|
|
|
; ecx src
|
|
|
|
; edx size-1
|
|
|
|
; esi
|
|
|
|
; edi
|
|
|
|
; ebp
|
|
|
|
|
|
|
|
Ltwo_or_more:
|
|
|
|
FR_push ebx
|
|
|
|
FR_push esi
|
|
|
|
FR_push edi
|
|
|
|
FR_push ebp
|
|
|
|
mov edi,[PARAM_DST]
|
|
|
|
mov esi,[PARAM_CARRY]
|
|
|
|
mov eax,[ecx] ; src low limb
|
|
|
|
xor ebx,ebx
|
|
|
|
sub eax,esi
|
|
|
|
mov esi,TWO_THIRDS_FLOOR
|
|
|
|
lea ecx,[ecx+edx*4] ; &src[size-1]
|
|
|
|
lea edi,[edi+edx*4] ; &dst[size-1]
|
|
|
|
adc ebx,0 ; carry,0 or 1
|
|
|
|
neg edx ; -(size-1)
|
|
|
|
|
|
|
|
; The loop needs a source limb ready at the top,which leads to one limb
|
|
|
|
; handled separately at the end,and the special case above for size==1.
|
|
|
|
; There doesn't seem to be any scheduling that would keep the speed but move
|
|
|
|
; the source load and carry subtract up to the top.
|
|
|
|
;
|
|
|
|
; The destination cache line prefetching adds 1 cycle to the loop but is
|
|
|
|
; considered worthwhile. The slowdown is a factor of 1.07,but will prevent
|
|
|
|
; repeated write-throughs if the destination isn't in L1. A version using
|
|
|
|
; an outer loop to prefetch only every 8 limbs (a cache line) proved to be
|
|
|
|
; no faster,due to unavoidable branch mispreditions in the inner loop.
|
|
|
|
;
|
|
|
|
; setc is 2 cycles on P54,so an adcl is used instead. If the movl $0,%ebx
|
|
|
|
; could be avoided then the src limb fetch could pair up and save a cycle.
|
|
|
|
; This would probably mean going to a two limb loop with the carry limb
|
|
|
|
; alternately positive or negative,since an sbbl %ebx,%ebx will leave a
|
|
|
|
; value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
|
|
|
|
;
|
|
|
|
; A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
|
|
|
|
; "cmpl %edx,$n" with the immediate as the second operand.
|
|
|
|
;
|
|
|
|
; The "4" source displacement is in the loop rather than the setup because
|
|
|
|
; this gets Ltop aligned to 8 bytes at no cost.
|
|
|
|
|
|
|
|
; eax source limb,carry subtracted
|
|
|
|
; ebx carry (0 or 1)
|
|
|
|
; ecx &src[size-1]
|
|
|
|
; edx counter,limbs,negative
|
|
|
|
; esi TWO_THIRDS_FLOOR
|
|
|
|
; edi &dst[size-1]
|
|
|
|
; ebp scratch (result limb)
|
|
|
|
|
|
|
|
align 8
|
|
|
|
Ltop:
|
|
|
|
imul ebp,eax,INVERSE_3
|
|
|
|
cmp ebp,ONE_THIRD_CEIL
|
|
|
|
mov eax,[edi+edx*4] ; dst cache line prefetch
|
|
|
|
sbb ebx,-1 ; +1 if ebp>=ceil(b/3)
|
|
|
|
cmp esi,ebp
|
|
|
|
mov eax,[4+ecx+edx*4] ; next src limb
|
|
|
|
sbb eax,ebx ; and further -1 if ebp>=ceil(b*2/3)
|
|
|
|
mov ebx,0
|
|
|
|
adc ebx,0 ; new carry
|
|
|
|
mov [edi+edx*4],ebp
|
|
|
|
inc edx
|
|
|
|
jnz Ltop
|
|
|
|
imul edx,eax,INVERSE_3
|
|
|
|
cmp edx,ONE_THIRD_CEIL
|
|
|
|
mov [edi],edx
|
|
|
|
sbb ebx,-1 ; +1 if edx>=ceil(b/3)
|
|
|
|
cmp edx,TWO_THIRDS_CEIL
|
|
|
|
sbb ebx,-1 ; +1 if edx>=ceil(b*2/3)
|
|
|
|
pop ebp
|
|
|
|
mov eax,ebx
|
|
|
|
pop edi
|
|
|
|
pop esi
|
|
|
|
pop ebx
|
|
|
|
ret
|
|
|
|
|
|
|
|
end
|