201 lines
5.2 KiB
NASM
201 lines
5.2 KiB
NASM
dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder
|
|
|
|
dnl Copyright 2003, 2004 Free Software Foundation, Inc.
|
|
dnl
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
|
dnl License, or (at your option) any later version.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
dnl Lesser General Public License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public
|
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
C cycles/limb
|
|
C EV4: 47
|
|
C EV5: 30
|
|
C EV6: 15
|
|
|
|
|
|
C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
|
|
C mp_limb_t c)
|
|
C
|
|
C This code follows the "alternate" code in mpn/generic/mode1o.c,
|
|
C eliminating cbit+climb from the dependent chain. This leaves,
|
|
C
|
|
C ev4 ev5 ev6
|
|
C 1 3 1 subq y = x - h
|
|
C 23 13 7 mulq q = y * inverse
|
|
C 23 14 7 umulh h = high (q * d)
|
|
C -- -- --
|
|
C 47 30 15
|
|
C
|
|
C In each case, the load latency, loop control, and extra carry bit handling
|
|
C hide under the multiply latencies. Those latencies are long enough that
|
|
C we don't need to worry about alignment or pairing to squeeze out
|
|
C performance.
|
|
C
|
|
C For the first limb, some of the loop code is broken out and scheduled back
|
|
C since it can be done earlier.
|
|
C
|
|
C - The first ldq src[0] is near the start of the routine, for maximum
|
|
C time from memory.
|
|
C
|
|
C - The subq y=x-climb can be done without waiting for the inverse.
|
|
C
|
|
C - The mulq y*inverse is replicated after the final subq for the inverse,
|
|
C instead of branching to the mulq in the main loop. On ev4 a branch
|
|
C there would cost cycles, but we can hide them under the mulq latency.
|
|
C
|
|
C For the last limb, high<divisor is tested and if that's true a subtract
|
|
C and addback is done, as per the main mpn/generic/mode1o.c code. This is a
|
|
C data-dependent branch, but we're waiting for umulh so any penalty should
|
|
C hide there. The multiplies saved would be worth the cost anyway.
|
|
C
|
|
C Enhancements:
|
|
C
|
|
C For size==1, a plain division (done bitwise say) might be faster than
|
|
C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
|
|
C ev5. A call to gcc __remqu might be a possibility.
|
|
|
|
ASM_START()
|
|
PROLOGUE(mpn_modexact_1c_odd,gp)
|
|
|
|
C r16 src
|
|
C r17 size
|
|
C r18 d
|
|
C r19 c
|
|
|
|
LEA(r0, modlimb_invert_table)
|
|
srl r18, 1, r20 C d >> 1
|
|
|
|
and r20, 127, r20 C idx = d>>1 & 0x7F
|
|
|
|
addq r0, r20, r21 C table + idx
|
|
|
|
ifelse(bwx_available_p,1,
|
|
` ldbu r20, 0(r21) C table[idx], inverse 8 bits
|
|
',`
|
|
ldq_u r20, 0(r21) C table[idx] qword
|
|
extbl r20, r21, r20 C table[idx], inverse 8 bits
|
|
')
|
|
|
|
mull r20, r20, r7 C i*i
|
|
addq r20, r20, r20 C 2*i
|
|
|
|
ldq r2, 0(r16) C x = s = src[0]
|
|
lda r17, -1(r17) C size--
|
|
clr r0 C initial cbit=0
|
|
|
|
mull r7, r18, r7 C i*i*d
|
|
|
|
subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits
|
|
|
|
mull r20, r20, r7 C i*i
|
|
addq r20, r20, r20 C 2*i
|
|
|
|
mull r7, r18, r7 C i*i*d
|
|
|
|
subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits
|
|
|
|
mulq r20, r20, r7 C i*i
|
|
addq r20, r20, r20 C 2*i
|
|
|
|
mulq r7, r18, r7 C i*i*d
|
|
subq r2, r19, r3 C y = x - climb
|
|
|
|
subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits
|
|
|
|
ASSERT(r7, C should have d*inv==1 mod 2^64
|
|
` mulq r18, r20, r7
|
|
cmpeq r7, 1, r7')
|
|
|
|
mulq r3, r20, r4 C first q = y * inv
|
|
|
|
beq r17, L(one) C if size==1
|
|
br L(entry)
|
|
|
|
|
|
L(top):
|
|
C r0 cbit
|
|
C r16 src, incrementing
|
|
C r17 size, decrementing
|
|
C r18 d
|
|
C r19 climb
|
|
C r20 inv
|
|
|
|
ldq r1, 0(r16) C s = src[i]
|
|
subq r1, r0, r2 C x = s - cbit
|
|
cmpult r1, r0, r0 C new cbit = s < cbit
|
|
|
|
subq r2, r19, r3 C y = x - climb
|
|
|
|
mulq r3, r20, r4 C q = y * inv
|
|
L(entry):
|
|
cmpult r2, r19, r5 C cbit2 = x < climb
|
|
addq r5, r0, r0 C cbit += cbit2
|
|
lda r16, 8(r16) C src++
|
|
lda r17, -1(r17) C size--
|
|
|
|
umulh r4, r18, r19 C climb = q * d
|
|
bne r17, L(top) C while 2 or more limbs left
|
|
|
|
|
|
|
|
C r0 cbit
|
|
C r18 d
|
|
C r19 climb
|
|
C r20 inv
|
|
|
|
ldq r1, 0(r16) C s = src[size-1] high limb
|
|
|
|
cmpult r1, r18, r2 C test high<divisor
|
|
bne r2, L(skip) C skip if so
|
|
|
|
C can't skip a division, repeat loop code
|
|
|
|
subq r1, r0, r2 C x = s - cbit
|
|
cmpult r1, r0, r0 C new cbit = s < cbit
|
|
|
|
subq r2, r19, r3 C y = x - climb
|
|
|
|
mulq r3, r20, r4 C q = y * inv
|
|
L(one):
|
|
cmpult r2, r19, r5 C cbit2 = x < climb
|
|
addq r5, r0, r0 C cbit += cbit2
|
|
|
|
umulh r4, r18, r19 C climb = q * d
|
|
|
|
addq r19, r0, r0 C return climb + cbit
|
|
ret r31, (r26), 1
|
|
|
|
|
|
ALIGN(8)
|
|
L(skip):
|
|
C with high<divisor, the final step can be just (cbit+climb)-s and
|
|
C an addback of d if that underflows
|
|
|
|
addq r19, r0, r19 C c = climb + cbit
|
|
|
|
subq r19, r1, r2 C c - s
|
|
cmpult r19, r1, r3 C c < s
|
|
|
|
addq r2, r18, r0 C return c-s + divisor
|
|
|
|
cmoveq r3, r2, r0 C return c-s if no underflow
|
|
ret r31, (r26), 1
|
|
|
|
EPILOGUE()
|
|
ASM_END()
|