474 lines
14 KiB
NASM
474 lines
14 KiB
NASM
dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
|
|
dnl the result from a second limb vector.
|
|
|
|
dnl Copyright 2000, 2002 Free Software Foundation, Inc.
|
|
|
|
dnl This file is part of the GNU MP Library.
|
|
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
dnl it under the terms of the GNU Lesser General Public License as published
|
|
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
|
|
dnl your option) any later version.
|
|
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
dnl License for more details.
|
|
|
|
dnl You should have received a copy of the GNU Lesser General Public License
|
|
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
|
|
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
dnl Boston, MA 02110-1301, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
dnl INPUT PARAMETERS
|
|
dnl res_ptr r16
|
|
dnl s1_ptr r17
|
|
dnl size r18
|
|
dnl s2_limb r19
|
|
|
|
dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
|
|
dnl exactly 3.5 cycles/limb on EV6...
|
|
|
|
dnl This code was written in close cooperation with ev6 pipeline expert
|
|
dnl Steve Root. Any errors are tege's fault, though.
|
|
dnl
|
|
dnl Register usages for unrolled loop:
|
|
dnl 0-3 mul's
|
|
dnl 4-7 acc's
|
|
dnl 8-15 mul results
|
|
dnl 20,21 carry's
|
|
dnl 22,23 save for stores
|
|
|
|
dnl Sustains 8 mul-adds in 28 cycles in the unrolled inner loop.
|
|
|
|
dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
|
|
dnl them, so that further disturbance to the schedule is damped.
|
|
|
|
dnl We couldn't pair the loads, because the entangled schedule of the
|
|
dnl carry's has to happen on one side {0} of the machine. Note, the total
|
|
dnl use of U0, and the total use of L0 (after attending to the stores).
|
|
dnl which is part of the reason why....
|
|
|
|
dnl This is a great schedule for the d_cache, a poor schedule for the
|
|
dnl b_cache. The lockup on U0 means that any stall can't be recovered
|
|
dnl from. Consider a ldq in L1. say that load gets stalled because it
|
|
dnl collides with a fill from the b_Cache. On the next cycle, this load
|
|
dnl gets priority. If first looks at L0, and goes there. The instruction
|
|
dnl we intended for L0 gets to look at L1, which is NOT where we want
|
|
dnl it. It either stalls 1, because it can't go in L0, or goes there, and
|
|
dnl causes a further instruction to stall.
|
|
|
|
dnl So for b_cache, we're likely going to want to put one or more cycles
|
|
dnl back into the code! And, of course, put in prefetches. For the
|
|
dnl accumulator, lds, intent to modify. For the multiplier, you might
|
|
dnl want ldq, evict next, if you're not wanting to use it again soon. Use
|
|
dnl 256 ahead of present pointer value. At a place where we have an mt
|
|
dnl followed by a bookkeeping, put the bookkeeping in upper, and the
|
|
dnl prefetch into lower.
|
|
|
|
dnl Note, the usage of physical registers per cycle is smoothed off, as
|
|
dnl much as possible.
|
|
|
|
dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
|
|
dnl like not to have a ldq or stq to preceded a conditional branch in a
|
|
dnl quadpack. The conditional branch moves the retire pointer one cycle
|
|
dnl later.
|
|
|
|
dnl Optimization notes:
|
|
dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
|
|
dnl Reserved regs: r29 r30 r31
|
|
dnl Free caller-saves regs in unrolled code: r24 r25 r28
|
|
dnl We should swap some of the callee-saves regs for some of the free
|
|
dnl caller-saves regs, saving some overhead cycles.
|
|
dnl Most importantly, we should write fast code for the 0-7 case.
|
|
dnl The code we use there are for the 21164, and runs at 7 cycles/limb
|
|
dnl on the 21264. Should not be hard, if we write specialized code for
|
|
dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
|
|
dnl need a jump table indexed by the low 3 bits of the count argument.
|
|
|
|
|
|
ASM_START()
|
|
PROLOGUE(mpn_submul_1)
|
|
cmpult r18, 8, r1
|
|
beq r1, $Large
|
|
|
|
ldq r2, 0(r17) C r2 = s1_limb
|
|
addq r17, 8, r17 C s1_ptr++
|
|
subq r18, 1, r18 C size--
|
|
mulq r2, r19, r3 C r3 = prod_low
|
|
ldq r5, 0(r16) C r5 = *res_ptr
|
|
umulh r2, r19, r0 C r0 = prod_high
|
|
beq r18, $Lend0b C jump if size was == 1
|
|
ldq r2, 0(r17) C r2 = s1_limb
|
|
addq r17, 8, r17 C s1_ptr++
|
|
subq r18, 1, r18 C size--
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r4
|
|
stq r3, 0(r16)
|
|
addq r16, 8, r16 C res_ptr++
|
|
beq r18, $Lend0a C jump if size was == 2
|
|
|
|
ALIGN(8)
|
|
$Loop0: mulq r2, r19, r3 C r3 = prod_low
|
|
ldq r5, 0(r16) C r5 = *res_ptr
|
|
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
|
|
subq r18, 1, r18 C size--
|
|
umulh r2, r19, r4 C r4 = cy_limb
|
|
ldq r2, 0(r17) C r2 = s1_limb
|
|
addq r17, 8, r17 C s1_ptr++
|
|
addq r3, r0, r3 C r3 = cy_limb + prod_low
|
|
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r5
|
|
stq r3, 0(r16)
|
|
addq r16, 8, r16 C res_ptr++
|
|
addq r5, r0, r0 C combine carries
|
|
bne r18, $Loop0
|
|
$Lend0a:
|
|
mulq r2, r19, r3 C r3 = prod_low
|
|
ldq r5, 0(r16) C r5 = *res_ptr
|
|
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
|
|
umulh r2, r19, r4 C r4 = cy_limb
|
|
addq r3, r0, r3 C r3 = cy_limb + prod_low
|
|
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r5
|
|
stq r3, 0(r16)
|
|
addq r5, r0, r0 C combine carries
|
|
addq r4, r0, r0 C cy_limb = prod_high + cy
|
|
ret r31, (r26), 1
|
|
$Lend0b:
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r5
|
|
stq r3, 0(r16)
|
|
addq r0, r5, r0
|
|
ret r31, (r26), 1
|
|
|
|
$Large:
|
|
lda $30, -240($30)
|
|
stq $9, 8($30)
|
|
stq $10, 16($30)
|
|
stq $11, 24($30)
|
|
stq $12, 32($30)
|
|
stq $13, 40($30)
|
|
stq $14, 48($30)
|
|
stq $15, 56($30)
|
|
|
|
and r18, 7, r20 C count for the first loop, 0-7
|
|
srl r18, 3, r18 C count for unrolled loop
|
|
bis r31, r31, r0
|
|
beq r20, $Lunroll
|
|
ldq r2, 0(r17) C r2 = s1_limb
|
|
addq r17, 8, r17 C s1_ptr++
|
|
subq r20, 1, r20 C size--
|
|
mulq r2, r19, r3 C r3 = prod_low
|
|
ldq r5, 0(r16) C r5 = *res_ptr
|
|
umulh r2, r19, r0 C r0 = prod_high
|
|
beq r20, $Lend1b C jump if size was == 1
|
|
ldq r2, 0(r17) C r2 = s1_limb
|
|
addq r17, 8, r17 C s1_ptr++
|
|
subq r20, 1, r20 C size--
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r4
|
|
stq r3, 0(r16)
|
|
addq r16, 8, r16 C res_ptr++
|
|
beq r20, $Lend1a C jump if size was == 2
|
|
|
|
ALIGN(8)
|
|
$Loop1: mulq r2, r19, r3 C r3 = prod_low
|
|
ldq r5, 0(r16) C r5 = *res_ptr
|
|
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
|
|
subq r20, 1, r20 C size--
|
|
umulh r2, r19, r4 C r4 = cy_limb
|
|
ldq r2, 0(r17) C r2 = s1_limb
|
|
addq r17, 8, r17 C s1_ptr++
|
|
addq r3, r0, r3 C r3 = cy_limb + prod_low
|
|
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r5
|
|
stq r3, 0(r16)
|
|
addq r16, 8, r16 C res_ptr++
|
|
addq r5, r0, r0 C combine carries
|
|
bne r20, $Loop1
|
|
|
|
$Lend1a:
|
|
mulq r2, r19, r3 C r3 = prod_low
|
|
ldq r5, 0(r16) C r5 = *res_ptr
|
|
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
|
|
umulh r2, r19, r4 C r4 = cy_limb
|
|
addq r3, r0, r3 C r3 = cy_limb + prod_low
|
|
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r5
|
|
stq r3, 0(r16)
|
|
addq r16, 8, r16 C res_ptr++
|
|
addq r5, r0, r0 C combine carries
|
|
addq r4, r0, r0 C cy_limb = prod_high + cy
|
|
br r31, $Lunroll
|
|
$Lend1b:
|
|
subq r5, r3, r3
|
|
cmpult r5, r3, r5
|
|
stq r3, 0(r16)
|
|
addq r16, 8, r16 C res_ptr++
|
|
addq r0, r5, r0
|
|
|
|
$Lunroll:
|
|
lda r17, -16(r17) C L1 bookkeeping
|
|
lda r16, -16(r16) C L1 bookkeeping
|
|
bis r0, r31, r12
|
|
|
|
C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
|
|
|
|
ldq r2, 16(r17) C L1
|
|
ldq r3, 24(r17) C L1
|
|
lda r18, -1(r18) C L1 bookkeeping
|
|
ldq r6, 16(r16) C L1
|
|
ldq r7, 24(r16) C L1
|
|
ldq r0, 32(r17) C L1
|
|
mulq r19, r2, r13 C U1
|
|
ldq r1, 40(r17) C L1
|
|
umulh r19, r2, r14 C U1
|
|
mulq r19, r3, r15 C U1
|
|
lda r17, 64(r17) C L1 bookkeeping
|
|
ldq r4, 32(r16) C L1
|
|
ldq r5, 40(r16) C L1
|
|
umulh r19, r3, r8 C U1
|
|
ldq r2, -16(r17) C L1
|
|
mulq r19, r0, r9 C U1
|
|
ldq r3, -8(r17) C L1
|
|
umulh r19, r0, r10 C U1
|
|
subq r6, r13, r13 C L0 lo + acc
|
|
mulq r19, r1, r11 C U1
|
|
cmpult r6, r13, r20 C L0 lo add => carry
|
|
lda r16, 64(r16) C L1 bookkeeping
|
|
subq r13, r12, r22 C U0 hi add => answer
|
|
cmpult r13, r12, r21 C L0 hi add => carry
|
|
addq r14, r20, r14 C U0 hi mul + carry
|
|
ldq r6, -16(r16) C L1
|
|
subq r7, r15, r28 C L0 lo + acc
|
|
addq r14, r21, r14 C U0 hi mul + carry
|
|
cmpult r7, r15, r20 C L0 lo add => carry
|
|
ldq r7, -8(r16) C L1
|
|
umulh r19, r1, r12 C U1
|
|
subq r28, r14, r23 C U0 hi add => answer
|
|
ldq r0, 0(r17) C L1
|
|
mulq r19, r2, r13 C U1
|
|
cmpult r28, r14, r21 C L0 hi add => carry
|
|
addq r8, r20, r8 C U0 hi mul + carry
|
|
ldq r1, 8(r17) C L1
|
|
umulh r19, r2, r14 C U1
|
|
subq r4, r9, r9 C L0 lo + acc
|
|
stq r22, -48(r16) C L0
|
|
stq r23, -40(r16) C L1
|
|
mulq r19, r3, r15 C U1
|
|
addq r8, r21, r8 C U0 hi mul + carry
|
|
cmpult r4, r9, r20 C L0 lo add => carry
|
|
subq r9, r8, r22 C U0 hi add => answer
|
|
ble r18, $Lend C U1 bookkeeping
|
|
|
|
C ____ MAIN UNROLLED LOOP ____
|
|
ALIGN(16)
|
|
$Loop:
|
|
bis r31, r31, r31 C U1 mt
|
|
cmpult r9, r8, r21 C L0 hi add => carry
|
|
addq r10, r20, r10 C U0 hi mul + carry
|
|
ldq r4, 0(r16) C L1
|
|
|
|
bis r31, r31, r31 C U1 mt
|
|
subq r5, r11, r23 C L0 lo + acc
|
|
addq r10, r21, r10 C L0 hi mul + carry
|
|
ldq r2, 16(r17) C L1
|
|
|
|
umulh r19, r3, r8 C U1
|
|
cmpult r5, r11, r20 C L0 lo add => carry
|
|
subq r23, r10, r28 C U0 hi add => answer
|
|
ldq r5, 8(r16) C L1
|
|
|
|
mulq r19, r0, r9 C U1
|
|
cmpult r23, r10, r21 C L0 hi add => carry
|
|
addq r12, r20, r12 C U0 hi mul + carry
|
|
ldq r3, 24(r17) C L1
|
|
|
|
umulh r19, r0, r10 C U1
|
|
subq r6, r13, r13 C U0 lo + acc
|
|
stq r22, -32(r16) C L0
|
|
stq r28, -24(r16) C L1
|
|
|
|
bis r31, r31, r31 C L0 st slosh
|
|
mulq r19, r1, r11 C U1
|
|
bis r31, r31, r31 C L1 st slosh
|
|
addq r12, r21, r12 C U0 hi mul + carry
|
|
|
|
cmpult r6, r13, r20 C L0 lo add => carry
|
|
bis r31, r31, r31 C U1 mt
|
|
lda r18, -1(r18) C L1 bookkeeping
|
|
subq r13, r12, r22 C U0 hi add => answer
|
|
|
|
bis r31, r31, r31 C U1 mt
|
|
cmpult r13, r12, r21 C L0 hi add => carry
|
|
addq r14, r20, r14 C U0 hi mul + carry
|
|
ldq r6, 16(r16) C L1
|
|
|
|
bis r31, r31, r31 C U1 mt
|
|
subq r7, r15, r23 C L0 lo + acc
|
|
addq r14, r21, r14 C U0 hi mul + carry
|
|
ldq r0, 32(r17) C L1
|
|
|
|
umulh r19, r1, r12 C U1
|
|
cmpult r7, r15, r20 C L0 lo add => carry
|
|
subq r23, r14, r28 C U0 hi add => answer
|
|
ldq r7, 24(r16) C L1
|
|
|
|
mulq r19, r2, r13 C U1
|
|
cmpult r23, r14, r21 C L0 hi add => carry
|
|
addq r8, r20, r8 C U0 hi mul + carry
|
|
ldq r1, 40(r17) C L1
|
|
|
|
umulh r19, r2, r14 C U1
|
|
subq r4, r9, r9 C U0 lo + acc
|
|
stq r22, -16(r16) C L0
|
|
stq r28, -8(r16) C L1
|
|
|
|
bis r31, r31, r31 C L0 st slosh
|
|
mulq r19, r3, r15 C U1
|
|
bis r31, r31, r31 C L1 st slosh
|
|
addq r8, r21, r8 C L0 hi mul + carry
|
|
|
|
cmpult r4, r9, r20 C L0 lo add => carry
|
|
bis r31, r31, r31 C U1 mt
|
|
lda r17, 64(r17) C L1 bookkeeping
|
|
subq r9, r8, r22 C U0 hi add => answer
|
|
|
|
bis r31, r31, r31 C U1 mt
|
|
cmpult r9, r8, r21 C L0 hi add => carry
|
|
addq r10, r20, r10 C U0 hi mul + carry
|
|
ldq r4, 32(r16) C L1
|
|
|
|
bis r31, r31, r31 C U1 mt
|
|
subq r5, r11, r23 C L0 lo + acc
|
|
addq r10, r21, r10 C L0 hi mul + carry
|
|
ldq r2, -16(r17) C L1
|
|
|
|
umulh r19, r3, r8 C U1
|
|
cmpult r5, r11, r20 C L0 lo add => carry
|
|
subq r23, r10, r28 C U0 hi add => answer
|
|
ldq r5, 40(r16) C L1
|
|
|
|
mulq r19, r0, r9 C U1
|
|
cmpult r23, r10, r21 C L0 hi add => carry
|
|
addq r12, r20, r12 C U0 hi mul + carry
|
|
ldq r3, -8(r17) C L1
|
|
|
|
umulh r19, r0, r10 C U1
|
|
subq r6, r13, r13 C U0 lo + acc
|
|
stq r22, 0(r16) C L0
|
|
stq r28, 8(r16) C L1
|
|
|
|
bis r31, r31, r31 C L0 st slosh
|
|
mulq r19, r1, r11 C U1
|
|
bis r31, r31, r31 C L1 st slosh
|
|
addq r12, r21, r12 C U0 hi mul + carry
|
|
|
|
cmpult r6, r13, r20 C L0 lo add => carry
|
|
bis r31, r31, r31 C U1 mt
|
|
lda r16, 64(r16) C L1 bookkeeping
|
|
subq r13, r12, r22 C U0 hi add => answer
|
|
|
|
bis r31, r31, r31 C U1 mt
|
|
cmpult r13, r12, r21 C L0 hi add => carry
|
|
addq r14, r20, r14 C U0 hi mul + carry
|
|
ldq r6, -16(r16) C L1
|
|
|
|
bis r31, r31, r31 C U1 mt
|
|
subq r7, r15, r23 C L0 lo + acc
|
|
addq r14, r21, r14 C U0 hi mul + carry
|
|
ldq r0, 0(r17) C L1
|
|
|
|
umulh r19, r1, r12 C U1
|
|
cmpult r7, r15, r20 C L0 lo add => carry
|
|
subq r23, r14, r28 C U0 hi add => answer
|
|
ldq r7, -8(r16) C L1
|
|
|
|
mulq r19, r2, r13 C U1
|
|
cmpult r23, r14, r21 C L0 hi add => carry
|
|
addq r8, r20, r8 C U0 hi mul + carry
|
|
ldq r1, 8(r17) C L1
|
|
|
|
umulh r19, r2, r14 C U1
|
|
subq r4, r9, r9 C U0 lo + acc
|
|
stq r22, -48(r16) C L0
|
|
stq r28, -40(r16) C L1
|
|
|
|
bis r31, r31, r31 C L0 st slosh
|
|
mulq r19, r3, r15 C U1
|
|
bis r31, r31, r31 C L1 st slosh
|
|
addq r8, r21, r8 C U0 hi mul + carry
|
|
|
|
cmpult r4, r9, r20 C L0 lo add => carry
|
|
subq r9, r8, r22 C U0 hi add => answer
|
|
bis r31, r31, r31 C L1 mt
|
|
bgt r18, $Loop C U1 bookkeeping
|
|
|
|
C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
|
|
$Lend:
|
|
cmpult r9, r8, r21 C L0 hi add => carry
|
|
addq r10, r20, r10 C U0 hi mul + carry
|
|
ldq r4, 0(r16) C L1
|
|
subq r5, r11, r23 C L0 lo + acc
|
|
addq r10, r21, r10 C L0 hi mul + carry
|
|
umulh r19, r3, r8 C U1
|
|
cmpult r5, r11, r20 C L0 lo add => carry
|
|
subq r23, r10, r28 C U0 hi add => answer
|
|
ldq r5, 8(r16) C L1
|
|
mulq r19, r0, r9 C U1
|
|
cmpult r23, r10, r21 C L0 hi add => carry
|
|
addq r12, r20, r12 C U0 hi mul + carry
|
|
umulh r19, r0, r10 C U1
|
|
subq r6, r13, r13 C L0 lo + acc
|
|
stq r22, -32(r16) C L0
|
|
stq r28, -24(r16) C L1
|
|
mulq r19, r1, r11 C U1
|
|
addq r12, r21, r12 C U0 hi mul + carry
|
|
cmpult r6, r13, r20 C L0 lo add => carry
|
|
subq r13, r12, r22 C U0 hi add => answer
|
|
cmpult r13, r12, r21 C L0 hi add => carry
|
|
addq r14, r20, r14 C U0 hi mul + carry
|
|
subq r7, r15, r23 C L0 lo + acc
|
|
addq r14, r21, r14 C U0 hi mul + carry
|
|
umulh r19, r1, r12 C U1
|
|
cmpult r7, r15, r20 C L0 lo add => carry
|
|
subq r23, r14, r28 C U0 hi add => answer
|
|
cmpult r23, r14, r21 C L0 hi add => carry
|
|
addq r8, r20, r8 C U0 hi mul + carry
|
|
subq r4, r9, r9 C U0 lo + acc
|
|
stq r22, -16(r16) C L0
|
|
stq r28, -8(r16) C L1
|
|
addq r8, r21, r8 C L0 hi mul + carry
|
|
cmpult r4, r9, r20 C L0 lo add => carry
|
|
subq r9, r8, r22 C U0 hi add => answer
|
|
cmpult r9, r8, r21 C L0 hi add => carry
|
|
addq r10, r20, r10 C U0 hi mul + carry
|
|
subq r5, r11, r23 C L0 lo + acc
|
|
addq r10, r21, r10 C L0 hi mul + carry
|
|
cmpult r5, r11, r20 C L0 lo add => carry
|
|
subq r23, r10, r28 C U0 hi add => answer
|
|
cmpult r23, r10, r21 C L0 hi add => carry
|
|
addq r12, r20, r12 C U0 hi mul + carry
|
|
stq r22, 0(r16) C L0
|
|
stq r28, 8(r16) C L1
|
|
addq r12, r21, r0 C U0 hi mul + carry
|
|
|
|
ldq $9, 8($30)
|
|
ldq $10, 16($30)
|
|
ldq $11, 24($30)
|
|
ldq $12, 32($30)
|
|
ldq $13, 40($30)
|
|
ldq $14, 48($30)
|
|
ldq $15, 56($30)
|
|
lda $30, 240($30)
|
|
ret r31, (r26), 1
|
|
EPILOGUE(mpn_submul_1)
|
|
ASM_END()
|