mpir/mpn/alpha/ev6/submul_1.asm

474 lines
14 KiB
NASM

dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
dnl the result from a second limb vector.
dnl Copyright 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl Boston, MA 02110-1301, USA.
include(`../config.m4')
dnl INPUT PARAMETERS
dnl res_ptr r16
dnl s1_ptr r17
dnl size r18
dnl s2_limb r19
dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
dnl exactly 3.5 cycles/limb on EV6...
dnl This code was written in close cooperation with ev6 pipeline expert
dnl Steve Root. Any errors are tege's fault, though.
dnl
dnl Register usages for unrolled loop:
dnl 0-3 mul's
dnl 4-7 acc's
dnl 8-15 mul results
dnl 20,21 carry's
dnl 22,23 save for stores
dnl Sustains 8 mul-adds in 28 cycles in the unrolled inner loop.
dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
dnl them, so that further disturbance to the schedule is damped.
dnl We couldn't pair the loads, because the entangled schedule of the
dnl carry's has to happen on one side {0} of the machine. Note, the total
dnl use of U0, and the total use of L0 (after attending to the stores).
dnl which is part of the reason why....
dnl This is a great schedule for the d_cache, a poor schedule for the
dnl b_cache. The lockup on U0 means that any stall can't be recovered
dnl from. Consider a ldq in L1. say that load gets stalled because it
dnl collides with a fill from the b_Cache. On the next cycle, this load
dnl gets priority. If first looks at L0, and goes there. The instruction
dnl we intended for L0 gets to look at L1, which is NOT where we want
dnl it. It either stalls 1, because it can't go in L0, or goes there, and
dnl causes a further instruction to stall.
dnl So for b_cache, we're likely going to want to put one or more cycles
dnl back into the code! And, of course, put in prefetches. For the
dnl accumulator, lds, intent to modify. For the multiplier, you might
dnl want ldq, evict next, if you're not wanting to use it again soon. Use
dnl 256 ahead of present pointer value. At a place where we have an mt
dnl followed by a bookkeeping, put the bookkeeping in upper, and the
dnl prefetch into lower.
dnl Note, the usage of physical registers per cycle is smoothed off, as
dnl much as possible.
dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
dnl like not to have a ldq or stq to preceded a conditional branch in a
dnl quadpack. The conditional branch moves the retire pointer one cycle
dnl later.
dnl Optimization notes:
dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
dnl Reserved regs: r29 r30 r31
dnl Free caller-saves regs in unrolled code: r24 r25 r28
dnl We should swap some of the callee-saves regs for some of the free
dnl caller-saves regs, saving some overhead cycles.
dnl Most importantly, we should write fast code for the 0-7 case.
dnl The code we use there are for the 21164, and runs at 7 cycles/limb
dnl on the 21264. Should not be hard, if we write specialized code for
dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
dnl need a jump table indexed by the low 3 bits of the count argument.
ASM_START()
PROLOGUE(mpn_submul_1)
cmpult r18, 8, r1
beq r1, $Large
ldq r2, 0(r17) C r2 = s1_limb
addq r17, 8, r17 C s1_ptr++
subq r18, 1, r18 C size--
mulq r2, r19, r3 C r3 = prod_low
ldq r5, 0(r16) C r5 = *res_ptr
umulh r2, r19, r0 C r0 = prod_high
beq r18, $Lend0b C jump if size was == 1
ldq r2, 0(r17) C r2 = s1_limb
addq r17, 8, r17 C s1_ptr++
subq r18, 1, r18 C size--
subq r5, r3, r3
cmpult r5, r3, r4
stq r3, 0(r16)
addq r16, 8, r16 C res_ptr++
beq r18, $Lend0a C jump if size was == 2
ALIGN(8)
$Loop0: mulq r2, r19, r3 C r3 = prod_low
ldq r5, 0(r16) C r5 = *res_ptr
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
subq r18, 1, r18 C size--
umulh r2, r19, r4 C r4 = cy_limb
ldq r2, 0(r17) C r2 = s1_limb
addq r17, 8, r17 C s1_ptr++
addq r3, r0, r3 C r3 = cy_limb + prod_low
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
subq r5, r3, r3
cmpult r5, r3, r5
stq r3, 0(r16)
addq r16, 8, r16 C res_ptr++
addq r5, r0, r0 C combine carries
bne r18, $Loop0
$Lend0a:
mulq r2, r19, r3 C r3 = prod_low
ldq r5, 0(r16) C r5 = *res_ptr
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
umulh r2, r19, r4 C r4 = cy_limb
addq r3, r0, r3 C r3 = cy_limb + prod_low
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
subq r5, r3, r3
cmpult r5, r3, r5
stq r3, 0(r16)
addq r5, r0, r0 C combine carries
addq r4, r0, r0 C cy_limb = prod_high + cy
ret r31, (r26), 1
$Lend0b:
subq r5, r3, r3
cmpult r5, r3, r5
stq r3, 0(r16)
addq r0, r5, r0
ret r31, (r26), 1
$Large:
lda $30, -240($30)
stq $9, 8($30)
stq $10, 16($30)
stq $11, 24($30)
stq $12, 32($30)
stq $13, 40($30)
stq $14, 48($30)
stq $15, 56($30)
and r18, 7, r20 C count for the first loop, 0-7
srl r18, 3, r18 C count for unrolled loop
bis r31, r31, r0
beq r20, $Lunroll
ldq r2, 0(r17) C r2 = s1_limb
addq r17, 8, r17 C s1_ptr++
subq r20, 1, r20 C size--
mulq r2, r19, r3 C r3 = prod_low
ldq r5, 0(r16) C r5 = *res_ptr
umulh r2, r19, r0 C r0 = prod_high
beq r20, $Lend1b C jump if size was == 1
ldq r2, 0(r17) C r2 = s1_limb
addq r17, 8, r17 C s1_ptr++
subq r20, 1, r20 C size--
subq r5, r3, r3
cmpult r5, r3, r4
stq r3, 0(r16)
addq r16, 8, r16 C res_ptr++
beq r20, $Lend1a C jump if size was == 2
ALIGN(8)
$Loop1: mulq r2, r19, r3 C r3 = prod_low
ldq r5, 0(r16) C r5 = *res_ptr
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
subq r20, 1, r20 C size--
umulh r2, r19, r4 C r4 = cy_limb
ldq r2, 0(r17) C r2 = s1_limb
addq r17, 8, r17 C s1_ptr++
addq r3, r0, r3 C r3 = cy_limb + prod_low
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
subq r5, r3, r3
cmpult r5, r3, r5
stq r3, 0(r16)
addq r16, 8, r16 C res_ptr++
addq r5, r0, r0 C combine carries
bne r20, $Loop1
$Lend1a:
mulq r2, r19, r3 C r3 = prod_low
ldq r5, 0(r16) C r5 = *res_ptr
addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
umulh r2, r19, r4 C r4 = cy_limb
addq r3, r0, r3 C r3 = cy_limb + prod_low
cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
subq r5, r3, r3
cmpult r5, r3, r5
stq r3, 0(r16)
addq r16, 8, r16 C res_ptr++
addq r5, r0, r0 C combine carries
addq r4, r0, r0 C cy_limb = prod_high + cy
br r31, $Lunroll
$Lend1b:
subq r5, r3, r3
cmpult r5, r3, r5
stq r3, 0(r16)
addq r16, 8, r16 C res_ptr++
addq r0, r5, r0
$Lunroll:
lda r17, -16(r17) C L1 bookkeeping
lda r16, -16(r16) C L1 bookkeeping
bis r0, r31, r12
C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
ldq r2, 16(r17) C L1
ldq r3, 24(r17) C L1
lda r18, -1(r18) C L1 bookkeeping
ldq r6, 16(r16) C L1
ldq r7, 24(r16) C L1
ldq r0, 32(r17) C L1
mulq r19, r2, r13 C U1
ldq r1, 40(r17) C L1
umulh r19, r2, r14 C U1
mulq r19, r3, r15 C U1
lda r17, 64(r17) C L1 bookkeeping
ldq r4, 32(r16) C L1
ldq r5, 40(r16) C L1
umulh r19, r3, r8 C U1
ldq r2, -16(r17) C L1
mulq r19, r0, r9 C U1
ldq r3, -8(r17) C L1
umulh r19, r0, r10 C U1
subq r6, r13, r13 C L0 lo + acc
mulq r19, r1, r11 C U1
cmpult r6, r13, r20 C L0 lo add => carry
lda r16, 64(r16) C L1 bookkeeping
subq r13, r12, r22 C U0 hi add => answer
cmpult r13, r12, r21 C L0 hi add => carry
addq r14, r20, r14 C U0 hi mul + carry
ldq r6, -16(r16) C L1
subq r7, r15, r28 C L0 lo + acc
addq r14, r21, r14 C U0 hi mul + carry
cmpult r7, r15, r20 C L0 lo add => carry
ldq r7, -8(r16) C L1
umulh r19, r1, r12 C U1
subq r28, r14, r23 C U0 hi add => answer
ldq r0, 0(r17) C L1
mulq r19, r2, r13 C U1
cmpult r28, r14, r21 C L0 hi add => carry
addq r8, r20, r8 C U0 hi mul + carry
ldq r1, 8(r17) C L1
umulh r19, r2, r14 C U1
subq r4, r9, r9 C L0 lo + acc
stq r22, -48(r16) C L0
stq r23, -40(r16) C L1
mulq r19, r3, r15 C U1
addq r8, r21, r8 C U0 hi mul + carry
cmpult r4, r9, r20 C L0 lo add => carry
subq r9, r8, r22 C U0 hi add => answer
ble r18, $Lend C U1 bookkeeping
C ____ MAIN UNROLLED LOOP ____
ALIGN(16)
$Loop:
bis r31, r31, r31 C U1 mt
cmpult r9, r8, r21 C L0 hi add => carry
addq r10, r20, r10 C U0 hi mul + carry
ldq r4, 0(r16) C L1
bis r31, r31, r31 C U1 mt
subq r5, r11, r23 C L0 lo + acc
addq r10, r21, r10 C L0 hi mul + carry
ldq r2, 16(r17) C L1
umulh r19, r3, r8 C U1
cmpult r5, r11, r20 C L0 lo add => carry
subq r23, r10, r28 C U0 hi add => answer
ldq r5, 8(r16) C L1
mulq r19, r0, r9 C U1
cmpult r23, r10, r21 C L0 hi add => carry
addq r12, r20, r12 C U0 hi mul + carry
ldq r3, 24(r17) C L1
umulh r19, r0, r10 C U1
subq r6, r13, r13 C U0 lo + acc
stq r22, -32(r16) C L0
stq r28, -24(r16) C L1
bis r31, r31, r31 C L0 st slosh
mulq r19, r1, r11 C U1
bis r31, r31, r31 C L1 st slosh
addq r12, r21, r12 C U0 hi mul + carry
cmpult r6, r13, r20 C L0 lo add => carry
bis r31, r31, r31 C U1 mt
lda r18, -1(r18) C L1 bookkeeping
subq r13, r12, r22 C U0 hi add => answer
bis r31, r31, r31 C U1 mt
cmpult r13, r12, r21 C L0 hi add => carry
addq r14, r20, r14 C U0 hi mul + carry
ldq r6, 16(r16) C L1
bis r31, r31, r31 C U1 mt
subq r7, r15, r23 C L0 lo + acc
addq r14, r21, r14 C U0 hi mul + carry
ldq r0, 32(r17) C L1
umulh r19, r1, r12 C U1
cmpult r7, r15, r20 C L0 lo add => carry
subq r23, r14, r28 C U0 hi add => answer
ldq r7, 24(r16) C L1
mulq r19, r2, r13 C U1
cmpult r23, r14, r21 C L0 hi add => carry
addq r8, r20, r8 C U0 hi mul + carry
ldq r1, 40(r17) C L1
umulh r19, r2, r14 C U1
subq r4, r9, r9 C U0 lo + acc
stq r22, -16(r16) C L0
stq r28, -8(r16) C L1
bis r31, r31, r31 C L0 st slosh
mulq r19, r3, r15 C U1
bis r31, r31, r31 C L1 st slosh
addq r8, r21, r8 C L0 hi mul + carry
cmpult r4, r9, r20 C L0 lo add => carry
bis r31, r31, r31 C U1 mt
lda r17, 64(r17) C L1 bookkeeping
subq r9, r8, r22 C U0 hi add => answer
bis r31, r31, r31 C U1 mt
cmpult r9, r8, r21 C L0 hi add => carry
addq r10, r20, r10 C U0 hi mul + carry
ldq r4, 32(r16) C L1
bis r31, r31, r31 C U1 mt
subq r5, r11, r23 C L0 lo + acc
addq r10, r21, r10 C L0 hi mul + carry
ldq r2, -16(r17) C L1
umulh r19, r3, r8 C U1
cmpult r5, r11, r20 C L0 lo add => carry
subq r23, r10, r28 C U0 hi add => answer
ldq r5, 40(r16) C L1
mulq r19, r0, r9 C U1
cmpult r23, r10, r21 C L0 hi add => carry
addq r12, r20, r12 C U0 hi mul + carry
ldq r3, -8(r17) C L1
umulh r19, r0, r10 C U1
subq r6, r13, r13 C U0 lo + acc
stq r22, 0(r16) C L0
stq r28, 8(r16) C L1
bis r31, r31, r31 C L0 st slosh
mulq r19, r1, r11 C U1
bis r31, r31, r31 C L1 st slosh
addq r12, r21, r12 C U0 hi mul + carry
cmpult r6, r13, r20 C L0 lo add => carry
bis r31, r31, r31 C U1 mt
lda r16, 64(r16) C L1 bookkeeping
subq r13, r12, r22 C U0 hi add => answer
bis r31, r31, r31 C U1 mt
cmpult r13, r12, r21 C L0 hi add => carry
addq r14, r20, r14 C U0 hi mul + carry
ldq r6, -16(r16) C L1
bis r31, r31, r31 C U1 mt
subq r7, r15, r23 C L0 lo + acc
addq r14, r21, r14 C U0 hi mul + carry
ldq r0, 0(r17) C L1
umulh r19, r1, r12 C U1
cmpult r7, r15, r20 C L0 lo add => carry
subq r23, r14, r28 C U0 hi add => answer
ldq r7, -8(r16) C L1
mulq r19, r2, r13 C U1
cmpult r23, r14, r21 C L0 hi add => carry
addq r8, r20, r8 C U0 hi mul + carry
ldq r1, 8(r17) C L1
umulh r19, r2, r14 C U1
subq r4, r9, r9 C U0 lo + acc
stq r22, -48(r16) C L0
stq r28, -40(r16) C L1
bis r31, r31, r31 C L0 st slosh
mulq r19, r3, r15 C U1
bis r31, r31, r31 C L1 st slosh
addq r8, r21, r8 C U0 hi mul + carry
cmpult r4, r9, r20 C L0 lo add => carry
subq r9, r8, r22 C U0 hi add => answer
bis r31, r31, r31 C L1 mt
bgt r18, $Loop C U1 bookkeeping
C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
$Lend:
cmpult r9, r8, r21 C L0 hi add => carry
addq r10, r20, r10 C U0 hi mul + carry
ldq r4, 0(r16) C L1
subq r5, r11, r23 C L0 lo + acc
addq r10, r21, r10 C L0 hi mul + carry
umulh r19, r3, r8 C U1
cmpult r5, r11, r20 C L0 lo add => carry
subq r23, r10, r28 C U0 hi add => answer
ldq r5, 8(r16) C L1
mulq r19, r0, r9 C U1
cmpult r23, r10, r21 C L0 hi add => carry
addq r12, r20, r12 C U0 hi mul + carry
umulh r19, r0, r10 C U1
subq r6, r13, r13 C L0 lo + acc
stq r22, -32(r16) C L0
stq r28, -24(r16) C L1
mulq r19, r1, r11 C U1
addq r12, r21, r12 C U0 hi mul + carry
cmpult r6, r13, r20 C L0 lo add => carry
subq r13, r12, r22 C U0 hi add => answer
cmpult r13, r12, r21 C L0 hi add => carry
addq r14, r20, r14 C U0 hi mul + carry
subq r7, r15, r23 C L0 lo + acc
addq r14, r21, r14 C U0 hi mul + carry
umulh r19, r1, r12 C U1
cmpult r7, r15, r20 C L0 lo add => carry
subq r23, r14, r28 C U0 hi add => answer
cmpult r23, r14, r21 C L0 hi add => carry
addq r8, r20, r8 C U0 hi mul + carry
subq r4, r9, r9 C U0 lo + acc
stq r22, -16(r16) C L0
stq r28, -8(r16) C L1
addq r8, r21, r8 C L0 hi mul + carry
cmpult r4, r9, r20 C L0 lo add => carry
subq r9, r8, r22 C U0 hi add => answer
cmpult r9, r8, r21 C L0 hi add => carry
addq r10, r20, r10 C U0 hi mul + carry
subq r5, r11, r23 C L0 lo + acc
addq r10, r21, r10 C L0 hi mul + carry
cmpult r5, r11, r20 C L0 lo add => carry
subq r23, r10, r28 C U0 hi add => answer
cmpult r23, r10, r21 C L0 hi add => carry
addq r12, r20, r12 C U0 hi mul + carry
stq r22, 0(r16) C L0
stq r28, 8(r16) C L1
addq r12, r21, r0 C U0 hi mul + carry
ldq $9, 8($30)
ldq $10, 16($30)
ldq $11, 24($30)
ldq $12, 32($30)
ldq $13, 40($30)
ldq $14, 48($30)
ldq $15, 56($30)
lda $30, 240($30)
ret r31, (r26), 1
EPILOGUE(mpn_submul_1)
ASM_END()