mpir/mpn/ia64/addmul_4.asm
2010-03-19 16:57:04 +00:00

459 lines
9.1 KiB
NASM

include(`../config.m4')
C FILE: addmul_4_asm.s
C
C AUTHOR: Jason Worth Martin <jason.worth.martin@gmail.com>
C
C LICENSE: GNU LGPLv3+
C
C DESCRIPTION: uint64_t addmul_4(uint64_t *rp,
C uint64_t *up,
C uint64_t n,
C uint64_t *vp)
C
C Form the product of the n-limb number pointed to by up with the
C 4-limb number pointed to by vp. Accumulate n limbs of the product
C to the n-limb number pointed to by rp. Returns the sum of the
C most significant limb of the product with the last carry from the
C addition.
C
C WARNING: This code will only work correctly if n > 1. This is okay since
C for n = 1 it isn't worth the overhead of starting up all the pipelines.
C So, mul_basecase needs to check for the size of n and use a smaller routine,
C such as addmul_2 or addmul_1 if n is small.
C
C HOW THIS CODE WORKS
C -------------------
C
C This code is highly pipelined. There are four multiplication pipelines,
C two load pipelines, and four addition pipelines. All the pipelines are run
C in parallel to hide the latency of the operations. This means we have a
C large startup penalty, but once the pipelines are full we get four
C multiplications every six cycles for a throughput of 1.5 cycles/mul.
C
C The multiplication pipelines are m1, m2, m3, m4 with the mklo and mkhi
C for k=1,2,3,4 denoting the lo and hi words.
C
C The addition pipelines are a1, a2, a3, a4. The addition pipelines have
C varying lengths to accomidate the staggered additions. In other words,
C we have add lines that look like this:
C
C **** **** **** a1_1 a1_0
C **** **** a2_2 a2_1 a2_0
C **** a3_3 a3_2 a3_1 a3_0
C a4_4 a4_3 a4_2 a4_1 a4_0
C
C where the additions are performed in columns. We put the results from mul1
C into a1, mul2 into a2, etc.
C
C Note that we use the add+add+cmp+cmp carry propigation method instead of
C the add+cmp+add method. This is not a disadvantage because the main loop
C needs the ldf8, ld8, st8, and br instructions anyway, so we bury the extra
C compares in the bundles used by those instructions. This means that we are
C propigatin single bit carries along each of these add lines separately instead
C of dealing with word-wise carries. Using the word-wise carry propigation
C would reduce the number of instructions needed (add+cmp+add) but increases
C the latency. At this point, we need at least 16 limbs in the input before
C we break even with two times the addmul_2 routine, so it isn't worth using
C the longer latency loops because somewhere between 50 and 60 limbs we
C transition into Karasuba multiplication anyway.
C
C The code uses the register rotation supported on Itanium. On each iteration
C of the main loop, the value appears to move up a register. We label the
C registers to take this into account so that, for eaxmple, the value in a1_1
C appears in a1_0 on the next loop cycle.
C
C We also use the rotating predicate registers to turn on and off the loads and
C stores. This is because we need to wait until the first result has moved
C all the way through the pipelines before storing it. Likewise, we stop the
C loads in the epilogue stages of the loop but continue the stores. See the
C Itanium documentation for a full explaination of register rotation and the
C br.ctop instruction that supports it.
C
C
C General Register use:
C
C r0 - Always 0
C r1 - Global Pointer (preserve)
C r2 - stores ar.pfs (scratch)
C r3 - scratch
C r4-r7 - preserve
C r8 - return0 (structure/union return pointer on entry)
C r9 - lc store (return1)
C r10 - small loop trip count (return2)
C r11 - scratch (return3)
C r12 - stack pointer
C r13 - thread pointer (preserve)
C v1p = vp -- only used at beginning of routine. Originally in r35
C v2p = vp + 1*8
C v3p = vp + 2*8
C v4p = vp + 3*8
C numlim = number of limbs
C res1 = (*rp) + a1_0
C res2 = res1 + a2_0
C res3 = res2 + a3_0
C res4 = res3 + a4_0
C sp = store pointer. This is where we write the result. It is originally rp
pr_store = r3
lc_store = r9
tripcnt_s = r10
rptr = r14
uptr = r15
v1p = r35
v2p = r16
v3p = r17
v4p = r18
tripcnt = r19
res1 = r20
res2 = r21
res3 = r22
res4 = r8
sptr = r23
m1hi_s = r24
m2hi_s = r25
m3hi_s = r26
m4hi_s = r27
C
C General Register Rotating Pipelines
C
C
C r32 - add line 1 stage 1 (rp on entry)
C r33 - add line 1 stage 0 (uptr on entry)
a1_1 = r32
a1_0 = r33
C r34 - add line 2 stage 2 (n on entry)
C r35 - add line 2 stage 1 (vp on entry)
C r36 - add line 2 stage 0 (vp on entry)
a2_2 = r34
a2_1 = r35
a2_0 = r36
C r37 - add line 3 stage 3
C r38 - add line 3 stage 2
C r39 - add line 3 stage 1
C r40 - add line 3 stage 0
a3_3 = r37
a3_2 = r38
a3_1 = r39
a3_0 = r40
C r41 - add line 4 stage 4
C r42 - add line 4 stage 3
C r43 - add line 4 stage 2
C r44 - add line 4 stage 1
C r45 - add line 4 stage 0
a4_4 = r41
a4_3 = r42
a4_2 = r43
a4_1 = r44
a4_0 = r45
C rp input pipeline
C rpin_c = r46
rpin_c = r47
rpin_3 = r48
rpin_2 = r49
rpin_1 = r50
rpin_0 = r51
C Remainder of rotating registers
toprot = r55
C
C Floating Point Register Use
C
v1 = f6
v2 = f7
v3 = f8
v4 = f9
C
C Floating point rotating register pipelines:
C
C fp load pipeline
upld_c = f127
upld_2 = f32
upld_1 = f33
upld_0 = f34
C mul_line_1:
m1lo_1 = f35
m1lo_0 = f36
m1hi_1 = f37
m1hi_0 = f38
C mul_line_2:
m2lo_1 = f39
m2lo_0 = f40
m2hi_1 = f41
m2hi_0 = f42
C mul_line_3:
m3lo_1 = f43
m3lo_0 = f44
m3hi_1 = f45
m3hi_0 = f46
C mul_line_4:
m4lo_1 = f47
m4lo_0 = f48
m4hi_1 = f49
m4hi_0 = f50
C
C Predicate Register Use
C
c1_1 = p47
c1_0 = p48
c1n_1 = p49
c1n_0 = p50
c2_1 = p51
c2_0 = p52
c2n_1 = p53
c2n_0 = p54
c3_1 = p55
c3_0 = p56
c3n_1 = p57
c3n_0 = p58
c4_1 = p59
c4_0 = p60
c4n_1 = p61
c4n_0 = p62
ASM_START()
PROLOGUE(mpn_addmul_4)
.explicit
.pred.rel "mutex", p14, p15
.pred.rel "mutex", c1_0, c1n_0
.pred.rel "mutex", c1_1, c1n_1
.pred.rel "mutex", c2_0, c2n_0
.pred.rel "mutex", c2_1, c2n_1
.pred.rel "mutex", c3_0, c3n_0
.pred.rel "mutex", c3_1, c3n_1
.pred.rel "mutex", c4_0, c4n_0
.pred.rel "mutex", c4_1, c4n_1
C cycle 1
{
.prologue
.save ar.pfs, r2
alloc r2 = ar.pfs,4,20,0,24
.save ar.lc, lc_store
mov lc_store = ar.lc
.save pr, pr_store
mov pr_store = pr
}
{
.body
mov rptr = r32
mov sptr = r32
mov uptr = r33
}
;;
C cycle 2
{
ldf8 v1 = [v1p]
cmp.eq p14,p15 = 1, r34
add v2p = 8, r35
}
{
ldf8 upld_0 = [uptr], 8
add v3p = 16, r35
add v4p = 24, r35
}
;;
C cycle 3
{
ldf8 v2 = [v2p]
(p15) add tripcnt = -2, r34
mov m1lo_0 = f0
}
{
ldf8 v3 = [v3p]
(p14) mov tripcnt = 0
mov m1hi_0 = f0
}
C cycle 4
{
ldf8 v4 = [v4p]
mov m2lo_0 = f0
(p15) mov pr.rot = 0x0fff0000 C PR16 = 16-27 set to 1
}
{
ld8 rpin_2 = [rptr], 8
mov m2hi_0 = f0
(p14) mov pr.rot = 0x0ffe0000 C Single Limb
}
;;
C cycle 5
{
mov rpin_3 = 0
mov rpin_1 = 0
mov m3lo_0 = f0
}
{
mov rpin_0 = 0
cmp.ne c3_0, c3n_0 = r0, r0
mov m3hi_0 = f0
}
C cycle 6
{
cmp.ne c4_0, c4n_0 = r0, r0
mov a1_1 = 0
mov m4lo_0 = f0
}
{
mov a1_0 = 0
mov a2_2 = 0
mov m4hi_0 = f0
}
C cycle 7
{
mov a2_1 = 0
mov a2_0 = 0
mov upld_1 = f0
}
{
mov a3_3 = 0
mov upld_2 = f0
(p14) mov ar.ec = 7
}
;;
C cycle 8
{
mov a3_2 = 0
mov upld_c = f0
(p15) mov ar.ec = 8
}
{
cmp.ne c1_0, c1n_0 = r0, r0
mov toprot = 0
mov ar.lc = tripcnt
}
C cycle 9
{
cmp.ne c2_0, c2n_0 = r0, r0
mov a3_0 = 0
mov a3_1 = 0
}
{
mov a4_4 = 0
mov a4_3 = 0
mov a4_0 = 0
}
C cycle 10
{
mov a4_1 = 0
mov a4_2 = 0
nop 0
}
{
nop 0
nop 0
nop 0
}
;;
addmul_4_main_loop:
.pred.rel "mutex", c1_0, c1n_0
.pred.rel "mutex", c1_1, c1n_1
.pred.rel "mutex", c2_0, c2n_0
.pred.rel "mutex", c2_1, c2n_1
.pred.rel "mutex", c3_0, c3n_0
.pred.rel "mutex", c3_1, c3n_1
.pred.rel "mutex", c4_0, c4n_0
.pred.rel "mutex", c4_1, c4n_1
C loop cycle 1
{
getf.sig a1_1 = m1lo_0
(p16) ldf8 upld_1 = [uptr], 8
xma.lu m1lo_1 = upld_0, v1, m1hi_0
}
{
nop 0
nop 0
xma.hu m1hi_1 = upld_0, v1, m1hi_0
}
;;
C loop cycle 2
{
getf.sig a2_2 = m2lo_0
xma.lu m2lo_1 = upld_0, v2, m2hi_0
(c1_0) add res1 = rpin_0, a1_0, 1
}
{
(p30) st8 [sptr] = res4, 8
(c1n_0) add res1 = rpin_0, a1_0
xma.hu m2hi_1 = upld_0, v2, m2hi_0
}
;;
C loop cycle 3
{
getf.sig a3_3 = m3lo_0
(p16) ld8 rpin_3 = [rptr], 8
xma.lu m3lo_1 = upld_0, v3, m3hi_0
}
{
(c2_0) add res2 = res1, a2_0, 1
(c2n_0) add res2 = res1, a2_0
xma.hu m3hi_1 = upld_0, v3, m3hi_0
}
;;
C loop cycle 4
{
getf.sig a4_4 = m4lo_0
xma.lu m4lo_1 = upld_0, v4, m4hi_0
(c1n_0) cmp.ltu c1_1, c1n_1 = res1, rpin_0
}
{
(c3_0) add res3 = res2, a3_0, 1
(c3n_0) add res3 = res2, a3_0
xma.hu m4hi_1 = upld_0, v4, m4hi_0
}
;;
C loop cycle 5
{
(c1_0) cmp.leu c1_1, c1n_1 = res1, rpin_0
(c4_0) add res4 = res3, a4_0, 1
(c4n_0) add res4 = res3, a4_0
}
{
(c2n_0) cmp.ltu c2_1, c2n_1 = res2, res1
(c2_0) cmp.leu c2_1, c2n_1 = res2, res1
(c3n_0) cmp.ltu c3_1, c3n_1 = res3, res2
}
;;
C loop cycle 6
{
(c3_0) cmp.leu c3_1, c3n_1 = res3, res2
mov upld_c = f0
mov rpin_c = 0
}
{
(c4_0) cmp.leu c4_1, c4n_1 = res4, res3
(c4n_0) cmp.ltu c4_1, c4n_1 = res4, res3
br.ctop.sptk.many addmul_4_main_loop
}
;;
.auto
mov pr = pr_store,-1
mov ar.lc = lc_store
mov ar.pfs = r2
br.ret.sptk.many b0
EPILOGUE()
ASM_END()