459 lines
9.1 KiB
NASM
459 lines
9.1 KiB
NASM
include(`../config.m4')
|
|
C FILE: addmul_4_asm.s
|
|
C
|
|
C AUTHOR: Jason Worth Martin <jason.worth.martin@gmail.com>
|
|
C
|
|
C LICENSE: GNU LGPLv3+
|
|
C
|
|
C DESCRIPTION: uint64_t addmul_4(uint64_t *rp,
|
|
C uint64_t *up,
|
|
C uint64_t n,
|
|
C uint64_t *vp)
|
|
C
|
|
C Form the product of the n-limb number pointed to by up with the
|
|
C 4-limb number pointed to by vp. Accumulate n limbs of the product
|
|
C to the n-limb number pointed to by rp. Returns the sum of the
|
|
C most significant limb of the product with the last carry from the
|
|
C addition.
|
|
C
|
|
C WARNING: This code will only work correctly if n > 1. This is okay since
|
|
C for n = 1 it isn't worth the overhead of starting up all the pipelines.
|
|
C So, mul_basecase needs to check for the size of n and use a smaller routine,
|
|
C such as addmul_2 or addmul_1 if n is small.
|
|
C
|
|
C HOW THIS CODE WORKS
|
|
C -------------------
|
|
C
|
|
C This code is highly pipelined. There are four multiplication pipelines,
|
|
C two load pipelines, and four addition pipelines. All the pipelines are run
|
|
C in parallel to hide the latency of the operations. This means we have a
|
|
C large startup penalty, but once the pipelines are full we get four
|
|
C multiplications every six cycles for a throughput of 1.5 cycles/mul.
|
|
C
|
|
C The multiplication pipelines are m1, m2, m3, m4 with the mklo and mkhi
|
|
C for k=1,2,3,4 denoting the lo and hi words.
|
|
C
|
|
C The addition pipelines are a1, a2, a3, a4. The addition pipelines have
|
|
C varying lengths to accomidate the staggered additions. In other words,
|
|
C we have add lines that look like this:
|
|
C
|
|
C **** **** **** a1_1 a1_0
|
|
C **** **** a2_2 a2_1 a2_0
|
|
C **** a3_3 a3_2 a3_1 a3_0
|
|
C a4_4 a4_3 a4_2 a4_1 a4_0
|
|
C
|
|
C where the additions are performed in columns. We put the results from mul1
|
|
C into a1, mul2 into a2, etc.
|
|
C
|
|
C Note that we use the add+add+cmp+cmp carry propigation method instead of
|
|
C the add+cmp+add method. This is not a disadvantage because the main loop
|
|
C needs the ldf8, ld8, st8, and br instructions anyway, so we bury the extra
|
|
C compares in the bundles used by those instructions. This means that we are
|
|
C propigatin single bit carries along each of these add lines separately instead
|
|
C of dealing with word-wise carries. Using the word-wise carry propigation
|
|
C would reduce the number of instructions needed (add+cmp+add) but increases
|
|
C the latency. At this point, we need at least 16 limbs in the input before
|
|
C we break even with two times the addmul_2 routine, so it isn't worth using
|
|
C the longer latency loops because somewhere between 50 and 60 limbs we
|
|
C transition into Karasuba multiplication anyway.
|
|
C
|
|
C The code uses the register rotation supported on Itanium. On each iteration
|
|
C of the main loop, the value appears to move up a register. We label the
|
|
C registers to take this into account so that, for eaxmple, the value in a1_1
|
|
C appears in a1_0 on the next loop cycle.
|
|
C
|
|
C We also use the rotating predicate registers to turn on and off the loads and
|
|
C stores. This is because we need to wait until the first result has moved
|
|
C all the way through the pipelines before storing it. Likewise, we stop the
|
|
C loads in the epilogue stages of the loop but continue the stores. See the
|
|
C Itanium documentation for a full explaination of register rotation and the
|
|
C br.ctop instruction that supports it.
|
|
C
|
|
|
|
|
|
C
|
|
C General Register use:
|
|
C
|
|
C r0 - Always 0
|
|
C r1 - Global Pointer (preserve)
|
|
C r2 - stores ar.pfs (scratch)
|
|
C r3 - scratch
|
|
C r4-r7 - preserve
|
|
C r8 - return0 (structure/union return pointer on entry)
|
|
C r9 - lc store (return1)
|
|
C r10 - small loop trip count (return2)
|
|
C r11 - scratch (return3)
|
|
C r12 - stack pointer
|
|
C r13 - thread pointer (preserve)
|
|
|
|
|
|
C v1p = vp -- only used at beginning of routine. Originally in r35
|
|
C v2p = vp + 1*8
|
|
C v3p = vp + 2*8
|
|
C v4p = vp + 3*8
|
|
C numlim = number of limbs
|
|
C res1 = (*rp) + a1_0
|
|
C res2 = res1 + a2_0
|
|
C res3 = res2 + a3_0
|
|
C res4 = res3 + a4_0
|
|
C sp = store pointer. This is where we write the result. It is originally rp
|
|
pr_store = r3
|
|
lc_store = r9
|
|
tripcnt_s = r10
|
|
rptr = r14
|
|
uptr = r15
|
|
v1p = r35
|
|
v2p = r16
|
|
v3p = r17
|
|
v4p = r18
|
|
tripcnt = r19
|
|
res1 = r20
|
|
res2 = r21
|
|
res3 = r22
|
|
res4 = r8
|
|
sptr = r23
|
|
m1hi_s = r24
|
|
m2hi_s = r25
|
|
m3hi_s = r26
|
|
m4hi_s = r27
|
|
|
|
C
|
|
C General Register Rotating Pipelines
|
|
C
|
|
C
|
|
C r32 - add line 1 stage 1 (rp on entry)
|
|
C r33 - add line 1 stage 0 (uptr on entry)
|
|
a1_1 = r32
|
|
a1_0 = r33
|
|
|
|
C r34 - add line 2 stage 2 (n on entry)
|
|
C r35 - add line 2 stage 1 (vp on entry)
|
|
C r36 - add line 2 stage 0 (vp on entry)
|
|
a2_2 = r34
|
|
a2_1 = r35
|
|
a2_0 = r36
|
|
|
|
C r37 - add line 3 stage 3
|
|
C r38 - add line 3 stage 2
|
|
C r39 - add line 3 stage 1
|
|
C r40 - add line 3 stage 0
|
|
a3_3 = r37
|
|
a3_2 = r38
|
|
a3_1 = r39
|
|
a3_0 = r40
|
|
|
|
C r41 - add line 4 stage 4
|
|
C r42 - add line 4 stage 3
|
|
C r43 - add line 4 stage 2
|
|
C r44 - add line 4 stage 1
|
|
C r45 - add line 4 stage 0
|
|
a4_4 = r41
|
|
a4_3 = r42
|
|
a4_2 = r43
|
|
a4_1 = r44
|
|
a4_0 = r45
|
|
|
|
C rp input pipeline
|
|
C rpin_c = r46
|
|
rpin_c = r47
|
|
rpin_3 = r48
|
|
rpin_2 = r49
|
|
rpin_1 = r50
|
|
rpin_0 = r51
|
|
|
|
C Remainder of rotating registers
|
|
toprot = r55
|
|
|
|
|
|
C
|
|
C Floating Point Register Use
|
|
C
|
|
v1 = f6
|
|
v2 = f7
|
|
v3 = f8
|
|
v4 = f9
|
|
|
|
C
|
|
C Floating point rotating register pipelines:
|
|
C
|
|
|
|
C fp load pipeline
|
|
upld_c = f127
|
|
upld_2 = f32
|
|
upld_1 = f33
|
|
upld_0 = f34
|
|
|
|
C mul_line_1:
|
|
m1lo_1 = f35
|
|
m1lo_0 = f36
|
|
m1hi_1 = f37
|
|
m1hi_0 = f38
|
|
|
|
C mul_line_2:
|
|
m2lo_1 = f39
|
|
m2lo_0 = f40
|
|
m2hi_1 = f41
|
|
m2hi_0 = f42
|
|
|
|
C mul_line_3:
|
|
m3lo_1 = f43
|
|
m3lo_0 = f44
|
|
m3hi_1 = f45
|
|
m3hi_0 = f46
|
|
|
|
C mul_line_4:
|
|
m4lo_1 = f47
|
|
m4lo_0 = f48
|
|
m4hi_1 = f49
|
|
m4hi_0 = f50
|
|
|
|
|
|
|
|
C
|
|
C Predicate Register Use
|
|
C
|
|
c1_1 = p47
|
|
c1_0 = p48
|
|
c1n_1 = p49
|
|
c1n_0 = p50
|
|
|
|
c2_1 = p51
|
|
c2_0 = p52
|
|
c2n_1 = p53
|
|
c2n_0 = p54
|
|
|
|
c3_1 = p55
|
|
c3_0 = p56
|
|
c3n_1 = p57
|
|
c3n_0 = p58
|
|
|
|
c4_1 = p59
|
|
c4_0 = p60
|
|
c4n_1 = p61
|
|
c4n_0 = p62
|
|
|
|
|
|
|
|
ASM_START()
|
|
PROLOGUE(mpn_addmul_4)
|
|
.explicit
|
|
.pred.rel "mutex", p14, p15
|
|
.pred.rel "mutex", c1_0, c1n_0
|
|
.pred.rel "mutex", c1_1, c1n_1
|
|
.pred.rel "mutex", c2_0, c2n_0
|
|
.pred.rel "mutex", c2_1, c2n_1
|
|
.pred.rel "mutex", c3_0, c3n_0
|
|
.pred.rel "mutex", c3_1, c3n_1
|
|
.pred.rel "mutex", c4_0, c4n_0
|
|
.pred.rel "mutex", c4_1, c4n_1
|
|
C cycle 1
|
|
{
|
|
.prologue
|
|
.save ar.pfs, r2
|
|
alloc r2 = ar.pfs,4,20,0,24
|
|
.save ar.lc, lc_store
|
|
mov lc_store = ar.lc
|
|
.save pr, pr_store
|
|
mov pr_store = pr
|
|
}
|
|
{
|
|
.body
|
|
mov rptr = r32
|
|
mov sptr = r32
|
|
mov uptr = r33
|
|
}
|
|
;;
|
|
C cycle 2
|
|
{
|
|
ldf8 v1 = [v1p]
|
|
cmp.eq p14,p15 = 1, r34
|
|
add v2p = 8, r35
|
|
}
|
|
{
|
|
ldf8 upld_0 = [uptr], 8
|
|
add v3p = 16, r35
|
|
add v4p = 24, r35
|
|
}
|
|
;;
|
|
C cycle 3
|
|
{
|
|
ldf8 v2 = [v2p]
|
|
(p15) add tripcnt = -2, r34
|
|
mov m1lo_0 = f0
|
|
}
|
|
{
|
|
ldf8 v3 = [v3p]
|
|
(p14) mov tripcnt = 0
|
|
mov m1hi_0 = f0
|
|
}
|
|
C cycle 4
|
|
{
|
|
ldf8 v4 = [v4p]
|
|
mov m2lo_0 = f0
|
|
(p15) mov pr.rot = 0x0fff0000 C PR16 = 16-27 set to 1
|
|
}
|
|
{
|
|
ld8 rpin_2 = [rptr], 8
|
|
mov m2hi_0 = f0
|
|
(p14) mov pr.rot = 0x0ffe0000 C Single Limb
|
|
}
|
|
;;
|
|
C cycle 5
|
|
{
|
|
mov rpin_3 = 0
|
|
mov rpin_1 = 0
|
|
mov m3lo_0 = f0
|
|
}
|
|
{
|
|
mov rpin_0 = 0
|
|
cmp.ne c3_0, c3n_0 = r0, r0
|
|
mov m3hi_0 = f0
|
|
}
|
|
C cycle 6
|
|
{
|
|
cmp.ne c4_0, c4n_0 = r0, r0
|
|
mov a1_1 = 0
|
|
mov m4lo_0 = f0
|
|
}
|
|
{
|
|
mov a1_0 = 0
|
|
mov a2_2 = 0
|
|
mov m4hi_0 = f0
|
|
}
|
|
C cycle 7
|
|
{
|
|
mov a2_1 = 0
|
|
mov a2_0 = 0
|
|
mov upld_1 = f0
|
|
}
|
|
{
|
|
mov a3_3 = 0
|
|
mov upld_2 = f0
|
|
(p14) mov ar.ec = 7
|
|
}
|
|
;;
|
|
C cycle 8
|
|
{
|
|
mov a3_2 = 0
|
|
mov upld_c = f0
|
|
(p15) mov ar.ec = 8
|
|
}
|
|
{
|
|
cmp.ne c1_0, c1n_0 = r0, r0
|
|
mov toprot = 0
|
|
mov ar.lc = tripcnt
|
|
}
|
|
C cycle 9
|
|
{
|
|
cmp.ne c2_0, c2n_0 = r0, r0
|
|
mov a3_0 = 0
|
|
mov a3_1 = 0
|
|
}
|
|
{
|
|
mov a4_4 = 0
|
|
mov a4_3 = 0
|
|
mov a4_0 = 0
|
|
}
|
|
C cycle 10
|
|
{
|
|
mov a4_1 = 0
|
|
mov a4_2 = 0
|
|
nop 0
|
|
}
|
|
{
|
|
nop 0
|
|
nop 0
|
|
nop 0
|
|
}
|
|
;;
|
|
addmul_4_main_loop:
|
|
.pred.rel "mutex", c1_0, c1n_0
|
|
.pred.rel "mutex", c1_1, c1n_1
|
|
.pred.rel "mutex", c2_0, c2n_0
|
|
.pred.rel "mutex", c2_1, c2n_1
|
|
.pred.rel "mutex", c3_0, c3n_0
|
|
.pred.rel "mutex", c3_1, c3n_1
|
|
.pred.rel "mutex", c4_0, c4n_0
|
|
.pred.rel "mutex", c4_1, c4n_1
|
|
C loop cycle 1
|
|
{
|
|
getf.sig a1_1 = m1lo_0
|
|
(p16) ldf8 upld_1 = [uptr], 8
|
|
xma.lu m1lo_1 = upld_0, v1, m1hi_0
|
|
}
|
|
{
|
|
nop 0
|
|
nop 0
|
|
xma.hu m1hi_1 = upld_0, v1, m1hi_0
|
|
}
|
|
;;
|
|
C loop cycle 2
|
|
{
|
|
getf.sig a2_2 = m2lo_0
|
|
xma.lu m2lo_1 = upld_0, v2, m2hi_0
|
|
(c1_0) add res1 = rpin_0, a1_0, 1
|
|
}
|
|
{
|
|
(p30) st8 [sptr] = res4, 8
|
|
(c1n_0) add res1 = rpin_0, a1_0
|
|
xma.hu m2hi_1 = upld_0, v2, m2hi_0
|
|
}
|
|
;;
|
|
C loop cycle 3
|
|
{
|
|
getf.sig a3_3 = m3lo_0
|
|
(p16) ld8 rpin_3 = [rptr], 8
|
|
xma.lu m3lo_1 = upld_0, v3, m3hi_0
|
|
}
|
|
{
|
|
(c2_0) add res2 = res1, a2_0, 1
|
|
(c2n_0) add res2 = res1, a2_0
|
|
xma.hu m3hi_1 = upld_0, v3, m3hi_0
|
|
}
|
|
;;
|
|
C loop cycle 4
|
|
{
|
|
getf.sig a4_4 = m4lo_0
|
|
xma.lu m4lo_1 = upld_0, v4, m4hi_0
|
|
(c1n_0) cmp.ltu c1_1, c1n_1 = res1, rpin_0
|
|
}
|
|
{
|
|
(c3_0) add res3 = res2, a3_0, 1
|
|
(c3n_0) add res3 = res2, a3_0
|
|
xma.hu m4hi_1 = upld_0, v4, m4hi_0
|
|
}
|
|
;;
|
|
C loop cycle 5
|
|
{
|
|
(c1_0) cmp.leu c1_1, c1n_1 = res1, rpin_0
|
|
(c4_0) add res4 = res3, a4_0, 1
|
|
(c4n_0) add res4 = res3, a4_0
|
|
}
|
|
{
|
|
(c2n_0) cmp.ltu c2_1, c2n_1 = res2, res1
|
|
(c2_0) cmp.leu c2_1, c2n_1 = res2, res1
|
|
(c3n_0) cmp.ltu c3_1, c3n_1 = res3, res2
|
|
}
|
|
;;
|
|
C loop cycle 6
|
|
{
|
|
(c3_0) cmp.leu c3_1, c3n_1 = res3, res2
|
|
mov upld_c = f0
|
|
mov rpin_c = 0
|
|
}
|
|
{
|
|
(c4_0) cmp.leu c4_1, c4n_1 = res4, res3
|
|
(c4n_0) cmp.ltu c4_1, c4n_1 = res4, res3
|
|
br.ctop.sptk.many addmul_4_main_loop
|
|
}
|
|
;;
|
|
|
|
.auto
|
|
mov pr = pr_store,-1
|
|
mov ar.lc = lc_store
|
|
mov ar.pfs = r2
|
|
br.ret.sptk.many b0
|
|
EPILOGUE()
|
|
ASM_END()
|
|
|