include(`../config.m4') C FILE: addmul_4_asm.s C C AUTHOR: Jason Worth Martin C C LICENSE: GNU LGPLv3+ C C DESCRIPTION: uint64_t addmul_4(uint64_t *rp, C uint64_t *up, C uint64_t n, C uint64_t *vp) C C Form the product of the n-limb number pointed to by up with the C 4-limb number pointed to by vp. Accumulate n limbs of the product C to the n-limb number pointed to by rp. Returns the sum of the C most significant limb of the product with the last carry from the C addition. C C WARNING: This code will only work correctly if n > 1. This is okay since C for n = 1 it isn't worth the overhead of starting up all the pipelines. C So, mul_basecase needs to check for the size of n and use a smaller routine, C such as addmul_2 or addmul_1 if n is small. C C HOW THIS CODE WORKS C ------------------- C C This code is highly pipelined. There are four multiplication pipelines, C two load pipelines, and four addition pipelines. All the pipelines are run C in parallel to hide the latency of the operations. This means we have a C large startup penalty, but once the pipelines are full we get four C multiplications every six cycles for a throughput of 1.5 cycles/mul. C C The multiplication pipelines are m1, m2, m3, m4 with the mklo and mkhi C for k=1,2,3,4 denoting the lo and hi words. C C The addition pipelines are a1, a2, a3, a4. The addition pipelines have C varying lengths to accomidate the staggered additions. In other words, C we have add lines that look like this: C C **** **** **** a1_1 a1_0 C **** **** a2_2 a2_1 a2_0 C **** a3_3 a3_2 a3_1 a3_0 C a4_4 a4_3 a4_2 a4_1 a4_0 C C where the additions are performed in columns. We put the results from mul1 C into a1, mul2 into a2, etc. C C Note that we use the add+add+cmp+cmp carry propigation method instead of C the add+cmp+add method. This is not a disadvantage because the main loop C needs the ldf8, ld8, st8, and br instructions anyway, so we bury the extra C compares in the bundles used by those instructions. This means that we are C propigatin single bit carries along each of these add lines separately instead C of dealing with word-wise carries. Using the word-wise carry propigation C would reduce the number of instructions needed (add+cmp+add) but increases C the latency. At this point, we need at least 16 limbs in the input before C we break even with two times the addmul_2 routine, so it isn't worth using C the longer latency loops because somewhere between 50 and 60 limbs we C transition into Karasuba multiplication anyway. C C The code uses the register rotation supported on Itanium. On each iteration C of the main loop, the value appears to move up a register. We label the C registers to take this into account so that, for eaxmple, the value in a1_1 C appears in a1_0 on the next loop cycle. C C We also use the rotating predicate registers to turn on and off the loads and C stores. This is because we need to wait until the first result has moved C all the way through the pipelines before storing it. Likewise, we stop the C loads in the epilogue stages of the loop but continue the stores. See the C Itanium documentation for a full explaination of register rotation and the C br.ctop instruction that supports it. C C C General Register use: C C r0 - Always 0 C r1 - Global Pointer (preserve) C r2 - stores ar.pfs (scratch) C r3 - scratch C r4-r7 - preserve C r8 - return0 (structure/union return pointer on entry) C r9 - lc store (return1) C r10 - small loop trip count (return2) C r11 - scratch (return3) C r12 - stack pointer C r13 - thread pointer (preserve) C v1p = vp -- only used at beginning of routine. Originally in r35 C v2p = vp + 1*8 C v3p = vp + 2*8 C v4p = vp + 3*8 C numlim = number of limbs C res1 = (*rp) + a1_0 C res2 = res1 + a2_0 C res3 = res2 + a3_0 C res4 = res3 + a4_0 C sp = store pointer. This is where we write the result. It is originally rp pr_store = r3 lc_store = r9 tripcnt_s = r10 rptr = r14 uptr = r15 v1p = r35 v2p = r16 v3p = r17 v4p = r18 tripcnt = r19 res1 = r20 res2 = r21 res3 = r22 res4 = r8 sptr = r23 m1hi_s = r24 m2hi_s = r25 m3hi_s = r26 m4hi_s = r27 C C General Register Rotating Pipelines C C C r32 - add line 1 stage 1 (rp on entry) C r33 - add line 1 stage 0 (uptr on entry) a1_1 = r32 a1_0 = r33 C r34 - add line 2 stage 2 (n on entry) C r35 - add line 2 stage 1 (vp on entry) C r36 - add line 2 stage 0 (vp on entry) a2_2 = r34 a2_1 = r35 a2_0 = r36 C r37 - add line 3 stage 3 C r38 - add line 3 stage 2 C r39 - add line 3 stage 1 C r40 - add line 3 stage 0 a3_3 = r37 a3_2 = r38 a3_1 = r39 a3_0 = r40 C r41 - add line 4 stage 4 C r42 - add line 4 stage 3 C r43 - add line 4 stage 2 C r44 - add line 4 stage 1 C r45 - add line 4 stage 0 a4_4 = r41 a4_3 = r42 a4_2 = r43 a4_1 = r44 a4_0 = r45 C rp input pipeline C rpin_c = r46 rpin_c = r47 rpin_3 = r48 rpin_2 = r49 rpin_1 = r50 rpin_0 = r51 C Remainder of rotating registers toprot = r55 C C Floating Point Register Use C v1 = f6 v2 = f7 v3 = f8 v4 = f9 C C Floating point rotating register pipelines: C C fp load pipeline upld_c = f127 upld_2 = f32 upld_1 = f33 upld_0 = f34 C mul_line_1: m1lo_1 = f35 m1lo_0 = f36 m1hi_1 = f37 m1hi_0 = f38 C mul_line_2: m2lo_1 = f39 m2lo_0 = f40 m2hi_1 = f41 m2hi_0 = f42 C mul_line_3: m3lo_1 = f43 m3lo_0 = f44 m3hi_1 = f45 m3hi_0 = f46 C mul_line_4: m4lo_1 = f47 m4lo_0 = f48 m4hi_1 = f49 m4hi_0 = f50 C C Predicate Register Use C c1_1 = p47 c1_0 = p48 c1n_1 = p49 c1n_0 = p50 c2_1 = p51 c2_0 = p52 c2n_1 = p53 c2n_0 = p54 c3_1 = p55 c3_0 = p56 c3n_1 = p57 c3n_0 = p58 c4_1 = p59 c4_0 = p60 c4n_1 = p61 c4n_0 = p62 ASM_START() PROLOGUE(mpn_addmul_4) .explicit .pred.rel "mutex", p14, p15 .pred.rel "mutex", c1_0, c1n_0 .pred.rel "mutex", c1_1, c1n_1 .pred.rel "mutex", c2_0, c2n_0 .pred.rel "mutex", c2_1, c2n_1 .pred.rel "mutex", c3_0, c3n_0 .pred.rel "mutex", c3_1, c3n_1 .pred.rel "mutex", c4_0, c4n_0 .pred.rel "mutex", c4_1, c4n_1 C cycle 1 { .prologue .save ar.pfs, r2 alloc r2 = ar.pfs,4,20,0,24 .save ar.lc, lc_store mov lc_store = ar.lc .save pr, pr_store mov pr_store = pr } { .body mov rptr = r32 mov sptr = r32 mov uptr = r33 } ;; C cycle 2 { ldf8 v1 = [v1p] cmp.eq p14,p15 = 1, r34 add v2p = 8, r35 } { ldf8 upld_0 = [uptr], 8 add v3p = 16, r35 add v4p = 24, r35 } ;; C cycle 3 { ldf8 v2 = [v2p] (p15) add tripcnt = -2, r34 mov m1lo_0 = f0 } { ldf8 v3 = [v3p] (p14) mov tripcnt = 0 mov m1hi_0 = f0 } C cycle 4 { ldf8 v4 = [v4p] mov m2lo_0 = f0 (p15) mov pr.rot = 0x0fff0000 C PR16 = 16-27 set to 1 } { ld8 rpin_2 = [rptr], 8 mov m2hi_0 = f0 (p14) mov pr.rot = 0x0ffe0000 C Single Limb } ;; C cycle 5 { mov rpin_3 = 0 mov rpin_1 = 0 mov m3lo_0 = f0 } { mov rpin_0 = 0 cmp.ne c3_0, c3n_0 = r0, r0 mov m3hi_0 = f0 } C cycle 6 { cmp.ne c4_0, c4n_0 = r0, r0 mov a1_1 = 0 mov m4lo_0 = f0 } { mov a1_0 = 0 mov a2_2 = 0 mov m4hi_0 = f0 } C cycle 7 { mov a2_1 = 0 mov a2_0 = 0 mov upld_1 = f0 } { mov a3_3 = 0 mov upld_2 = f0 (p14) mov ar.ec = 7 } ;; C cycle 8 { mov a3_2 = 0 mov upld_c = f0 (p15) mov ar.ec = 8 } { cmp.ne c1_0, c1n_0 = r0, r0 mov toprot = 0 mov ar.lc = tripcnt } C cycle 9 { cmp.ne c2_0, c2n_0 = r0, r0 mov a3_0 = 0 mov a3_1 = 0 } { mov a4_4 = 0 mov a4_3 = 0 mov a4_0 = 0 } C cycle 10 { mov a4_1 = 0 mov a4_2 = 0 nop 0 } { nop 0 nop 0 nop 0 } ;; addmul_4_main_loop: .pred.rel "mutex", c1_0, c1n_0 .pred.rel "mutex", c1_1, c1n_1 .pred.rel "mutex", c2_0, c2n_0 .pred.rel "mutex", c2_1, c2n_1 .pred.rel "mutex", c3_0, c3n_0 .pred.rel "mutex", c3_1, c3n_1 .pred.rel "mutex", c4_0, c4n_0 .pred.rel "mutex", c4_1, c4n_1 C loop cycle 1 { getf.sig a1_1 = m1lo_0 (p16) ldf8 upld_1 = [uptr], 8 xma.lu m1lo_1 = upld_0, v1, m1hi_0 } { nop 0 nop 0 xma.hu m1hi_1 = upld_0, v1, m1hi_0 } ;; C loop cycle 2 { getf.sig a2_2 = m2lo_0 xma.lu m2lo_1 = upld_0, v2, m2hi_0 (c1_0) add res1 = rpin_0, a1_0, 1 } { (p30) st8 [sptr] = res4, 8 (c1n_0) add res1 = rpin_0, a1_0 xma.hu m2hi_1 = upld_0, v2, m2hi_0 } ;; C loop cycle 3 { getf.sig a3_3 = m3lo_0 (p16) ld8 rpin_3 = [rptr], 8 xma.lu m3lo_1 = upld_0, v3, m3hi_0 } { (c2_0) add res2 = res1, a2_0, 1 (c2n_0) add res2 = res1, a2_0 xma.hu m3hi_1 = upld_0, v3, m3hi_0 } ;; C loop cycle 4 { getf.sig a4_4 = m4lo_0 xma.lu m4lo_1 = upld_0, v4, m4hi_0 (c1n_0) cmp.ltu c1_1, c1n_1 = res1, rpin_0 } { (c3_0) add res3 = res2, a3_0, 1 (c3n_0) add res3 = res2, a3_0 xma.hu m4hi_1 = upld_0, v4, m4hi_0 } ;; C loop cycle 5 { (c1_0) cmp.leu c1_1, c1n_1 = res1, rpin_0 (c4_0) add res4 = res3, a4_0, 1 (c4n_0) add res4 = res3, a4_0 } { (c2n_0) cmp.ltu c2_1, c2n_1 = res2, res1 (c2_0) cmp.leu c2_1, c2n_1 = res2, res1 (c3n_0) cmp.ltu c3_1, c3n_1 = res3, res2 } ;; C loop cycle 6 { (c3_0) cmp.leu c3_1, c3n_1 = res3, res2 mov upld_c = f0 mov rpin_c = 0 } { (c4_0) cmp.leu c4_1, c4n_1 = res4, res3 (c4n_0) cmp.ltu c4_1, c4n_1 = res4, res3 br.ctop.sptk.many addmul_4_main_loop } ;; .auto mov pr = pr_store,-1 mov ar.lc = lc_store mov ar.pfs = r2 br.ret.sptk.many b0 EPILOGUE() ASM_END()