mpir/mpn/ia64/addmul_4.asm

include(`../config.m4')
C FILE: addmul_4_asm.s
C
C AUTHOR: Jason Worth Martin <jason.worth.martin@gmail.com>
C
C LICENSE: GNU LGPLv3+
C
C DESCRIPTION: uint64_t addmul_4(uint64_t *rp,
C                                uint64_t *up,
C                                uint64_t n,
C                                uint64_t *vp)
C
C Form the product of the n-limb number pointed to by up with the
C 4-limb number pointed to by vp.  Accumulate n limbs of the product
C to the n-limb number pointed to by rp.  Returns the sum of the
C most significant limb of the product with the last carry from the
C addition.
C
C WARNING: This code will only work correctly if n > 1.  This is okay since
C for n = 1 it isn't worth the overhead of starting up all the pipelines.
C So, mul_basecase needs to check for the size of n and use a smaller routine,
C such as addmul_2 or addmul_1 if n is small.
C
C HOW THIS CODE WORKS
C -------------------
C
C This code is highly pipelined.  There are four multiplication pipelines,
C two load pipelines, and four addition pipelines.  All the pipelines are run
C in parallel to hide the latency of the operations.  This means we have a
C large startup penalty, but once the pipelines are full we get four
C multiplications every six cycles for a throughput of 1.5 cycles/mul.
C
C The multiplication pipelines are m1, m2, m3, m4 with the mklo and mkhi
C for k=1,2,3,4 denoting the lo and hi words.
C
C The addition pipelines are a1, a2, a3, a4.  The addition pipelines have
C varying lengths to accomidate the staggered additions.  In other words,
C we have add lines that look like this:
C
C **** **** **** a1_1 a1_0
C **** **** a2_2 a2_1 a2_0
C **** a3_3 a3_2 a3_1 a3_0
C a4_4 a4_3 a4_2 a4_1 a4_0
C
C where the additions are performed in columns.  We put the results from mul1
C into a1, mul2 into a2, etc.
C
C Note that we use the add+add+cmp+cmp carry propigation method instead of
C the add+cmp+add method.  This is not a disadvantage because the main loop
C needs the ldf8, ld8, st8, and br instructions anyway, so we bury the extra
C compares in the bundles used by those instructions.  This means that we are
C propigatin single bit carries along each of these add lines separately instead
C of dealing with word-wise carries.  Using the word-wise carry propigation
C would reduce the number of instructions needed (add+cmp+add) but increases
C the latency.  At this point, we need at least 16 limbs in the input before
C we break even with two times the addmul_2 routine, so it isn't worth using
C the longer latency loops because somewhere between 50 and 60 limbs we
C transition into Karasuba multiplication anyway.
C
C The code uses the register rotation supported on Itanium.  On each iteration
C of the main loop, the value appears to move up a register.  We label the
C registers to take this into account so that, for eaxmple, the value in a1_1
C appears in a1_0 on the next loop cycle.
C
C We also use the rotating predicate registers to turn on and off the loads and
C stores.  This is because we need to wait until the first result has moved
C all the way through the pipelines before storing it.  Likewise, we stop the
C loads in the epilogue stages of the loop but continue the stores.  See the
C Itanium documentation for a full explaination of register rotation and the
C br.ctop instruction that supports it.
C


C
C General Register use:
C
C r0      - Always 0
C r1      - Global Pointer (preserve)
C r2      - stores ar.pfs (scratch)
C r3      - scratch
C r4-r7   - preserve
C r8      - return0 (structure/union return pointer on entry)
C r9      - lc store (return1)
C r10     - small loop trip count (return2)
C r11     - scratch (return3)
C r12     - stack pointer
C r13     - thread pointer (preserve)


C v1p = vp -- only used at beginning of routine.  Originally in r35
C v2p = vp + 1*8
C v3p = vp + 2*8
C v4p = vp + 3*8
C numlim = number of limbs
C res1 = (*rp) + a1_0
C res2 = res1  + a2_0
C res3 = res2  + a3_0
C res4 = res3  + a4_0
C sp = store pointer.  This is where we write the result.  It is originally rp
pr_store = r3
lc_store = r9
tripcnt_s = r10
rptr	= r14
uptr	= r15
v1p	= r35
v2p	= r16
v3p	= r17
v4p	= r18
tripcnt	= r19
res1	= r20
res2	= r21
res3	= r22
res4	= r8
sptr	= r23
m1hi_s	= r24
m2hi_s	= r25
m3hi_s	= r26
m4hi_s	= r27
	
C
C General Register Rotating Pipelines
C
C
C r32     - add line 1 stage 1 (rp on entry)
C r33     - add line 1 stage 0 (uptr on entry)
a1_1	= r32
a1_0	= r33

C r34     - add line 2 stage 2 (n on entry)
C r35     - add line 2 stage 1 (vp on entry)
C r36     - add line 2 stage 0 (vp on entry)
a2_2	= r34
a2_1	= r35
a2_0	= r36

C r37     - add line 3 stage 3
C r38     - add line 3 stage 2
C r39     - add line 3 stage 1
C r40     - add line 3 stage 0
a3_3	= r37
a3_2	= r38
a3_1	= r39
a3_0	= r40
	
C r41     - add line 4 stage 4
C r42     - add line 4 stage 3
C r43     - add line 4 stage 2
C r44     - add line 4 stage 1
C r45     - add line 4 stage 0
a4_4	= r41
a4_3	= r42
a4_2	= r43
a4_1	= r44
a4_0	= r45

C rp input pipeline
C rpin_c	= r46
rpin_c	= r47
rpin_3	= r48
rpin_2	= r49
rpin_1	= r50
rpin_0	= r51

C Remainder of rotating registers
toprot	= r55


C
C Floating Point Register Use
C
v1	= f6
v2	= f7
v3	= f8
v4	= f9

C
C Floating point rotating register pipelines:
C

C fp load pipeline
upld_c  = f127
upld_2	= f32
upld_1	= f33
upld_0	= f34

C mul_line_1:
m1lo_1	= f35
m1lo_0	= f36
m1hi_1	= f37
m1hi_0	= f38
	
C mul_line_2:
m2lo_1	= f39
m2lo_0	= f40
m2hi_1	= f41
m2hi_0	= f42
	
C mul_line_3:
m3lo_1	= f43
m3lo_0	= f44
m3hi_1	= f45
m3hi_0	= f46
	
C mul_line_4:
m4lo_1	= f47
m4lo_0	= f48
m4hi_1	= f49
m4hi_0	= f50


C
C Predicate Register Use
C
c1_1	= p47
c1_0	= p48
c1n_1	= p49
c1n_0	= p50

c2_1	= p51
c2_0	= p52
c2n_1	= p53
c2n_0	= p54

c3_1	= p55
c3_0	= p56
c3n_1	= p57
c3n_0	= p58

c4_1	= p59
c4_0	= p60
c4n_1	= p61
c4n_0	= p62


ASM_START()
PROLOGUE(mpn_addmul_4)
	.explicit
	.pred.rel "mutex", p14, p15
	.pred.rel "mutex", c1_0, c1n_0
	.pred.rel "mutex", c1_1, c1n_1
	.pred.rel "mutex", c2_0, c2n_0
	.pred.rel "mutex", c2_1, c2n_1
	.pred.rel "mutex", c3_0, c3n_0
	.pred.rel "mutex", c3_1, c3n_1
	.pred.rel "mutex", c4_0, c4n_0
	.pred.rel "mutex", c4_1, c4n_1
	C cycle 1
{
	.prologue
	.save	ar.pfs, r2
	alloc	r2 = ar.pfs,4,20,0,24
	.save	ar.lc, lc_store
	mov	lc_store = ar.lc
	.save	pr, pr_store
	mov	pr_store = pr
}
{
	.body
	mov	rptr = r32
	mov	sptr = r32
	mov	uptr = r33
}
	;; 
	C cycle 2
{
	ldf8	v1 = [v1p]	
	cmp.eq	p14,p15 = 1, r34
	add	v2p = 8, r35
}
{
	ldf8	upld_0 = [uptr], 8
	add	v3p = 16, r35
	add	v4p = 24, r35
}
	;;
	C cycle 3
{
	ldf8	v2 = [v2p]
(p15)	add	tripcnt = -2, r34
	mov	m1lo_0 = f0
}
{
	ldf8	v3 = [v3p]
(p14)	mov	tripcnt = 0
	mov	m1hi_0 = f0
}
	C cycle 4
{
	ldf8	v4 = [v4p]
	mov	m2lo_0 = f0
(p15)	mov	pr.rot = 0x0fff0000 C PR16 = 16-27 set to 1
}
{
	ld8	rpin_2 = [rptr], 8
	mov	m2hi_0 = f0
(p14)	mov	pr.rot = 0x0ffe0000 C Single Limb
}
	;;
	C cycle 5
{
	mov	rpin_3 = 0
	mov	rpin_1 = 0
	mov	m3lo_0 = f0
}
{
	mov	rpin_0 = 0
	cmp.ne	c3_0, c3n_0 = r0, r0
	mov	m3hi_0 = f0
}
	C cycle 6
{
	cmp.ne	c4_0, c4n_0 = r0, r0
	mov	a1_1 = 0
	mov	m4lo_0 = f0
}
{
	mov	a1_0 = 0
	mov	a2_2 = 0
	mov	m4hi_0 = f0
}
	C cycle 7
{
	mov	a2_1 = 0
	mov	a2_0 = 0
	mov	upld_1 = f0
}
{
	mov	a3_3 = 0
	mov	upld_2 = f0
(p14)	mov	ar.ec = 7
}
	;;
	C cycle 8
{
	mov	a3_2 = 0
	mov	upld_c = f0
(p15)	mov	ar.ec = 8
}
{
	cmp.ne	c1_0, c1n_0 = r0, r0
	mov	toprot = 0
	mov	ar.lc = tripcnt
}
	C cycle 9
{
	cmp.ne	c2_0, c2n_0 = r0, r0
	mov	a3_0 = 0
	mov	a3_1 = 0
}
{
	mov	a4_4 = 0
	mov	a4_3 = 0
	mov	a4_0 = 0
}
	C cycle 10
{
	mov	a4_1 = 0
	mov	a4_2 = 0
	nop	0
}
{
	nop	0
	nop	0
	nop	0
}
	;;	
addmul_4_main_loop:
	.pred.rel "mutex", c1_0, c1n_0
	.pred.rel "mutex", c1_1, c1n_1
	.pred.rel "mutex", c2_0, c2n_0
	.pred.rel "mutex", c2_1, c2n_1
	.pred.rel "mutex", c3_0, c3n_0
	.pred.rel "mutex", c3_1, c3n_1
	.pred.rel "mutex", c4_0, c4n_0
	.pred.rel "mutex", c4_1, c4n_1
	C loop cycle 1
{
	getf.sig	a1_1 = m1lo_0
(p16)	ldf8	upld_1 = [uptr], 8
	xma.lu	m1lo_1 = upld_0, v1, m1hi_0
}
{
	nop	0
	nop	0
	xma.hu	m1hi_1 = upld_0, v1, m1hi_0
}
	;; 
	C loop cycle 2
{
	getf.sig	a2_2 = m2lo_0
	xma.lu	m2lo_1 = upld_0, v2, m2hi_0
(c1_0)	add	res1 = rpin_0, a1_0, 1
}
{
(p30)	st8	[sptr] = res4, 8
(c1n_0)	add	res1 = rpin_0, a1_0
	xma.hu	m2hi_1 = upld_0, v2, m2hi_0
}
	;; 
	C loop cycle 3
{
	getf.sig	a3_3 = m3lo_0
(p16)	ld8	rpin_3 = [rptr], 8
	xma.lu	m3lo_1 = upld_0, v3, m3hi_0
}
{
(c2_0)	add	res2 = res1, a2_0, 1
(c2n_0)	add	res2 = res1, a2_0
	xma.hu	m3hi_1 = upld_0, v3, m3hi_0	
}
	;; 
	C loop cycle 4
{
	getf.sig	a4_4 = m4lo_0
	xma.lu	m4lo_1 = upld_0, v4, m4hi_0
(c1n_0)	cmp.ltu	c1_1, c1n_1 = res1, rpin_0
}
{
(c3_0)	add	res3 = res2, a3_0, 1
(c3n_0)	add	res3 = res2, a3_0
	xma.hu	m4hi_1 = upld_0, v4, m4hi_0	
}
	;; 
	C loop cycle 5
{
(c1_0)	cmp.leu	c1_1, c1n_1 = res1, rpin_0
(c4_0)	add	res4 = res3, a4_0, 1
(c4n_0)	add	res4 = res3, a4_0
}
{
(c2n_0)	cmp.ltu	c2_1, c2n_1 = res2, res1
(c2_0)	cmp.leu	c2_1, c2n_1 = res2, res1
(c3n_0)	cmp.ltu	c3_1, c3n_1 = res3, res2
}
	;;
	C loop cycle 6
{
(c3_0)	cmp.leu	c3_1, c3n_1 = res3, res2
	mov	upld_c = f0
	mov	rpin_c = 0
}
{
(c4_0)	cmp.leu	c4_1, c4n_1 = res4, res3
(c4n_0)	cmp.ltu	c4_1, c4n_1 = res4, res3
	br.ctop.sptk.many	addmul_4_main_loop
}
	;;
	
	.auto
	mov	pr = pr_store,-1
	mov	ar.lc = lc_store
	mov	ar.pfs = r2
	br.ret.sptk.many	b0
EPILOGUE()
ASM_END()
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			include(`../config.m4')
			`C FILE: addmul_4_asm.s`
			`C`
			`C AUTHOR: Jason Worth Martin <jason.worth.martin@gmail.com>`
			`C`
			`C LICENSE: GNU LGPLv3+`
			`C`
			`C DESCRIPTION: uint64_t addmul_4(uint64_t *rp,`
			`C uint64_t *up,`
			`C uint64_t n,`
			`C uint64_t *vp)`
			`C`
			`C Form the product of the n-limb number pointed to by up with the`
			`C 4-limb number pointed to by vp. Accumulate n limbs of the product`
			`C to the n-limb number pointed to by rp. Returns the sum of the`
			`C most significant limb of the product with the last carry from the`
			`C addition.`
			`C`
			`C WARNING: This code will only work correctly if n > 1. This is okay since`
			`C for n = 1 it isn't worth the overhead of starting up all the pipelines.`
			`C So, mul_basecase needs to check for the size of n and use a smaller routine,`
			`C such as addmul_2 or addmul_1 if n is small.`
			`C`
			`C HOW THIS CODE WORKS`
			`C -------------------`
			`C`
			`C This code is highly pipelined. There are four multiplication pipelines,`
			`C two load pipelines, and four addition pipelines. All the pipelines are run`
			`C in parallel to hide the latency of the operations. This means we have a`
			`C large startup penalty, but once the pipelines are full we get four`
			`C multiplications every six cycles for a throughput of 1.5 cycles/mul.`
			`C`
			`C The multiplication pipelines are m1, m2, m3, m4 with the mklo and mkhi`
			`C for k=1,2,3,4 denoting the lo and hi words.`
			`C`
			`C The addition pipelines are a1, a2, a3, a4. The addition pipelines have`
			`C varying lengths to accomidate the staggered additions. In other words,`
			`C we have add lines that look like this:`
			`C`
			`C ** ** a1_1 a1_0`
			`C ** ** a2_2 a2_1 a2_0`
			`C **** a3_3 a3_2 a3_1 a3_0`
			`C a4_4 a4_3 a4_2 a4_1 a4_0`
			`C`
			`C where the additions are performed in columns. We put the results from mul1`
			`C into a1, mul2 into a2, etc.`
			`C`
			`C Note that we use the add+add+cmp+cmp carry propigation method instead of`
			`C the add+cmp+add method. This is not a disadvantage because the main loop`
			`C needs the ldf8, ld8, st8, and br instructions anyway, so we bury the extra`
			`C compares in the bundles used by those instructions. This means that we are`
			`C propigatin single bit carries along each of these add lines separately instead`
			`C of dealing with word-wise carries. Using the word-wise carry propigation`
			`C would reduce the number of instructions needed (add+cmp+add) but increases`
			`C the latency. At this point, we need at least 16 limbs in the input before`
			`C we break even with two times the addmul_2 routine, so it isn't worth using`
			`C the longer latency loops because somewhere between 50 and 60 limbs we`
			`C transition into Karasuba multiplication anyway.`
			`C`
			`C The code uses the register rotation supported on Itanium. On each iteration`
			`C of the main loop, the value appears to move up a register. We label the`
			`C registers to take this into account so that, for eaxmple, the value in a1_1`
			`C appears in a1_0 on the next loop cycle.`
			`C`
			`C We also use the rotating predicate registers to turn on and off the loads and`
			`C stores. This is because we need to wait until the first result has moved`
			`C all the way through the pipelines before storing it. Likewise, we stop the`
			`C loads in the epilogue stages of the loop but continue the stores. See the`
			`C Itanium documentation for a full explaination of register rotation and the`
			`C br.ctop instruction that supports it.`
			`C`


			`C`
			`C General Register use:`
			`C`
			`C r0 - Always 0`
			`C r1 - Global Pointer (preserve)`
			`C r2 - stores ar.pfs (scratch)`
			`C r3 - scratch`
			`C r4-r7 - preserve`
			`C r8 - return0 (structure/union return pointer on entry)`
			`C r9 - lc store (return1)`
			`C r10 - small loop trip count (return2)`
			`C r11 - scratch (return3)`
			`C r12 - stack pointer`
			`C r13 - thread pointer (preserve)`


			`C v1p = vp -- only used at beginning of routine. Originally in r35`
			`C v2p = vp + 1*8`
			`C v3p = vp + 2*8`
			`C v4p = vp + 3*8`
			`C numlim = number of limbs`
			`C res1 = (*rp) + a1_0`
			`C res2 = res1 + a2_0`
			`C res3 = res2 + a3_0`
			`C res4 = res3 + a4_0`
			`C sp = store pointer. This is where we write the result. It is originally rp`
			`pr_store = r3`
			`lc_store = r9`
			`tripcnt_s = r10`
			`rptr = r14`
			`uptr = r15`
			`v1p = r35`
			`v2p = r16`
			`v3p = r17`
			`v4p = r18`
			`tripcnt = r19`
			`res1 = r20`
			`res2 = r21`
			`res3 = r22`
			`res4 = r8`
			`sptr = r23`
			`m1hi_s = r24`
			`m2hi_s = r25`
			`m3hi_s = r26`
			`m4hi_s = r27`

			`C`
			`C General Register Rotating Pipelines`
			`C`
			`C`
			`C r32 - add line 1 stage 1 (rp on entry)`
			`C r33 - add line 1 stage 0 (uptr on entry)`
			`a1_1 = r32`
			`a1_0 = r33`

			`C r34 - add line 2 stage 2 (n on entry)`
			`C r35 - add line 2 stage 1 (vp on entry)`
			`C r36 - add line 2 stage 0 (vp on entry)`
			`a2_2 = r34`
			`a2_1 = r35`
			`a2_0 = r36`

			`C r37 - add line 3 stage 3`
			`C r38 - add line 3 stage 2`
			`C r39 - add line 3 stage 1`
			`C r40 - add line 3 stage 0`
			`a3_3 = r37`
			`a3_2 = r38`
			`a3_1 = r39`
			`a3_0 = r40`

			`C r41 - add line 4 stage 4`
			`C r42 - add line 4 stage 3`
			`C r43 - add line 4 stage 2`
			`C r44 - add line 4 stage 1`
			`C r45 - add line 4 stage 0`
			`a4_4 = r41`
			`a4_3 = r42`
			`a4_2 = r43`
			`a4_1 = r44`
			`a4_0 = r45`

			`C rp input pipeline`
			`C rpin_c = r46`
			`rpin_c = r47`
			`rpin_3 = r48`
			`rpin_2 = r49`
			`rpin_1 = r50`
			`rpin_0 = r51`

			`C Remainder of rotating registers`
			`toprot = r55`


			`C`
			`C Floating Point Register Use`
			`C`
			`v1 = f6`
			`v2 = f7`
			`v3 = f8`
			`v4 = f9`

			`C`
			`C Floating point rotating register pipelines:`
			`C`

			`C fp load pipeline`
			`upld_c = f127`
			`upld_2 = f32`
			`upld_1 = f33`
			`upld_0 = f34`

			`C mul_line_1:`
			`m1lo_1 = f35`
			`m1lo_0 = f36`
			`m1hi_1 = f37`
			`m1hi_0 = f38`

			`C mul_line_2:`
			`m2lo_1 = f39`
			`m2lo_0 = f40`
			`m2hi_1 = f41`
			`m2hi_0 = f42`

			`C mul_line_3:`
			`m3lo_1 = f43`
			`m3lo_0 = f44`
			`m3hi_1 = f45`
			`m3hi_0 = f46`

			`C mul_line_4:`
			`m4lo_1 = f47`
			`m4lo_0 = f48`
			`m4hi_1 = f49`
			`m4hi_0 = f50`



			`C`
			`C Predicate Register Use`
			`C`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`c1_1 = p47`
			`c1_0 = p48`
			`c1n_1 = p49`
			`c1n_0 = p50`

			`c2_1 = p51`
			`c2_0 = p52`
			`c2n_1 = p53`
			`c2n_0 = p54`

			`c3_1 = p55`
			`c3_0 = p56`
			`c3n_1 = p57`
			`c3n_0 = p58`

			`c4_1 = p59`
			`c4_0 = p60`
			`c4n_1 = p61`
			`c4n_0 = p62`

New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00

			`ASM_START()`
			`PROLOGUE(mpn_addmul_4)`
			`.explicit`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`.pred.rel "mutex", p14, p15`
			`.pred.rel "mutex", c1_0, c1n_0`
			`.pred.rel "mutex", c1_1, c1n_1`
			`.pred.rel "mutex", c2_0, c2n_0`
			`.pred.rel "mutex", c2_1, c2n_1`
			`.pred.rel "mutex", c3_0, c3n_0`
			`.pred.rel "mutex", c3_1, c3n_1`
			`.pred.rel "mutex", c4_0, c4n_0`
			`.pred.rel "mutex", c4_1, c4n_1`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`C cycle 1`
			`{`
			`.prologue`
			`.save ar.pfs, r2`
			`alloc r2 = ar.pfs,4,20,0,24`
			`.save ar.lc, lc_store`
			`mov lc_store = ar.lc`
			`.save pr, pr_store`
			`mov pr_store = pr`
			`}`
			`{`
			`.body`
			`mov rptr = r32`
			`mov sptr = r32`
			`mov uptr = r33`
			`}`
			`;;`
			`C cycle 2`
			`{`
			`ldf8 v1 = [v1p]`
			`cmp.eq p14,p15 = 1, r34`
			`add v2p = 8, r35`
			`}`
			`{`
			`ldf8 upld_0 = [uptr], 8`
			`add v3p = 16, r35`
			`add v4p = 24, r35`
			`}`
			`;;`
			`C cycle 3`
			`{`
			`ldf8 v2 = [v2p]`
			`(p15) add tripcnt = -2, r34`
			`mov m1lo_0 = f0`
			`}`
			`{`
			`ldf8 v3 = [v3p]`
			`(p14) mov tripcnt = 0`
			`mov m1hi_0 = f0`
			`}`
			`C cycle 4`
			`{`
			`ldf8 v4 = [v4p]`
			`mov m2lo_0 = f0`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(p15) mov pr.rot = 0x0fff0000 C PR16 = 16-27 set to 1`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`}`
			`{`
			`ld8 rpin_2 = [rptr], 8`
			`mov m2hi_0 = f0`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(p14) mov pr.rot = 0x0ffe0000 C Single Limb`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`}`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`;;`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`C cycle 5`
			`{`
			`mov rpin_3 = 0`
			`mov rpin_1 = 0`
			`mov m3lo_0 = f0`
			`}`
			`{`
			`mov rpin_0 = 0`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`cmp.ne c3_0, c3n_0 = r0, r0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`mov m3hi_0 = f0`
			`}`
			`C cycle 6`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`cmp.ne c4_0, c4n_0 = r0, r0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`mov a1_1 = 0`
			`mov m4lo_0 = f0`
			`}`
			`{`
			`mov a1_0 = 0`
			`mov a2_2 = 0`
			`mov m4hi_0 = f0`
			`}`
			`C cycle 7`
			`{`
			`mov a2_1 = 0`
			`mov a2_0 = 0`
			`mov upld_1 = f0`
			`}`
			`{`
			`mov a3_3 = 0`
			`mov upld_2 = f0`
			`(p14) mov ar.ec = 7`
			`}`
			`;;`
			`C cycle 8`
			`{`
			`mov a3_2 = 0`
			`mov upld_c = f0`
			`(p15) mov ar.ec = 8`
			`}`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`cmp.ne c1_0, c1n_0 = r0, r0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`mov toprot = 0`
			`mov ar.lc = tripcnt`
			`}`
			`C cycle 9`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`cmp.ne c2_0, c2n_0 = r0, r0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`mov a3_0 = 0`
			`mov a3_1 = 0`
			`}`
			`{`
			`mov a4_4 = 0`
			`mov a4_3 = 0`
			`mov a4_0 = 0`
			`}`
			`C cycle 10`
			`{`
			`mov a4_1 = 0`
			`mov a4_2 = 0`
			`nop 0`
			`}`
			`{`
			`nop 0`
			`nop 0`
			`nop 0`
			`}`
			`;;`
			`addmul_4_main_loop:`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`.pred.rel "mutex", c1_0, c1n_0`
			`.pred.rel "mutex", c1_1, c1n_1`
			`.pred.rel "mutex", c2_0, c2n_0`
			`.pred.rel "mutex", c2_1, c2n_1`
			`.pred.rel "mutex", c3_0, c3n_0`
			`.pred.rel "mutex", c3_1, c3n_1`
			`.pred.rel "mutex", c4_0, c4n_0`
			`.pred.rel "mutex", c4_1, c4n_1`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`C loop cycle 1`
			`{`
			`getf.sig a1_1 = m1lo_0`
			`(p16) ldf8 upld_1 = [uptr], 8`
			`xma.lu m1lo_1 = upld_0, v1, m1hi_0`
			`}`
			`{`
			`nop 0`
			`nop 0`
			`xma.hu m1hi_1 = upld_0, v1, m1hi_0`
			`}`
			`;;`
			`C loop cycle 2`
			`{`
			`getf.sig a2_2 = m2lo_0`
			`xma.lu m2lo_1 = upld_0, v2, m2hi_0`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c1_0) add res1 = rpin_0, a1_0, 1`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`}`
			`{`
			`(p30) st8 [sptr] = res4, 8`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c1n_0) add res1 = rpin_0, a1_0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`xma.hu m2hi_1 = upld_0, v2, m2hi_0`
			`}`
			`;;`
			`C loop cycle 3`
			`{`
			`getf.sig a3_3 = m3lo_0`
			`(p16) ld8 rpin_3 = [rptr], 8`
			`xma.lu m3lo_1 = upld_0, v3, m3hi_0`
			`}`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c2_0) add res2 = res1, a2_0, 1`
			`(c2n_0) add res2 = res1, a2_0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`xma.hu m3hi_1 = upld_0, v3, m3hi_0`
			`}`
			`;;`
			`C loop cycle 4`
			`{`
			`getf.sig a4_4 = m4lo_0`
			`xma.lu m4lo_1 = upld_0, v4, m4hi_0`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c1n_0) cmp.ltu c1_1, c1n_1 = res1, rpin_0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`}`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c3_0) add res3 = res2, a3_0, 1`
			`(c3n_0) add res3 = res2, a3_0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`xma.hu m4hi_1 = upld_0, v4, m4hi_0`
			`}`
			`;;`
			`C loop cycle 5`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c1_0) cmp.leu c1_1, c1n_1 = res1, rpin_0`
			`(c4_0) add res4 = res3, a4_0, 1`
			`(c4n_0) add res4 = res3, a4_0`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`}`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c2n_0) cmp.ltu c2_1, c2n_1 = res2, res1`
			`(c2_0) cmp.leu c2_1, c2n_1 = res2, res1`
			`(c3n_0) cmp.ltu c3_1, c3n_1 = res3, res2`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`}`
			`;;`
			`C loop cycle 6`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c3_0) cmp.leu c3_1, c3n_1 = res3, res2`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`mov upld_c = f0`
			`mov rpin_c = 0`
			`}`
			`{`
ia64 addmul_4 adjustments for icc 2010-03-19 12:57:04 -04:00			`(c4_0) cmp.leu c4_1, c4n_1 = res4, res3`
			`(c4n_0) cmp.ltu c4_1, c4n_1 = res4, res3`
New asm ia64 mul_2 addmul_4 2010-03-18 09:48:46 -04:00			`br.ctop.sptk.many addmul_4_main_loop`
			`}`
			`;;`

			`.auto`
			`mov pr = pr_store,-1`
			`mov ar.lc = lc_store`
			`mov ar.pfs = r2`
			`br.ret.sptk.many b0`
			`EPILOGUE()`
			`ASM_END()`