mpir/mpn/ia64/addmul_1.asm
2010-03-08 00:26:51 +00:00

591 lines
13 KiB
NASM

dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
dnl result to a second limb vector.
dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Itanium: 3.0
C Itanium 2: 2.0
C TODO
C * Further optimize feed-in and wind-down code, both for speed and code size.
C * Handle low limb input and results specially, using a common stf8 in the
C epilogue.
C * Use 1 c/l carry propagation scheme in wind-down code.
C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
C * Work out final differences with mul_1.asm. That function is 300 bytes
C smaller than this due to better loop scheduling and thus simpler feed-in
C code.
C INPUT PARAMETERS
define(`rp', `r32')
define(`up', `r33')
define(`n', `r34')
define(`vl', `r35')
ASM_START()
PROLOGUE(mpn_addmul_1)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
` addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
zxt4 n = n C I
;;
')
{.mmi
adds r15 = -1, n C M I
mov r20 = rp C M I
mov.i r2 = ar.lc C I0
}
{.mmi
ldf8 f7 = [up], 8 C M
ldf8 f8 = [rp], 8 C M
and r14 = 3, n C M I
;;
}
{.mmi
setf.sig f6 = vl C M2 M3
cmp.eq p10, p0 = 0, r14 C M I
shr.u r31 = r15, 2 C I0
}
{.mmi
cmp.eq p11, p0 = 2, r14 C M I
cmp.eq p12, p0 = 3, r14 C M I
nop.i 0 C I
;;
}
{.mii
cmp.ne p6, p7 = r0, r0 C M I
mov.i ar.lc = r31 C I0
cmp.ne p8, p9 = r0, r0 C M I
}
{.bbb
(p10) br.dptk .Lb00 C B
(p11) br.dptk .Lb10 C B
(p12) br.dptk .Lb11 C B
;;
}
.Lb01: br.cloop.dptk .grt1 C B
xma.l f39 = f7, f6, f8 C F
xma.hu f43 = f7, f6, f8 C F
;;
getf.sig r8 = f43 C M2
stf8 [r20] = f39 C M2 M3
mov.i ar.lc = r2 C I0
br.ret.sptk.many b0 C B
.grt1:
ldf8 f32 = [up], 8
ldf8 f44 = [rp], 8
;;
ldf8 f33 = [up], 8
ldf8 f45 = [rp], 8
;;
ldf8 f34 = [up], 8
xma.l f39 = f7, f6, f8
ldf8 f46 = [rp], 8
xma.hu f43 = f7, f6, f8
;;
ldf8 f35 = [up], 8
ldf8 f47 = [rp], 8
br.cloop.dptk .grt5
xma.l f36 = f32, f6, f44
xma.hu f40 = f32, f6, f44
;;
stf8 [r20] = f39, 8
xma.l f37 = f33, f6, f45
xma.hu f41 = f33, f6, f45
;;
getf.sig r31 = f43
getf.sig r24 = f36
xma.l f38 = f34, f6, f46
xma.hu f42 = f34, f6, f46
;;
getf.sig r28 = f40
getf.sig r25 = f37
xma.l f39 = f35, f6, f47
xma.hu f43 = f35, f6, f47
;;
getf.sig r29 = f41
getf.sig r26 = f38
br .Lcj5
.grt5:
mov r30 = 0
xma.l f36 = f32, f6, f44
xma.hu f40 = f32, f6, f44
;;
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f45
ldf8 f44 = [rp], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f33 = [up], 8
getf.sig r27 = f39
;;
getf.sig r31 = f43
xma.l f38 = f34, f6, f46
ldf8 f45 = [rp], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f34 = [up], 8
getf.sig r24 = f36
;;
getf.sig r28 = f40
xma.l f39 = f35, f6, f47
ldf8 f46 = [rp], 8
xma.hu f43 = f35, f6, f47
;;
ldf8 f35 = [up], 8
getf.sig r25 = f37
br.cloop.dptk .Loop
br .Le0
.Lb10: ldf8 f35 = [up], 8
ldf8 f47 = [rp], 8
br.cloop.dptk .grt2
xma.l f38 = f7, f6, f8
xma.hu f42 = f7, f6, f8
;;
xma.l f39 = f35, f6, f47
xma.hu f43 = f35, f6, f47
;;
getf.sig r30 = f42
stf8 [r20] = f38, 8
getf.sig r27 = f39
getf.sig r8 = f43
br .Lcj2
.grt2:
ldf8 f32 = [up], 8
ldf8 f44 = [rp], 8
;;
ldf8 f33 = [up], 8
xma.l f38 = f7, f6, f8
ldf8 f45 = [rp], 8
xma.hu f42 = f7, f6, f8
;;
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f47
ldf8 f46 = [rp], 8
xma.hu f43 = f35, f6, f47
;;
ldf8 f35 = [up], 8
ldf8 f47 = [rp], 8
br.cloop.dptk .grt6
stf8 [r20] = f38, 8
xma.l f36 = f32, f6, f44
xma.hu f40 = f32, f6, f44
;;
getf.sig r30 = f42
getf.sig r27 = f39
xma.l f37 = f33, f6, f45
xma.hu f41 = f33, f6, f45
;;
getf.sig r31 = f43
getf.sig r24 = f36
xma.l f38 = f34, f6, f46
xma.hu f42 = f34, f6, f46
;;
getf.sig r28 = f40
getf.sig r25 = f37
xma.l f39 = f35, f6, f47
xma.hu f43 = f35, f6, f47
br .Lcj6
.grt6:
mov r29 = 0
xma.l f36 = f32, f6, f44
xma.hu f40 = f32, f6, f44
;;
ldf8 f32 = [up], 8
getf.sig r26 = f38
;;
getf.sig r30 = f42
xma.l f37 = f33, f6, f45
ldf8 f44 = [rp], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f33 = [up], 8
getf.sig r27 = f39
;;
getf.sig r31 = f43
xma.l f38 = f34, f6, f46
ldf8 f45 = [rp], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f34 = [up], 8
getf.sig r24 = f36
br .LL10
.Lb11: ldf8 f34 = [up], 8
ldf8 f46 = [rp], 8
;;
ldf8 f35 = [up], 8
ldf8 f47 = [rp], 8
br.cloop.dptk .grt3
;;
xma.l f37 = f7, f6, f8
xma.hu f41 = f7, f6, f8
xma.l f38 = f34, f6, f46
xma.hu f42 = f34, f6, f46
xma.l f39 = f35, f6, f47
xma.hu f43 = f35, f6, f47
;;
getf.sig r29 = f41
stf8 [r20] = f37, 8
getf.sig r26 = f38
getf.sig r30 = f42
getf.sig r27 = f39
getf.sig r8 = f43
br .Lcj3
.grt3:
ldf8 f32 = [up], 8
xma.l f37 = f7, f6, f8
ldf8 f44 = [rp], 8
xma.hu f41 = f7, f6, f8
;;
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f46
ldf8 f45 = [rp], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f47
ldf8 f46 = [rp], 8
xma.hu f43 = f35, f6, f47
;;
ldf8 f35 = [up], 8
getf.sig r25 = f37 C FIXME
ldf8 f47 = [rp], 8
br.cloop.dptk .grt7
getf.sig r29 = f41
stf8 [r20] = f37, 8 C FIXME
xma.l f36 = f32, f6, f44
getf.sig r26 = f38
xma.hu f40 = f32, f6, f44
;;
getf.sig r30 = f42
xma.l f37 = f33, f6, f45
getf.sig r27 = f39
xma.hu f41 = f33, f6, f45
;;
getf.sig r31 = f43
xma.l f38 = f34, f6, f46
getf.sig r24 = f36
xma.hu f42 = f34, f6, f46
br .Lcj7
.grt7:
getf.sig r29 = f41
xma.l f36 = f32, f6, f44
mov r28 = 0
xma.hu f40 = f32, f6, f44
;;
ldf8 f32 = [up], 8
getf.sig r26 = f38
;;
getf.sig r30 = f42
xma.l f37 = f33, f6, f45
ldf8 f44 = [rp], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f33 = [up], 8
getf.sig r27 = f39
br .LL11
.Lb00: ldf8 f33 = [up], 8
ldf8 f45 = [rp], 8
;;
ldf8 f34 = [up], 8
ldf8 f46 = [rp], 8
;;
ldf8 f35 = [up], 8
xma.l f36 = f7, f6, f8
ldf8 f47 = [rp], 8
xma.hu f40 = f7, f6, f8
br.cloop.dptk .grt4
xma.l f37 = f33, f6, f45
xma.hu f41 = f33, f6, f45
xma.l f38 = f34, f6, f46
xma.hu f42 = f34, f6, f46
;;
getf.sig r28 = f40
stf8 [r20] = f36, 8
xma.l f39 = f35, f6, f47
getf.sig r25 = f37
xma.hu f43 = f35, f6, f47
;;
getf.sig r29 = f41
getf.sig r26 = f38
getf.sig r30 = f42
getf.sig r27 = f39
br .Lcj4
.grt4:
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f45
ldf8 f44 = [rp], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f46
ldf8 f45 = [rp], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f34 = [up], 8
getf.sig r24 = f36 C FIXME
xma.l f39 = f35, f6, f47
ldf8 f46 = [rp], 8
getf.sig r28 = f40
xma.hu f43 = f35, f6, f47
;;
ldf8 f35 = [up], 8
getf.sig r25 = f37
ldf8 f47 = [rp], 8
br.cloop.dptk .grt8
getf.sig r29 = f41
stf8 [r20] = f36, 8 C FIXME
xma.l f36 = f32, f6, f44
getf.sig r26 = f38
getf.sig r30 = f42
xma.hu f40 = f32, f6, f44
;;
xma.l f37 = f33, f6, f45
getf.sig r27 = f39
xma.hu f41 = f33, f6, f45
br .Lcj8
.grt8:
getf.sig r29 = f41
xma.l f36 = f32, f6, f44
mov r31 = 0
xma.hu f40 = f32, f6, f44
;;
ldf8 f32 = [up], 8
getf.sig r26 = f38
br .LL00
C *** MAIN LOOP START ***
ALIGN(32) C insn fed cycle #
.Loop:
.pred.rel "mutex", p6, p7 C num by i1 i2
getf.sig r29 = f41 C 00 16 0 0
xma.l f36 = f32, f6, f44 C 01 06,15 0 0
(p6) add r14 = r30, r27, 1 C 02 0 0
ldf8 f47 = [rp], 8 C 03 0 0
xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
(p7) add r14 = r30, r27 C 05 0 0
;;
.pred.rel "mutex", p6, p7
ldf8 f32 = [up], 8 C 06 1 1
(p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
(p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
getf.sig r26 = f38 C 09 25 2 1
st8 [r20] = r14, 8 C 10 2 1
nop.b 0 C 11 2 1
;;
.LL00:
.pred.rel "mutex", p8, p9
getf.sig r30 = f42 C 12 28 3 2
xma.l f37 = f33, f6, f45 C 13 18,27 3 2
(p8) add r16 = r31, r24, 1 C 14 3 2
ldf8 f44 = [rp], 8 C 15 3 2
xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
(p9) add r16 = r31, r24 C 17 3 2
;;
.pred.rel "mutex", p8, p9
ldf8 f33 = [up], 8 C 18 4 3
(p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
(p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
getf.sig r27 = f39 C 21 37 5 3
st8 [r20] = r16, 8 C 22 5 3
nop.b 0 C 23 5 3
;;
.LL11:
.pred.rel "mutex", p6, p7
getf.sig r31 = f43 C 24 40 6 4
xma.l f38 = f34, f6, f46 C 25 30,39 6 4
(p6) add r14 = r28, r25, 1 C 26 6 4
ldf8 f45 = [rp], 8 C 27 6 4
xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
(p7) add r14 = r28, r25 C 29 6 4
;;
.pred.rel "mutex", p6, p7
ldf8 f34 = [up], 8 C 30 7 5
(p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
(p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
getf.sig r24 = f36 C 33 01 8 5
st8 [r20] = r14, 8 C 34 8 5
nop.b 0 C 35 8 5
;;
.LL10:
.pred.rel "mutex", p8, p9
getf.sig r28 = f40 C 36 04 9 6
xma.l f39 = f35, f6, f47 C 37 42,03 9 6
(p8) add r16 = r29, r26, 1 C 38 9 6
ldf8 f46 = [rp], 8 C 39 9 6
xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
(p9) add r16 = r29, r26 C 41 9 6
;;
.pred.rel "mutex", p8, p9
ldf8 f35 = [up], 8 C 42 10 7
(p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
(p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
getf.sig r25 = f37 C 45 13 11 7
st8 [r20] = r16, 8 C 46 11 7
br.cloop.dptk .Loop C 47 11 7
C *** MAIN LOOP END ***
;;
.Le0:
.pred.rel "mutex", p6, p7
getf.sig r29 = f41 C
xma.l f36 = f32, f6, f44 C
(p6) add r14 = r30, r27, 1 C
ldf8 f47 = [rp], 8 C
xma.hu f40 = f32, f6, f44 C
(p7) add r14 = r30, r27 C
;;
.pred.rel "mutex", p6, p7
(p6) cmp.leu p8, p9 = r14, r27 C
(p7) cmp.ltu p8, p9 = r14, r27 C
getf.sig r26 = f38 C
st8 [r20] = r14, 8 C
;;
.pred.rel "mutex", p8, p9
getf.sig r30 = f42 C
xma.l f37 = f33, f6, f45 C
(p8) add r16 = r31, r24, 1 C
xma.hu f41 = f33, f6, f45 C
(p9) add r16 = r31, r24 C
;;
.pred.rel "mutex", p8, p9
(p8) cmp.leu p6, p7 = r16, r24 C
(p9) cmp.ltu p6, p7 = r16, r24 C
getf.sig r27 = f39 C
st8 [r20] = r16, 8 C
;;
.Lcj8:
.pred.rel "mutex", p6, p7
getf.sig r31 = f43 C
xma.l f38 = f34, f6, f46 C
(p6) add r14 = r28, r25, 1 C
xma.hu f42 = f34, f6, f46 C
(p7) add r14 = r28, r25 C
;;
.pred.rel "mutex", p6, p7
(p6) cmp.leu p8, p9 = r14, r25 C
(p7) cmp.ltu p8, p9 = r14, r25 C
getf.sig r24 = f36 C
st8 [r20] = r14, 8 C
;;
.Lcj7:
.pred.rel "mutex", p8, p9
getf.sig r28 = f40 C
xma.l f39 = f35, f6, f47 C
(p8) add r16 = r29, r26, 1 C
xma.hu f43 = f35, f6, f47 C
(p9) add r16 = r29, r26 C
;;
.pred.rel "mutex", p8, p9
(p8) cmp.leu p6, p7 = r16, r26 C
(p9) cmp.ltu p6, p7 = r16, r26 C
getf.sig r25 = f37 C
st8 [r20] = r16, 8 C
;;
.Lcj6:
.pred.rel "mutex", p6, p7
getf.sig r29 = f41 C
(p6) add r14 = r30, r27, 1 C
(p7) add r14 = r30, r27 C
;;
.pred.rel "mutex", p6, p7
(p6) cmp.leu p8, p9 = r14, r27 C
(p7) cmp.ltu p8, p9 = r14, r27 C
getf.sig r26 = f38 C
st8 [r20] = r14, 8 C
;;
.Lcj5:
.pred.rel "mutex", p8, p9
getf.sig r30 = f42 C
(p8) add r16 = r31, r24, 1 C
(p9) add r16 = r31, r24 C
;;
.pred.rel "mutex", p8, p9
(p8) cmp.leu p6, p7 = r16, r24 C
(p9) cmp.ltu p6, p7 = r16, r24 C
getf.sig r27 = f39 C
st8 [r20] = r16, 8 C
;;
.Lcj4:
.pred.rel "mutex", p6, p7
getf.sig r8 = f43 C
(p6) add r14 = r28, r25, 1 C
(p7) add r14 = r28, r25 C
;;
.pred.rel "mutex", p6, p7
st8 [r20] = r14, 8 C
(p6) cmp.leu p8, p9 = r14, r25 C
(p7) cmp.ltu p8, p9 = r14, r25 C
;;
.Lcj3:
.pred.rel "mutex", p8, p9
(p8) add r16 = r29, r26, 1 C
(p9) add r16 = r29, r26 C
;;
.pred.rel "mutex", p8, p9
st8 [r20] = r16, 8 C
(p8) cmp.leu p6, p7 = r16, r26 C
(p9) cmp.ltu p6, p7 = r16, r26 C
;;
.Lcj2:
.pred.rel "mutex", p6, p7
(p6) add r14 = r30, r27, 1 C
(p7) add r14 = r30, r27 C
;;
.pred.rel "mutex", p6, p7
st8 [r20] = r14 C
(p6) cmp.leu p8, p9 = r14, r27 C
(p7) cmp.ltu p8, p9 = r14, r27 C
;;
(p8) add r8 = 1, r8 C M I
mov.i ar.lc = r2 C I0
br.ret.sptk.many b0 C B
EPILOGUE()
ASM_END()