mpir/mpn/ia64/mul_1.asm
2010-03-08 00:26:51 +00:00

574 lines
11 KiB
NASM

dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
dnl store the result in a second limb vector.
dnl Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software
dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Itanium: 4.0
C Itanium 2: 2.0
C TODO
C * Further optimize feed-in and wind-down code, both for speed and code size.
C * Handle low limb input and results specially, using a common stf8 in the
C epilogue.
C * Use 1 c/l carry propagation scheme in wind-down code.
C * Use extra pointer register for `up' to speed up feed-in loads.
C * Work out final differences with addmul_1.asm.
C INPUT PARAMETERS
define(`rp', `r32')
define(`up', `r33')
define(`n', `r34')
define(`vl', `r35')
define(`cy', `r36') C for mpn_mul_1c
ASM_START()
PROLOGUE(mpn_mul_1)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
` addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
zxt4 n = n C I
;;
')
{.mfi
adds r15 = -1, n C M I
mov f9 = f0 C F
mov.i r2 = ar.lc C I0
}
{.mmi
ldf8 f7 = [up], 8 C M
nop.m 0 C M
and r14 = 3, n C M I
;;
}
.Lcommon:
{.mii
setf.sig f6 = vl C M2 M3
shr.u r31 = r15, 2 C I0
cmp.eq p10, p0 = 0, r14 C M I
}
{.mii
cmp.eq p11, p0 = 2, r14 C M I
cmp.eq p12, p0 = 3, r14 C M I
nop.i 0 C I
;;
}
{.mii
cmp.ne p6, p7 = r0, r0 C M I
mov.i ar.lc = r31 C I0
cmp.ne p8, p9 = r0, r0 C M I
}
{.bbb
(p10) br.dptk .Lb00 C B
(p11) br.dptk .Lb10 C B
(p12) br.dptk .Lb11 C B
;;
}
.Lb01: mov r20 = 0
br.cloop.dptk .grt1 C B
xma.l f39 = f7, f6, f9 C F
xma.hu f43 = f7, f6, f9 C F
;;
getf.sig r8 = f43 C M2
stf8 [rp] = f39 C M2 M3
mov.i ar.lc = r2 C I0
br.ret.sptk.many b0 C B
.grt1:
ldf8 f32 = [up], 8
;;
ldf8 f33 = [up], 8
;;
ldf8 f34 = [up], 8
xma.l f39 = f7, f6, f9
xma.hu f43 = f7, f6, f9
;;
ldf8 f35 = [up], 8
br.cloop.dptk .grt5
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
stf8 [rp] = f39, 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
getf.sig r16 = f38
br .Lcj5
.grt5:
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r17 = f39
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
;;
getf.sig r18 = f36
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
;;
getf.sig r19 = f37
xma.hu f43 = f35, f6, f0
br .LL01
.Lb10: ldf8 f35 = [up], 8
mov r23 = 0
br.cloop.dptk .grt2
xma.l f38 = f7, f6, f9
xma.hu f42 = f7, f6, f9
;;
stf8 [rp] = f38, 8
xma.l f39 = f35, f6, f42
xma.hu f43 = f35, f6, f42
;;
getf.sig r8 = f43
stf8 [rp] = f39
mov.i ar.lc = r2
br.ret.sptk.many b0
.grt2:
ldf8 f32 = [up], 8
;;
ldf8 f33 = [up], 8
xma.l f38 = f7, f6, f9
xma.hu f42 = f7, f6, f9
;;
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
ldf8 f35 = [up], 8
br.cloop.dptk .grt6
stf8 [rp] = f38, 8
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r20 = f42
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
br .Lcj6
.grt6:
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r20 = f42
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
;;
getf.sig r17 = f39
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
;;
getf.sig r18 = f36
xma.hu f42 = f34, f6, f0
br .LL10
.Lb11: ldf8 f34 = [up], 8
mov r22 = 0
;;
ldf8 f35 = [up], 8
br.cloop.dptk .grt3
;;
xma.l f37 = f7, f6, f9
xma.hu f41 = f7, f6, f9
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
stf8 [rp] = f37, 8
getf.sig r16 = f38
getf.sig r20 = f42
getf.sig r17 = f39
getf.sig r8 = f43
br .Lcj3
.grt3:
ldf8 f32 = [up], 8
xma.l f37 = f7, f6, f9
xma.hu f41 = f7, f6, f9
;;
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r19 = f37
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
ldf8 f35 = [up], 8
br.cloop.dptk .grt7
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
getf.sig r20 = f42
xma.hu f40 = f32, f6, f0
;;
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
getf.sig r21 = f43
xma.hu f41 = f33, f6, f0
;;
getf.sig r18 = f36
st8 [rp] = r19, 8
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
br .Lcj7
.grt7:
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r20 = f42
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
;;
getf.sig r17 = f39
xma.hu f41 = f33, f6, f0
br .LL11
.Lb00: ldf8 f33 = [up], 8
mov r21 = 0
;;
ldf8 f34 = [up], 8
;;
ldf8 f35 = [up], 8
xma.l f36 = f7, f6, f9
xma.hu f40 = f7, f6, f9
br.cloop.dptk .grt4
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
stf8 [rp] = f36, 8
xma.l f39 = f35, f6, f0
getf.sig r19 = f37
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
getf.sig r16 = f38
getf.sig r20 = f42
getf.sig r17 = f39
br .Lcj4
.grt4:
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r18 = f36
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
;;
getf.sig r19 = f37
getf.sig r23 = f41
xma.hu f43 = f35, f6, f0
ldf8 f35 = [up], 8
br.cloop.dptk .grt8
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
getf.sig r20 = f42
xma.hu f40 = f32, f6, f0
;;
getf.sig r17 = f39
st8 [rp] = r18, 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
br .Lcj8
.grt8:
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
br .LL00
C *** MAIN LOOP START ***
ALIGN(32)
.Loop:
.pred.rel "mutex",p6,p7
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
(p6) cmp.leu p8, p9 = r24, r17
st8 [rp] = r24, 8
xma.hu f40 = f32, f6, f0
(p7) cmp.ltu p8, p9 = r24, r17
;;
.LL00:
.pred.rel "mutex",p8,p9
getf.sig r20 = f42
(p8) add r24 = r18, r21, 1
nop.b 0
ldf8 f32 = [up], 8
(p9) add r24 = r18, r21
nop.b 0
;;
.pred.rel "mutex",p8,p9
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
(p8) cmp.leu p6, p7 = r24, r18
st8 [rp] = r24, 8
xma.hu f41 = f33, f6, f0
(p9) cmp.ltu p6, p7 = r24, r18
;;
.LL11:
.pred.rel "mutex",p6,p7
getf.sig r21 = f43
(p6) add r24 = r19, r22, 1
nop.b 0
ldf8 f33 = [up], 8
(p7) add r24 = r19, r22
nop.b 0
;;
.pred.rel "mutex",p6,p7
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
(p6) cmp.leu p8, p9 = r24, r19
st8 [rp] = r24, 8
xma.hu f42 = f34, f6, f0
(p7) cmp.ltu p8, p9 = r24, r19
;;
.LL10:
.pred.rel "mutex",p8,p9
getf.sig r22 = f40
(p8) add r24 = r16, r23, 1
nop.b 0
ldf8 f34 = [up], 8
(p9) add r24 = r16, r23
nop.b 0
;;
.pred.rel "mutex",p8,p9
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
(p8) cmp.leu p6, p7 = r24, r16
st8 [rp] = r24, 8
xma.hu f43 = f35, f6, f0
(p9) cmp.ltu p6, p7 = r24, r16
;;
.LL01:
.pred.rel "mutex",p6,p7
getf.sig r23 = f41
(p6) add r24 = r17, r20, 1
nop.b 0
ldf8 f35 = [up], 8
(p7) add r24 = r17, r20
br.cloop.dptk .Loop
C *** MAIN LOOP END ***
;;
.Lcj9:
.pred.rel "mutex",p6,p7
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
(p6) cmp.leu p8, p9 = r24, r17
st8 [rp] = r24, 8
xma.hu f40 = f32, f6, f0
(p7) cmp.ltu p8, p9 = r24, r17
;;
.pred.rel "mutex",p8,p9
getf.sig r20 = f42
(p8) add r24 = r18, r21, 1
(p9) add r24 = r18, r21
;;
.pred.rel "mutex",p8,p9
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
(p8) cmp.leu p6, p7 = r24, r18
st8 [rp] = r24, 8
xma.hu f41 = f33, f6, f0
(p9) cmp.ltu p6, p7 = r24, r18
;;
.Lcj8:
.pred.rel "mutex",p6,p7
getf.sig r21 = f43
(p6) add r24 = r19, r22, 1
(p7) add r24 = r19, r22
;;
.pred.rel "mutex",p6,p7
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
(p6) cmp.leu p8, p9 = r24, r19
st8 [rp] = r24, 8
xma.hu f42 = f34, f6, f0
(p7) cmp.ltu p8, p9 = r24, r19
;;
.Lcj7:
.pred.rel "mutex",p8,p9
getf.sig r22 = f40
(p8) add r24 = r16, r23, 1
(p9) add r24 = r16, r23
;;
.pred.rel "mutex",p8,p9
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
(p8) cmp.leu p6, p7 = r24, r16
st8 [rp] = r24, 8
xma.hu f43 = f35, f6, f0
(p9) cmp.ltu p6, p7 = r24, r16
;;
.Lcj6:
.pred.rel "mutex",p6,p7
getf.sig r23 = f41
(p6) add r24 = r17, r20, 1
(p7) add r24 = r17, r20
;;
.pred.rel "mutex",p6,p7
(p6) cmp.leu p8, p9 = r24, r17
(p7) cmp.ltu p8, p9 = r24, r17
getf.sig r16 = f38
st8 [rp] = r24, 8
;;
.Lcj5:
.pred.rel "mutex",p8,p9
getf.sig r20 = f42
(p8) add r24 = r18, r21, 1
(p9) add r24 = r18, r21
;;
.pred.rel "mutex",p8,p9
(p8) cmp.leu p6, p7 = r24, r18
(p9) cmp.ltu p6, p7 = r24, r18
getf.sig r17 = f39
st8 [rp] = r24, 8
;;
.Lcj4:
.pred.rel "mutex",p6,p7
getf.sig r8 = f43
(p6) add r24 = r19, r22, 1
(p7) add r24 = r19, r22
;;
.pred.rel "mutex",p6,p7
st8 [rp] = r24, 8
(p6) cmp.leu p8, p9 = r24, r19
(p7) cmp.ltu p8, p9 = r24, r19
;;
.Lcj3:
.pred.rel "mutex",p8,p9
(p8) add r24 = r16, r23, 1
(p9) add r24 = r16, r23
;;
.pred.rel "mutex",p8,p9
st8 [rp] = r24, 8
(p8) cmp.leu p6, p7 = r24, r16
(p9) cmp.ltu p6, p7 = r24, r16
;;
.Lcj2:
.pred.rel "mutex",p6,p7
(p6) add r24 = r17, r20, 1
(p7) add r24 = r17, r20
;;
.pred.rel "mutex",p6,p7
st8 [rp] = r24, 8
(p6) cmp.leu p8, p9 = r24, r17
(p7) cmp.ltu p8, p9 = r24, r17
;;
.pred.rel "mutex",p8,p9
(p8) add r8 = 1, r8
mov.i ar.lc = r2
br.ret.sptk.many b0
EPILOGUE()
PROLOGUE(mpn_mul_1c)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
` addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
zxt4 n = n C I
;;
')
{.mmi
adds r15 = -1, n C M I
setf.sig f9 = cy C M2 M3
mov.i r2 = ar.lc C I0
}
{.mmb
ldf8 f7 = [up], 8 C M
and r14 = 3, n C M I
br.sptk .Lcommon
;;
}
EPILOGUE()
ASM_END()