80 lines
2.1 KiB
NASM
80 lines
2.1 KiB
NASM
|
dnl IA-64 mpn_sqr_diagonal. Helper for sqr_basecase.
|
||
|
|
||
|
dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
|
||
|
|
||
|
dnl This file is part of the GNU MP Library.
|
||
|
|
||
|
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
dnl it under the terms of the GNU Lesser General Public License as published
|
||
|
dnl by the Free Software Foundation; either version 3 of the License, or (at
|
||
|
dnl your option) any later version.
|
||
|
|
||
|
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
dnl License for more details.
|
||
|
|
||
|
dnl You should have received a copy of the GNU Lesser General Public License
|
||
|
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
||
|
|
||
|
include(`../config.m4')
|
||
|
|
||
|
C cycles/limb
|
||
|
C Itanium: 4
|
||
|
C Itanium 2: 2
|
||
|
|
||
|
C TODO
|
||
|
C * Perhaps avoid ctop loop. Unfortunately, a cloop loop running at 1 c/l
|
||
|
C would need prohibitive 8-way unrolling.
|
||
|
C * Instead of messing too much with this, write a nifty mpn_sqr_basecase.
|
||
|
|
||
|
C INPUT PARAMETERS
|
||
|
C rp = r32
|
||
|
C sp = r33
|
||
|
C n = r34
|
||
|
|
||
|
ASM_START()
|
||
|
PROLOGUE(mpn_sqr_diagonal)
|
||
|
.prologue
|
||
|
.save ar.lc, r2
|
||
|
.save pr, r15
|
||
|
.body
|
||
|
ifdef(`HAVE_ABI_32',
|
||
|
` addp4 r32 = 0, r32
|
||
|
addp4 r33 = 0, r33
|
||
|
zxt4 r34 = r34
|
||
|
;;
|
||
|
')
|
||
|
ldf8 f32 = [r33], 8 C M load rp[0] early
|
||
|
mov r2 = ar.lc C I0
|
||
|
mov r14 = ar.ec C I0
|
||
|
mov r15 = pr C I0
|
||
|
add r19 = -1, r34 C M I decr n
|
||
|
add r18 = 8, r32 C M I rp for high limb
|
||
|
;;
|
||
|
mov ar.lc = r19 C I0
|
||
|
mov ar.ec = 5 C I0
|
||
|
mov pr.rot = 1<<16 C I0
|
||
|
;;
|
||
|
br.cexit.spnt .Ldone C B
|
||
|
;;
|
||
|
ALIGN(32)
|
||
|
.Loop:
|
||
|
(p16) ldf8 f32 = [r33], 8 C M
|
||
|
(p19) xma.l f36 = f35, f35, f0 C F
|
||
|
(p21) stf8 [r32] = f38, 16 C M2 M3
|
||
|
(p19) xma.hu f40 = f35, f35, f0 C F
|
||
|
(p21) stf8 [r18] = f42, 16 C M2 M3
|
||
|
br.ctop.dptk .Loop C B
|
||
|
;;
|
||
|
.Ldone:
|
||
|
stf8 [r32] = f38 C M2 M3
|
||
|
stf8 [r18] = f42 C M2 M3
|
||
|
mov ar.ec = r14 C I0
|
||
|
;;
|
||
|
mov pr = r15, 0x1ffff C I0
|
||
|
mov ar.lc = r2 C I0
|
||
|
br.ret.sptk.many b0 C B
|
||
|
EPILOGUE(mpn_sqr_diagonal)
|
||
|
ASM_END()
|