mpir/mpn/ia64/lorrshift.asm
2008-04-17 21:03:07 +00:00

235 lines
5.2 KiB
NASM

dnl IA-64 mpn_Xshift.
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl Boston, MA 02110-1301, USA.
include(`../config.m4')
C This code runs at 2 cycles/limb for large operands on the Itanium. It needs
C a very deep software pipeline, since shl/shr.u have a 4 cycle latency. The
C main loop here is not great; it is oversheduled with respect to the shr.u
C instructions, and this actually turns out to give considerably more complex
C wind down code. The code runs slowly for operands with <= 8 limbs, since we
C have a non-scheduled loop for that case. We also have a primitive loop for
C the unrolling edge, and as a consequence of the main loop stupidity it is
C executed 1-4 steps instead of 0-3 steps.
C By having 63 separate loops using the shrp instruction, we could easily reach
C 1 cycle/limb. Such loops would require a less deep software pipeline, since
C shrp unlike shl/shr.u have a plain one cycle latency.
C INPUT PARAMETERS
C rp = r32
C sp = r33
C n = r34
C cnt = r35
ifdef(`OPERATION_lshift',`
define(`FSH',`shl')
define(`BSH',`shr.u')
define(`UPD',`-8')
define(`func',`mpn_lshift')
')
ifdef(`OPERATION_rshift',`
define(`FSH',`shr.u')
define(`BSH',`shl')
define(`UPD',`8')
define(`func',`mpn_rshift')
')
ASM_START()
PROLOGUE(func)
.prologue
ifdef(`HAVE_ABI_32',
` addp4 r32 = 0, r32
addp4 r33 = 0, r33
sxt4 r34 = r34
zxt4 r35 = r35
;;
')
add r34 = -1, r34
sub r31 = 64, r35
.save ar.lc, r2
mov r2 = ar.lc;;
.body
cmp.leu p6, p7 = 8,r34
ifdef(`OPERATION_lshift',`
shladd r33 = r34, 3, r33
shladd r32 = r34, 3, r32;;
')
ld8 r19 = [r33], UPD ;;
BSH r8 = r19, r31 C function return value
(p6) br.dptk .Lbig
C
C Code for small operands. Not an optimization for the Itanium, it is here
C just to simplify the general case.
C
mov ar.lc = r34;;
br.cloop.dptk .Loops
FSH r26 = r19, r35 ;;
st8 [r32] = r26
mov ar.lc = r2
br.ret.sptk.many b0
.Loops:
ld8 r16 = [r33], UPD
FSH r26 = r19, r35 ;;
BSH r27 = r16, r31 ;;
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
or r27 = r27, r26
mov r19 = r16 ;;
st8 [r32] = r27, UPD
br.cloop.dptk .Loops
FSH r26 = r19, r35 ;;
st8 [r32] = r26
mov ar.lc = r2
br.ret.sptk.many b0
C
C Code for operands with >8 limbs. An edge loop and a very deep software
C pipeline.
C
.Lbig: and r15 = 3, r34
shr.u r14 = r34, 2 ;;
mov ar.lc = r15
.Loop0:
ld8 r16 = [r33], UPD
FSH r26 = r19, r35 ;;
BSH r27 = r16, r31 ;;
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
{ .mib; nop.b 0;; } C delay to save 6 cycles...
or r27 = r27, r26
mov r19 = r16 ;;
st8 [r32] = r27, UPD
br.cloop.dptk .Loop0
.Lunroll:
add r14 = -2, r14 ;;
mov ar.lc = r14
.Lphase1:
{ .mmi
ld8 r16 = [r33], UPD ;;
} { .mmi
ld8 r17 = [r33], UPD ;;
} { .mmi
ld8 r18 = [r33], UPD
FSH r26 = r19, r35 ;;
} { .mmi
ld8 r19 = [r33], UPD
BSH r27 = r16, r31 ;;
} { .mib
FSH r20 = r16, r35
}
.Lphase2:
{ .mmi
ld8 r16 = [r33], UPD
BSH r21 = r17, r31
} { .mib
FSH r22 = r17, r35 ;;
} { .mmi
ld8 r17 = [r33], UPD
BSH r23 = r18, r31
} { .mib
or r27 = r27, r26
FSH r24 = r18, r35
br.cloop.dptk .Loop
}
br.sptk .Lend2
.Loop:
{ .mmi
st8 [r32] = r27, UPD
ld8 r18 = [r33], UPD
BSH r25 = r19, r31
} { .mib
or r21 = r21, r20
FSH r26 = r19, r35 ;;
} { .mmi
st8 [r32] = r21, UPD
ld8 r19 = [r33], UPD
BSH r27 = r16, r31
} { .mib
or r23 = r23, r22
FSH r20 = r16, r35 ;;
} { .mmi
st8 [r32] = r23, UPD
ld8 r16 = [r33], UPD
BSH r21 = r17, r31
} { .mib
or r25 = r25, r24
FSH r22 = r17, r35 ;;
} { .mmi
st8 [r32] = r25, UPD
ld8 r17 = [r33], UPD
BSH r23 = r18, r31
} { .mib
or r27 = r27, r26
FSH r24 = r18, r35
br.cloop.sptk .Loop;;
}
.Lend2:
{ .mmi
st8 [r32] = r27, UPD
ld8 r18 = [r33], UPD
BSH r25 = r19, r31
} { .mib
or r21 = r21, r20
FSH r26 = r19, r35 ;;
} { .mmi
st8 [r32] = r21, UPD
BSH r27 = r16, r31
} { .mib
or r23 = r23, r22
FSH r20 = r16, r35 ;;
} { .mmi
st8 [r32] = r23, UPD
BSH r21 = r17, r31
} { .mib
or r25 = r25, r24
FSH r22 = r17, r35 ;;
} { .mmi
st8 [r32] = r25, UPD
BSH r23 = r18, r31
} { .mib
or r27 = r27, r26
FSH r24 = r18, r35 ;;
}
{ .mmi
st8 [r32] = r27, UPD
} { .mib
or r21 = r21, r20 ;;
} { .mmi
st8 [r32] = r21, UPD
} { .mib
or r23 = r23, r22 ;;
} { .mmi
st8 [r32] = r23, UPD;;
} { .mmi
st8 [r32] = r24
}
mov ar.lc = r2
br.ret.sptk.many b0
EPILOGUE(func)
ASM_END()