235 lines
5.2 KiB
NASM
235 lines
5.2 KiB
NASM
|
dnl IA-64 mpn_Xshift.
|
||
|
|
||
|
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
|
||
|
|
||
|
dnl This file is part of the GNU MP Library.
|
||
|
|
||
|
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
dnl it under the terms of the GNU Lesser General Public License as published
|
||
|
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
|
||
|
dnl your option) any later version.
|
||
|
|
||
|
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
dnl License for more details.
|
||
|
|
||
|
dnl You should have received a copy of the GNU Lesser General Public License
|
||
|
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
|
||
|
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||
|
dnl Boston, MA 02110-1301, USA.
|
||
|
|
||
|
include(`../config.m4')
|
||
|
|
||
|
C This code runs at 2 cycles/limb for large operands on the Itanium. It needs
|
||
|
C a very deep software pipeline, since shl/shr.u have a 4 cycle latency. The
|
||
|
C main loop here is not great; it is oversheduled with respect to the shr.u
|
||
|
C instructions, and this actually turns out to give considerably more complex
|
||
|
C wind down code. The code runs slowly for operands with <= 8 limbs, since we
|
||
|
C have a non-scheduled loop for that case. We also have a primitive loop for
|
||
|
C the unrolling edge, and as a consequence of the main loop stupidity it is
|
||
|
C executed 1-4 steps instead of 0-3 steps.
|
||
|
|
||
|
C By having 63 separate loops using the shrp instruction, we could easily reach
|
||
|
C 1 cycle/limb. Such loops would require a less deep software pipeline, since
|
||
|
C shrp unlike shl/shr.u have a plain one cycle latency.
|
||
|
|
||
|
C INPUT PARAMETERS
|
||
|
C rp = r32
|
||
|
C sp = r33
|
||
|
C n = r34
|
||
|
C cnt = r35
|
||
|
|
||
|
ifdef(`OPERATION_lshift',`
|
||
|
define(`FSH',`shl')
|
||
|
define(`BSH',`shr.u')
|
||
|
define(`UPD',`-8')
|
||
|
define(`func',`mpn_lshift')
|
||
|
')
|
||
|
ifdef(`OPERATION_rshift',`
|
||
|
define(`FSH',`shr.u')
|
||
|
define(`BSH',`shl')
|
||
|
define(`UPD',`8')
|
||
|
define(`func',`mpn_rshift')
|
||
|
')
|
||
|
|
||
|
ASM_START()
|
||
|
PROLOGUE(func)
|
||
|
.prologue
|
||
|
ifdef(`HAVE_ABI_32',
|
||
|
` addp4 r32 = 0, r32
|
||
|
addp4 r33 = 0, r33
|
||
|
sxt4 r34 = r34
|
||
|
zxt4 r35 = r35
|
||
|
;;
|
||
|
')
|
||
|
add r34 = -1, r34
|
||
|
sub r31 = 64, r35
|
||
|
.save ar.lc, r2
|
||
|
mov r2 = ar.lc;;
|
||
|
.body
|
||
|
cmp.leu p6, p7 = 8,r34
|
||
|
ifdef(`OPERATION_lshift',`
|
||
|
shladd r33 = r34, 3, r33
|
||
|
shladd r32 = r34, 3, r32;;
|
||
|
')
|
||
|
ld8 r19 = [r33], UPD ;;
|
||
|
BSH r8 = r19, r31 C function return value
|
||
|
(p6) br.dptk .Lbig
|
||
|
|
||
|
C
|
||
|
C Code for small operands. Not an optimization for the Itanium, it is here
|
||
|
C just to simplify the general case.
|
||
|
C
|
||
|
mov ar.lc = r34;;
|
||
|
br.cloop.dptk .Loops
|
||
|
FSH r26 = r19, r35 ;;
|
||
|
st8 [r32] = r26
|
||
|
mov ar.lc = r2
|
||
|
br.ret.sptk.many b0
|
||
|
.Loops:
|
||
|
ld8 r16 = [r33], UPD
|
||
|
FSH r26 = r19, r35 ;;
|
||
|
BSH r27 = r16, r31 ;;
|
||
|
{ .mib; nop.b 0;; } C delay to save 6 cycles...
|
||
|
{ .mib; nop.b 0;; } C delay to save 6 cycles...
|
||
|
{ .mib; nop.b 0;; } C delay to save 6 cycles...
|
||
|
or r27 = r27, r26
|
||
|
mov r19 = r16 ;;
|
||
|
st8 [r32] = r27, UPD
|
||
|
br.cloop.dptk .Loops
|
||
|
FSH r26 = r19, r35 ;;
|
||
|
st8 [r32] = r26
|
||
|
mov ar.lc = r2
|
||
|
br.ret.sptk.many b0
|
||
|
|
||
|
C
|
||
|
C Code for operands with >8 limbs. An edge loop and a very deep software
|
||
|
C pipeline.
|
||
|
C
|
||
|
.Lbig: and r15 = 3, r34
|
||
|
shr.u r14 = r34, 2 ;;
|
||
|
mov ar.lc = r15
|
||
|
.Loop0:
|
||
|
ld8 r16 = [r33], UPD
|
||
|
FSH r26 = r19, r35 ;;
|
||
|
BSH r27 = r16, r31 ;;
|
||
|
{ .mib; nop.b 0;; } C delay to save 6 cycles...
|
||
|
{ .mib; nop.b 0;; } C delay to save 6 cycles...
|
||
|
{ .mib; nop.b 0;; } C delay to save 6 cycles...
|
||
|
or r27 = r27, r26
|
||
|
mov r19 = r16 ;;
|
||
|
st8 [r32] = r27, UPD
|
||
|
br.cloop.dptk .Loop0
|
||
|
|
||
|
.Lunroll:
|
||
|
add r14 = -2, r14 ;;
|
||
|
mov ar.lc = r14
|
||
|
|
||
|
.Lphase1:
|
||
|
{ .mmi
|
||
|
ld8 r16 = [r33], UPD ;;
|
||
|
} { .mmi
|
||
|
ld8 r17 = [r33], UPD ;;
|
||
|
} { .mmi
|
||
|
ld8 r18 = [r33], UPD
|
||
|
FSH r26 = r19, r35 ;;
|
||
|
} { .mmi
|
||
|
ld8 r19 = [r33], UPD
|
||
|
BSH r27 = r16, r31 ;;
|
||
|
} { .mib
|
||
|
FSH r20 = r16, r35
|
||
|
}
|
||
|
|
||
|
.Lphase2:
|
||
|
{ .mmi
|
||
|
ld8 r16 = [r33], UPD
|
||
|
BSH r21 = r17, r31
|
||
|
} { .mib
|
||
|
FSH r22 = r17, r35 ;;
|
||
|
} { .mmi
|
||
|
ld8 r17 = [r33], UPD
|
||
|
BSH r23 = r18, r31
|
||
|
} { .mib
|
||
|
or r27 = r27, r26
|
||
|
FSH r24 = r18, r35
|
||
|
br.cloop.dptk .Loop
|
||
|
}
|
||
|
br.sptk .Lend2
|
||
|
.Loop:
|
||
|
{ .mmi
|
||
|
st8 [r32] = r27, UPD
|
||
|
ld8 r18 = [r33], UPD
|
||
|
BSH r25 = r19, r31
|
||
|
} { .mib
|
||
|
or r21 = r21, r20
|
||
|
FSH r26 = r19, r35 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r21, UPD
|
||
|
ld8 r19 = [r33], UPD
|
||
|
BSH r27 = r16, r31
|
||
|
} { .mib
|
||
|
or r23 = r23, r22
|
||
|
FSH r20 = r16, r35 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r23, UPD
|
||
|
ld8 r16 = [r33], UPD
|
||
|
BSH r21 = r17, r31
|
||
|
} { .mib
|
||
|
or r25 = r25, r24
|
||
|
FSH r22 = r17, r35 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r25, UPD
|
||
|
ld8 r17 = [r33], UPD
|
||
|
BSH r23 = r18, r31
|
||
|
} { .mib
|
||
|
or r27 = r27, r26
|
||
|
FSH r24 = r18, r35
|
||
|
br.cloop.sptk .Loop;;
|
||
|
}
|
||
|
.Lend2:
|
||
|
{ .mmi
|
||
|
st8 [r32] = r27, UPD
|
||
|
ld8 r18 = [r33], UPD
|
||
|
BSH r25 = r19, r31
|
||
|
} { .mib
|
||
|
or r21 = r21, r20
|
||
|
FSH r26 = r19, r35 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r21, UPD
|
||
|
BSH r27 = r16, r31
|
||
|
} { .mib
|
||
|
or r23 = r23, r22
|
||
|
FSH r20 = r16, r35 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r23, UPD
|
||
|
BSH r21 = r17, r31
|
||
|
} { .mib
|
||
|
or r25 = r25, r24
|
||
|
FSH r22 = r17, r35 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r25, UPD
|
||
|
BSH r23 = r18, r31
|
||
|
} { .mib
|
||
|
or r27 = r27, r26
|
||
|
FSH r24 = r18, r35 ;;
|
||
|
}
|
||
|
|
||
|
{ .mmi
|
||
|
st8 [r32] = r27, UPD
|
||
|
} { .mib
|
||
|
or r21 = r21, r20 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r21, UPD
|
||
|
} { .mib
|
||
|
or r23 = r23, r22 ;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r23, UPD;;
|
||
|
} { .mmi
|
||
|
st8 [r32] = r24
|
||
|
}
|
||
|
mov ar.lc = r2
|
||
|
br.ret.sptk.many b0
|
||
|
EPILOGUE(func)
|
||
|
ASM_END()
|