mpir/mpn/ia64/lorrshift.asm

dnl  IA-64 mpn_Xshift.

dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

C This code runs at 2 cycles/limb for large operands on the Itanium.  It needs
C a very deep software pipeline, since shl/shr.u have a 4 cycle latency.  The
C main loop here is not great; it is oversheduled with respect to the shr.u
C instructions, and this actually turns out to give considerably more complex
C wind down code.  The code runs slowly for operands with <= 8 limbs, since we
C have a non-scheduled loop for that case.  We also have a primitive loop for
C the unrolling edge, and as a consequence of the main loop stupidity it is
C executed 1-4 steps instead of 0-3 steps.

C By having 63 separate loops using the shrp instruction, we could easily reach
C 1 cycle/limb.  Such loops would require a less deep software pipeline, since
C shrp unlike shl/shr.u have a plain one cycle latency.

C INPUT PARAMETERS
C rp = r32
C sp = r33
C n = r34
C cnt = r35

ifdef(`OPERATION_lshift',`
	define(`FSH',`shl')
	define(`BSH',`shr.u')
	define(`UPD',`-8')
	define(`func',`mpn_lshift')
')
ifdef(`OPERATION_rshift',`
	define(`FSH',`shr.u')
	define(`BSH',`shl')
	define(`UPD',`8')
	define(`func',`mpn_rshift')
')

ASM_START()
PROLOGUE(func)
	.prologue
ifdef(`HAVE_ABI_32',
`	addp4	r32 = 0, r32
	addp4	r33 = 0, r33
	sxt4	r34 = r34
	zxt4	r35 = r35
	;;
')
	add	r34 = -1, r34
	sub	r31 = 64, r35
	.save	ar.lc, r2
	mov	r2 = ar.lc;;
	.body
	cmp.leu	p6, p7 = 8,r34
ifdef(`OPERATION_lshift',`
	shladd	r33 = r34, 3, r33
	shladd	r32 = r34, 3, r32;;
')
	ld8	r19 = [r33], UPD	;;
	BSH	r8 = r19, r31		C function return value
   (p6) br.dptk	.Lbig

C
C Code for small operands.  Not an optimization for the Itanium, it is here
C just to simplify the general case.
C
	mov	ar.lc = r34;;
	br.cloop.dptk .Loops
	FSH	r26 = r19, r35	;;
	st8	[r32] = r26
	mov	ar.lc = r2
	br.ret.sptk.many b0
.Loops:
	ld8	r16 = [r33], UPD
	FSH	r26 = r19, r35	;;
	BSH	r27 = r16, r31	;;
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	or	r27 = r27, r26
	mov	r19 = r16	;;
	st8	[r32] = r27, UPD
	br.cloop.dptk .Loops
	FSH	r26 = r19, r35	;;
	st8	[r32] = r26
	mov	ar.lc = r2
	br.ret.sptk.many b0

C
C Code for operands with >8 limbs.  An edge loop and a very deep software
C pipeline.
C
.Lbig:	and	r15 = 3, r34
	shr.u	r14 = r34, 2	;;
	mov	ar.lc = r15
.Loop0:
	ld8	r16 = [r33], UPD
	FSH	r26 = r19, r35	;;
	BSH	r27 = r16, r31	;;
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	or	r27 = r27, r26
	mov	r19 = r16	;;
	st8	[r32] = r27, UPD
	br.cloop.dptk .Loop0

.Lunroll:
	add	r14 = -2, r14	;;
	mov	ar.lc = r14

.Lphase1:
  { .mmi
	ld8	r16 = [r33], UPD	;;
} { .mmi
	ld8	r17 = [r33], UPD	;;
} { .mmi
	ld8	r18 = [r33], UPD
	FSH	r26 = r19, r35	;;
} { .mmi
	ld8	r19 = [r33], UPD
	BSH	r27 = r16, r31	;;
} { .mib
	FSH	r20 = r16, r35
}

.Lphase2:
  { .mmi
	ld8	r16 = [r33], UPD
	BSH	r21 = r17, r31
} { .mib
	FSH	r22 = r17, r35	;;
} { .mmi
	ld8	r17 = [r33], UPD
	BSH	r23 = r18, r31
} { .mib
	or	r27 = r27, r26
	FSH	r24 = r18, r35
	br.cloop.dptk .Loop
}
	br.sptk	.Lend2
.Loop:
  { .mmi
	st8	[r32] = r27, UPD
	ld8	r18 = [r33], UPD
	BSH	r25 = r19, r31
} { .mib
	or	r21 = r21, r20
	FSH	r26 = r19, r35	;;
} { .mmi
	st8	[r32] = r21, UPD
	ld8	r19 = [r33], UPD
	BSH	r27 = r16, r31
} { .mib
	or	r23 = r23, r22
	FSH	r20 = r16, r35	;;
} { .mmi
	st8	[r32] = r23, UPD
	ld8	r16 = [r33], UPD
	BSH	r21 = r17, r31
} { .mib
	or	r25 = r25, r24
	FSH	r22 = r17, r35	;;
} { .mmi
	st8	[r32] = r25, UPD
	ld8	r17 = [r33], UPD
	BSH	r23 = r18, r31
} { .mib
	or	r27 = r27, r26
	FSH	r24 = r18, r35
	br.cloop.sptk .Loop;;
}
.Lend2:
  { .mmi
	st8	[r32] = r27, UPD
	ld8	r18 = [r33], UPD
	BSH	r25 = r19, r31
} { .mib
	or	r21 = r21, r20
	FSH	r26 = r19, r35	;;
} { .mmi
	st8	[r32] = r21, UPD
	BSH	r27 = r16, r31
} { .mib
	or	r23 = r23, r22
	FSH	r20 = r16, r35	;;
} { .mmi
	st8	[r32] = r23, UPD
	BSH	r21 = r17, r31
} { .mib
	or	r25 = r25, r24
	FSH	r22 = r17, r35	;;
} { .mmi
	st8	[r32] = r25, UPD
	BSH	r23 = r18, r31
} { .mib
	or	r27 = r27, r26
	FSH	r24 = r18, r35	;;
}

  { .mmi
	st8	[r32] = r27, UPD
} { .mib
	or	r21 = r21, r20	;;
} { .mmi
	st8	[r32] = r21, UPD
} { .mib
	or	r23 = r23, r22	;;
} { .mmi
	st8	[r32] = r23, UPD;;
} { .mmi
	st8	[r32] = r24
}
	mov	ar.lc = r2
	br.ret.sptk.many b0
EPILOGUE(func)
ASM_END()
Basic GMP files with a new core2 directory and amd_64 directory with Martin's and Gaudry's patches. Removed directories for no longer supported architectures. 2008-04-17 17:03:07 -04:00			`dnl IA-64 mpn_Xshift.`

			`dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.`

			`dnl This file is part of the GNU MP Library.`

			`dnl The GNU MP Library is free software; you can redistribute it and/or modify`
			`dnl it under the terms of the GNU Lesser General Public License as published`
			`dnl by the Free Software Foundation; either version 2.1 of the License, or (at`
			`dnl your option) any later version.`

			`dnl The GNU MP Library is distributed in the hope that it will be useful, but`
			`dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`dnl License for more details.`

			`dnl You should have received a copy of the GNU Lesser General Public License`
			`dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write`
			`dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`dnl Boston, MA 02110-1301, USA.`

			include(`../config.m4')

			`C This code runs at 2 cycles/limb for large operands on the Itanium. It needs`
			`C a very deep software pipeline, since shl/shr.u have a 4 cycle latency. The`
			`C main loop here is not great; it is oversheduled with respect to the shr.u`
			`C instructions, and this actually turns out to give considerably more complex`
			`C wind down code. The code runs slowly for operands with <= 8 limbs, since we`
			`C have a non-scheduled loop for that case. We also have a primitive loop for`
			`C the unrolling edge, and as a consequence of the main loop stupidity it is`
			`C executed 1-4 steps instead of 0-3 steps.`

			`C By having 63 separate loops using the shrp instruction, we could easily reach`
			`C 1 cycle/limb. Such loops would require a less deep software pipeline, since`
			`C shrp unlike shl/shr.u have a plain one cycle latency.`

			`C INPUT PARAMETERS`
			`C rp = r32`
			`C sp = r33`
			`C n = r34`
			`C cnt = r35`

			ifdef(`OPERATION_lshift',`
			define(`FSH',`shl')
			define(`BSH',`shr.u')
			define(`UPD',`-8')
			define(`func',`mpn_lshift')
			`')`
			ifdef(`OPERATION_rshift',`
			define(`FSH',`shr.u')
			define(`BSH',`shl')
			define(`UPD',`8')
			define(`func',`mpn_rshift')
			`')`

			`ASM_START()`
			`PROLOGUE(func)`
			`.prologue`
			ifdef(`HAVE_ABI_32',
			` addp4 r32 = 0, r32
			`addp4 r33 = 0, r33`
			`sxt4 r34 = r34`
			`zxt4 r35 = r35`
			`;;`
			`')`
			`add r34 = -1, r34`
			`sub r31 = 64, r35`
			`.save ar.lc, r2`
			`mov r2 = ar.lc;;`
			`.body`
			`cmp.leu p6, p7 = 8,r34`
			ifdef(`OPERATION_lshift',`
			`shladd r33 = r34, 3, r33`
			`shladd r32 = r34, 3, r32;;`
			`')`
			`ld8 r19 = [r33], UPD ;;`
			`BSH r8 = r19, r31 C function return value`
			`(p6) br.dptk .Lbig`

			`C`
			`C Code for small operands. Not an optimization for the Itanium, it is here`
			`C just to simplify the general case.`
			`C`
			`mov ar.lc = r34;;`
			`br.cloop.dptk .Loops`
			`FSH r26 = r19, r35 ;;`
			`st8 [r32] = r26`
			`mov ar.lc = r2`
			`br.ret.sptk.many b0`
			`.Loops:`
			`ld8 r16 = [r33], UPD`
			`FSH r26 = r19, r35 ;;`
			`BSH r27 = r16, r31 ;;`
			`{ .mib; nop.b 0;; } C delay to save 6 cycles...`
			`{ .mib; nop.b 0;; } C delay to save 6 cycles...`
			`{ .mib; nop.b 0;; } C delay to save 6 cycles...`
			`or r27 = r27, r26`
			`mov r19 = r16 ;;`
			`st8 [r32] = r27, UPD`
			`br.cloop.dptk .Loops`
			`FSH r26 = r19, r35 ;;`
			`st8 [r32] = r26`
			`mov ar.lc = r2`
			`br.ret.sptk.many b0`

			`C`
			`C Code for operands with >8 limbs. An edge loop and a very deep software`
			`C pipeline.`
			`C`
			`.Lbig: and r15 = 3, r34`
			`shr.u r14 = r34, 2 ;;`
			`mov ar.lc = r15`
			`.Loop0:`
			`ld8 r16 = [r33], UPD`
			`FSH r26 = r19, r35 ;;`
			`BSH r27 = r16, r31 ;;`
			`{ .mib; nop.b 0;; } C delay to save 6 cycles...`
			`{ .mib; nop.b 0;; } C delay to save 6 cycles...`
			`{ .mib; nop.b 0;; } C delay to save 6 cycles...`
			`or r27 = r27, r26`
			`mov r19 = r16 ;;`
			`st8 [r32] = r27, UPD`
			`br.cloop.dptk .Loop0`

			`.Lunroll:`
			`add r14 = -2, r14 ;;`
			`mov ar.lc = r14`

			`.Lphase1:`
			`{ .mmi`
			`ld8 r16 = [r33], UPD ;;`
			`} { .mmi`
			`ld8 r17 = [r33], UPD ;;`
			`} { .mmi`
			`ld8 r18 = [r33], UPD`
			`FSH r26 = r19, r35 ;;`
			`} { .mmi`
			`ld8 r19 = [r33], UPD`
			`BSH r27 = r16, r31 ;;`
			`} { .mib`
			`FSH r20 = r16, r35`
			`}`

			`.Lphase2:`
			`{ .mmi`
			`ld8 r16 = [r33], UPD`
			`BSH r21 = r17, r31`
			`} { .mib`
			`FSH r22 = r17, r35 ;;`
			`} { .mmi`
			`ld8 r17 = [r33], UPD`
			`BSH r23 = r18, r31`
			`} { .mib`
			`or r27 = r27, r26`
			`FSH r24 = r18, r35`
			`br.cloop.dptk .Loop`
			`}`
			`br.sptk .Lend2`
			`.Loop:`
			`{ .mmi`
			`st8 [r32] = r27, UPD`
			`ld8 r18 = [r33], UPD`
			`BSH r25 = r19, r31`
			`} { .mib`
			`or r21 = r21, r20`
			`FSH r26 = r19, r35 ;;`
			`} { .mmi`
			`st8 [r32] = r21, UPD`
			`ld8 r19 = [r33], UPD`
			`BSH r27 = r16, r31`
			`} { .mib`
			`or r23 = r23, r22`
			`FSH r20 = r16, r35 ;;`
			`} { .mmi`
			`st8 [r32] = r23, UPD`
			`ld8 r16 = [r33], UPD`
			`BSH r21 = r17, r31`
			`} { .mib`
			`or r25 = r25, r24`
			`FSH r22 = r17, r35 ;;`
			`} { .mmi`
			`st8 [r32] = r25, UPD`
			`ld8 r17 = [r33], UPD`
			`BSH r23 = r18, r31`
			`} { .mib`
			`or r27 = r27, r26`
			`FSH r24 = r18, r35`
			`br.cloop.sptk .Loop;;`
			`}`
			`.Lend2:`
			`{ .mmi`
			`st8 [r32] = r27, UPD`
			`ld8 r18 = [r33], UPD`
			`BSH r25 = r19, r31`
			`} { .mib`
			`or r21 = r21, r20`
			`FSH r26 = r19, r35 ;;`
			`} { .mmi`
			`st8 [r32] = r21, UPD`
			`BSH r27 = r16, r31`
			`} { .mib`
			`or r23 = r23, r22`
			`FSH r20 = r16, r35 ;;`
			`} { .mmi`
			`st8 [r32] = r23, UPD`
			`BSH r21 = r17, r31`
			`} { .mib`
			`or r25 = r25, r24`
			`FSH r22 = r17, r35 ;;`
			`} { .mmi`
			`st8 [r32] = r25, UPD`
			`BSH r23 = r18, r31`
			`} { .mib`
			`or r27 = r27, r26`
			`FSH r24 = r18, r35 ;;`
			`}`

			`{ .mmi`
			`st8 [r32] = r27, UPD`
			`} { .mib`
			`or r21 = r21, r20 ;;`
			`} { .mmi`
			`st8 [r32] = r21, UPD`
			`} { .mib`
			`or r23 = r23, r22 ;;`
			`} { .mmi`
			`st8 [r32] = r23, UPD;;`
			`} { .mmi`
			`st8 [r32] = r24`
			`}`
			`mov ar.lc = r2`
			`br.ret.sptk.many b0`
			`EPILOGUE(func)`
			`ASM_END()`