mpir/mpn/alpha/ev5/com_n.asm

dnl  Alpha EV5 mpn_com_n -- mpn one's complement.

dnl  Copyright 2003 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl  Fifth Floor, Boston, MA 02110-1301, USA.

include(`../config.m4')


C      cycles/limb
C EV4:    4.75
C EV5:    2.0
C EV6:    1.5


C mp_limb_t mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
C will be 1.5+2/N c/l.
C
C 2 cycles of loop control are unavoidable, for pointer updates and the
C taken branch bubble, but also since ldq cannot issue two cycles after stq
C (and with a run of stqs that means neither of two cycles at the end of the
C loop.
C
C The fbeq is forced into the second cycle of the loop using unops, since
C the first time through it must wait for the cvtqt result.  Once that
C result is ready (a 1 cycle stall) then both the branch and following loads
C can issue together.
C
C The main loop handles an odd count of limbs, being two limbs loaded before
C each size test, plus one pipelined around from the previous iteration (or
C setup in the entry sequence).
C
C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
C entry sequence, and an increment of the pointers.  For an odd size there's
C no increment and the first store in the loop (r24) is a repeat of dst[0].
C
C Note that the load for r24 after the possible pointer increment is done
C before the explicit store to dst[0], in case src==dst.


ASM_START()

FLOAT64(L(dat), 2.0)

	ALIGN(16)

PROLOGUE(mpn_com_n,gp)

	C r16	dst
	C r17	src
	C r18	size

	lda	r30, -16(r30)		C temporary stack space
	lda	r7, -3(r18)		C size - 3

	ldq	r20, 0(r17)		C src[0]
	srl	r7, 1, r6		C (size-3)/2

	stq	r6, 8(r30)		C (size-3)/2
	and	r7, 1, r5		C 1 if size even

	LEA(	r8, L(dat))
	s8addq	r5, r17, r17		C skip src[0] if even

	ornot	r31, r20, r20		C ~src[0]
	unop

	ldt	f0, 8(r30)		C (size-3)/2
	ldq	r24, 0(r17)		C src[0 or 1]

	stq	r20, 0(r16)		C dst[0]
	s8addq	r5, r16, r19		C skip dst[0] if even

	ldt	f1, 0(r8)		C data 2.0
	lda	r30, 16(r30)		C restore stack
	unop
	cvtqt	f0, f0			C (size-3)/2 as float

	ornot	r31, r24, r24
	blt	r7, L(done_1)		C if size<=2
	unop
	unop


	C 16-byte alignment here
L(top):
	C r17	src, incrementing
	C r19	dst, incrementing
	C r24	dst[i] result, ready to store
	C f0	(size-3)/2, decrementing
	C f1	2.0

	ldq	r20, 8(r17)		C src[i+1]
	ldq	r21, 16(r17)		C src[i+2]
	unop
	unop

	fbeq	f0, L(done_2)
	unop
	ldq	r22, 24(r17)		C src[i+3]
	ldq	r23, 32(r17)		C src[i+4]

	stq	r24, 0(r19)		C dst[i]
	ornot	r31, r20, r20
	subt	f0, f1, f0		C count -= 2
	unop

	stq	r20, 8(r19)		C dst[i+1]
	ornot	r31, r21, r21
	unop
	unop

	stq	r21, 16(r19)		C dst[i+2]
	ornot	r31, r22, r22

	stq	r22, 24(r19)		C dst[i+3]
	ornot	r31, r23, r24

	lda	r17, 32(r17)		C src += 4
	lda	r19, 32(r19)		C dst += 4
	unop
	fbge	f0, L(top)


L(done_1):
	C r19	&dst[size-1]
	C r24	result for dst[size-1]

	stq	r24, 0(r19)		C dst[size-1]
	ret	r31, (r26), 1


L(done_2):
	C r19	&dst[size-3]
	C r20	src[size-2]
	C r21	src[size-1]
	C r24	result for dst[size-3]

	stq	r24, 0(r19)		C dst[size-3]
	ornot	r31, r20, r20

	stq	r20, 8(r19)		C dst[size-2]
	ornot	r31, r21, r21

	stq	r21, 16(r19)		C dst[size-1]
	ret	r31, (r26), 1

EPILOGUE()
ASM_END()
Added all the assembly code back for all supported architectures. 2008-04-27 15:55:56 -04:00			`dnl Alpha EV5 mpn_com_n -- mpn one's complement.`

			`dnl Copyright 2003 Free Software Foundation, Inc.`
			`dnl`
			`dnl This file is part of the GNU MP Library.`
			`dnl`
			`dnl The GNU MP Library is free software; you can redistribute it and/or`
			`dnl modify it under the terms of the GNU Lesser General Public License as`
			`dnl published by the Free Software Foundation; either version 2.1 of the`
			`dnl License, or (at your option) any later version.`
			`dnl`
			`dnl The GNU MP Library is distributed in the hope that it will be useful,`
			`dnl but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`dnl Lesser General Public License for more details.`
			`dnl`
			`dnl You should have received a copy of the GNU Lesser General Public`
			`dnl License along with the GNU MP Library; see the file COPYING.LIB. If`
			`dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,`
			`dnl Fifth Floor, Boston, MA 02110-1301, USA.`

			include(`../config.m4')


			`C cycles/limb`
			`C EV4: 4.75`
			`C EV5: 2.0`
			`C EV6: 1.5`


			`C mp_limb_t mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);`
			`C`
			`C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total`
			`C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop`
			`C will be 1.5+2/N c/l.`
			`C`
			`C 2 cycles of loop control are unavoidable, for pointer updates and the`
			`C taken branch bubble, but also since ldq cannot issue two cycles after stq`
			`C (and with a run of stqs that means neither of two cycles at the end of the`
			`C loop.`
			`C`
			`C The fbeq is forced into the second cycle of the loop using unops, since`
			`C the first time through it must wait for the cvtqt result. Once that`
			`C result is ready (a 1 cycle stall) then both the branch and following loads`
			`C can issue together.`
			`C`
			`C The main loop handles an odd count of limbs, being two limbs loaded before`
			`C each size test, plus one pipelined around from the previous iteration (or`
			`C setup in the entry sequence).`
			`C`
			`C An even number of limbs is handled by an explicit dst[0]=~src[0] in the`
			`C entry sequence, and an increment of the pointers. For an odd size there's`
			`C no increment and the first store in the loop (r24) is a repeat of dst[0].`
			`C`
			`C Note that the load for r24 after the possible pointer increment is done`
			`C before the explicit store to dst[0], in case src==dst.`


			`ASM_START()`

			`FLOAT64(L(dat), 2.0)`

			`ALIGN(16)`

			`PROLOGUE(mpn_com_n,gp)`

			`C r16 dst`
			`C r17 src`
			`C r18 size`

			`lda r30, -16(r30) C temporary stack space`
			`lda r7, -3(r18) C size - 3`

			`ldq r20, 0(r17) C src[0]`
			`srl r7, 1, r6 C (size-3)/2`

			`stq r6, 8(r30) C (size-3)/2`
			`and r7, 1, r5 C 1 if size even`

			`LEA( r8, L(dat))`
			`s8addq r5, r17, r17 C skip src[0] if even`

			`ornot r31, r20, r20 C ~src[0]`
			`unop`

			`ldt f0, 8(r30) C (size-3)/2`
			`ldq r24, 0(r17) C src[0 or 1]`

			`stq r20, 0(r16) C dst[0]`
			`s8addq r5, r16, r19 C skip dst[0] if even`

			`ldt f1, 0(r8) C data 2.0`
			`lda r30, 16(r30) C restore stack`
			`unop`
			`cvtqt f0, f0 C (size-3)/2 as float`

			`ornot r31, r24, r24`
			`blt r7, L(done_1) C if size<=2`
			`unop`
			`unop`


			`C 16-byte alignment here`
			`L(top):`
			`C r17 src, incrementing`
			`C r19 dst, incrementing`
			`C r24 dst[i] result, ready to store`
			`C f0 (size-3)/2, decrementing`
			`C f1 2.0`

			`ldq r20, 8(r17) C src[i+1]`
			`ldq r21, 16(r17) C src[i+2]`
			`unop`
			`unop`

			`fbeq f0, L(done_2)`
			`unop`
			`ldq r22, 24(r17) C src[i+3]`
			`ldq r23, 32(r17) C src[i+4]`

			`stq r24, 0(r19) C dst[i]`
			`ornot r31, r20, r20`
			`subt f0, f1, f0 C count -= 2`
			`unop`

			`stq r20, 8(r19) C dst[i+1]`
			`ornot r31, r21, r21`
			`unop`
			`unop`

			`stq r21, 16(r19) C dst[i+2]`
			`ornot r31, r22, r22`

			`stq r22, 24(r19) C dst[i+3]`
			`ornot r31, r23, r24`

			`lda r17, 32(r17) C src += 4`
			`lda r19, 32(r19) C dst += 4`
			`unop`
			`fbge f0, L(top)`


			`L(done_1):`
			`C r19 &dst[size-1]`
			`C r24 result for dst[size-1]`

			`stq r24, 0(r19) C dst[size-1]`
			`ret r31, (r26), 1`


			`L(done_2):`
			`C r19 &dst[size-3]`
			`C r20 src[size-2]`
			`C r21 src[size-1]`
			`C r24 result for dst[size-3]`

			`stq r24, 0(r19) C dst[size-3]`
			`ornot r31, r20, r20`

			`stq r20, 8(r19) C dst[size-2]`
			`ornot r31, r21, r21`

			`stq r21, 16(r19) C dst[size-1]`
			`ret r31, (r26), 1`

			`EPILOGUE()`
			`ASM_END()`