remove all traces of aors_n.* , we now use separate add and sub versions
This commit is contained in:
parent
c2c5579778
commit
577aeee345
2
configure
vendored
2
configure
vendored
@ -26039,7 +26039,6 @@ esac
|
||||
# functions that can be provided by multi-function files
|
||||
tmp_mulfunc=
|
||||
case $tmp_fn in
|
||||
add_n|sub_n) tmp_mulfunc="aors_n" ;;
|
||||
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
|
||||
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
|
||||
tmp_mulfunc="logops_n" ;;
|
||||
@ -26182,7 +26181,6 @@ for tmp_fn in $gmp_mpn_functions; do
|
||||
# functions that can be provided by multi-function files
|
||||
tmp_mulfunc=
|
||||
case $tmp_fn in
|
||||
add_n|sub_n) tmp_mulfunc="aors_n" ;;
|
||||
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
|
||||
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
|
||||
tmp_mulfunc="logops_n" ;;
|
||||
|
@ -2363,7 +2363,6 @@ define(GMP_MULFUNC_CHOICES,
|
||||
[# functions that can be provided by multi-function files
|
||||
tmp_mulfunc=
|
||||
case $tmp_fn in
|
||||
add_n|sub_n) tmp_mulfunc="aors_n" ;;
|
||||
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
|
||||
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
|
||||
tmp_mulfunc="logops_n" ;;
|
||||
|
@ -54,6 +54,8 @@ define(`CYSH',`GMP_NUMB_BITS')
|
||||
dnl This declaration is munged by configure
|
||||
NAILS_SUPPORT(1-63)
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(`OP', addq)
|
||||
define(`CYSH',`GMP_NUMB_BITS')
|
226
mpn/alpha/ev6/nails/sub_n.asm
Normal file
226
mpn/alpha/ev6/nails/sub_n.asm
Normal file
@ -0,0 +1,226 @@
|
||||
dnl Alpha ev6 nails mpn_add_n and mpn_sub_n.
|
||||
|
||||
dnl Copyright 2002, 2006 Free Software Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
|
||||
dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb
|
||||
dnl with 8-way unrolling.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
dnl INPUT PARAMETERS
|
||||
define(`rp',`r16')
|
||||
define(`up',`r17')
|
||||
define(`vp',`r18')
|
||||
define(`n',`r19')
|
||||
|
||||
define(`rl0',`r0')
|
||||
define(`rl1',`r1')
|
||||
define(`rl2',`r2')
|
||||
define(`rl3',`r3')
|
||||
|
||||
define(`ul0',`r4')
|
||||
define(`ul1',`r5')
|
||||
define(`ul2',`r6')
|
||||
define(`ul3',`r7')
|
||||
|
||||
define(`vl0',`r22')
|
||||
define(`vl1',`r23')
|
||||
define(`vl2',`r24')
|
||||
define(`vl3',`r25')
|
||||
|
||||
define(`numb_mask',`r21')
|
||||
|
||||
define(`NAIL_BITS',`GMP_NAIL_BITS')
|
||||
define(`CYSH',`GMP_NUMB_BITS')
|
||||
|
||||
dnl This declaration is munged by configure
|
||||
NAILS_SUPPORT(1-63)
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(`OP', addq)
|
||||
define(`CYSH',`GMP_NUMB_BITS')
|
||||
define(`func', mpn_add_n)')
|
||||
ifdef(`OPERATION_sub_n', `
|
||||
define(`OP', subq)
|
||||
define(`CYSH',63)
|
||||
define(`func', mpn_sub_n)')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
|
||||
|
||||
ASM_START()
|
||||
PROLOGUE(func)
|
||||
lda numb_mask, -1(r31)
|
||||
srl numb_mask, NAIL_BITS, numb_mask
|
||||
bis r31, r31, r20
|
||||
|
||||
and n, 3, r25
|
||||
lda n, -4(n)
|
||||
beq r25, L(ge4)
|
||||
|
||||
L(lp0): ldq ul0, 0(up)
|
||||
lda up, 8(up)
|
||||
ldq vl0, 0(vp)
|
||||
lda vp, 8(vp)
|
||||
lda rp, 8(rp)
|
||||
lda r25, -1(r25)
|
||||
OP ul0, vl0, rl0
|
||||
OP rl0, r20, rl0
|
||||
and rl0, numb_mask, r28
|
||||
stq r28, -8(rp)
|
||||
srl rl0, CYSH, r20
|
||||
bne r25, L(lp0)
|
||||
|
||||
blt n, L(ret)
|
||||
|
||||
L(ge4): ldq ul0, 0(up)
|
||||
ldq vl0, 0(vp)
|
||||
ldq ul1, 8(up)
|
||||
ldq vl1, 8(vp)
|
||||
ldq ul2, 16(up)
|
||||
ldq vl2, 16(vp)
|
||||
ldq ul3, 24(up)
|
||||
ldq vl3, 24(vp)
|
||||
lda up, 32(up)
|
||||
lda vp, 32(vp)
|
||||
lda n, -4(n)
|
||||
bge n, L(ge8)
|
||||
|
||||
OP ul0, vl0, rl0 C main-add 0
|
||||
OP rl0, r20, rl0 C cy-add 0
|
||||
OP ul1, vl1, rl1 C main-add 1
|
||||
srl rl0, CYSH, r20 C gen cy 0
|
||||
OP rl1, r20, rl1 C cy-add 1
|
||||
and rl0,numb_mask, r27
|
||||
br r31, L(cj0)
|
||||
|
||||
L(ge8): OP ul0, vl0, rl0 C main-add 0
|
||||
ldq ul0, 0(up)
|
||||
ldq vl0, 0(vp)
|
||||
OP rl0, r20, rl0 C cy-add 0
|
||||
OP ul1, vl1, rl1 C main-add 1
|
||||
srl rl0, CYSH, r20 C gen cy 0
|
||||
ldq ul1, 8(up)
|
||||
ldq vl1, 8(vp)
|
||||
OP rl1, r20, rl1 C cy-add 1
|
||||
and rl0,numb_mask, r27
|
||||
OP ul2, vl2, rl2 C main-add 2
|
||||
srl rl1, CYSH, r20 C gen cy 1
|
||||
ldq ul2, 16(up)
|
||||
ldq vl2, 16(vp)
|
||||
OP rl2, r20, rl2 C cy-add 2
|
||||
and rl1,numb_mask, r28
|
||||
stq r27, 0(rp)
|
||||
OP ul3, vl3, rl3 C main-add 3
|
||||
srl rl2, CYSH, r20 C gen cy 2
|
||||
ldq ul3, 24(up)
|
||||
ldq vl3, 24(vp)
|
||||
OP rl3, r20, rl3 C cy-add 3
|
||||
and rl2,numb_mask, r27
|
||||
stq r28, 8(rp)
|
||||
lda rp, 32(rp)
|
||||
lda up, 32(up)
|
||||
lda vp, 32(vp)
|
||||
lda n, -4(n)
|
||||
blt n, L(end)
|
||||
|
||||
ALIGN(32)
|
||||
L(top): OP ul0, vl0, rl0 C main-add 0
|
||||
srl rl3, CYSH, r20 C gen cy 3
|
||||
ldq ul0, 0(up)
|
||||
ldq vl0, 0(vp)
|
||||
|
||||
OP rl0, r20, rl0 C cy-add 0
|
||||
and rl3,numb_mask, r28
|
||||
stq r27, -16(rp)
|
||||
bis r31, r31, r31
|
||||
|
||||
OP ul1, vl1, rl1 C main-add 1
|
||||
srl rl0, CYSH, r20 C gen cy 0
|
||||
ldq ul1, 8(up)
|
||||
ldq vl1, 8(vp)
|
||||
|
||||
OP rl1, r20, rl1 C cy-add 1
|
||||
and rl0,numb_mask, r27
|
||||
stq r28, -8(rp)
|
||||
bis r31, r31, r31
|
||||
|
||||
OP ul2, vl2, rl2 C main-add 2
|
||||
srl rl1, CYSH, r20 C gen cy 1
|
||||
ldq ul2, 16(up)
|
||||
ldq vl2, 16(vp)
|
||||
|
||||
OP rl2, r20, rl2 C cy-add 2
|
||||
and rl1,numb_mask, r28
|
||||
stq r27, 0(rp)
|
||||
bis r31, r31, r31
|
||||
|
||||
OP ul3, vl3, rl3 C main-add 3
|
||||
srl rl2, CYSH, r20 C gen cy 2
|
||||
ldq ul3, 24(up)
|
||||
ldq vl3, 24(vp)
|
||||
|
||||
OP rl3, r20, rl3 C cy-add 3
|
||||
and rl2,numb_mask, r27
|
||||
stq r28, 8(rp)
|
||||
bis r31, r31, r31
|
||||
|
||||
bis r31, r31, r31
|
||||
lda n, -4(n)
|
||||
lda up, 32(up)
|
||||
lda vp, 32(vp)
|
||||
|
||||
bis r31, r31, r31
|
||||
bis r31, r31, r31
|
||||
lda rp, 32(rp)
|
||||
bge n, L(top)
|
||||
|
||||
L(end): OP ul0, vl0, rl0 C main-add 0
|
||||
srl rl3, CYSH, r20 C gen cy 3
|
||||
OP rl0, r20, rl0 C cy-add 0
|
||||
and rl3,numb_mask, r28
|
||||
stq r27, -16(rp)
|
||||
OP ul1, vl1, rl1 C main-add 1
|
||||
srl rl0, CYSH, r20 C gen cy 0
|
||||
OP rl1, r20, rl1 C cy-add 1
|
||||
and rl0,numb_mask, r27
|
||||
stq r28, -8(rp)
|
||||
L(cj0): OP ul2, vl2, rl2 C main-add 2
|
||||
srl rl1, CYSH, r20 C gen cy 1
|
||||
OP rl2, r20, rl2 C cy-add 2
|
||||
and rl1,numb_mask, r28
|
||||
stq r27, 0(rp)
|
||||
OP ul3, vl3, rl3 C main-add 3
|
||||
srl rl2, CYSH, r20 C gen cy 2
|
||||
OP rl3, r20, rl3 C cy-add 3
|
||||
and rl2,numb_mask, r27
|
||||
stq r28, 8(rp)
|
||||
|
||||
srl rl3, CYSH, r20 C gen cy 3
|
||||
and rl3,numb_mask, r28
|
||||
stq r27, 16(rp)
|
||||
stq r28, 24(rp)
|
||||
|
||||
L(ret): and r20, 1, r0
|
||||
ret r31, (r26), 1
|
||||
EPILOGUE()
|
||||
ASM_END()
|
@ -1052,10 +1052,6 @@ dnl with ifdef() rather than be expanded.
|
||||
m4_not_for_expansion(`PIC')
|
||||
m4_not_for_expansion(`DLL_EXPORT')
|
||||
|
||||
dnl aors_n
|
||||
m4_not_for_expansion(`OPERATION_add_n')
|
||||
m4_not_for_expansion(`OPERATION_sub_n')
|
||||
|
||||
dnl aorsmul_1
|
||||
m4_not_for_expansion(`OPERATION_addmul_1')
|
||||
m4_not_for_expansion(`OPERATION_submul_1')
|
||||
|
@ -33,6 +33,8 @@ define(`up',`r33')
|
||||
define(`vp',`r34')
|
||||
define(`n',`r35')
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(ADDSUB, add)
|
||||
define(PRED, ltu)
|
613
mpn/ia64/sub_n.asm
Normal file
613
mpn/ia64/sub_n.asm
Normal file
@ -0,0 +1,613 @@
|
||||
dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
|
||||
|
||||
dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
|
||||
|
||||
dnl This file is part of the GNU MP Library.
|
||||
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
dnl it under the terms of the GNU Lesser General Public License as published
|
||||
dnl by the Free Software Foundation; either version 3 of the License, or (at
|
||||
dnl your option) any later version.
|
||||
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
dnl License for more details.
|
||||
|
||||
dnl You should have received a copy of the GNU Lesser General Public License
|
||||
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
C cycles/limb
|
||||
C Itanium: 2.67
|
||||
C Itanium 2: 1.25
|
||||
|
||||
C TODO
|
||||
C * Consider using special code for small n, using something like
|
||||
C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
|
||||
|
||||
C INPUT PARAMETERS
|
||||
define(`rp',`r32')
|
||||
define(`up',`r33')
|
||||
define(`vp',`r34')
|
||||
define(`n',`r35')
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(ADDSUB, add)
|
||||
define(PRED, ltu)
|
||||
define(INCR, 1)
|
||||
define(LIM, -1)
|
||||
define(func, mpn_add_n)
|
||||
')
|
||||
ifdef(`OPERATION_sub_n',`
|
||||
define(ADDSUB, sub)
|
||||
define(PRED, gtu)
|
||||
define(INCR, -1)
|
||||
define(LIM, 0)
|
||||
define(func, mpn_sub_n)
|
||||
')
|
||||
|
||||
C Some useful aliases for registers we use
|
||||
define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
|
||||
define(`u4',`r18') define(`u5',`r19') define(`u6',`r20') define(`u7',`r21')
|
||||
define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
|
||||
define(`v4',`r28') define(`v5',`r29') define(`v6',`r30') define(`v7',`r31')
|
||||
define(`w0',`r22') define(`w1',`r9') define(`w2',`r8') define(`w3',`r23')
|
||||
define(`w4',`r22') define(`w5',`r9') define(`w6',`r8') define(`w7',`r23')
|
||||
define(`rpx',`r3')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
|
||||
|
||||
ASM_START()
|
||||
PROLOGUE(func)
|
||||
.prologue
|
||||
.save ar.lc, r2
|
||||
.body
|
||||
ifdef(`HAVE_ABI_32',`
|
||||
addp4 rp = 0, rp C M I
|
||||
addp4 up = 0, up C M I
|
||||
addp4 vp = 0, vp C M I
|
||||
zxt4 n = n C I
|
||||
;;
|
||||
')
|
||||
{.mmi C 00
|
||||
ld8 r11 = [vp], 8 C M01
|
||||
ld8 r10 = [up], 8 C M01
|
||||
mov.i r2 = ar.lc C I0
|
||||
}
|
||||
{.mmi
|
||||
and r14 = 7, n C M I
|
||||
cmp.lt p15, p14 = 8, n C M I
|
||||
add n = -8, n C M I
|
||||
;;
|
||||
}
|
||||
{.mmi C 01
|
||||
cmp.eq p6, p0 = 1, r14 C M I
|
||||
cmp.eq p7, p0 = 2, r14 C M I
|
||||
cmp.eq p8, p0 = 3, r14 C M I
|
||||
}
|
||||
{.bbb
|
||||
(p6) br.dptk .Lb001 C B
|
||||
(p7) br.dptk .Lb010 C B
|
||||
(p8) br.dptk .Lb011 C B
|
||||
;;
|
||||
}
|
||||
{.mmi C 02
|
||||
cmp.eq p9, p0 = 4, r14 C M I
|
||||
cmp.eq p10, p0 = 5, r14 C M I
|
||||
cmp.eq p11, p0 = 6, r14 C M I
|
||||
}
|
||||
{.bbb
|
||||
(p9) br.dptk .Lb100 C B
|
||||
(p10) br.dptk .Lb101 C B
|
||||
(p11) br.dptk .Lb110 C B
|
||||
;;
|
||||
} C 03
|
||||
{.mmb
|
||||
cmp.eq p12, p0 = 7, r14 C M I
|
||||
add n = -1, n C loop count M I
|
||||
(p12) br.dptk .Lb111 C B
|
||||
}
|
||||
|
||||
|
||||
.Lb000: ld8 v2 = [vp], 8 C M01
|
||||
ld8 u2 = [up], 8 C M01
|
||||
add rpx = 8, rp C M I
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
ADDSUB w1 = r10, r11 C M I
|
||||
;;
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
cmp.PRED p7, p0 = w1, r10 C M I
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
ADDSUB w2 = u2, v2 C M I
|
||||
;;
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
ld8 u6 = [up], 8 C M01
|
||||
cmp.PRED p8, p0 = w2, u2 C M I
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
ADDSUB w3 = u3, v3 C M I
|
||||
;;
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
ld8 u0 = [up], 8 C M01
|
||||
cmp.PRED p9, p0 = w3, u3 C M I
|
||||
(p7) cmp.eq.or p8, p0 = LIM, w2 C M I
|
||||
(p7) add w2 = INCR, w2 C M I
|
||||
(p14) br.cond.dptk .Lcj8 C B
|
||||
;;
|
||||
|
||||
.grt8: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
shr.u n = n, 3 C I0
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u2 = [up], 8 C M01
|
||||
nop.i 0
|
||||
nop.b 0
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
br .LL000 C B
|
||||
|
||||
.Lb001: add rpx = 16, rp C M I
|
||||
ADDSUB w0 = r10, r11 C M I
|
||||
(p15) br.cond.dpnt .grt1 C B
|
||||
;;
|
||||
cmp.PRED p6, p0 = w0, r10 C M I
|
||||
mov r8 = 0 C M I
|
||||
br .Lcj1 C B
|
||||
|
||||
.grt1: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
shr.u n = n, 3 C I0
|
||||
;;
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
ld8 u2 = [up], 8 C M01
|
||||
cmp.ne p9, p0 = r0, r0 C read near Loop
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
;;
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
cmp.PRED p6, p0 = w0, r10 C M I
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
ADDSUB w1 = u1, v1 C M I
|
||||
;;
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
ld8 u6 = [up], 8 C M01
|
||||
cmp.PRED p7, p0 = w1, u1 C M I
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
ADDSUB w2 = u2, v2 C M I
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u0 = [up], 8 C M01
|
||||
br.cloop.dptk .Loop C B
|
||||
br .Lcj9 C B
|
||||
|
||||
.Lb010: ld8 v0 = [vp], 8 C M01
|
||||
ld8 u0 = [up], 8 C M01
|
||||
add rpx = 24, rp C M I
|
||||
ADDSUB w7 = r10, r11 C M I
|
||||
(p15) br.cond.dpnt .grt2 C B
|
||||
;;
|
||||
cmp.PRED p9, p0 = w7, r10 C M I
|
||||
ADDSUB w0 = u0, v0 C M I
|
||||
br .Lcj2 C B
|
||||
|
||||
.grt2: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
shr.u n = n, 3 C I0
|
||||
;;
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
ld8 u2 = [up], 8 C M01
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
;;
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
cmp.PRED p9, p0 = w7, r10 C M I
|
||||
;;
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
ld8 u6 = [up], 8 C M01
|
||||
ADDSUB w0 = u0, v0 C M I
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u7 = [up], 8 C M01
|
||||
br .LL01x C B
|
||||
|
||||
.Lb011: ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
ADDSUB w6 = r10, r11 C M I
|
||||
;;
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
ld8 u0 = [up], 8 C M01
|
||||
(p15) br.cond.dpnt .grt3 C B
|
||||
;;
|
||||
cmp.PRED p8, p0 = w6, r10 C M I
|
||||
ADDSUB w7 = u7, v7 C M I
|
||||
;;
|
||||
st8 [rp] = w6, 8 C M23
|
||||
cmp.PRED p9, p0 = w7, u7 C M I
|
||||
br .Lcj3 C B
|
||||
|
||||
.grt3: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
add rpx = 32, rp C M I
|
||||
;;
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
ld8 u2 = [up], 8 C M01
|
||||
shr.u n = n, 3 C I0
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
cmp.PRED p8, p0 = w6, r10 C M I
|
||||
;;
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
ADDSUB w7 = u7, v7 C M I
|
||||
nop.i 0
|
||||
nop.b 0
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
cmp.PRED p9, p0 = w7, u7 C M I
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u6 = [up], 8 C M01
|
||||
(p8) cmp.eq.or p9, p0 = LIM, w7 C M I
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
(p8) add w7 = INCR, w7 C M I
|
||||
st8 [rp] = w6, 8 C M23
|
||||
ADDSUB w0 = u0, v0 C M I
|
||||
br .LL01x C B
|
||||
|
||||
.Lb100: ld8 v6 = [vp], 8 C M01
|
||||
ld8 u6 = [up], 8 C M01
|
||||
add rpx = 8, rp C M I
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
ADDSUB w5 = r10, r11 C M I
|
||||
;;
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
ld8 u0 = [up], 8 C M01
|
||||
(p15) br.cond.dpnt .grt4 C B
|
||||
;;
|
||||
cmp.PRED p7, p0 = w5, r10 C M I
|
||||
ADDSUB w6 = u6, v6 C M I
|
||||
;;
|
||||
cmp.PRED p8, p0 = w6, u6 C M I
|
||||
ADDSUB w7 = u7, v7 C M I
|
||||
br .Lcj4 C B
|
||||
|
||||
.grt4: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
shr.u n = n, 3 C I0
|
||||
cmp.PRED p7, p0 = w5, r10 C M I
|
||||
;;
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
ld8 u2 = [up], 8 C M01
|
||||
ADDSUB w6 = u6, v6 C M I
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
cmp.PRED p8, p0 = w6, u6 C M I
|
||||
;;
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
ADDSUB w7 = u7, v7 C M I
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u6 = [up], 8 C M01
|
||||
cmp.PRED p9, p0 = w7, u7 C M I
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
(p7) cmp.eq.or p8, p0 = LIM, w6 C M I
|
||||
(p7) add w6 = INCR, w6 C M I
|
||||
br .LL100 C B
|
||||
|
||||
.Lb101: ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
add rpx = 16, rp C M I
|
||||
;;
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
ld8 u6 = [up], 8 C M01
|
||||
ADDSUB w4 = r10, r11 C M I
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
cmp.PRED p6, p0 = w4, r10 C M I
|
||||
;;
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
ld8 u0 = [up], 8 C M01
|
||||
ADDSUB w5 = u5, v5 C M I
|
||||
shr.u n = n, 3 C I0
|
||||
(p15) br.cond.dpnt .grt5 C B
|
||||
;;
|
||||
cmp.PRED p7, p0 = w5, u5 C M I
|
||||
ADDSUB w6 = u6, v6 C M I
|
||||
br .Lcj5 C B
|
||||
|
||||
.grt5: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
;;
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
ld8 u2 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
cmp.PRED p7, p0 = w5, u5 C M I
|
||||
;;
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
ADDSUB w6 = u6, v6 C M I
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u5 = [up], 8 C M01
|
||||
br .LL101 C B
|
||||
|
||||
.Lb110: ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
add rpx = 24, rp C M I
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
ADDSUB w3 = r10, r11 C M I
|
||||
;;
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
ld8 u6 = [up], 8 C M01
|
||||
shr.u n = n, 3 C I0
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
cmp.PRED p9, p0 = w3, r10 C M I
|
||||
;;
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
ld8 u0 = [up], 8 C M01
|
||||
ADDSUB w4 = u4, v4 C M I
|
||||
(p14) br.cond.dptk .Lcj67 C B
|
||||
;;
|
||||
|
||||
.grt6: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
cmp.PRED p9, p0 = w3, r10 C M I
|
||||
nop.i 0
|
||||
nop.b 0
|
||||
;;
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
ld8 u2 = [up], 8 C M01
|
||||
ADDSUB w4 = u4, v4 C M I
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u3 = [up], 8 C M01
|
||||
br .LL11x C B
|
||||
|
||||
.Lb111: ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
add rpx = 32, rp C M I
|
||||
;;
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
ld8 u4 = [up], 8 C M01
|
||||
ADDSUB w2 = r10, r11 C M I
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
ld8 u5 = [up], 8 C M01
|
||||
cmp.PRED p8, p0 = w2, r10 C M I
|
||||
;;
|
||||
ld8 v6 = [vp], 8 C M01
|
||||
ld8 u6 = [up], 8 C M01
|
||||
ADDSUB w3 = u3, v3 C M I
|
||||
;;
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
ld8 u7 = [up], 8 C M01
|
||||
cmp.PRED p9, p0 = w3, u3 C M I
|
||||
;;
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
ld8 u0 = [up], 8 C M01
|
||||
(p15) br.cond.dpnt .grt7 C B
|
||||
;;
|
||||
st8 [rp] = w2, 8 C M23
|
||||
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
|
||||
(p8) add w3 = INCR, w3 C M I
|
||||
ADDSUB w4 = u4, v4 C M I
|
||||
br .Lcj67 C B
|
||||
|
||||
.grt7: ld8 v1 = [vp], 8 C M01
|
||||
ld8 u1 = [up], 8 C M01
|
||||
shr.u n = n, 3 C I0
|
||||
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
|
||||
nop.i 0
|
||||
nop.b 0
|
||||
;;
|
||||
add r11 = 512, vp
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
add r10 = 512, up
|
||||
ld8 u2 = [up], 8 C M01
|
||||
(p8) add w3 = INCR, w3 C M I
|
||||
nop.b 0
|
||||
;;
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
ld8 u3 = [up], 8 C M01
|
||||
mov.i ar.lc = n C I0
|
||||
st8 [rp] = w2, 8 C M23
|
||||
ADDSUB w4 = u4, v4 C M I
|
||||
br .LL11x C B
|
||||
|
||||
C *** MAIN LOOP START ***
|
||||
ALIGN(32)
|
||||
.Loop: ld8 v1 = [vp], 8 C M01
|
||||
cmp.PRED p7, p0 = w1, u1 C M I
|
||||
(p9) cmp.eq.or p6, p0 = LIM, w0 C M I
|
||||
ld8 u1 = [up], 8 C M01
|
||||
(p9) add w0 = INCR, w0 C M I
|
||||
ADDSUB w2 = u2, v2 C M I
|
||||
;;
|
||||
ld8 v2 = [vp], 8 C M01
|
||||
cmp.PRED p8, p0 = w2, u2 C M I
|
||||
(p6) cmp.eq.or p7, p0 = LIM, w1 C M I
|
||||
ld8 u2 = [up], 8 C M01
|
||||
(p6) add w1 = INCR, w1 C M I
|
||||
ADDSUB w3 = u3, v3 C M I
|
||||
;;
|
||||
st8 [rp] = w0, 8 C M23
|
||||
ld8 v3 = [vp], 8 C M01
|
||||
cmp.PRED p9, p0 = w3, u3 C M I
|
||||
(p7) cmp.eq.or p8, p0 = LIM, w2 C M I
|
||||
ld8 u3 = [up], 8 C M01
|
||||
(p7) add w2 = INCR, w2 C M I
|
||||
;;
|
||||
.LL000: st8 [rp] = w1, 16 C M23
|
||||
st8 [rpx] = w2, 32 C M23
|
||||
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
|
||||
lfetch [r10], 64
|
||||
(p8) add w3 = INCR, w3 C M I
|
||||
ADDSUB w4 = u4, v4 C M I
|
||||
;;
|
||||
.LL11x: st8 [rp] = w3, 8 C M23
|
||||
ld8 v4 = [vp], 8 C M01
|
||||
cmp.PRED p6, p0 = w4, u4 C M I
|
||||
ld8 u4 = [up], 8 C M01
|
||||
ADDSUB w5 = u5, v5 C M I
|
||||
;;
|
||||
ld8 v5 = [vp], 8 C M01
|
||||
cmp.PRED p7, p0 = w5, u5 C M I
|
||||
(p9) cmp.eq.or p6, p0 = LIM, w4 C M I
|
||||
ld8 u5 = [up], 8 C M01
|
||||
(p9) add w4 = INCR, w4 C M I
|
||||
ADDSUB w6 = u6, v6 C M I
|
||||
;;
|
||||
.LL101: ld8 v6 = [vp], 8 C M01
|
||||
cmp.PRED p8, p0 = w6, u6 C M I
|
||||
(p6) cmp.eq.or p7, p0 = LIM, w5 C M I
|
||||
ld8 u6 = [up], 8 C M01
|
||||
(p6) add w5 = INCR, w5 C M I
|
||||
ADDSUB w7 = u7, v7 C M I
|
||||
;;
|
||||
st8 [rp] = w4, 8 C M23
|
||||
ld8 v7 = [vp], 8 C M01
|
||||
cmp.PRED p9, p0 = w7, u7 C M I
|
||||
(p7) cmp.eq.or p8, p0 = LIM, w6 C M I
|
||||
ld8 u7 = [up], 8 C M01
|
||||
(p7) add w6 = INCR, w6 C M I
|
||||
;;
|
||||
.LL100: st8 [rp] = w5, 16 C M23
|
||||
st8 [rpx] = w6, 32 C M23
|
||||
(p8) cmp.eq.or p9, p0 = LIM, w7 C M I
|
||||
lfetch [r11], 64
|
||||
(p8) add w7 = INCR, w7 C M I
|
||||
ADDSUB w0 = u0, v0 C M I
|
||||
;;
|
||||
.LL01x: st8 [rp] = w7, 8 C M23
|
||||
ld8 v0 = [vp], 8 C M01
|
||||
cmp.PRED p6, p0 = w0, u0 C M I
|
||||
ld8 u0 = [up], 8 C M01
|
||||
ADDSUB w1 = u1, v1 C M I
|
||||
br.cloop.dptk .Loop C B
|
||||
;;
|
||||
C *** MAIN LOOP END ***
|
||||
|
||||
cmp.PRED p7, p0 = w1, u1 C M I
|
||||
(p9) cmp.eq.or p6, p0 = LIM, w0 C M I
|
||||
(p9) add w0 = INCR, w0 C M I
|
||||
ADDSUB w2 = u2, v2 C M I
|
||||
;;
|
||||
.Lcj9: cmp.PRED p8, p0 = w2, u2 C M I
|
||||
(p6) cmp.eq.or p7, p0 = LIM, w1 C M I
|
||||
st8 [rp] = w0, 8 C M23
|
||||
(p6) add w1 = INCR, w1 C M I
|
||||
ADDSUB w3 = u3, v3 C M I
|
||||
;;
|
||||
cmp.PRED p9, p0 = w3, u3 C M I
|
||||
(p7) cmp.eq.or p8, p0 = LIM, w2 C M I
|
||||
(p7) add w2 = INCR, w2 C M I
|
||||
;;
|
||||
.Lcj8: st8 [rp] = w1, 16 C M23
|
||||
st8 [rpx] = w2, 32 C M23
|
||||
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
|
||||
(p8) add w3 = INCR, w3 C M I
|
||||
ADDSUB w4 = u4, v4 C M I
|
||||
;;
|
||||
.Lcj67: st8 [rp] = w3, 8 C M23
|
||||
cmp.PRED p6, p0 = w4, u4 C M I
|
||||
ADDSUB w5 = u5, v5 C M I
|
||||
;;
|
||||
cmp.PRED p7, p0 = w5, u5 C M I
|
||||
(p9) cmp.eq.or p6, p0 = LIM, w4 C M I
|
||||
(p9) add w4 = INCR, w4 C M I
|
||||
ADDSUB w6 = u6, v6 C M I
|
||||
;;
|
||||
.Lcj5: cmp.PRED p8, p0 = w6, u6 C M I
|
||||
(p6) cmp.eq.or p7, p0 = LIM, w5 C M I
|
||||
st8 [rp] = w4, 8 C M23
|
||||
(p6) add w5 = INCR, w5 C M I
|
||||
ADDSUB w7 = u7, v7 C M I
|
||||
;;
|
||||
.Lcj4: cmp.PRED p9, p0 = w7, u7 C M I
|
||||
(p7) cmp.eq.or p8, p0 = LIM, w6 C M I
|
||||
(p7) add w6 = INCR, w6 C M I
|
||||
;;
|
||||
st8 [rp] = w5, 16 C M23
|
||||
st8 [rpx] = w6, 32 C M23
|
||||
.Lcj3:
|
||||
(p8) cmp.eq.or p9, p0 = LIM, w7 C M I
|
||||
(p8) add w7 = INCR, w7 C M I
|
||||
ADDSUB w0 = u0, v0 C M I
|
||||
;;
|
||||
.Lcj2: st8 [rp] = w7, 8 C M23
|
||||
cmp.PRED p6, p0 = w0, u0 C M I
|
||||
;;
|
||||
(p9) cmp.eq.or p6, p0 = LIM, w0 C M I
|
||||
(p9) add w0 = INCR, w0 C M I
|
||||
mov r8 = 0 C M I
|
||||
;;
|
||||
.Lcj1: st8 [rp] = w0, 8 C M23
|
||||
mov.i ar.lc = r2 C I0
|
||||
(p6) mov r8 = 1 C M I
|
||||
br.ret.sptk.many b0 C B
|
||||
EPILOGUE()
|
||||
ASM_END()
|
@ -25,6 +25,8 @@ include(`../config.m4')
|
||||
C cycles/limb
|
||||
C 68040: 6
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(M4_inst, addxl)
|
||||
define(M4_function_n, mpn_add_n)
|
93
mpn/m68k/sub_n.asm
Normal file
93
mpn/m68k/sub_n.asm
Normal file
@ -0,0 +1,93 @@
|
||||
dnl mc68020 mpn_add_n, mpn_sub_n -- add or subtract limb vectors
|
||||
|
||||
dnl Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002, 2003, 2005 Free
|
||||
dnl Software Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
C cycles/limb
|
||||
C 68040: 6
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(M4_inst, addxl)
|
||||
define(M4_function_n, mpn_add_n)
|
||||
',`ifdef(`OPERATION_sub_n',`
|
||||
define(M4_inst, subxl)
|
||||
define(M4_function_n, mpn_sub_n)
|
||||
',
|
||||
`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
|
||||
')')')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
|
||||
|
||||
|
||||
C INPUT PARAMETERS
|
||||
C res_ptr (sp + 4)
|
||||
C s1_ptr (sp + 8)
|
||||
C s2_ptr (sp + 12)
|
||||
C size (sp + 16)
|
||||
|
||||
|
||||
PROLOGUE(M4_function_n)
|
||||
|
||||
C Save used registers on the stack.
|
||||
movel d2, M(-,sp)
|
||||
movel a2, M(-,sp)
|
||||
|
||||
C Copy the arguments to registers. Better use movem?
|
||||
movel M(sp,12), a2
|
||||
movel M(sp,16), a0
|
||||
movel M(sp,20), a1
|
||||
movel M(sp,24), d2
|
||||
|
||||
eorw #1, d2
|
||||
lsrl #1, d2
|
||||
bcc L(L1)
|
||||
subql #1, d2 C clears cy as side effect
|
||||
|
||||
L(Loop):
|
||||
movel M(a0,+), d0
|
||||
movel M(a1,+), d1
|
||||
M4_inst d1, d0
|
||||
movel d0, M(a2,+)
|
||||
L(L1): movel M(a0,+), d0
|
||||
movel M(a1,+), d1
|
||||
M4_inst d1, d0
|
||||
movel d0, M(a2,+)
|
||||
|
||||
dbf d2, L(Loop) C loop until 16 lsb of %4 == -1
|
||||
subxl d0, d0 C d0 <= -cy; save cy as 0 or -1 in d0
|
||||
subl #0x10000, d2
|
||||
bcs L(L2)
|
||||
addl d0, d0 C restore cy
|
||||
bra L(Loop)
|
||||
|
||||
L(L2):
|
||||
negl d0
|
||||
|
||||
C Restore used registers from stack frame.
|
||||
movel M(sp,+), a2
|
||||
movel M(sp,+), d2
|
||||
|
||||
rts
|
||||
|
||||
EPILOGUE(M4_function_n)
|
@ -30,6 +30,7 @@ C K6: 3.5
|
||||
C K7: 2.25
|
||||
C P4: 8.75
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(M4_inst, adcl)
|
@ -35,6 +35,7 @@ dnl Maximum possible with the current code is 64.
|
||||
|
||||
deflit(UNROLL_COUNT, 16)
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(M4_inst, adcl)
|
@ -35,6 +35,7 @@ dnl Maximum possible with the current code is 64.
|
||||
|
||||
deflit(UNROLL_COUNT, 16)
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(M4_inst, adcl)
|
@ -24,6 +24,7 @@ include(`../config.m4')
|
||||
|
||||
C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(M4_inst, adcl)
|
329
mpn/x86/k6/sub_n.asm
Normal file
329
mpn/x86/k6/sub_n.asm
Normal file
@ -0,0 +1,329 @@
|
||||
dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
|
||||
|
||||
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
|
||||
C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(M4_inst, adcl)
|
||||
define(M4_function_n, mpn_add_n)
|
||||
define(M4_function_nc, mpn_add_nc)
|
||||
define(M4_description, add)
|
||||
',`ifdef(`OPERATION_sub_n', `
|
||||
define(M4_inst, sbbl)
|
||||
define(M4_function_n, mpn_sub_n)
|
||||
define(M4_function_nc, mpn_sub_nc)
|
||||
define(M4_description, subtract)
|
||||
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
|
||||
')')')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
|
||||
|
||||
|
||||
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size);
|
||||
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size, mp_limb_t carry);
|
||||
C
|
||||
C Calculate src1,size M4_description src2,size, and store the result in
|
||||
C dst,size. The return value is the carry bit from the top of the result
|
||||
C (1 or 0).
|
||||
C
|
||||
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
|
||||
C the calculation. Note values other than 1 or 0 here will lead to garbage
|
||||
C results.
|
||||
C
|
||||
C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
|
||||
C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
|
||||
C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
|
||||
|
||||
define(PARAM_CARRY, `FRAME+20(%esp)')
|
||||
define(PARAM_SIZE, `FRAME+16(%esp)')
|
||||
define(PARAM_SRC2, `FRAME+12(%esp)')
|
||||
define(PARAM_SRC1, `FRAME+8(%esp)')
|
||||
define(PARAM_DST, `FRAME+4(%esp)')
|
||||
deflit(`FRAME',0)
|
||||
|
||||
dnl minimum 5 because the unrolled code can't handle less
|
||||
deflit(UNROLL_THRESHOLD, 5)
|
||||
|
||||
TEXT
|
||||
ALIGN(32)
|
||||
|
||||
PROLOGUE(M4_function_nc)
|
||||
movl PARAM_CARRY, %eax
|
||||
jmp L(start)
|
||||
EPILOGUE()
|
||||
|
||||
|
||||
PROLOGUE(M4_function_n)
|
||||
xorl %eax, %eax
|
||||
L(start):
|
||||
movl PARAM_SIZE, %ecx
|
||||
pushl %ebx
|
||||
FRAME_pushl()
|
||||
|
||||
movl PARAM_SRC1, %ebx
|
||||
pushl %edi
|
||||
FRAME_pushl()
|
||||
|
||||
movl PARAM_SRC2, %edx
|
||||
cmpl $UNROLL_THRESHOLD, %ecx
|
||||
|
||||
movl PARAM_DST, %edi
|
||||
jae L(unroll)
|
||||
|
||||
|
||||
shrl %eax C initial carry flag
|
||||
|
||||
C offset 0x21 here, close enough to aligned
|
||||
L(simple):
|
||||
C eax scratch
|
||||
C ebx src1
|
||||
C ecx counter
|
||||
C edx src2
|
||||
C esi
|
||||
C edi dst
|
||||
C ebp
|
||||
C
|
||||
C The store to (%edi) could be done with a stosl; it'd be smaller
|
||||
C code, but there's no speed gain and a cld would have to be added
|
||||
C (per mpn/x86/README).
|
||||
|
||||
movl (%ebx), %eax
|
||||
leal 4(%ebx), %ebx
|
||||
|
||||
M4_inst (%edx), %eax
|
||||
|
||||
movl %eax, (%edi)
|
||||
leal 4(%edi), %edi
|
||||
|
||||
leal 4(%edx), %edx
|
||||
loop L(simple)
|
||||
|
||||
|
||||
movl $0, %eax
|
||||
popl %edi
|
||||
|
||||
setc %al
|
||||
|
||||
popl %ebx
|
||||
ret
|
||||
|
||||
|
||||
C -----------------------------------------------------------------------------
|
||||
L(unroll):
|
||||
C eax carry
|
||||
C ebx src1
|
||||
C ecx counter
|
||||
C edx src2
|
||||
C esi
|
||||
C edi dst
|
||||
C ebp
|
||||
|
||||
cmpl %edi, %ebx
|
||||
pushl %esi
|
||||
|
||||
je L(inplace)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
cmpl %edi, %edx
|
||||
|
||||
je L(inplace_reverse)
|
||||
')
|
||||
|
||||
movl %ecx, %esi
|
||||
|
||||
andl $-4, %ecx
|
||||
andl $3, %esi
|
||||
|
||||
leal (%ebx,%ecx,4), %ebx
|
||||
leal (%edx,%ecx,4), %edx
|
||||
leal (%edi,%ecx,4), %edi
|
||||
|
||||
negl %ecx
|
||||
shrl %eax
|
||||
|
||||
ALIGN(32)
|
||||
L(normal_top):
|
||||
C eax counter, qwords, negative
|
||||
C ebx src1
|
||||
C ecx scratch
|
||||
C edx src2
|
||||
C esi
|
||||
C edi dst
|
||||
C ebp
|
||||
|
||||
movl (%ebx,%ecx,4), %eax
|
||||
leal 5(%ecx), %ecx
|
||||
M4_inst -20(%edx,%ecx,4), %eax
|
||||
movl %eax, -20(%edi,%ecx,4)
|
||||
|
||||
movl 4-20(%ebx,%ecx,4), %eax
|
||||
M4_inst 4-20(%edx,%ecx,4), %eax
|
||||
movl %eax, 4-20(%edi,%ecx,4)
|
||||
|
||||
movl 8-20(%ebx,%ecx,4), %eax
|
||||
M4_inst 8-20(%edx,%ecx,4), %eax
|
||||
movl %eax, 8-20(%edi,%ecx,4)
|
||||
|
||||
movl 12-20(%ebx,%ecx,4), %eax
|
||||
M4_inst 12-20(%edx,%ecx,4), %eax
|
||||
movl %eax, 12-20(%edi,%ecx,4)
|
||||
|
||||
loop L(normal_top)
|
||||
|
||||
|
||||
decl %esi
|
||||
jz L(normal_finish_one)
|
||||
js L(normal_done)
|
||||
|
||||
C two or three more limbs
|
||||
|
||||
movl (%ebx), %eax
|
||||
M4_inst (%edx), %eax
|
||||
movl %eax, (%edi)
|
||||
|
||||
movl 4(%ebx), %eax
|
||||
M4_inst 4(%edx), %eax
|
||||
decl %esi
|
||||
movl %eax, 4(%edi)
|
||||
|
||||
jz L(normal_done)
|
||||
movl $2, %ecx
|
||||
|
||||
L(normal_finish_one):
|
||||
movl (%ebx,%ecx,4), %eax
|
||||
M4_inst (%edx,%ecx,4), %eax
|
||||
movl %eax, (%edi,%ecx,4)
|
||||
|
||||
L(normal_done):
|
||||
popl %esi
|
||||
popl %edi
|
||||
|
||||
movl $0, %eax
|
||||
popl %ebx
|
||||
|
||||
setc %al
|
||||
|
||||
ret
|
||||
|
||||
|
||||
C -----------------------------------------------------------------------------
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
L(inplace_reverse):
|
||||
C dst==src2
|
||||
|
||||
movl %ebx, %edx
|
||||
')
|
||||
|
||||
L(inplace):
|
||||
C eax initial carry
|
||||
C ebx
|
||||
C ecx size
|
||||
C edx src
|
||||
C esi
|
||||
C edi dst
|
||||
C ebp
|
||||
|
||||
leal -1(%ecx), %esi
|
||||
decl %ecx
|
||||
|
||||
andl $-4, %ecx
|
||||
andl $3, %esi
|
||||
|
||||
movl (%edx), %ebx C src low limb
|
||||
leal (%edx,%ecx,4), %edx
|
||||
|
||||
leal (%edi,%ecx,4), %edi
|
||||
negl %ecx
|
||||
|
||||
shrl %eax
|
||||
|
||||
|
||||
ALIGN(32)
|
||||
L(inplace_top):
|
||||
C eax
|
||||
C ebx next src limb
|
||||
C ecx size
|
||||
C edx src
|
||||
C esi
|
||||
C edi dst
|
||||
C ebp
|
||||
|
||||
M4_inst %ebx, (%edi,%ecx,4)
|
||||
|
||||
movl 4(%edx,%ecx,4), %eax
|
||||
leal 5(%ecx), %ecx
|
||||
|
||||
M4_inst %eax, 4-20(%edi,%ecx,4)
|
||||
|
||||
movl 8-20(%edx,%ecx,4), %eax
|
||||
movl 12-20(%edx,%ecx,4), %ebx
|
||||
|
||||
M4_inst %eax, 8-20(%edi,%ecx,4)
|
||||
M4_inst %ebx, 12-20(%edi,%ecx,4)
|
||||
|
||||
movl 16-20(%edx,%ecx,4), %ebx
|
||||
loop L(inplace_top)
|
||||
|
||||
|
||||
C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
|
||||
|
||||
M4_inst %ebx, (%edi)
|
||||
|
||||
decl %esi
|
||||
jz L(inplace_finish_one)
|
||||
js L(inplace_done)
|
||||
|
||||
C two or three more limbs
|
||||
|
||||
movl 4(%edx), %eax
|
||||
movl 8(%edx), %ebx
|
||||
M4_inst %eax, 4(%edi)
|
||||
M4_inst %ebx, 8(%edi)
|
||||
|
||||
decl %esi
|
||||
movl $2, %ecx
|
||||
|
||||
jz L(normal_done)
|
||||
|
||||
L(inplace_finish_one):
|
||||
movl 4(%edx,%ecx,4), %eax
|
||||
M4_inst %eax, 4(%edi,%ecx,4)
|
||||
|
||||
L(inplace_done):
|
||||
popl %esi
|
||||
popl %edi
|
||||
|
||||
movl $0, %eax
|
||||
popl %ebx
|
||||
|
||||
setc %al
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE()
|
250
mpn/x86/k7/add_n.asm
Normal file
250
mpn/x86/k7/add_n.asm
Normal file
@ -0,0 +1,250 @@
|
||||
dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
|
||||
|
||||
dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
|
||||
C K7: 1.64 cycles/limb (at 16 limbs/loop).
|
||||
|
||||
|
||||
|
||||
dnl K7: UNROLL_COUNT cycles/limb
|
||||
dnl 8 1.9
|
||||
dnl 16 1.64
|
||||
dnl 32 1.7
|
||||
dnl 64 2.0
|
||||
dnl Maximum possible with the current code is 64.
|
||||
|
||||
deflit(UNROLL_COUNT, 16)
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(M4_inst, adcl)
|
||||
define(M4_function_n, mpn_add_n)
|
||||
define(M4_function_nc, mpn_add_nc)
|
||||
define(M4_description, add)
|
||||
',`ifdef(`OPERATION_sub_n', `
|
||||
define(M4_inst, sbbl)
|
||||
define(M4_function_n, mpn_sub_n)
|
||||
define(M4_function_nc, mpn_sub_nc)
|
||||
define(M4_description, subtract)
|
||||
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
|
||||
')')')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
|
||||
|
||||
|
||||
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size);
|
||||
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size, mp_limb_t carry);
|
||||
C
|
||||
C Calculate src1,size M4_description src2,size, and store the result in
|
||||
C dst,size. The return value is the carry bit from the top of the result (1
|
||||
C or 0).
|
||||
C
|
||||
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
|
||||
C the calculation. Note values other than 1 or 0 here will lead to garbage
|
||||
C results.
|
||||
C
|
||||
C This code runs at 1.64 cycles/limb, which might be the best possible with
|
||||
C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
|
||||
C which can be done each cycle, leading to 1.5 c/l.
|
||||
|
||||
dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
|
||||
ifdef(`PIC',`
|
||||
deflit(UNROLL_THRESHOLD, 8)
|
||||
',`
|
||||
deflit(UNROLL_THRESHOLD, 8)
|
||||
')
|
||||
|
||||
defframe(PARAM_CARRY,20)
|
||||
defframe(PARAM_SIZE, 16)
|
||||
defframe(PARAM_SRC2, 12)
|
||||
defframe(PARAM_SRC1, 8)
|
||||
defframe(PARAM_DST, 4)
|
||||
|
||||
defframe(SAVE_EBP, -4)
|
||||
defframe(SAVE_ESI, -8)
|
||||
defframe(SAVE_EBX, -12)
|
||||
defframe(SAVE_EDI, -16)
|
||||
deflit(STACK_SPACE, 16)
|
||||
|
||||
TEXT
|
||||
ALIGN(32)
|
||||
deflit(`FRAME',0)
|
||||
|
||||
PROLOGUE(M4_function_nc)
|
||||
movl PARAM_CARRY, %eax
|
||||
jmp L(start)
|
||||
EPILOGUE()
|
||||
|
||||
PROLOGUE(M4_function_n)
|
||||
|
||||
xorl %eax, %eax C carry
|
||||
L(start):
|
||||
movl PARAM_SIZE, %ecx
|
||||
subl $STACK_SPACE, %esp
|
||||
deflit(`FRAME',STACK_SPACE)
|
||||
|
||||
movl %edi, SAVE_EDI
|
||||
movl %ebx, SAVE_EBX
|
||||
cmpl $UNROLL_THRESHOLD, %ecx
|
||||
|
||||
movl PARAM_SRC2, %edx
|
||||
movl PARAM_SRC1, %ebx
|
||||
jae L(unroll)
|
||||
|
||||
movl PARAM_DST, %edi
|
||||
leal (%ebx,%ecx,4), %ebx
|
||||
leal (%edx,%ecx,4), %edx
|
||||
|
||||
leal (%edi,%ecx,4), %edi
|
||||
negl %ecx
|
||||
shrl %eax
|
||||
|
||||
C This loop in in a single 16 byte code block already, so no
|
||||
C alignment necessary.
|
||||
L(simple):
|
||||
C eax scratch
|
||||
C ebx src1
|
||||
C ecx counter
|
||||
C edx src2
|
||||
C esi
|
||||
C edi dst
|
||||
C ebp
|
||||
|
||||
movl (%ebx,%ecx,4), %eax
|
||||
M4_inst (%edx,%ecx,4), %eax
|
||||
movl %eax, (%edi,%ecx,4)
|
||||
incl %ecx
|
||||
jnz L(simple)
|
||||
|
||||
movl $0, %eax
|
||||
movl SAVE_EDI, %edi
|
||||
|
||||
movl SAVE_EBX, %ebx
|
||||
setc %al
|
||||
addl $STACK_SPACE, %esp
|
||||
|
||||
ret
|
||||
|
||||
|
||||
C -----------------------------------------------------------------------------
|
||||
C This is at 0x55, close enough to aligned.
|
||||
L(unroll):
|
||||
deflit(`FRAME',STACK_SPACE)
|
||||
movl %ebp, SAVE_EBP
|
||||
andl $-2, %ecx C size low bit masked out
|
||||
andl $1, PARAM_SIZE C size low bit kept
|
||||
|
||||
movl %ecx, %edi
|
||||
decl %ecx
|
||||
movl PARAM_DST, %ebp
|
||||
|
||||
shrl $UNROLL_LOG2, %ecx
|
||||
negl %edi
|
||||
movl %esi, SAVE_ESI
|
||||
|
||||
andl $UNROLL_MASK, %edi
|
||||
|
||||
ifdef(`PIC',`
|
||||
call L(pic_calc)
|
||||
L(here):
|
||||
',`
|
||||
leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
|
||||
')
|
||||
negl %edi
|
||||
shrl %eax
|
||||
|
||||
leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
|
||||
leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
|
||||
leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
|
||||
|
||||
jmp *%esi
|
||||
|
||||
|
||||
ifdef(`PIC',`
|
||||
L(pic_calc):
|
||||
C See mpn/x86/README about old gas bugs
|
||||
leal (%edi,%edi,8), %esi
|
||||
addl $L(entry)-L(here), %esi
|
||||
addl (%esp), %esi
|
||||
ret_internal
|
||||
')
|
||||
|
||||
|
||||
C -----------------------------------------------------------------------------
|
||||
ALIGN(32)
|
||||
L(top):
|
||||
C eax zero
|
||||
C ebx src1
|
||||
C ecx counter
|
||||
C edx src2
|
||||
C esi scratch (was computed jump)
|
||||
C edi dst
|
||||
C ebp scratch
|
||||
|
||||
leal UNROLL_BYTES(%edx), %edx
|
||||
|
||||
L(entry):
|
||||
deflit(CHUNK_COUNT, 2)
|
||||
forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
|
||||
deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
|
||||
deflit(`disp1', eval(disp0 + 4))
|
||||
|
||||
Zdisp( movl, disp0,(%ebx), %esi)
|
||||
movl disp1(%ebx), %ebp
|
||||
Zdisp( M4_inst,disp0,(%edx), %esi)
|
||||
Zdisp( movl, %esi, disp0,(%edi))
|
||||
M4_inst disp1(%edx), %ebp
|
||||
movl %ebp, disp1(%edi)
|
||||
')
|
||||
|
||||
decl %ecx
|
||||
leal UNROLL_BYTES(%ebx), %ebx
|
||||
leal UNROLL_BYTES(%edi), %edi
|
||||
jns L(top)
|
||||
|
||||
|
||||
mov PARAM_SIZE, %esi
|
||||
movl SAVE_EBP, %ebp
|
||||
movl $0, %eax
|
||||
|
||||
decl %esi
|
||||
js L(even)
|
||||
|
||||
movl (%ebx), %ecx
|
||||
M4_inst UNROLL_BYTES(%edx), %ecx
|
||||
movl %ecx, (%edi)
|
||||
L(even):
|
||||
|
||||
movl SAVE_EDI, %edi
|
||||
movl SAVE_EBX, %ebx
|
||||
setc %al
|
||||
|
||||
movl SAVE_ESI, %esi
|
||||
addl $STACK_SPACE, %esp
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE()
|
250
mpn/x86/k7/sub_n.asm
Normal file
250
mpn/x86/k7/sub_n.asm
Normal file
@ -0,0 +1,250 @@
|
||||
dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
|
||||
|
||||
dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
|
||||
C K7: 1.64 cycles/limb (at 16 limbs/loop).
|
||||
|
||||
|
||||
|
||||
dnl K7: UNROLL_COUNT cycles/limb
|
||||
dnl 8 1.9
|
||||
dnl 16 1.64
|
||||
dnl 32 1.7
|
||||
dnl 64 2.0
|
||||
dnl Maximum possible with the current code is 64.
|
||||
|
||||
deflit(UNROLL_COUNT, 16)
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n', `
|
||||
define(M4_inst, adcl)
|
||||
define(M4_function_n, mpn_add_n)
|
||||
define(M4_function_nc, mpn_add_nc)
|
||||
define(M4_description, add)
|
||||
',`ifdef(`OPERATION_sub_n', `
|
||||
define(M4_inst, sbbl)
|
||||
define(M4_function_n, mpn_sub_n)
|
||||
define(M4_function_nc, mpn_sub_nc)
|
||||
define(M4_description, subtract)
|
||||
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
|
||||
')')')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
|
||||
|
||||
|
||||
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size);
|
||||
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size, mp_limb_t carry);
|
||||
C
|
||||
C Calculate src1,size M4_description src2,size, and store the result in
|
||||
C dst,size. The return value is the carry bit from the top of the result (1
|
||||
C or 0).
|
||||
C
|
||||
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
|
||||
C the calculation. Note values other than 1 or 0 here will lead to garbage
|
||||
C results.
|
||||
C
|
||||
C This code runs at 1.64 cycles/limb, which might be the best possible with
|
||||
C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
|
||||
C which can be done each cycle, leading to 1.5 c/l.
|
||||
|
||||
dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
|
||||
ifdef(`PIC',`
|
||||
deflit(UNROLL_THRESHOLD, 8)
|
||||
',`
|
||||
deflit(UNROLL_THRESHOLD, 8)
|
||||
')
|
||||
|
||||
defframe(PARAM_CARRY,20)
|
||||
defframe(PARAM_SIZE, 16)
|
||||
defframe(PARAM_SRC2, 12)
|
||||
defframe(PARAM_SRC1, 8)
|
||||
defframe(PARAM_DST, 4)
|
||||
|
||||
defframe(SAVE_EBP, -4)
|
||||
defframe(SAVE_ESI, -8)
|
||||
defframe(SAVE_EBX, -12)
|
||||
defframe(SAVE_EDI, -16)
|
||||
deflit(STACK_SPACE, 16)
|
||||
|
||||
TEXT
|
||||
ALIGN(32)
|
||||
deflit(`FRAME',0)
|
||||
|
||||
PROLOGUE(M4_function_nc)
|
||||
movl PARAM_CARRY, %eax
|
||||
jmp L(start)
|
||||
EPILOGUE()
|
||||
|
||||
PROLOGUE(M4_function_n)
|
||||
|
||||
xorl %eax, %eax C carry
|
||||
L(start):
|
||||
movl PARAM_SIZE, %ecx
|
||||
subl $STACK_SPACE, %esp
|
||||
deflit(`FRAME',STACK_SPACE)
|
||||
|
||||
movl %edi, SAVE_EDI
|
||||
movl %ebx, SAVE_EBX
|
||||
cmpl $UNROLL_THRESHOLD, %ecx
|
||||
|
||||
movl PARAM_SRC2, %edx
|
||||
movl PARAM_SRC1, %ebx
|
||||
jae L(unroll)
|
||||
|
||||
movl PARAM_DST, %edi
|
||||
leal (%ebx,%ecx,4), %ebx
|
||||
leal (%edx,%ecx,4), %edx
|
||||
|
||||
leal (%edi,%ecx,4), %edi
|
||||
negl %ecx
|
||||
shrl %eax
|
||||
|
||||
C This loop in in a single 16 byte code block already, so no
|
||||
C alignment necessary.
|
||||
L(simple):
|
||||
C eax scratch
|
||||
C ebx src1
|
||||
C ecx counter
|
||||
C edx src2
|
||||
C esi
|
||||
C edi dst
|
||||
C ebp
|
||||
|
||||
movl (%ebx,%ecx,4), %eax
|
||||
M4_inst (%edx,%ecx,4), %eax
|
||||
movl %eax, (%edi,%ecx,4)
|
||||
incl %ecx
|
||||
jnz L(simple)
|
||||
|
||||
movl $0, %eax
|
||||
movl SAVE_EDI, %edi
|
||||
|
||||
movl SAVE_EBX, %ebx
|
||||
setc %al
|
||||
addl $STACK_SPACE, %esp
|
||||
|
||||
ret
|
||||
|
||||
|
||||
C -----------------------------------------------------------------------------
|
||||
C This is at 0x55, close enough to aligned.
|
||||
L(unroll):
|
||||
deflit(`FRAME',STACK_SPACE)
|
||||
movl %ebp, SAVE_EBP
|
||||
andl $-2, %ecx C size low bit masked out
|
||||
andl $1, PARAM_SIZE C size low bit kept
|
||||
|
||||
movl %ecx, %edi
|
||||
decl %ecx
|
||||
movl PARAM_DST, %ebp
|
||||
|
||||
shrl $UNROLL_LOG2, %ecx
|
||||
negl %edi
|
||||
movl %esi, SAVE_ESI
|
||||
|
||||
andl $UNROLL_MASK, %edi
|
||||
|
||||
ifdef(`PIC',`
|
||||
call L(pic_calc)
|
||||
L(here):
|
||||
',`
|
||||
leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
|
||||
')
|
||||
negl %edi
|
||||
shrl %eax
|
||||
|
||||
leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
|
||||
leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
|
||||
leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
|
||||
|
||||
jmp *%esi
|
||||
|
||||
|
||||
ifdef(`PIC',`
|
||||
L(pic_calc):
|
||||
C See mpn/x86/README about old gas bugs
|
||||
leal (%edi,%edi,8), %esi
|
||||
addl $L(entry)-L(here), %esi
|
||||
addl (%esp), %esi
|
||||
ret_internal
|
||||
')
|
||||
|
||||
|
||||
C -----------------------------------------------------------------------------
|
||||
ALIGN(32)
|
||||
L(top):
|
||||
C eax zero
|
||||
C ebx src1
|
||||
C ecx counter
|
||||
C edx src2
|
||||
C esi scratch (was computed jump)
|
||||
C edi dst
|
||||
C ebp scratch
|
||||
|
||||
leal UNROLL_BYTES(%edx), %edx
|
||||
|
||||
L(entry):
|
||||
deflit(CHUNK_COUNT, 2)
|
||||
forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
|
||||
deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
|
||||
deflit(`disp1', eval(disp0 + 4))
|
||||
|
||||
Zdisp( movl, disp0,(%ebx), %esi)
|
||||
movl disp1(%ebx), %ebp
|
||||
Zdisp( M4_inst,disp0,(%edx), %esi)
|
||||
Zdisp( movl, %esi, disp0,(%edi))
|
||||
M4_inst disp1(%edx), %ebp
|
||||
movl %ebp, disp1(%edi)
|
||||
')
|
||||
|
||||
decl %ecx
|
||||
leal UNROLL_BYTES(%ebx), %ebx
|
||||
leal UNROLL_BYTES(%edi), %edi
|
||||
jns L(top)
|
||||
|
||||
|
||||
mov PARAM_SIZE, %esi
|
||||
movl SAVE_EBP, %ebp
|
||||
movl $0, %eax
|
||||
|
||||
decl %esi
|
||||
js L(even)
|
||||
|
||||
movl (%ebx), %ecx
|
||||
M4_inst UNROLL_BYTES(%edx), %ecx
|
||||
movl %ecx, (%edi)
|
||||
L(even):
|
||||
|
||||
movl SAVE_EDI, %edi
|
||||
movl SAVE_EBX, %ebx
|
||||
setc %al
|
||||
|
||||
movl SAVE_ESI, %esi
|
||||
addl $STACK_SPACE, %esp
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE()
|
@ -39,5 +39,4 @@ C couple of experiments didn't get much joy, but such an approach would at
|
||||
C least avoid serialization, presumably.
|
||||
C
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
|
||||
include_mpn(`x86/k7/aors_n.asm')
|
||||
include_mpn(`x86/k7/add_n.asm')
|
42
mpn/x86/p6/sub_n.asm
Normal file
42
mpn/x86/p6/sub_n.asm
Normal file
@ -0,0 +1,42 @@
|
||||
dnl Intel P6 mpn_add_n, mpn_sub_n -- mpn add or subtract.
|
||||
|
||||
dnl Copyright 2003 Free Software Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
|
||||
C P6: 2.7 cycles/limb
|
||||
|
||||
|
||||
C The K7 code runs quite well on P6, but this seems mainly due to the larger
|
||||
C amount of unrolling than in mpn/x86/aors_n.asm.
|
||||
C
|
||||
C P6 apparently doesn't separately rename the carry flag, or something, so a
|
||||
C loop holding a carry across decl or incl takes 4 cycles for the loop
|
||||
C control. Perhaps it's more when relying on out-of-order execution to hide
|
||||
C load latencies too.
|
||||
C
|
||||
C Not sure what the best approach would be. sbbl then addl to save and
|
||||
C restore the carry across the loop control would be a possibility. A
|
||||
C couple of experiments didn't get much joy, but such an approach would at
|
||||
C least avoid serialization, presumably.
|
||||
C
|
||||
|
||||
include_mpn(`x86/k7/sub_n.asm')
|
@ -25,6 +25,7 @@ include(`../config.m4')
|
||||
|
||||
C P5: 2.375 cycles/limb
|
||||
|
||||
define(OPERATION_add_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(M4_inst, adcl)
|
196
mpn/x86/pentium/sub_n.asm
Normal file
196
mpn/x86/pentium/sub_n.asm
Normal file
@ -0,0 +1,196 @@
|
||||
dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
|
||||
|
||||
dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
|
||||
dnl Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
|
||||
C P5: 2.375 cycles/limb
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(M4_inst, adcl)
|
||||
define(M4_function_n, mpn_add_n)
|
||||
define(M4_function_nc, mpn_add_nc)
|
||||
|
||||
',`ifdef(`OPERATION_sub_n',`
|
||||
define(M4_inst, sbbl)
|
||||
define(M4_function_n, mpn_sub_n)
|
||||
define(M4_function_nc, mpn_sub_nc)
|
||||
|
||||
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
|
||||
')')')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
|
||||
|
||||
|
||||
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size);
|
||||
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size, mp_limb_t carry);
|
||||
|
||||
defframe(PARAM_CARRY,20)
|
||||
defframe(PARAM_SIZE, 16)
|
||||
defframe(PARAM_SRC2, 12)
|
||||
defframe(PARAM_SRC1, 8)
|
||||
defframe(PARAM_DST, 4)
|
||||
|
||||
TEXT
|
||||
ALIGN(8)
|
||||
PROLOGUE(M4_function_nc)
|
||||
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
pushl %ebp
|
||||
deflit(`FRAME',16)
|
||||
|
||||
movl PARAM_DST,%edi
|
||||
movl PARAM_SRC1,%esi
|
||||
movl PARAM_SRC2,%ebp
|
||||
movl PARAM_SIZE,%ecx
|
||||
|
||||
movl (%ebp),%ebx
|
||||
|
||||
decl %ecx
|
||||
movl %ecx,%edx
|
||||
shrl $3,%ecx
|
||||
andl $7,%edx
|
||||
testl %ecx,%ecx C zero carry flag
|
||||
jz L(endgo)
|
||||
|
||||
pushl %edx
|
||||
FRAME_pushl()
|
||||
movl PARAM_CARRY,%eax
|
||||
shrl $1,%eax C shift bit 0 into carry
|
||||
jmp L(oop)
|
||||
|
||||
L(endgo):
|
||||
deflit(`FRAME',16)
|
||||
movl PARAM_CARRY,%eax
|
||||
shrl $1,%eax C shift bit 0 into carry
|
||||
jmp L(end)
|
||||
|
||||
EPILOGUE()
|
||||
|
||||
|
||||
ALIGN(8)
|
||||
PROLOGUE(M4_function_n)
|
||||
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
pushl %ebp
|
||||
deflit(`FRAME',16)
|
||||
|
||||
movl PARAM_DST,%edi
|
||||
movl PARAM_SRC1,%esi
|
||||
movl PARAM_SRC2,%ebp
|
||||
movl PARAM_SIZE,%ecx
|
||||
|
||||
movl (%ebp),%ebx
|
||||
|
||||
decl %ecx
|
||||
movl %ecx,%edx
|
||||
shrl $3,%ecx
|
||||
andl $7,%edx
|
||||
testl %ecx,%ecx C zero carry flag
|
||||
jz L(end)
|
||||
pushl %edx
|
||||
FRAME_pushl()
|
||||
|
||||
ALIGN(8)
|
||||
L(oop): movl 28(%edi),%eax C fetch destination cache line
|
||||
leal 32(%edi),%edi
|
||||
|
||||
L(1): movl (%esi),%eax
|
||||
movl 4(%esi),%edx
|
||||
M4_inst %ebx,%eax
|
||||
movl 4(%ebp),%ebx
|
||||
M4_inst %ebx,%edx
|
||||
movl 8(%ebp),%ebx
|
||||
movl %eax,-32(%edi)
|
||||
movl %edx,-28(%edi)
|
||||
|
||||
L(2): movl 8(%esi),%eax
|
||||
movl 12(%esi),%edx
|
||||
M4_inst %ebx,%eax
|
||||
movl 12(%ebp),%ebx
|
||||
M4_inst %ebx,%edx
|
||||
movl 16(%ebp),%ebx
|
||||
movl %eax,-24(%edi)
|
||||
movl %edx,-20(%edi)
|
||||
|
||||
L(3): movl 16(%esi),%eax
|
||||
movl 20(%esi),%edx
|
||||
M4_inst %ebx,%eax
|
||||
movl 20(%ebp),%ebx
|
||||
M4_inst %ebx,%edx
|
||||
movl 24(%ebp),%ebx
|
||||
movl %eax,-16(%edi)
|
||||
movl %edx,-12(%edi)
|
||||
|
||||
L(4): movl 24(%esi),%eax
|
||||
movl 28(%esi),%edx
|
||||
M4_inst %ebx,%eax
|
||||
movl 28(%ebp),%ebx
|
||||
M4_inst %ebx,%edx
|
||||
movl 32(%ebp),%ebx
|
||||
movl %eax,-8(%edi)
|
||||
movl %edx,-4(%edi)
|
||||
|
||||
leal 32(%esi),%esi
|
||||
leal 32(%ebp),%ebp
|
||||
decl %ecx
|
||||
jnz L(oop)
|
||||
|
||||
popl %edx
|
||||
FRAME_popl()
|
||||
L(end):
|
||||
decl %edx C test %edx w/o clobbering carry
|
||||
js L(end2)
|
||||
incl %edx
|
||||
L(oop2):
|
||||
leal 4(%edi),%edi
|
||||
movl (%esi),%eax
|
||||
M4_inst %ebx,%eax
|
||||
movl 4(%ebp),%ebx
|
||||
movl %eax,-4(%edi)
|
||||
leal 4(%esi),%esi
|
||||
leal 4(%ebp),%ebp
|
||||
decl %edx
|
||||
jnz L(oop2)
|
||||
L(end2):
|
||||
movl (%esi),%eax
|
||||
M4_inst %ebx,%eax
|
||||
movl %eax,(%edi)
|
||||
|
||||
sbbl %eax,%eax
|
||||
negl %eax
|
||||
|
||||
popl %ebp
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
ret
|
||||
|
||||
EPILOGUE()
|
195
mpn/x86/sub_n.asm
Normal file
195
mpn/x86/sub_n.asm
Normal file
@ -0,0 +1,195 @@
|
||||
dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
|
||||
|
||||
dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2001, 2002 Free Software
|
||||
dnl Foundation, Inc.
|
||||
dnl
|
||||
dnl This file is part of the GNU MP Library.
|
||||
dnl
|
||||
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||||
dnl modify it under the terms of the GNU Lesser General Public License as
|
||||
dnl published by the Free Software Foundation; either version 2.1 of the
|
||||
dnl License, or (at your option) any later version.
|
||||
dnl
|
||||
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||||
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
dnl Lesser General Public License for more details.
|
||||
dnl
|
||||
dnl You should have received a copy of the GNU Lesser General Public
|
||||
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
include(`../config.m4')
|
||||
|
||||
|
||||
C cycles/limb
|
||||
C P5: 3.375
|
||||
C P6: 3.125
|
||||
C K6: 3.5
|
||||
C K7: 2.25
|
||||
C P4: 8.75
|
||||
|
||||
define(OPERATION_sub_n,1)
|
||||
|
||||
ifdef(`OPERATION_add_n',`
|
||||
define(M4_inst, adcl)
|
||||
define(M4_function_n, mpn_add_n)
|
||||
define(M4_function_nc, mpn_add_nc)
|
||||
|
||||
',`ifdef(`OPERATION_sub_n',`
|
||||
define(M4_inst, sbbl)
|
||||
define(M4_function_n, mpn_sub_n)
|
||||
define(M4_function_nc, mpn_sub_nc)
|
||||
|
||||
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
|
||||
')')')
|
||||
|
||||
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
|
||||
|
||||
|
||||
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size);
|
||||
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
|
||||
C mp_size_t size, mp_limb_t carry);
|
||||
|
||||
defframe(PARAM_CARRY,20)
|
||||
defframe(PARAM_SIZE, 16)
|
||||
defframe(PARAM_SRC2, 12)
|
||||
defframe(PARAM_SRC1, 8)
|
||||
defframe(PARAM_DST, 4)
|
||||
|
||||
TEXT
|
||||
ALIGN(8)
|
||||
|
||||
PROLOGUE(M4_function_nc)
|
||||
deflit(`FRAME',0)
|
||||
|
||||
pushl %edi FRAME_pushl()
|
||||
pushl %esi FRAME_pushl()
|
||||
|
||||
movl PARAM_DST,%edi
|
||||
movl PARAM_SRC1,%esi
|
||||
movl PARAM_SRC2,%edx
|
||||
movl PARAM_SIZE,%ecx
|
||||
|
||||
movl %ecx,%eax
|
||||
shrl $3,%ecx C compute count for unrolled loop
|
||||
negl %eax
|
||||
andl $7,%eax C get index where to start loop
|
||||
jz L(oopgo) C necessary special case for 0
|
||||
incl %ecx C adjust loop count
|
||||
shll $2,%eax C adjustment for pointers...
|
||||
subl %eax,%edi C ... since they are offset ...
|
||||
subl %eax,%esi C ... by a constant when we ...
|
||||
subl %eax,%edx C ... enter the loop
|
||||
shrl $2,%eax C restore previous value
|
||||
|
||||
ifdef(`PIC',`
|
||||
C Calculate start address in loop for PIC. Due to limitations in
|
||||
C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
|
||||
call L(0a)
|
||||
L(0a): leal (%eax,%eax,8),%eax
|
||||
addl (%esp),%eax
|
||||
addl $L(oop)-L(0a)-3,%eax
|
||||
addl $4,%esp
|
||||
',`
|
||||
C Calculate start address in loop for non-PIC.
|
||||
leal L(oop)-3(%eax,%eax,8),%eax
|
||||
')
|
||||
|
||||
C These lines initialize carry from the 5th parameter. Should be
|
||||
C possible to simplify.
|
||||
pushl %ebp FRAME_pushl()
|
||||
movl PARAM_CARRY,%ebp
|
||||
shrl $1,%ebp C shift bit 0 into carry
|
||||
popl %ebp FRAME_popl()
|
||||
|
||||
jmp *%eax C jump into loop
|
||||
|
||||
EPILOGUE()
|
||||
|
||||
|
||||
ALIGN(16)
|
||||
PROLOGUE(M4_function_n)
|
||||
deflit(`FRAME',0)
|
||||
|
||||
pushl %edi FRAME_pushl()
|
||||
pushl %esi FRAME_pushl()
|
||||
|
||||
movl PARAM_DST,%edi
|
||||
movl PARAM_SRC1,%esi
|
||||
movl PARAM_SRC2,%edx
|
||||
movl PARAM_SIZE,%ecx
|
||||
|
||||
movl %ecx,%eax
|
||||
shrl $3,%ecx C compute count for unrolled loop
|
||||
negl %eax
|
||||
andl $7,%eax C get index where to start loop
|
||||
jz L(oop) C necessary special case for 0
|
||||
incl %ecx C adjust loop count
|
||||
shll $2,%eax C adjustment for pointers...
|
||||
subl %eax,%edi C ... since they are offset ...
|
||||
subl %eax,%esi C ... by a constant when we ...
|
||||
subl %eax,%edx C ... enter the loop
|
||||
shrl $2,%eax C restore previous value
|
||||
|
||||
ifdef(`PIC',`
|
||||
C Calculate start address in loop for PIC. Due to limitations in
|
||||
C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
|
||||
call L(0b)
|
||||
L(0b): leal (%eax,%eax,8),%eax
|
||||
addl (%esp),%eax
|
||||
addl $L(oop)-L(0b)-3,%eax
|
||||
addl $4,%esp
|
||||
',`
|
||||
C Calculate start address in loop for non-PIC.
|
||||
leal L(oop)-3(%eax,%eax,8),%eax
|
||||
')
|
||||
jmp *%eax C jump into loop
|
||||
|
||||
L(oopgo):
|
||||
pushl %ebp FRAME_pushl()
|
||||
movl PARAM_CARRY,%ebp
|
||||
shrl $1,%ebp C shift bit 0 into carry
|
||||
popl %ebp FRAME_popl()
|
||||
|
||||
ALIGN(16)
|
||||
L(oop): movl (%esi),%eax
|
||||
M4_inst (%edx),%eax
|
||||
movl %eax,(%edi)
|
||||
movl 4(%esi),%eax
|
||||
M4_inst 4(%edx),%eax
|
||||
movl %eax,4(%edi)
|
||||
movl 8(%esi),%eax
|
||||
M4_inst 8(%edx),%eax
|
||||
movl %eax,8(%edi)
|
||||
movl 12(%esi),%eax
|
||||
M4_inst 12(%edx),%eax
|
||||
movl %eax,12(%edi)
|
||||
movl 16(%esi),%eax
|
||||
M4_inst 16(%edx),%eax
|
||||
movl %eax,16(%edi)
|
||||
movl 20(%esi),%eax
|
||||
M4_inst 20(%edx),%eax
|
||||
movl %eax,20(%edi)
|
||||
movl 24(%esi),%eax
|
||||
M4_inst 24(%edx),%eax
|
||||
movl %eax,24(%edi)
|
||||
movl 28(%esi),%eax
|
||||
M4_inst 28(%edx),%eax
|
||||
movl %eax,28(%edi)
|
||||
leal 32(%edi),%edi
|
||||
leal 32(%esi),%esi
|
||||
leal 32(%edx),%edx
|
||||
decl %ecx
|
||||
jnz L(oop)
|
||||
|
||||
sbbl %eax,%eax
|
||||
negl %eax
|
||||
|
||||
popl %esi
|
||||
popl %edi
|
||||
ret
|
||||
|
||||
EPILOGUE()
|
@ -99,8 +99,7 @@ sub process_asm {
|
||||
my $base = basename ($file, '.asm');
|
||||
|
||||
my @funs;
|
||||
if ($base eq 'aors_n') { @funs = qw(add_n sub_n); }
|
||||
elsif ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); }
|
||||
if ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); }
|
||||
elsif ($base eq 'logops_n') { @funs = qw(and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n); }
|
||||
else { @funs = ($base); }
|
||||
|
||||
|
@ -284,15 +284,6 @@ my @table =
|
||||
'speed' => 'SPEED_ROUTINE_MPN_BINARY_N',
|
||||
'speed_flags'=> 'FLAG_R_OPTIONAL',
|
||||
},
|
||||
{
|
||||
'regexp'=> 'aors_n',
|
||||
'mulfunc'=> ['add_n','sub_n'],
|
||||
'ret' => 'mp_limb_t',
|
||||
'args' => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size',
|
||||
'speed' => 'SPEED_ROUTINE_MPN_BINARY_N',
|
||||
'speed_flags'=> 'FLAG_R_OPTIONAL',
|
||||
},
|
||||
|
||||
{
|
||||
'regexp'=> 'addmul_1|submul_1',
|
||||
'ret' => 'mp_limb_t',
|
||||
|
Loading…
Reference in New Issue
Block a user