remove all traces of aors_n.* , we now use separate add and sub versions

This commit is contained in:
jasonmoxham 2010-12-30 07:20:29 +00:00
parent c2c5579778
commit 577aeee345
23 changed files with 2207 additions and 20 deletions

2
configure vendored
View File

@ -26039,7 +26039,6 @@ esac
# functions that can be provided by multi-function files
tmp_mulfunc=
case $tmp_fn in
add_n|sub_n) tmp_mulfunc="aors_n" ;;
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
tmp_mulfunc="logops_n" ;;
@ -26182,7 +26181,6 @@ for tmp_fn in $gmp_mpn_functions; do
# functions that can be provided by multi-function files
tmp_mulfunc=
case $tmp_fn in
add_n|sub_n) tmp_mulfunc="aors_n" ;;
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
tmp_mulfunc="logops_n" ;;

View File

@ -2363,7 +2363,6 @@ define(GMP_MULFUNC_CHOICES,
[# functions that can be provided by multi-function files
tmp_mulfunc=
case $tmp_fn in
add_n|sub_n) tmp_mulfunc="aors_n" ;;
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
tmp_mulfunc="logops_n" ;;

View File

@ -54,6 +54,8 @@ define(`CYSH',`GMP_NUMB_BITS')
dnl This declaration is munged by configure
NAILS_SUPPORT(1-63)
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n', `
define(`OP', addq)
define(`CYSH',`GMP_NUMB_BITS')

View File

@ -0,0 +1,226 @@
dnl Alpha ev6 nails mpn_add_n and mpn_sub_n.
dnl Copyright 2002, 2006 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb
dnl with 8-way unrolling.
include(`../config.m4')
dnl INPUT PARAMETERS
define(`rp',`r16')
define(`up',`r17')
define(`vp',`r18')
define(`n',`r19')
define(`rl0',`r0')
define(`rl1',`r1')
define(`rl2',`r2')
define(`rl3',`r3')
define(`ul0',`r4')
define(`ul1',`r5')
define(`ul2',`r6')
define(`ul3',`r7')
define(`vl0',`r22')
define(`vl1',`r23')
define(`vl2',`r24')
define(`vl3',`r25')
define(`numb_mask',`r21')
define(`NAIL_BITS',`GMP_NAIL_BITS')
define(`CYSH',`GMP_NUMB_BITS')
dnl This declaration is munged by configure
NAILS_SUPPORT(1-63)
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n', `
define(`OP', addq)
define(`CYSH',`GMP_NUMB_BITS')
define(`func', mpn_add_n)')
ifdef(`OPERATION_sub_n', `
define(`OP', subq)
define(`CYSH',63)
define(`func', mpn_sub_n)')
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
ASM_START()
PROLOGUE(func)
lda numb_mask, -1(r31)
srl numb_mask, NAIL_BITS, numb_mask
bis r31, r31, r20
and n, 3, r25
lda n, -4(n)
beq r25, L(ge4)
L(lp0): ldq ul0, 0(up)
lda up, 8(up)
ldq vl0, 0(vp)
lda vp, 8(vp)
lda rp, 8(rp)
lda r25, -1(r25)
OP ul0, vl0, rl0
OP rl0, r20, rl0
and rl0, numb_mask, r28
stq r28, -8(rp)
srl rl0, CYSH, r20
bne r25, L(lp0)
blt n, L(ret)
L(ge4): ldq ul0, 0(up)
ldq vl0, 0(vp)
ldq ul1, 8(up)
ldq vl1, 8(vp)
ldq ul2, 16(up)
ldq vl2, 16(vp)
ldq ul3, 24(up)
ldq vl3, 24(vp)
lda up, 32(up)
lda vp, 32(vp)
lda n, -4(n)
bge n, L(ge8)
OP ul0, vl0, rl0 C main-add 0
OP rl0, r20, rl0 C cy-add 0
OP ul1, vl1, rl1 C main-add 1
srl rl0, CYSH, r20 C gen cy 0
OP rl1, r20, rl1 C cy-add 1
and rl0,numb_mask, r27
br r31, L(cj0)
L(ge8): OP ul0, vl0, rl0 C main-add 0
ldq ul0, 0(up)
ldq vl0, 0(vp)
OP rl0, r20, rl0 C cy-add 0
OP ul1, vl1, rl1 C main-add 1
srl rl0, CYSH, r20 C gen cy 0
ldq ul1, 8(up)
ldq vl1, 8(vp)
OP rl1, r20, rl1 C cy-add 1
and rl0,numb_mask, r27
OP ul2, vl2, rl2 C main-add 2
srl rl1, CYSH, r20 C gen cy 1
ldq ul2, 16(up)
ldq vl2, 16(vp)
OP rl2, r20, rl2 C cy-add 2
and rl1,numb_mask, r28
stq r27, 0(rp)
OP ul3, vl3, rl3 C main-add 3
srl rl2, CYSH, r20 C gen cy 2
ldq ul3, 24(up)
ldq vl3, 24(vp)
OP rl3, r20, rl3 C cy-add 3
and rl2,numb_mask, r27
stq r28, 8(rp)
lda rp, 32(rp)
lda up, 32(up)
lda vp, 32(vp)
lda n, -4(n)
blt n, L(end)
ALIGN(32)
L(top): OP ul0, vl0, rl0 C main-add 0
srl rl3, CYSH, r20 C gen cy 3
ldq ul0, 0(up)
ldq vl0, 0(vp)
OP rl0, r20, rl0 C cy-add 0
and rl3,numb_mask, r28
stq r27, -16(rp)
bis r31, r31, r31
OP ul1, vl1, rl1 C main-add 1
srl rl0, CYSH, r20 C gen cy 0
ldq ul1, 8(up)
ldq vl1, 8(vp)
OP rl1, r20, rl1 C cy-add 1
and rl0,numb_mask, r27
stq r28, -8(rp)
bis r31, r31, r31
OP ul2, vl2, rl2 C main-add 2
srl rl1, CYSH, r20 C gen cy 1
ldq ul2, 16(up)
ldq vl2, 16(vp)
OP rl2, r20, rl2 C cy-add 2
and rl1,numb_mask, r28
stq r27, 0(rp)
bis r31, r31, r31
OP ul3, vl3, rl3 C main-add 3
srl rl2, CYSH, r20 C gen cy 2
ldq ul3, 24(up)
ldq vl3, 24(vp)
OP rl3, r20, rl3 C cy-add 3
and rl2,numb_mask, r27
stq r28, 8(rp)
bis r31, r31, r31
bis r31, r31, r31
lda n, -4(n)
lda up, 32(up)
lda vp, 32(vp)
bis r31, r31, r31
bis r31, r31, r31
lda rp, 32(rp)
bge n, L(top)
L(end): OP ul0, vl0, rl0 C main-add 0
srl rl3, CYSH, r20 C gen cy 3
OP rl0, r20, rl0 C cy-add 0
and rl3,numb_mask, r28
stq r27, -16(rp)
OP ul1, vl1, rl1 C main-add 1
srl rl0, CYSH, r20 C gen cy 0
OP rl1, r20, rl1 C cy-add 1
and rl0,numb_mask, r27
stq r28, -8(rp)
L(cj0): OP ul2, vl2, rl2 C main-add 2
srl rl1, CYSH, r20 C gen cy 1
OP rl2, r20, rl2 C cy-add 2
and rl1,numb_mask, r28
stq r27, 0(rp)
OP ul3, vl3, rl3 C main-add 3
srl rl2, CYSH, r20 C gen cy 2
OP rl3, r20, rl3 C cy-add 3
and rl2,numb_mask, r27
stq r28, 8(rp)
srl rl3, CYSH, r20 C gen cy 3
and rl3,numb_mask, r28
stq r27, 16(rp)
stq r28, 24(rp)
L(ret): and r20, 1, r0
ret r31, (r26), 1
EPILOGUE()
ASM_END()

View File

@ -1052,10 +1052,6 @@ dnl with ifdef() rather than be expanded.
m4_not_for_expansion(`PIC')
m4_not_for_expansion(`DLL_EXPORT')
dnl aors_n
m4_not_for_expansion(`OPERATION_add_n')
m4_not_for_expansion(`OPERATION_sub_n')
dnl aorsmul_1
m4_not_for_expansion(`OPERATION_addmul_1')
m4_not_for_expansion(`OPERATION_submul_1')

View File

@ -33,6 +33,8 @@ define(`up',`r33')
define(`vp',`r34')
define(`n',`r35')
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n',`
define(ADDSUB, add)
define(PRED, ltu)

613
mpn/ia64/sub_n.asm Normal file
View File

@ -0,0 +1,613 @@
dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Itanium: 2.67
C Itanium 2: 1.25
C TODO
C * Consider using special code for small n, using something like
C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
C INPUT PARAMETERS
define(`rp',`r32')
define(`up',`r33')
define(`vp',`r34')
define(`n',`r35')
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n',`
define(ADDSUB, add)
define(PRED, ltu)
define(INCR, 1)
define(LIM, -1)
define(func, mpn_add_n)
')
ifdef(`OPERATION_sub_n',`
define(ADDSUB, sub)
define(PRED, gtu)
define(INCR, -1)
define(LIM, 0)
define(func, mpn_sub_n)
')
C Some useful aliases for registers we use
define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
define(`u4',`r18') define(`u5',`r19') define(`u6',`r20') define(`u7',`r21')
define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
define(`v4',`r28') define(`v5',`r29') define(`v6',`r30') define(`v7',`r31')
define(`w0',`r22') define(`w1',`r9') define(`w2',`r8') define(`w3',`r23')
define(`w4',`r22') define(`w5',`r9') define(`w6',`r8') define(`w7',`r23')
define(`rpx',`r3')
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
ASM_START()
PROLOGUE(func)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',`
addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
addp4 vp = 0, vp C M I
zxt4 n = n C I
;;
')
{.mmi C 00
ld8 r11 = [vp], 8 C M01
ld8 r10 = [up], 8 C M01
mov.i r2 = ar.lc C I0
}
{.mmi
and r14 = 7, n C M I
cmp.lt p15, p14 = 8, n C M I
add n = -8, n C M I
;;
}
{.mmi C 01
cmp.eq p6, p0 = 1, r14 C M I
cmp.eq p7, p0 = 2, r14 C M I
cmp.eq p8, p0 = 3, r14 C M I
}
{.bbb
(p6) br.dptk .Lb001 C B
(p7) br.dptk .Lb010 C B
(p8) br.dptk .Lb011 C B
;;
}
{.mmi C 02
cmp.eq p9, p0 = 4, r14 C M I
cmp.eq p10, p0 = 5, r14 C M I
cmp.eq p11, p0 = 6, r14 C M I
}
{.bbb
(p9) br.dptk .Lb100 C B
(p10) br.dptk .Lb101 C B
(p11) br.dptk .Lb110 C B
;;
} C 03
{.mmb
cmp.eq p12, p0 = 7, r14 C M I
add n = -1, n C loop count M I
(p12) br.dptk .Lb111 C B
}
.Lb000: ld8 v2 = [vp], 8 C M01
ld8 u2 = [up], 8 C M01
add rpx = 8, rp C M I
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
ADDSUB w1 = r10, r11 C M I
;;
ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
cmp.PRED p7, p0 = w1, r10 C M I
;;
ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
ADDSUB w2 = u2, v2 C M I
;;
ld8 v6 = [vp], 8 C M01
ld8 u6 = [up], 8 C M01
cmp.PRED p8, p0 = w2, u2 C M I
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
ADDSUB w3 = u3, v3 C M I
;;
ld8 v0 = [vp], 8 C M01
ld8 u0 = [up], 8 C M01
cmp.PRED p9, p0 = w3, u3 C M I
(p7) cmp.eq.or p8, p0 = LIM, w2 C M I
(p7) add w2 = INCR, w2 C M I
(p14) br.cond.dptk .Lcj8 C B
;;
.grt8: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
shr.u n = n, 3 C I0
;;
add r11 = 512, vp
ld8 v2 = [vp], 8 C M01
add r10 = 512, up
ld8 u2 = [up], 8 C M01
nop.i 0
nop.b 0
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
mov.i ar.lc = n C I0
br .LL000 C B
.Lb001: add rpx = 16, rp C M I
ADDSUB w0 = r10, r11 C M I
(p15) br.cond.dpnt .grt1 C B
;;
cmp.PRED p6, p0 = w0, r10 C M I
mov r8 = 0 C M I
br .Lcj1 C B
.grt1: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
shr.u n = n, 3 C I0
;;
ld8 v2 = [vp], 8 C M01
ld8 u2 = [up], 8 C M01
cmp.ne p9, p0 = r0, r0 C read near Loop
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
mov.i ar.lc = n C I0
;;
ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
cmp.PRED p6, p0 = w0, r10 C M I
;;
ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
ADDSUB w1 = u1, v1 C M I
;;
ld8 v6 = [vp], 8 C M01
ld8 u6 = [up], 8 C M01
cmp.PRED p7, p0 = w1, u1 C M I
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
ADDSUB w2 = u2, v2 C M I
;;
add r11 = 512, vp
ld8 v0 = [vp], 8 C M01
add r10 = 512, up
ld8 u0 = [up], 8 C M01
br.cloop.dptk .Loop C B
br .Lcj9 C B
.Lb010: ld8 v0 = [vp], 8 C M01
ld8 u0 = [up], 8 C M01
add rpx = 24, rp C M I
ADDSUB w7 = r10, r11 C M I
(p15) br.cond.dpnt .grt2 C B
;;
cmp.PRED p9, p0 = w7, r10 C M I
ADDSUB w0 = u0, v0 C M I
br .Lcj2 C B
.grt2: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
shr.u n = n, 3 C I0
;;
ld8 v2 = [vp], 8 C M01
ld8 u2 = [up], 8 C M01
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
mov.i ar.lc = n C I0
;;
ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
;;
ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
cmp.PRED p9, p0 = w7, r10 C M I
;;
ld8 v6 = [vp], 8 C M01
ld8 u6 = [up], 8 C M01
ADDSUB w0 = u0, v0 C M I
;;
add r11 = 512, vp
ld8 v7 = [vp], 8 C M01
add r10 = 512, up
ld8 u7 = [up], 8 C M01
br .LL01x C B
.Lb011: ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
ADDSUB w6 = r10, r11 C M I
;;
ld8 v0 = [vp], 8 C M01
ld8 u0 = [up], 8 C M01
(p15) br.cond.dpnt .grt3 C B
;;
cmp.PRED p8, p0 = w6, r10 C M I
ADDSUB w7 = u7, v7 C M I
;;
st8 [rp] = w6, 8 C M23
cmp.PRED p9, p0 = w7, u7 C M I
br .Lcj3 C B
.grt3: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
add rpx = 32, rp C M I
;;
ld8 v2 = [vp], 8 C M01
ld8 u2 = [up], 8 C M01
shr.u n = n, 3 C I0
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
cmp.PRED p8, p0 = w6, r10 C M I
;;
ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
mov.i ar.lc = n C I0
ADDSUB w7 = u7, v7 C M I
nop.i 0
nop.b 0
;;
ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
cmp.PRED p9, p0 = w7, u7 C M I
;;
add r11 = 512, vp
ld8 v6 = [vp], 8 C M01
add r10 = 512, up
ld8 u6 = [up], 8 C M01
(p8) cmp.eq.or p9, p0 = LIM, w7 C M I
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
(p8) add w7 = INCR, w7 C M I
st8 [rp] = w6, 8 C M23
ADDSUB w0 = u0, v0 C M I
br .LL01x C B
.Lb100: ld8 v6 = [vp], 8 C M01
ld8 u6 = [up], 8 C M01
add rpx = 8, rp C M I
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
ADDSUB w5 = r10, r11 C M I
;;
ld8 v0 = [vp], 8 C M01
ld8 u0 = [up], 8 C M01
(p15) br.cond.dpnt .grt4 C B
;;
cmp.PRED p7, p0 = w5, r10 C M I
ADDSUB w6 = u6, v6 C M I
;;
cmp.PRED p8, p0 = w6, u6 C M I
ADDSUB w7 = u7, v7 C M I
br .Lcj4 C B
.grt4: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
shr.u n = n, 3 C I0
cmp.PRED p7, p0 = w5, r10 C M I
;;
ld8 v2 = [vp], 8 C M01
ld8 u2 = [up], 8 C M01
ADDSUB w6 = u6, v6 C M I
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
cmp.PRED p8, p0 = w6, u6 C M I
;;
ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
mov.i ar.lc = n C I0
;;
ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
ADDSUB w7 = u7, v7 C M I
;;
add r11 = 512, vp
ld8 v6 = [vp], 8 C M01
add r10 = 512, up
ld8 u6 = [up], 8 C M01
cmp.PRED p9, p0 = w7, u7 C M I
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
(p7) cmp.eq.or p8, p0 = LIM, w6 C M I
(p7) add w6 = INCR, w6 C M I
br .LL100 C B
.Lb101: ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
add rpx = 16, rp C M I
;;
ld8 v6 = [vp], 8 C M01
ld8 u6 = [up], 8 C M01
ADDSUB w4 = r10, r11 C M I
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
cmp.PRED p6, p0 = w4, r10 C M I
;;
ld8 v0 = [vp], 8 C M01
ld8 u0 = [up], 8 C M01
ADDSUB w5 = u5, v5 C M I
shr.u n = n, 3 C I0
(p15) br.cond.dpnt .grt5 C B
;;
cmp.PRED p7, p0 = w5, u5 C M I
ADDSUB w6 = u6, v6 C M I
br .Lcj5 C B
.grt5: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
;;
ld8 v2 = [vp], 8 C M01
ld8 u2 = [up], 8 C M01
mov.i ar.lc = n C I0
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
cmp.PRED p7, p0 = w5, u5 C M I
;;
ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
ADDSUB w6 = u6, v6 C M I
;;
add r11 = 512, vp
ld8 v5 = [vp], 8 C M01
add r10 = 512, up
ld8 u5 = [up], 8 C M01
br .LL101 C B
.Lb110: ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
add rpx = 24, rp C M I
;;
ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
ADDSUB w3 = r10, r11 C M I
;;
ld8 v6 = [vp], 8 C M01
ld8 u6 = [up], 8 C M01
shr.u n = n, 3 C I0
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
cmp.PRED p9, p0 = w3, r10 C M I
;;
ld8 v0 = [vp], 8 C M01
ld8 u0 = [up], 8 C M01
ADDSUB w4 = u4, v4 C M I
(p14) br.cond.dptk .Lcj67 C B
;;
.grt6: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
mov.i ar.lc = n C I0
cmp.PRED p9, p0 = w3, r10 C M I
nop.i 0
nop.b 0
;;
ld8 v2 = [vp], 8 C M01
ld8 u2 = [up], 8 C M01
ADDSUB w4 = u4, v4 C M I
;;
add r11 = 512, vp
ld8 v3 = [vp], 8 C M01
add r10 = 512, up
ld8 u3 = [up], 8 C M01
br .LL11x C B
.Lb111: ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
add rpx = 32, rp C M I
;;
ld8 v4 = [vp], 8 C M01
ld8 u4 = [up], 8 C M01
ADDSUB w2 = r10, r11 C M I
;;
ld8 v5 = [vp], 8 C M01
ld8 u5 = [up], 8 C M01
cmp.PRED p8, p0 = w2, r10 C M I
;;
ld8 v6 = [vp], 8 C M01
ld8 u6 = [up], 8 C M01
ADDSUB w3 = u3, v3 C M I
;;
ld8 v7 = [vp], 8 C M01
ld8 u7 = [up], 8 C M01
cmp.PRED p9, p0 = w3, u3 C M I
;;
ld8 v0 = [vp], 8 C M01
ld8 u0 = [up], 8 C M01
(p15) br.cond.dpnt .grt7 C B
;;
st8 [rp] = w2, 8 C M23
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
(p8) add w3 = INCR, w3 C M I
ADDSUB w4 = u4, v4 C M I
br .Lcj67 C B
.grt7: ld8 v1 = [vp], 8 C M01
ld8 u1 = [up], 8 C M01
shr.u n = n, 3 C I0
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
nop.i 0
nop.b 0
;;
add r11 = 512, vp
ld8 v2 = [vp], 8 C M01
add r10 = 512, up
ld8 u2 = [up], 8 C M01
(p8) add w3 = INCR, w3 C M I
nop.b 0
;;
ld8 v3 = [vp], 8 C M01
ld8 u3 = [up], 8 C M01
mov.i ar.lc = n C I0
st8 [rp] = w2, 8 C M23
ADDSUB w4 = u4, v4 C M I
br .LL11x C B
C *** MAIN LOOP START ***
ALIGN(32)
.Loop: ld8 v1 = [vp], 8 C M01
cmp.PRED p7, p0 = w1, u1 C M I
(p9) cmp.eq.or p6, p0 = LIM, w0 C M I
ld8 u1 = [up], 8 C M01
(p9) add w0 = INCR, w0 C M I
ADDSUB w2 = u2, v2 C M I
;;
ld8 v2 = [vp], 8 C M01
cmp.PRED p8, p0 = w2, u2 C M I
(p6) cmp.eq.or p7, p0 = LIM, w1 C M I
ld8 u2 = [up], 8 C M01
(p6) add w1 = INCR, w1 C M I
ADDSUB w3 = u3, v3 C M I
;;
st8 [rp] = w0, 8 C M23
ld8 v3 = [vp], 8 C M01
cmp.PRED p9, p0 = w3, u3 C M I
(p7) cmp.eq.or p8, p0 = LIM, w2 C M I
ld8 u3 = [up], 8 C M01
(p7) add w2 = INCR, w2 C M I
;;
.LL000: st8 [rp] = w1, 16 C M23
st8 [rpx] = w2, 32 C M23
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
lfetch [r10], 64
(p8) add w3 = INCR, w3 C M I
ADDSUB w4 = u4, v4 C M I
;;
.LL11x: st8 [rp] = w3, 8 C M23
ld8 v4 = [vp], 8 C M01
cmp.PRED p6, p0 = w4, u4 C M I
ld8 u4 = [up], 8 C M01
ADDSUB w5 = u5, v5 C M I
;;
ld8 v5 = [vp], 8 C M01
cmp.PRED p7, p0 = w5, u5 C M I
(p9) cmp.eq.or p6, p0 = LIM, w4 C M I
ld8 u5 = [up], 8 C M01
(p9) add w4 = INCR, w4 C M I
ADDSUB w6 = u6, v6 C M I
;;
.LL101: ld8 v6 = [vp], 8 C M01
cmp.PRED p8, p0 = w6, u6 C M I
(p6) cmp.eq.or p7, p0 = LIM, w5 C M I
ld8 u6 = [up], 8 C M01
(p6) add w5 = INCR, w5 C M I
ADDSUB w7 = u7, v7 C M I
;;
st8 [rp] = w4, 8 C M23
ld8 v7 = [vp], 8 C M01
cmp.PRED p9, p0 = w7, u7 C M I
(p7) cmp.eq.or p8, p0 = LIM, w6 C M I
ld8 u7 = [up], 8 C M01
(p7) add w6 = INCR, w6 C M I
;;
.LL100: st8 [rp] = w5, 16 C M23
st8 [rpx] = w6, 32 C M23
(p8) cmp.eq.or p9, p0 = LIM, w7 C M I
lfetch [r11], 64
(p8) add w7 = INCR, w7 C M I
ADDSUB w0 = u0, v0 C M I
;;
.LL01x: st8 [rp] = w7, 8 C M23
ld8 v0 = [vp], 8 C M01
cmp.PRED p6, p0 = w0, u0 C M I
ld8 u0 = [up], 8 C M01
ADDSUB w1 = u1, v1 C M I
br.cloop.dptk .Loop C B
;;
C *** MAIN LOOP END ***
cmp.PRED p7, p0 = w1, u1 C M I
(p9) cmp.eq.or p6, p0 = LIM, w0 C M I
(p9) add w0 = INCR, w0 C M I
ADDSUB w2 = u2, v2 C M I
;;
.Lcj9: cmp.PRED p8, p0 = w2, u2 C M I
(p6) cmp.eq.or p7, p0 = LIM, w1 C M I
st8 [rp] = w0, 8 C M23
(p6) add w1 = INCR, w1 C M I
ADDSUB w3 = u3, v3 C M I
;;
cmp.PRED p9, p0 = w3, u3 C M I
(p7) cmp.eq.or p8, p0 = LIM, w2 C M I
(p7) add w2 = INCR, w2 C M I
;;
.Lcj8: st8 [rp] = w1, 16 C M23
st8 [rpx] = w2, 32 C M23
(p8) cmp.eq.or p9, p0 = LIM, w3 C M I
(p8) add w3 = INCR, w3 C M I
ADDSUB w4 = u4, v4 C M I
;;
.Lcj67: st8 [rp] = w3, 8 C M23
cmp.PRED p6, p0 = w4, u4 C M I
ADDSUB w5 = u5, v5 C M I
;;
cmp.PRED p7, p0 = w5, u5 C M I
(p9) cmp.eq.or p6, p0 = LIM, w4 C M I
(p9) add w4 = INCR, w4 C M I
ADDSUB w6 = u6, v6 C M I
;;
.Lcj5: cmp.PRED p8, p0 = w6, u6 C M I
(p6) cmp.eq.or p7, p0 = LIM, w5 C M I
st8 [rp] = w4, 8 C M23
(p6) add w5 = INCR, w5 C M I
ADDSUB w7 = u7, v7 C M I
;;
.Lcj4: cmp.PRED p9, p0 = w7, u7 C M I
(p7) cmp.eq.or p8, p0 = LIM, w6 C M I
(p7) add w6 = INCR, w6 C M I
;;
st8 [rp] = w5, 16 C M23
st8 [rpx] = w6, 32 C M23
.Lcj3:
(p8) cmp.eq.or p9, p0 = LIM, w7 C M I
(p8) add w7 = INCR, w7 C M I
ADDSUB w0 = u0, v0 C M I
;;
.Lcj2: st8 [rp] = w7, 8 C M23
cmp.PRED p6, p0 = w0, u0 C M I
;;
(p9) cmp.eq.or p6, p0 = LIM, w0 C M I
(p9) add w0 = INCR, w0 C M I
mov r8 = 0 C M I
;;
.Lcj1: st8 [rp] = w0, 8 C M23
mov.i ar.lc = r2 C I0
(p6) mov r8 = 1 C M I
br.ret.sptk.many b0 C B
EPILOGUE()
ASM_END()

View File

@ -25,6 +25,8 @@ include(`../config.m4')
C cycles/limb
C 68040: 6
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n',`
define(M4_inst, addxl)
define(M4_function_n, mpn_add_n)

93
mpn/m68k/sub_n.asm Normal file
View File

@ -0,0 +1,93 @@
dnl mc68020 mpn_add_n, mpn_sub_n -- add or subtract limb vectors
dnl Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002, 2003, 2005 Free
dnl Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C cycles/limb
C 68040: 6
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n',`
define(M4_inst, addxl)
define(M4_function_n, mpn_add_n)
',`ifdef(`OPERATION_sub_n',`
define(M4_inst, subxl)
define(M4_function_n, mpn_sub_n)
',
`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
C INPUT PARAMETERS
C res_ptr (sp + 4)
C s1_ptr (sp + 8)
C s2_ptr (sp + 12)
C size (sp + 16)
PROLOGUE(M4_function_n)
C Save used registers on the stack.
movel d2, M(-,sp)
movel a2, M(-,sp)
C Copy the arguments to registers. Better use movem?
movel M(sp,12), a2
movel M(sp,16), a0
movel M(sp,20), a1
movel M(sp,24), d2
eorw #1, d2
lsrl #1, d2
bcc L(L1)
subql #1, d2 C clears cy as side effect
L(Loop):
movel M(a0,+), d0
movel M(a1,+), d1
M4_inst d1, d0
movel d0, M(a2,+)
L(L1): movel M(a0,+), d0
movel M(a1,+), d1
M4_inst d1, d0
movel d0, M(a2,+)
dbf d2, L(Loop) C loop until 16 lsb of %4 == -1
subxl d0, d0 C d0 <= -cy; save cy as 0 or -1 in d0
subl #0x10000, d2
bcs L(L2)
addl d0, d0 C restore cy
bra L(Loop)
L(L2):
negl d0
C Restore used registers from stack frame.
movel M(sp,+), a2
movel M(sp,+), d2
rts
EPILOGUE(M4_function_n)

View File

@ -30,6 +30,7 @@ C K6: 3.5
C K7: 2.25
C P4: 8.75
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n',`
define(M4_inst, adcl)

View File

@ -35,6 +35,7 @@ dnl Maximum possible with the current code is 64.
deflit(UNROLL_COUNT, 16)
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n', `
define(M4_inst, adcl)

View File

@ -35,6 +35,7 @@ dnl Maximum possible with the current code is 64.
deflit(UNROLL_COUNT, 16)
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n', `
define(M4_inst, adcl)

View File

@ -24,6 +24,7 @@ include(`../config.m4')
C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n', `
define(M4_inst, adcl)

329
mpn/x86/k6/sub_n.asm Normal file
View File

@ -0,0 +1,329 @@
dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n', `
define(M4_inst, adcl)
define(M4_function_n, mpn_add_n)
define(M4_function_nc, mpn_add_nc)
define(M4_description, add)
',`ifdef(`OPERATION_sub_n', `
define(M4_inst, sbbl)
define(M4_function_n, mpn_sub_n)
define(M4_function_nc, mpn_sub_nc)
define(M4_description, subtract)
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size);
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size, mp_limb_t carry);
C
C Calculate src1,size M4_description src2,size, and store the result in
C dst,size. The return value is the carry bit from the top of the result
C (1 or 0).
C
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
C the calculation. Note values other than 1 or 0 here will lead to garbage
C results.
C
C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
define(PARAM_CARRY, `FRAME+20(%esp)')
define(PARAM_SIZE, `FRAME+16(%esp)')
define(PARAM_SRC2, `FRAME+12(%esp)')
define(PARAM_SRC1, `FRAME+8(%esp)')
define(PARAM_DST, `FRAME+4(%esp)')
deflit(`FRAME',0)
dnl minimum 5 because the unrolled code can't handle less
deflit(UNROLL_THRESHOLD, 5)
TEXT
ALIGN(32)
PROLOGUE(M4_function_nc)
movl PARAM_CARRY, %eax
jmp L(start)
EPILOGUE()
PROLOGUE(M4_function_n)
xorl %eax, %eax
L(start):
movl PARAM_SIZE, %ecx
pushl %ebx
FRAME_pushl()
movl PARAM_SRC1, %ebx
pushl %edi
FRAME_pushl()
movl PARAM_SRC2, %edx
cmpl $UNROLL_THRESHOLD, %ecx
movl PARAM_DST, %edi
jae L(unroll)
shrl %eax C initial carry flag
C offset 0x21 here, close enough to aligned
L(simple):
C eax scratch
C ebx src1
C ecx counter
C edx src2
C esi
C edi dst
C ebp
C
C The store to (%edi) could be done with a stosl; it'd be smaller
C code, but there's no speed gain and a cld would have to be added
C (per mpn/x86/README).
movl (%ebx), %eax
leal 4(%ebx), %ebx
M4_inst (%edx), %eax
movl %eax, (%edi)
leal 4(%edi), %edi
leal 4(%edx), %edx
loop L(simple)
movl $0, %eax
popl %edi
setc %al
popl %ebx
ret
C -----------------------------------------------------------------------------
L(unroll):
C eax carry
C ebx src1
C ecx counter
C edx src2
C esi
C edi dst
C ebp
cmpl %edi, %ebx
pushl %esi
je L(inplace)
ifdef(`OPERATION_add_n',`
cmpl %edi, %edx
je L(inplace_reverse)
')
movl %ecx, %esi
andl $-4, %ecx
andl $3, %esi
leal (%ebx,%ecx,4), %ebx
leal (%edx,%ecx,4), %edx
leal (%edi,%ecx,4), %edi
negl %ecx
shrl %eax
ALIGN(32)
L(normal_top):
C eax counter, qwords, negative
C ebx src1
C ecx scratch
C edx src2
C esi
C edi dst
C ebp
movl (%ebx,%ecx,4), %eax
leal 5(%ecx), %ecx
M4_inst -20(%edx,%ecx,4), %eax
movl %eax, -20(%edi,%ecx,4)
movl 4-20(%ebx,%ecx,4), %eax
M4_inst 4-20(%edx,%ecx,4), %eax
movl %eax, 4-20(%edi,%ecx,4)
movl 8-20(%ebx,%ecx,4), %eax
M4_inst 8-20(%edx,%ecx,4), %eax
movl %eax, 8-20(%edi,%ecx,4)
movl 12-20(%ebx,%ecx,4), %eax
M4_inst 12-20(%edx,%ecx,4), %eax
movl %eax, 12-20(%edi,%ecx,4)
loop L(normal_top)
decl %esi
jz L(normal_finish_one)
js L(normal_done)
C two or three more limbs
movl (%ebx), %eax
M4_inst (%edx), %eax
movl %eax, (%edi)
movl 4(%ebx), %eax
M4_inst 4(%edx), %eax
decl %esi
movl %eax, 4(%edi)
jz L(normal_done)
movl $2, %ecx
L(normal_finish_one):
movl (%ebx,%ecx,4), %eax
M4_inst (%edx,%ecx,4), %eax
movl %eax, (%edi,%ecx,4)
L(normal_done):
popl %esi
popl %edi
movl $0, %eax
popl %ebx
setc %al
ret
C -----------------------------------------------------------------------------
ifdef(`OPERATION_add_n',`
L(inplace_reverse):
C dst==src2
movl %ebx, %edx
')
L(inplace):
C eax initial carry
C ebx
C ecx size
C edx src
C esi
C edi dst
C ebp
leal -1(%ecx), %esi
decl %ecx
andl $-4, %ecx
andl $3, %esi
movl (%edx), %ebx C src low limb
leal (%edx,%ecx,4), %edx
leal (%edi,%ecx,4), %edi
negl %ecx
shrl %eax
ALIGN(32)
L(inplace_top):
C eax
C ebx next src limb
C ecx size
C edx src
C esi
C edi dst
C ebp
M4_inst %ebx, (%edi,%ecx,4)
movl 4(%edx,%ecx,4), %eax
leal 5(%ecx), %ecx
M4_inst %eax, 4-20(%edi,%ecx,4)
movl 8-20(%edx,%ecx,4), %eax
movl 12-20(%edx,%ecx,4), %ebx
M4_inst %eax, 8-20(%edi,%ecx,4)
M4_inst %ebx, 12-20(%edi,%ecx,4)
movl 16-20(%edx,%ecx,4), %ebx
loop L(inplace_top)
C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
M4_inst %ebx, (%edi)
decl %esi
jz L(inplace_finish_one)
js L(inplace_done)
C two or three more limbs
movl 4(%edx), %eax
movl 8(%edx), %ebx
M4_inst %eax, 4(%edi)
M4_inst %ebx, 8(%edi)
decl %esi
movl $2, %ecx
jz L(normal_done)
L(inplace_finish_one):
movl 4(%edx,%ecx,4), %eax
M4_inst %eax, 4(%edi,%ecx,4)
L(inplace_done):
popl %esi
popl %edi
movl $0, %eax
popl %ebx
setc %al
ret
EPILOGUE()

250
mpn/x86/k7/add_n.asm Normal file
View File

@ -0,0 +1,250 @@
dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C K7: 1.64 cycles/limb (at 16 limbs/loop).
dnl K7: UNROLL_COUNT cycles/limb
dnl 8 1.9
dnl 16 1.64
dnl 32 1.7
dnl 64 2.0
dnl Maximum possible with the current code is 64.
deflit(UNROLL_COUNT, 16)
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n', `
define(M4_inst, adcl)
define(M4_function_n, mpn_add_n)
define(M4_function_nc, mpn_add_nc)
define(M4_description, add)
',`ifdef(`OPERATION_sub_n', `
define(M4_inst, sbbl)
define(M4_function_n, mpn_sub_n)
define(M4_function_nc, mpn_sub_nc)
define(M4_description, subtract)
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size);
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size, mp_limb_t carry);
C
C Calculate src1,size M4_description src2,size, and store the result in
C dst,size. The return value is the carry bit from the top of the result (1
C or 0).
C
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
C the calculation. Note values other than 1 or 0 here will lead to garbage
C results.
C
C This code runs at 1.64 cycles/limb, which might be the best possible with
C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
C which can be done each cycle, leading to 1.5 c/l.
dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
ifdef(`PIC',`
deflit(UNROLL_THRESHOLD, 8)
',`
deflit(UNROLL_THRESHOLD, 8)
')
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
defframe(PARAM_SRC2, 12)
defframe(PARAM_SRC1, 8)
defframe(PARAM_DST, 4)
defframe(SAVE_EBP, -4)
defframe(SAVE_ESI, -8)
defframe(SAVE_EBX, -12)
defframe(SAVE_EDI, -16)
deflit(STACK_SPACE, 16)
TEXT
ALIGN(32)
deflit(`FRAME',0)
PROLOGUE(M4_function_nc)
movl PARAM_CARRY, %eax
jmp L(start)
EPILOGUE()
PROLOGUE(M4_function_n)
xorl %eax, %eax C carry
L(start):
movl PARAM_SIZE, %ecx
subl $STACK_SPACE, %esp
deflit(`FRAME',STACK_SPACE)
movl %edi, SAVE_EDI
movl %ebx, SAVE_EBX
cmpl $UNROLL_THRESHOLD, %ecx
movl PARAM_SRC2, %edx
movl PARAM_SRC1, %ebx
jae L(unroll)
movl PARAM_DST, %edi
leal (%ebx,%ecx,4), %ebx
leal (%edx,%ecx,4), %edx
leal (%edi,%ecx,4), %edi
negl %ecx
shrl %eax
C This loop in in a single 16 byte code block already, so no
C alignment necessary.
L(simple):
C eax scratch
C ebx src1
C ecx counter
C edx src2
C esi
C edi dst
C ebp
movl (%ebx,%ecx,4), %eax
M4_inst (%edx,%ecx,4), %eax
movl %eax, (%edi,%ecx,4)
incl %ecx
jnz L(simple)
movl $0, %eax
movl SAVE_EDI, %edi
movl SAVE_EBX, %ebx
setc %al
addl $STACK_SPACE, %esp
ret
C -----------------------------------------------------------------------------
C This is at 0x55, close enough to aligned.
L(unroll):
deflit(`FRAME',STACK_SPACE)
movl %ebp, SAVE_EBP
andl $-2, %ecx C size low bit masked out
andl $1, PARAM_SIZE C size low bit kept
movl %ecx, %edi
decl %ecx
movl PARAM_DST, %ebp
shrl $UNROLL_LOG2, %ecx
negl %edi
movl %esi, SAVE_ESI
andl $UNROLL_MASK, %edi
ifdef(`PIC',`
call L(pic_calc)
L(here):
',`
leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
')
negl %edi
shrl %eax
leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
jmp *%esi
ifdef(`PIC',`
L(pic_calc):
C See mpn/x86/README about old gas bugs
leal (%edi,%edi,8), %esi
addl $L(entry)-L(here), %esi
addl (%esp), %esi
ret_internal
')
C -----------------------------------------------------------------------------
ALIGN(32)
L(top):
C eax zero
C ebx src1
C ecx counter
C edx src2
C esi scratch (was computed jump)
C edi dst
C ebp scratch
leal UNROLL_BYTES(%edx), %edx
L(entry):
deflit(CHUNK_COUNT, 2)
forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
deflit(`disp1', eval(disp0 + 4))
Zdisp( movl, disp0,(%ebx), %esi)
movl disp1(%ebx), %ebp
Zdisp( M4_inst,disp0,(%edx), %esi)
Zdisp( movl, %esi, disp0,(%edi))
M4_inst disp1(%edx), %ebp
movl %ebp, disp1(%edi)
')
decl %ecx
leal UNROLL_BYTES(%ebx), %ebx
leal UNROLL_BYTES(%edi), %edi
jns L(top)
mov PARAM_SIZE, %esi
movl SAVE_EBP, %ebp
movl $0, %eax
decl %esi
js L(even)
movl (%ebx), %ecx
M4_inst UNROLL_BYTES(%edx), %ecx
movl %ecx, (%edi)
L(even):
movl SAVE_EDI, %edi
movl SAVE_EBX, %ebx
setc %al
movl SAVE_ESI, %esi
addl $STACK_SPACE, %esp
ret
EPILOGUE()

250
mpn/x86/k7/sub_n.asm Normal file
View File

@ -0,0 +1,250 @@
dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C K7: 1.64 cycles/limb (at 16 limbs/loop).
dnl K7: UNROLL_COUNT cycles/limb
dnl 8 1.9
dnl 16 1.64
dnl 32 1.7
dnl 64 2.0
dnl Maximum possible with the current code is 64.
deflit(UNROLL_COUNT, 16)
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n', `
define(M4_inst, adcl)
define(M4_function_n, mpn_add_n)
define(M4_function_nc, mpn_add_nc)
define(M4_description, add)
',`ifdef(`OPERATION_sub_n', `
define(M4_inst, sbbl)
define(M4_function_n, mpn_sub_n)
define(M4_function_nc, mpn_sub_nc)
define(M4_description, subtract)
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size);
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size, mp_limb_t carry);
C
C Calculate src1,size M4_description src2,size, and store the result in
C dst,size. The return value is the carry bit from the top of the result (1
C or 0).
C
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
C the calculation. Note values other than 1 or 0 here will lead to garbage
C results.
C
C This code runs at 1.64 cycles/limb, which might be the best possible with
C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
C which can be done each cycle, leading to 1.5 c/l.
dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
ifdef(`PIC',`
deflit(UNROLL_THRESHOLD, 8)
',`
deflit(UNROLL_THRESHOLD, 8)
')
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
defframe(PARAM_SRC2, 12)
defframe(PARAM_SRC1, 8)
defframe(PARAM_DST, 4)
defframe(SAVE_EBP, -4)
defframe(SAVE_ESI, -8)
defframe(SAVE_EBX, -12)
defframe(SAVE_EDI, -16)
deflit(STACK_SPACE, 16)
TEXT
ALIGN(32)
deflit(`FRAME',0)
PROLOGUE(M4_function_nc)
movl PARAM_CARRY, %eax
jmp L(start)
EPILOGUE()
PROLOGUE(M4_function_n)
xorl %eax, %eax C carry
L(start):
movl PARAM_SIZE, %ecx
subl $STACK_SPACE, %esp
deflit(`FRAME',STACK_SPACE)
movl %edi, SAVE_EDI
movl %ebx, SAVE_EBX
cmpl $UNROLL_THRESHOLD, %ecx
movl PARAM_SRC2, %edx
movl PARAM_SRC1, %ebx
jae L(unroll)
movl PARAM_DST, %edi
leal (%ebx,%ecx,4), %ebx
leal (%edx,%ecx,4), %edx
leal (%edi,%ecx,4), %edi
negl %ecx
shrl %eax
C This loop in in a single 16 byte code block already, so no
C alignment necessary.
L(simple):
C eax scratch
C ebx src1
C ecx counter
C edx src2
C esi
C edi dst
C ebp
movl (%ebx,%ecx,4), %eax
M4_inst (%edx,%ecx,4), %eax
movl %eax, (%edi,%ecx,4)
incl %ecx
jnz L(simple)
movl $0, %eax
movl SAVE_EDI, %edi
movl SAVE_EBX, %ebx
setc %al
addl $STACK_SPACE, %esp
ret
C -----------------------------------------------------------------------------
C This is at 0x55, close enough to aligned.
L(unroll):
deflit(`FRAME',STACK_SPACE)
movl %ebp, SAVE_EBP
andl $-2, %ecx C size low bit masked out
andl $1, PARAM_SIZE C size low bit kept
movl %ecx, %edi
decl %ecx
movl PARAM_DST, %ebp
shrl $UNROLL_LOG2, %ecx
negl %edi
movl %esi, SAVE_ESI
andl $UNROLL_MASK, %edi
ifdef(`PIC',`
call L(pic_calc)
L(here):
',`
leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
')
negl %edi
shrl %eax
leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
jmp *%esi
ifdef(`PIC',`
L(pic_calc):
C See mpn/x86/README about old gas bugs
leal (%edi,%edi,8), %esi
addl $L(entry)-L(here), %esi
addl (%esp), %esi
ret_internal
')
C -----------------------------------------------------------------------------
ALIGN(32)
L(top):
C eax zero
C ebx src1
C ecx counter
C edx src2
C esi scratch (was computed jump)
C edi dst
C ebp scratch
leal UNROLL_BYTES(%edx), %edx
L(entry):
deflit(CHUNK_COUNT, 2)
forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
deflit(`disp1', eval(disp0 + 4))
Zdisp( movl, disp0,(%ebx), %esi)
movl disp1(%ebx), %ebp
Zdisp( M4_inst,disp0,(%edx), %esi)
Zdisp( movl, %esi, disp0,(%edi))
M4_inst disp1(%edx), %ebp
movl %ebp, disp1(%edi)
')
decl %ecx
leal UNROLL_BYTES(%ebx), %ebx
leal UNROLL_BYTES(%edi), %edi
jns L(top)
mov PARAM_SIZE, %esi
movl SAVE_EBP, %ebp
movl $0, %eax
decl %esi
js L(even)
movl (%ebx), %ecx
M4_inst UNROLL_BYTES(%edx), %ecx
movl %ecx, (%edi)
L(even):
movl SAVE_EDI, %edi
movl SAVE_EBX, %ebx
setc %al
movl SAVE_ESI, %esi
addl $STACK_SPACE, %esp
ret
EPILOGUE()

View File

@ -39,5 +39,4 @@ C couple of experiments didn't get much joy, but such an approach would at
C least avoid serialization, presumably.
C
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
include_mpn(`x86/k7/aors_n.asm')
include_mpn(`x86/k7/add_n.asm')

42
mpn/x86/p6/sub_n.asm Normal file
View File

@ -0,0 +1,42 @@
dnl Intel P6 mpn_add_n, mpn_sub_n -- mpn add or subtract.
dnl Copyright 2003 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C P6: 2.7 cycles/limb
C The K7 code runs quite well on P6, but this seems mainly due to the larger
C amount of unrolling than in mpn/x86/aors_n.asm.
C
C P6 apparently doesn't separately rename the carry flag, or something, so a
C loop holding a carry across decl or incl takes 4 cycles for the loop
C control. Perhaps it's more when relying on out-of-order execution to hide
C load latencies too.
C
C Not sure what the best approach would be. sbbl then addl to save and
C restore the carry across the loop control would be a possibility. A
C couple of experiments didn't get much joy, but such an approach would at
C least avoid serialization, presumably.
C
include_mpn(`x86/k7/sub_n.asm')

View File

@ -25,6 +25,7 @@ include(`../config.m4')
C P5: 2.375 cycles/limb
define(OPERATION_add_n,1)
ifdef(`OPERATION_add_n',`
define(M4_inst, adcl)

196
mpn/x86/pentium/sub_n.asm Normal file
View File

@ -0,0 +1,196 @@
dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
dnl Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C P5: 2.375 cycles/limb
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n',`
define(M4_inst, adcl)
define(M4_function_n, mpn_add_n)
define(M4_function_nc, mpn_add_nc)
',`ifdef(`OPERATION_sub_n',`
define(M4_inst, sbbl)
define(M4_function_n, mpn_sub_n)
define(M4_function_nc, mpn_sub_nc)
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size);
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size, mp_limb_t carry);
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
defframe(PARAM_SRC2, 12)
defframe(PARAM_SRC1, 8)
defframe(PARAM_DST, 4)
TEXT
ALIGN(8)
PROLOGUE(M4_function_nc)
pushl %edi
pushl %esi
pushl %ebx
pushl %ebp
deflit(`FRAME',16)
movl PARAM_DST,%edi
movl PARAM_SRC1,%esi
movl PARAM_SRC2,%ebp
movl PARAM_SIZE,%ecx
movl (%ebp),%ebx
decl %ecx
movl %ecx,%edx
shrl $3,%ecx
andl $7,%edx
testl %ecx,%ecx C zero carry flag
jz L(endgo)
pushl %edx
FRAME_pushl()
movl PARAM_CARRY,%eax
shrl $1,%eax C shift bit 0 into carry
jmp L(oop)
L(endgo):
deflit(`FRAME',16)
movl PARAM_CARRY,%eax
shrl $1,%eax C shift bit 0 into carry
jmp L(end)
EPILOGUE()
ALIGN(8)
PROLOGUE(M4_function_n)
pushl %edi
pushl %esi
pushl %ebx
pushl %ebp
deflit(`FRAME',16)
movl PARAM_DST,%edi
movl PARAM_SRC1,%esi
movl PARAM_SRC2,%ebp
movl PARAM_SIZE,%ecx
movl (%ebp),%ebx
decl %ecx
movl %ecx,%edx
shrl $3,%ecx
andl $7,%edx
testl %ecx,%ecx C zero carry flag
jz L(end)
pushl %edx
FRAME_pushl()
ALIGN(8)
L(oop): movl 28(%edi),%eax C fetch destination cache line
leal 32(%edi),%edi
L(1): movl (%esi),%eax
movl 4(%esi),%edx
M4_inst %ebx,%eax
movl 4(%ebp),%ebx
M4_inst %ebx,%edx
movl 8(%ebp),%ebx
movl %eax,-32(%edi)
movl %edx,-28(%edi)
L(2): movl 8(%esi),%eax
movl 12(%esi),%edx
M4_inst %ebx,%eax
movl 12(%ebp),%ebx
M4_inst %ebx,%edx
movl 16(%ebp),%ebx
movl %eax,-24(%edi)
movl %edx,-20(%edi)
L(3): movl 16(%esi),%eax
movl 20(%esi),%edx
M4_inst %ebx,%eax
movl 20(%ebp),%ebx
M4_inst %ebx,%edx
movl 24(%ebp),%ebx
movl %eax,-16(%edi)
movl %edx,-12(%edi)
L(4): movl 24(%esi),%eax
movl 28(%esi),%edx
M4_inst %ebx,%eax
movl 28(%ebp),%ebx
M4_inst %ebx,%edx
movl 32(%ebp),%ebx
movl %eax,-8(%edi)
movl %edx,-4(%edi)
leal 32(%esi),%esi
leal 32(%ebp),%ebp
decl %ecx
jnz L(oop)
popl %edx
FRAME_popl()
L(end):
decl %edx C test %edx w/o clobbering carry
js L(end2)
incl %edx
L(oop2):
leal 4(%edi),%edi
movl (%esi),%eax
M4_inst %ebx,%eax
movl 4(%ebp),%ebx
movl %eax,-4(%edi)
leal 4(%esi),%esi
leal 4(%ebp),%ebp
decl %edx
jnz L(oop2)
L(end2):
movl (%esi),%eax
M4_inst %ebx,%eax
movl %eax,(%edi)
sbbl %eax,%eax
negl %eax
popl %ebp
popl %ebx
popl %esi
popl %edi
ret
EPILOGUE()

195
mpn/x86/sub_n.asm Normal file
View File

@ -0,0 +1,195 @@
dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2001, 2002 Free Software
dnl Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C cycles/limb
C P5: 3.375
C P6: 3.125
C K6: 3.5
C K7: 2.25
C P4: 8.75
define(OPERATION_sub_n,1)
ifdef(`OPERATION_add_n',`
define(M4_inst, adcl)
define(M4_function_n, mpn_add_n)
define(M4_function_nc, mpn_add_nc)
',`ifdef(`OPERATION_sub_n',`
define(M4_inst, sbbl)
define(M4_function_n, mpn_sub_n)
define(M4_function_nc, mpn_sub_nc)
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size);
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size, mp_limb_t carry);
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
defframe(PARAM_SRC2, 12)
defframe(PARAM_SRC1, 8)
defframe(PARAM_DST, 4)
TEXT
ALIGN(8)
PROLOGUE(M4_function_nc)
deflit(`FRAME',0)
pushl %edi FRAME_pushl()
pushl %esi FRAME_pushl()
movl PARAM_DST,%edi
movl PARAM_SRC1,%esi
movl PARAM_SRC2,%edx
movl PARAM_SIZE,%ecx
movl %ecx,%eax
shrl $3,%ecx C compute count for unrolled loop
negl %eax
andl $7,%eax C get index where to start loop
jz L(oopgo) C necessary special case for 0
incl %ecx C adjust loop count
shll $2,%eax C adjustment for pointers...
subl %eax,%edi C ... since they are offset ...
subl %eax,%esi C ... by a constant when we ...
subl %eax,%edx C ... enter the loop
shrl $2,%eax C restore previous value
ifdef(`PIC',`
C Calculate start address in loop for PIC. Due to limitations in
C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
call L(0a)
L(0a): leal (%eax,%eax,8),%eax
addl (%esp),%eax
addl $L(oop)-L(0a)-3,%eax
addl $4,%esp
',`
C Calculate start address in loop for non-PIC.
leal L(oop)-3(%eax,%eax,8),%eax
')
C These lines initialize carry from the 5th parameter. Should be
C possible to simplify.
pushl %ebp FRAME_pushl()
movl PARAM_CARRY,%ebp
shrl $1,%ebp C shift bit 0 into carry
popl %ebp FRAME_popl()
jmp *%eax C jump into loop
EPILOGUE()
ALIGN(16)
PROLOGUE(M4_function_n)
deflit(`FRAME',0)
pushl %edi FRAME_pushl()
pushl %esi FRAME_pushl()
movl PARAM_DST,%edi
movl PARAM_SRC1,%esi
movl PARAM_SRC2,%edx
movl PARAM_SIZE,%ecx
movl %ecx,%eax
shrl $3,%ecx C compute count for unrolled loop
negl %eax
andl $7,%eax C get index where to start loop
jz L(oop) C necessary special case for 0
incl %ecx C adjust loop count
shll $2,%eax C adjustment for pointers...
subl %eax,%edi C ... since they are offset ...
subl %eax,%esi C ... by a constant when we ...
subl %eax,%edx C ... enter the loop
shrl $2,%eax C restore previous value
ifdef(`PIC',`
C Calculate start address in loop for PIC. Due to limitations in
C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
call L(0b)
L(0b): leal (%eax,%eax,8),%eax
addl (%esp),%eax
addl $L(oop)-L(0b)-3,%eax
addl $4,%esp
',`
C Calculate start address in loop for non-PIC.
leal L(oop)-3(%eax,%eax,8),%eax
')
jmp *%eax C jump into loop
L(oopgo):
pushl %ebp FRAME_pushl()
movl PARAM_CARRY,%ebp
shrl $1,%ebp C shift bit 0 into carry
popl %ebp FRAME_popl()
ALIGN(16)
L(oop): movl (%esi),%eax
M4_inst (%edx),%eax
movl %eax,(%edi)
movl 4(%esi),%eax
M4_inst 4(%edx),%eax
movl %eax,4(%edi)
movl 8(%esi),%eax
M4_inst 8(%edx),%eax
movl %eax,8(%edi)
movl 12(%esi),%eax
M4_inst 12(%edx),%eax
movl %eax,12(%edi)
movl 16(%esi),%eax
M4_inst 16(%edx),%eax
movl %eax,16(%edi)
movl 20(%esi),%eax
M4_inst 20(%edx),%eax
movl %eax,20(%edi)
movl 24(%esi),%eax
M4_inst 24(%edx),%eax
movl %eax,24(%edi)
movl 28(%esi),%eax
M4_inst 28(%edx),%eax
movl %eax,28(%edi)
leal 32(%edi),%edi
leal 32(%esi),%esi
leal 32(%edx),%edx
decl %ecx
jnz L(oop)
sbbl %eax,%eax
negl %eax
popl %esi
popl %edi
ret
EPILOGUE()

View File

@ -99,8 +99,7 @@ sub process_asm {
my $base = basename ($file, '.asm');
my @funs;
if ($base eq 'aors_n') { @funs = qw(add_n sub_n); }
elsif ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); }
if ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); }
elsif ($base eq 'logops_n') { @funs = qw(and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n); }
else { @funs = ($base); }

View File

@ -284,15 +284,6 @@ my @table =
'speed' => 'SPEED_ROUTINE_MPN_BINARY_N',
'speed_flags'=> 'FLAG_R_OPTIONAL',
},
{
'regexp'=> 'aors_n',
'mulfunc'=> ['add_n','sub_n'],
'ret' => 'mp_limb_t',
'args' => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size',
'speed' => 'SPEED_ROUTINE_MPN_BINARY_N',
'speed_flags'=> 'FLAG_R_OPTIONAL',
},
{
'regexp'=> 'addmul_1|submul_1',
'ret' => 'mp_limb_t',