465 lines
10 KiB
NASM
465 lines
10 KiB
NASM
dnl IA-64 mpn_divrem_euclidean_qr_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
|
|
dnl unnormalized limb.
|
|
|
|
dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
|
|
|
|
dnl This file is part of the GNU MP Library.
|
|
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
dnl it under the terms of the GNU Lesser General Public License as published
|
|
dnl by the Free Software Foundation; either version 3 of the License, or (at
|
|
dnl your option) any later version.
|
|
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
dnl License for more details.
|
|
|
|
dnl You should have received a copy of the GNU Lesser General Public License
|
|
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
C cycles/limb
|
|
C Itanium: 40-42
|
|
C Itanium 2: 29-30
|
|
|
|
C This was generated by gcc, then the loops were optimized. The preinv entry
|
|
C point was shoehorned into the file. Lots of things outside the loops could
|
|
C be streamlined. It would probably be a good idea to merge the loops for
|
|
C normalized and unnormalized divisor, since the shifting stuff is done for
|
|
C free in parallel with other operations. It would even be possible to merge
|
|
C all loops, if the ld8 were made conditional.
|
|
|
|
C TODO
|
|
C * Consider delaying inversion for normalized mpn_divrem_1 entry till after
|
|
C computing leading limb.
|
|
C * Inline and interleave limb inversion code with loop setup code.
|
|
|
|
ASM_START()
|
|
|
|
C HP's assembler requires these declarations for importing mpn_invert_limb
|
|
.global mpn_invert_limb
|
|
.type mpn_invert_limb,@function
|
|
|
|
C INPUT PARAMETERS
|
|
C rp = r32
|
|
C qxn = r33
|
|
C up = r34
|
|
C n = r35
|
|
C vl = r36
|
|
C vlinv = r37 (preinv only)
|
|
C cnt = r38 (preinv only)
|
|
|
|
PROLOGUE(mpn_preinv_divrem_1)
|
|
.prologue
|
|
.save ar.pfs, r42
|
|
alloc r42 = ar.pfs, 7, 8, 1, 0
|
|
.save ar.lc, r44
|
|
mov r44 = ar.lc
|
|
.save rp, r41
|
|
mov r41 = b0
|
|
.body
|
|
ifdef(`HAVE_ABI_32',
|
|
` addp4 r32 = 0, r32
|
|
sxt4 r33 = r33
|
|
addp4 r34 = 0, r34
|
|
sxt4 r35 = r35
|
|
;;
|
|
')
|
|
mov r40 = r38
|
|
shladd r34 = r35, 3, r34
|
|
;;
|
|
adds r34 = -8, r34
|
|
;;
|
|
ld8 r39 = [r34], -8
|
|
;;
|
|
|
|
add r15 = r35, r33
|
|
;;
|
|
mov r8 = r37
|
|
shladd r32 = r15, 3, r32 C r32 = rp + n + qxn
|
|
cmp.le p8, p0 = 0, r36
|
|
;;
|
|
adds r32 = -8, r32 C r32 = rp + n + qxn - 1
|
|
cmp.leu p6, p7 = r36, r39
|
|
(p8) br.cond.dpnt .Lpunnorm
|
|
;;
|
|
|
|
(p6) addl r15 = 1, r0
|
|
(p7) mov r15 = r0
|
|
;;
|
|
(p6) sub r38 = r39, r36
|
|
(p7) mov r38 = r39
|
|
st8 [r32] = r15, -8
|
|
adds r35 = -2, r35 C un -= 2
|
|
br .Lpn
|
|
|
|
.Lpunnorm:
|
|
(p6) add r34 = 8, r34
|
|
mov r38 = 0 C r = 0
|
|
shl r36 = r36, r40
|
|
(p6) br.cond.dptk .Lpu
|
|
;;
|
|
shl r38 = r39, r40 C r = ahigh << cnt
|
|
cmp.ne p8, p0 = 1, r35
|
|
st8 [r32] = r0, -8
|
|
adds r35 = -1, r35 C un--
|
|
(p8) br.cond.dpnt .Lpu
|
|
|
|
mov r23 = 1
|
|
;;
|
|
setf.sig f6 = r8
|
|
setf.sig f12 = r23
|
|
br .L435
|
|
EPILOGUE()
|
|
|
|
|
|
PROLOGUE(mpn_divrem_euclidean_qr_1)
|
|
.prologue
|
|
.save ar.pfs, r42
|
|
alloc r42 = ar.pfs, 5, 8, 1, 0
|
|
.save ar.lc, r44
|
|
mov r44 = ar.lc
|
|
.save rp, r41
|
|
mov r41 = b0
|
|
.body
|
|
ifdef(`HAVE_ABI_32',
|
|
` addp4 r32 = 0, r32
|
|
sxt4 r33 = r33
|
|
addp4 r34 = 0, r34
|
|
sxt4 r35 = r35
|
|
;;
|
|
')
|
|
mov r38 = r0
|
|
add r15 = r35, r33
|
|
;;
|
|
cmp.ne p6, p7 = 0, r15
|
|
;;
|
|
(p7) mov r8 = r0
|
|
(p7) br.cond.dpnt .Lret
|
|
shladd r14 = r15, 3, r32 C r14 = rp + n + qxn
|
|
cmp.le p6, p7 = 0, r36
|
|
;;
|
|
adds r32 = -8, r14 C r32 = rp + n + qxn - 1
|
|
(p6) br.cond.dpnt .Lunnorm
|
|
cmp.eq p6, p7 = 0, r35
|
|
(p6) br.cond.dpnt .L179
|
|
shladd r14 = r35, 3, r34
|
|
;;
|
|
adds r14 = -8, r14
|
|
adds r35 = -1, r35
|
|
;;
|
|
ld8 r38 = [r14]
|
|
;;
|
|
cmp.leu p6, p7 = r36, r38
|
|
;;
|
|
(p6) addl r15 = 1, r0
|
|
(p7) mov r15 = r0
|
|
;;
|
|
st8 [r32] = r15, -8
|
|
(p6) sub r38 = r38, r36
|
|
|
|
.L179:
|
|
mov r45 = r36
|
|
adds r35 = -1, r35
|
|
br.call.sptk.many b0 = mpn_invert_limb
|
|
;;
|
|
shladd r34 = r35, 3, r34
|
|
.Lpn:
|
|
mov r23 = 1
|
|
;;
|
|
setf.sig f6 = r8
|
|
setf.sig f12 = r23
|
|
cmp.le p6, p7 = 0, r35
|
|
mov r40 = 0
|
|
(p7) br.cond.dpnt .L435
|
|
setf.sig f10 = r36
|
|
mov ar.lc = r35
|
|
setf.sig f7 = r38
|
|
;;
|
|
sub r28 = -1, r36
|
|
C Develop quotient limbs for normalized divisor
|
|
.Loop1: C 00 C q=r18 nh=r38/f7
|
|
ld8 r20 = [r34], -8
|
|
xma.hu f11 = f7, f6, f0
|
|
;; C 04
|
|
xma.l f8 = f11, f12, f7 C q = q + nh
|
|
;; C 08
|
|
getf.sig r18 = f8
|
|
xma.hu f9 = f8, f10, f0
|
|
xma.l f8 = f8, f10, f0
|
|
;; C 12
|
|
getf.sig r16 = f9
|
|
C 13
|
|
getf.sig r15 = f8
|
|
;; C 18
|
|
cmp.ltu p6, p7 = r20, r15
|
|
sub r15 = r20, r15
|
|
sub r16 = r38, r16
|
|
;; C 19
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
|
|
(p6) add r16 = -1, r16
|
|
(p0) cmp.ne.unc p6, p7 = r0, r0
|
|
;; C 20
|
|
(p8) cmp.ltu p6, p7 = r15, r36
|
|
(p8) sub r15 = r15, r36
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;; C 21
|
|
.pred.rel "mutex",p6,p7
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
|
|
cmp.ltu p6, p7 = r15, r36 C speculative
|
|
sub r28 = r15, r36 C speculative, just for cmp
|
|
;; C 22
|
|
(p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
|
|
(p8) mov r15 = r28
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;; C 23
|
|
(p6) setf.sig f7 = r15
|
|
(p7) sub r15 = r15, r36
|
|
(p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;; C 24
|
|
(p7) setf.sig f7 = r15
|
|
st8 [r32] = r18, -8
|
|
mov r38 = r15
|
|
br.cloop.dptk .Loop1
|
|
C 29/30
|
|
br.sptk .L435
|
|
;;
|
|
.Lunnorm:
|
|
mux1 r16 = r36, @rev
|
|
cmp.eq p6, p7 = 0, r35
|
|
(p6) br.cond.dpnt .L322
|
|
shladd r34 = r35, 3, r34
|
|
;;
|
|
adds r34 = -8, r34
|
|
;;
|
|
ld8 r39 = [r34]
|
|
;;
|
|
cmp.leu p6, p7 = r36, r39
|
|
(p6) br.cond.dptk .L322
|
|
adds r34 = -8, r34
|
|
;;
|
|
mov r38 = r39
|
|
;;
|
|
cmp.ne p6, p7 = 1, r15
|
|
st8 [r32] = r0, -8
|
|
;;
|
|
(p7) mov r8 = r38
|
|
(p7) br.cond.dpnt .Lret
|
|
adds r35 = -1, r35
|
|
.L322:
|
|
sub r14 = r0, r16
|
|
;;
|
|
or r14 = r16, r14
|
|
;;
|
|
mov r16 = -8
|
|
czx1.l r14 = r14
|
|
;;
|
|
shladd r16 = r14, 3, r16
|
|
;;
|
|
shr.u r14 = r36, r16
|
|
;;
|
|
cmp.geu p6, p7 = 15, r14
|
|
;;
|
|
(p7) shr.u r14 = r14, 4
|
|
(p7) adds r16 = 4, r16
|
|
;;
|
|
cmp.geu p6, p7 = 3, r14
|
|
;;
|
|
(p7) shr.u r14 = r14, 2
|
|
(p7) adds r16 = 2, r16
|
|
;;
|
|
tbit.nz p6, p7 = r14, 1
|
|
;;
|
|
.pred.rel "mutex",p6,p7
|
|
(p6) sub r40 = 62, r16
|
|
(p7) sub r40 = 63, r16
|
|
;;
|
|
shl r45 = r36, r40
|
|
shl r36 = r36, r40
|
|
shl r38 = r38, r40
|
|
br.call.sptk.many b0 = mpn_invert_limb
|
|
;;
|
|
.Lpu:
|
|
mov r23 = 1
|
|
;;
|
|
setf.sig f6 = r8
|
|
setf.sig f12 = r23
|
|
cmp.eq p6, p7 = 0, r35
|
|
(p6) br.cond.dpnt .L435
|
|
sub r16 = 64, r40
|
|
adds r35 = -2, r35
|
|
;;
|
|
ld8 r39 = [r34], -8
|
|
cmp.le p6, p7 = 0, r35
|
|
;;
|
|
shr.u r14 = r39, r16
|
|
;;
|
|
or r38 = r14, r38
|
|
(p7) br.cond.dpnt .Lend3
|
|
;;
|
|
mov r22 = r16
|
|
setf.sig f10 = r36
|
|
setf.sig f7 = r38
|
|
mov ar.lc = r35
|
|
;;
|
|
C Develop quotient limbs for unnormalized divisor
|
|
.Loop3:
|
|
ld8 r14 = [r34], -8
|
|
xma.hu f11 = f7, f6, f0
|
|
;;
|
|
xma.l f8 = f11, f12, f7 C q = q + nh
|
|
;;
|
|
getf.sig r18 = f8
|
|
xma.hu f9 = f8, f10, f0
|
|
shl r20 = r39, r40
|
|
xma.l f8 = f8, f10, f0
|
|
shr.u r24 = r14, r22
|
|
;;
|
|
getf.sig r16 = f9
|
|
getf.sig r15 = f8
|
|
or r20 = r24, r20
|
|
;;
|
|
cmp.ltu p6, p7 = r20, r15
|
|
sub r15 = r20, r15
|
|
sub r16 = r38, r16
|
|
;;
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
|
|
(p6) add r16 = -1, r16
|
|
(p0) cmp.ne.unc p6, p7 = r0, r0
|
|
;;
|
|
(p8) cmp.ltu p6, p7 = r15, r36
|
|
(p8) sub r15 = r15, r36
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
.pred.rel "mutex",p6,p7
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
|
|
cmp.ltu p6, p7 = r15, r36 C speculative
|
|
sub r28 = r15, r36 C speculative, just for cmp
|
|
;;
|
|
(p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
|
|
(p8) mov r15 = r28
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
(p6) setf.sig f7 = r15
|
|
(p7) sub r15 = r15, r36
|
|
(p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
(p7) setf.sig f7 = r15
|
|
st8 [r32] = r18, -8
|
|
mov r39 = r14
|
|
mov r38 = r15
|
|
br.cloop.dptk .Loop3
|
|
;;
|
|
.Lend3:
|
|
setf.sig f10 = r36
|
|
setf.sig f7 = r38
|
|
;;
|
|
xma.hu f11 = f7, f6, f0
|
|
;;
|
|
xma.l f8 = f11, f12, f7 C q = q + nh
|
|
;;
|
|
getf.sig r18 = f8
|
|
xma.hu f9 = f8, f10, f0
|
|
shl r20 = r39, r40
|
|
xma.l f8 = f8, f10, f0
|
|
;;
|
|
getf.sig r16 = f9
|
|
getf.sig r15 = f8
|
|
;;
|
|
cmp.ltu p6, p7 = r20, r15
|
|
sub r15 = r20, r15
|
|
sub r16 = r38, r16
|
|
;;
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
|
|
(p6) add r16 = -1, r16
|
|
(p0) cmp.ne.unc p6, p7 = r0, r0
|
|
;;
|
|
(p8) cmp.ltu p6, p7 = r15, r36
|
|
(p8) sub r15 = r15, r36
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
.pred.rel "mutex",p6,p7
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
|
|
;;
|
|
(p8) sub r15 = r15, r36
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
cmp.ltu p6, p7 = r15, r36
|
|
;;
|
|
(p7) sub r15 = r15, r36
|
|
(p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
st8 [r32] = r18, -8
|
|
mov r38 = r15
|
|
.L435:
|
|
adds r35 = -1, r33
|
|
cmp.le p6, p7 = 1, r33
|
|
(p7) br.cond.dpnt .Lend4
|
|
;;
|
|
setf.sig f7 = r38
|
|
setf.sig f10 = r36
|
|
mov ar.lc = r35
|
|
;;
|
|
.Loop4:
|
|
xma.hu f11 = f7, f6, f0
|
|
;;
|
|
xma.l f8 = f11, f12, f7 C q = q + nh
|
|
;;
|
|
getf.sig r18 = f8
|
|
xma.hu f9 = f8, f10, f0
|
|
xma.l f8 = f8, f10, f0
|
|
;;
|
|
getf.sig r16 = f9
|
|
getf.sig r15 = f8
|
|
;;
|
|
cmp.ltu p6, p7 = 0, r15
|
|
sub r15 = 0, r15
|
|
sub r16 = r38, r16
|
|
;;
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
|
|
(p6) add r16 = -1, r16
|
|
(p0) cmp.ne.unc p6, p7 = r0, r0
|
|
;;
|
|
(p8) cmp.ltu p6, p7 = r15, r36
|
|
(p8) sub r15 = r15, r36
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
.pred.rel "mutex",p6,p7
|
|
(p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
|
|
(p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
|
|
cmp.ltu p6, p7 = r15, r36 C speculative
|
|
sub r28 = r15, r36 C speculative, just for cmp
|
|
;;
|
|
(p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
|
|
(p8) mov r15 = r28
|
|
(p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
(p6) setf.sig f7 = r15
|
|
(p7) sub r15 = r15, r36
|
|
(p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
|
|
;;
|
|
(p7) setf.sig f7 = r15
|
|
st8 [r32] = r18, -8
|
|
mov r38 = r15
|
|
br.cloop.dptk .Loop4
|
|
;;
|
|
.Lend4:
|
|
shr.u r8 = r38, r40
|
|
.Lret:
|
|
mov ar.pfs = r42
|
|
mov ar.lc = r44
|
|
mov b0 = r41
|
|
br.ret.sptk.many b0
|
|
EPILOGUE()
|
|
ASM_END()
|