mpir/mpn/x86_64/nehalem/sub_err1_n.asm
2014-02-07 13:05:55 +00:00

175 lines
7.3 KiB
NASM

dnl mpn_sub_err1_n
dnl Copyright 2009 Jason Moxham
dnl This file is part of the MPIR Library.
dnl The MPIR Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
dnl your option) any later version.
dnl The MPIR Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the MPIR Library; see the file COPYING.LIB. If not, write
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl Boston, MA 02110-1301, USA.
include(`../config.m4')
C ret mpn_sub_err1(mp_ptr rp,mp_ptr up,mp_ptr vp,mp_ptr ep,mp_ptr_t yp,mp_size_t n,mp_limb_t cy)
C rax rdi, rsi, rdx, rcx, r8 r9 8(rsp)=>r10
ASM_START()
PROLOGUE(mpn_sub_err1_n)
C if we rearrange the params we could save some moves
C (rdi,r9)=(rsi,r9)+(rdx,r9) sum=carry*(r8)
mov 8(%rsp),%r10 C cy
mov %rbp,-16(%rsp) C save rbp
lea -24(%rdi,%r9,8),%rdi C rp += n - 3
mov %r12,-24(%rsp) C save r12
mov %r13,-32(%rsp) C save r13
lea -24(%rsi,%r9,8),%rsi C up += n - 3
mov %r14,-40(%rsp) C save r14
mov %r15,-48(%rsp) C save r15
lea -24(%rdx,%r9,8),%rdx C vp += n - 3
mov %rcx,-56(%rsp) C save rcx
mov %rbx,-8(%rsp) C save rbx
mov $3,%r11 C i = 3
shl $63,%r10
lea (%r8,%r9,8),%r8 C yp += n
sub %r9,%r11 C i = 3 - n
mov $0,%r9 C t1 = 0
mov $0,%rax C t2 = 0
mov $0,%rbx C t3 = 0
jnc skiplp C if done goto skiplp
ALIGN(16)
lp:
mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0)
mov 8(%rsi,%r11,8),%r13 C s2 = *(up + i + 1)
mov 16(%rsi,%r11,8),%r14 C s3 = *(up + i + 2)
mov 24(%rsi,%r11,8),%r15 C s4 = *(up + i + 3)
mov $0,%rbp C t5 = 0
shl $1,%r10 C s1 -= *(vp + i + 0) + (cy & 1)
sbb (%rdx,%r11,8),%r12
cmovc -8(%r8),%rax C if borrow1, t2 = *(yp - 1)
sbb 8(%rdx,%r11,8),%r13 C s2 -= *(vp + i + 1) + borrow1
cmovc -16(%r8),%rbx C if borrow2 t3 = *(yp - 2)
mov $0,%rcx C t4 = 0
sbb 16(%rdx,%r11,8),%r14 C s3 -= *(vp + i + 2) + borrow2
cmovc -24(%r8),%rcx C if borrow3 t4 = *(yp - 3)
sbb 24(%rdx,%r11,8),%r15 C s4 -= *(vp + i + 3) + borrow3
cmovc -32(%r8),%rbp C if borrow4 t5 = *(yp - 4)
rcr $1,%r10 C high bit of cy = borrow
add %rax,%r9 C t1 += t2
adc $0,%r10 C accumulate cy
add %rbx,%r9 C t1 += t2
adc $0,%r10 C accumulate cy
add %rcx,%r9 C t1 += t4
mov $0,%rax C t2 = 0
adc $0,%r10 C accumulate cy
lea -32(%r8),%r8 C yp -= 4
add %rbp,%r9 C t1 += t5
adc $0,%r10 C accumulate cy
mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1
mov %r13,8(%rdi,%r11,8) C *(rp + i + 1) = s2
mov %r14,16(%rdi,%r11,8) C *(rp + i + 2) = s3
mov %r15,24(%rdi,%r11,8) C *(rp + i + 3) = s4
mov $0,%rbx C t3 = 0
add $4,%r11 C i += 4
jnc lp C not done, goto lp
skiplp:
cmp $2,%r11 C cmp(i, 2)
mov -16(%rsp),%rbp C restore rbp
mov -48(%rsp),%r15 C restore r15
ja case0 C i == 3 goto case0
je case1 C i == 2 goto case1
jp case2 C i == 1 goto case2
case3:
mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0)
mov 8(%rsi,%r11,8),%r13 C s2 = *(up + i + 1)
mov 16(%rsi,%r11,8),%r14 C s3 = *(up + i + 2)
shl $1,%r10 C restore borrow1 from high bit of t1
sbb (%rdx,%r11,8),%r12 C s1 -= *(vp + i + 0) + borrow1
cmovc -8(%r8),%rax C if borrow2 t2 = *(yp - 1)
sbb 8(%rdx,%r11,8),%r13 C s2 -= *(vp + i + 1) + borrow2
cmovc -16(%r8),%rbx C if borrow3 t3 = *(yp - 2)
mov $0,%rcx C t4 = 0
sbb 16(%rdx,%r11,8),%r14 C s3 -= *(vp + i + 3) + borrow3
cmovc -24(%r8),%rcx C if borrow4 t4 = *(yp - 3)
rcr $1,%r10 C store borrow4 in high bit of cy
add %rax,%r9 C t1 += t2
adc $0,%r10 C accumulate cy
add %rbx,%r9 C t1 += t3
adc $0,%r10 C accumulate cy
add %rcx,%r9 C t1 += t4
adc $0,%r10 C accumulate cy
mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1
mov %r13,8(%rdi,%r11,8) C *(rp + i + 1) = s2
mov %r14,16(%rdi,%r11,8) C *(rp + i + 2) = s3
mov -56(%rsp),%rcx C restore rcx
mov %r9,(%rcx) C ep[0] = t1
btr $63,%r10 C retrieve borrow out and reset bit of cy
mov %r10,8(%rcx) C ep[1] = cy
mov -40(%rsp),%r14 C restore r14
mov $0,%rax
mov -32(%rsp),%r13 C restore r13
adc $0,%rax C return borrow out
mov -24(%rsp),%r12 C restore r12
mov -8(%rsp),%rbx C restore rbx
ret
ALIGN(16)
case2:
mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0)
mov 8(%rsi,%r11,8),%r13 C s2 = *(up + i + 1)
shl $1,%r10 C restore borrow1 from high bit of t1
sbb (%rdx,%r11,8),%r12 C s1 -= *(vp + i + 0) + borrow1
cmovc -8(%r8),%rax C if borrow2 t2 = *(yp - 1)
sbb 8(%rdx,%r11,8),%r13 C s2 -= *(vp + i + 1) + borrow2
cmovc -16(%r8),%rbx C if borrow3 t3 = *(yp - 2)
rcr $1,%r10 C store borrow3 in high bit of cy
add %rax,%r9 C t1 += t2
adc $0,%r10 C accumulate cy
add %rbx,%r9 C t1 += t3
adc $0,%r10 C accumulate cy
mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1
mov %r13,8(%rdi,%r11,8) C *(rp + i + 1) = s2
mov -56(%rsp),%rcx C restore rcx
mov %r9,(%rcx) C ep[0] = t1
btr $63,%r10 C retrieve borrow out and reset bit of cy
mov %r10,8(%rcx) C ep[1] = cy
mov -40(%rsp),%r14 C restore r14
mov $0,%rax
mov -32(%rsp),%r13 C restore r13
adc $0,%rax C return borrow out
mov -24(%rsp),%r12 C restore r12
mov -8(%rsp),%rbx C restore rbx
ret
ALIGN(16)
case1:
mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0)
shl $1,%r10 C restore borrow1 from high bit of t1
sbb (%rdx,%r11,8),%r12 C s1 -= *(vp + i + 0) + borrow1
cmovc -8(%r8),%rax C if borrow2 t2 = *(yp - 1)
rcr $1,%r10 C store borrow3 in high bit of cy
add %rax,%r9 C t1 += t2
adc $0,%r10 C accumulate cy
mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1
case0: mov -56(%rsp),%rcx C restore rcx
mov %r9,(%rcx) C ep[0] = t1
btr $63,%r10 C retrieve borrow out and reset bit of cy
mov %r10,8(%rcx) C ep[1] = cy
mov -40(%rsp),%r14 C restore r14
mov $0,%rax
mov -32(%rsp),%r13 C restore r13
adc $0,%rax C return borrow out
mov -24(%rsp),%r12 C restore r12
mov -8(%rsp),%rbx C restore rbx
ret
EPILOGUE()