dnl mpn_sub_err1_n dnl Copyright 2009 Jason Moxham dnl This file is part of the MPIR Library. dnl The MPIR Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 2.1 of the License, or (at dnl your option) any later version. dnl The MPIR Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the MPIR Library; see the file COPYING.LIB. If not, write dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, dnl Boston, MA 02110-1301, USA. include(`../config.m4') C ret mpn_sub_err1(mp_ptr rp,mp_ptr up,mp_ptr vp,mp_ptr ep,mp_ptr_t yp,mp_size_t n,mp_limb_t cy) C rax rdi, rsi, rdx, rcx, r8 r9 8(rsp)=>r10 ASM_START() PROLOGUE(mpn_sub_err1_n) C if we rearrange the params we could save some moves C (rdi,r9)=(rsi,r9)+(rdx,r9) sum=carry*(r8) mov 8(%rsp),%r10 C cy mov %rbp,-16(%rsp) C save rbp lea -24(%rdi,%r9,8),%rdi C rp += n - 3 mov %r12,-24(%rsp) C save r12 mov %r13,-32(%rsp) C save r13 lea -24(%rsi,%r9,8),%rsi C up += n - 3 mov %r14,-40(%rsp) C save r14 mov %r15,-48(%rsp) C save r15 lea -24(%rdx,%r9,8),%rdx C vp += n - 3 mov %rcx,-56(%rsp) C save rcx mov %rbx,-8(%rsp) C save rbx mov $3,%r11 C i = 3 shl $63,%r10 lea (%r8,%r9,8),%r8 C yp += n sub %r9,%r11 C i = 3 - n mov $0,%r9 C t1 = 0 mov $0,%rax C t2 = 0 mov $0,%rbx C t3 = 0 jnc skiplp C if done goto skiplp ALIGN(16) lp: mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0) mov 8(%rsi,%r11,8),%r13 C s2 = *(up + i + 1) mov 16(%rsi,%r11,8),%r14 C s3 = *(up + i + 2) mov 24(%rsi,%r11,8),%r15 C s4 = *(up + i + 3) mov $0,%rbp C t5 = 0 shl $1,%r10 C s1 -= *(vp + i + 0) + (cy & 1) sbb (%rdx,%r11,8),%r12 cmovc -8(%r8),%rax C if borrow1, t2 = *(yp - 1) sbb 8(%rdx,%r11,8),%r13 C s2 -= *(vp + i + 1) + borrow1 cmovc -16(%r8),%rbx C if borrow2 t3 = *(yp - 2) mov $0,%rcx C t4 = 0 sbb 16(%rdx,%r11,8),%r14 C s3 -= *(vp + i + 2) + borrow2 cmovc -24(%r8),%rcx C if borrow3 t4 = *(yp - 3) sbb 24(%rdx,%r11,8),%r15 C s4 -= *(vp + i + 3) + borrow3 cmovc -32(%r8),%rbp C if borrow4 t5 = *(yp - 4) rcr $1,%r10 C high bit of cy = borrow add %rax,%r9 C t1 += t2 adc $0,%r10 C accumulate cy add %rbx,%r9 C t1 += t2 adc $0,%r10 C accumulate cy add %rcx,%r9 C t1 += t4 mov $0,%rax C t2 = 0 adc $0,%r10 C accumulate cy lea -32(%r8),%r8 C yp -= 4 add %rbp,%r9 C t1 += t5 adc $0,%r10 C accumulate cy mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1 mov %r13,8(%rdi,%r11,8) C *(rp + i + 1) = s2 mov %r14,16(%rdi,%r11,8) C *(rp + i + 2) = s3 mov %r15,24(%rdi,%r11,8) C *(rp + i + 3) = s4 mov $0,%rbx C t3 = 0 add $4,%r11 C i += 4 jnc lp C not done, goto lp skiplp: cmp $2,%r11 C cmp(i, 2) mov -16(%rsp),%rbp C restore rbp mov -48(%rsp),%r15 C restore r15 ja case0 C i == 3 goto case0 je case1 C i == 2 goto case1 jp case2 C i == 1 goto case2 case3: mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0) mov 8(%rsi,%r11,8),%r13 C s2 = *(up + i + 1) mov 16(%rsi,%r11,8),%r14 C s3 = *(up + i + 2) shl $1,%r10 C restore borrow1 from high bit of t1 sbb (%rdx,%r11,8),%r12 C s1 -= *(vp + i + 0) + borrow1 cmovc -8(%r8),%rax C if borrow2 t2 = *(yp - 1) sbb 8(%rdx,%r11,8),%r13 C s2 -= *(vp + i + 1) + borrow2 cmovc -16(%r8),%rbx C if borrow3 t3 = *(yp - 2) mov $0,%rcx C t4 = 0 sbb 16(%rdx,%r11,8),%r14 C s3 -= *(vp + i + 3) + borrow3 cmovc -24(%r8),%rcx C if borrow4 t4 = *(yp - 3) rcr $1,%r10 C store borrow4 in high bit of cy add %rax,%r9 C t1 += t2 adc $0,%r10 C accumulate cy add %rbx,%r9 C t1 += t3 adc $0,%r10 C accumulate cy add %rcx,%r9 C t1 += t4 adc $0,%r10 C accumulate cy mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1 mov %r13,8(%rdi,%r11,8) C *(rp + i + 1) = s2 mov %r14,16(%rdi,%r11,8) C *(rp + i + 2) = s3 mov -56(%rsp),%rcx C restore rcx mov %r9,(%rcx) C ep[0] = t1 btr $63,%r10 C retrieve borrow out and reset bit of cy mov %r10,8(%rcx) C ep[1] = cy mov -40(%rsp),%r14 C restore r14 mov $0,%rax mov -32(%rsp),%r13 C restore r13 adc $0,%rax C return borrow out mov -24(%rsp),%r12 C restore r12 mov -8(%rsp),%rbx C restore rbx ret ALIGN(16) case2: mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0) mov 8(%rsi,%r11,8),%r13 C s2 = *(up + i + 1) shl $1,%r10 C restore borrow1 from high bit of t1 sbb (%rdx,%r11,8),%r12 C s1 -= *(vp + i + 0) + borrow1 cmovc -8(%r8),%rax C if borrow2 t2 = *(yp - 1) sbb 8(%rdx,%r11,8),%r13 C s2 -= *(vp + i + 1) + borrow2 cmovc -16(%r8),%rbx C if borrow3 t3 = *(yp - 2) rcr $1,%r10 C store borrow3 in high bit of cy add %rax,%r9 C t1 += t2 adc $0,%r10 C accumulate cy add %rbx,%r9 C t1 += t3 adc $0,%r10 C accumulate cy mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1 mov %r13,8(%rdi,%r11,8) C *(rp + i + 1) = s2 mov -56(%rsp),%rcx C restore rcx mov %r9,(%rcx) C ep[0] = t1 btr $63,%r10 C retrieve borrow out and reset bit of cy mov %r10,8(%rcx) C ep[1] = cy mov -40(%rsp),%r14 C restore r14 mov $0,%rax mov -32(%rsp),%r13 C restore r13 adc $0,%rax C return borrow out mov -24(%rsp),%r12 C restore r12 mov -8(%rsp),%rbx C restore rbx ret ALIGN(16) case1: mov (%rsi,%r11,8),%r12 C s1 = *(up + i + 0) shl $1,%r10 C restore borrow1 from high bit of t1 sbb (%rdx,%r11,8),%r12 C s1 -= *(vp + i + 0) + borrow1 cmovc -8(%r8),%rax C if borrow2 t2 = *(yp - 1) rcr $1,%r10 C store borrow3 in high bit of cy add %rax,%r9 C t1 += t2 adc $0,%r10 C accumulate cy mov %r12,(%rdi,%r11,8) C *(rp + i + 0) = s1 case0: mov -56(%rsp),%rcx C restore rcx mov %r9,(%rcx) C ep[0] = t1 btr $63,%r10 C retrieve borrow out and reset bit of cy mov %r10,8(%rcx) C ep[1] = cy mov -40(%rsp),%r14 C restore r14 mov $0,%rax mov -32(%rsp),%r13 C restore r13 adc $0,%rax C return borrow out mov -24(%rsp),%r12 C restore r12 mov -8(%rsp),%rbx C restore rbx ret EPILOGUE()