; ============================================================================
;  Copyright 2016 Jens Nurmann and Alexander Kruppa
;  This file is part of the MPIR Library.
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.
; mp_limb_t mpn_nsumdiff_n(mp_ptr Op3, mp_ptr Op4, mp_srcptr Op1, mp_srcptr Op2, mp_size_t Size)
; Linux     RAX           RDI         RSI         RDX            RCX            R8
; Win7      RAX           RCX         RDX         R8             R9             Stack
;
; Description:
; The function computes -(Op2+Op1) and stores the result in Op3 while at the
; same time subtracting Op2 from Op1 with result in Op4. The final carries from
; addition and subtraction are handed back as a combined mp_limb_t. There is a
; gain in execution speed compared to separate addition and subtraction by
; reducing memory access. The factor depends on the size of the operands (the
; cache hierarchy in which the operands can be handled).
;
; Equivalent to, assuming no overlap:
; cy1 = mpn_add_n(r1, s1, s2, n);
; cy2 = mpn_neg_n(r1, r1, n); /* cy2 = [{r1,n} != 0] */
; cy3 = mpn_sub_n(r2, s1, s2, n);
; return 2*(cy1 + cy2) + cy3;
; ============================================================================


%include 'yasm_mac.inc'

%define reg_save_list rsi, rdi, rbx, rbp, r12, r13, r14, r15

%define Op3     RDI
%define Op4     RSI
%define Op1     RDX
%define Op2     RCX
%define Size    R8

%define Limb0   RBP
%define Limb1   RBX
%define Limb2   R9
%define Limb3   R10
%define Limb4   R11
%define Limb5   R12
%define Limb6   R13
%define Limb7   R14
%define Limb8   R15

%ifdef USE_PREFETCH
%define Offs    PREFETCH_STRIDE ; no more regs avail. => fallback to const
%endif

%define SaveAC  setc    AL
%define LoadAC  shr     AL, 1

%define SaveSC  sbb     AH, AH
%define LoadSC  add     AH, AH

    BITS 64

    align   32

    FRAME_PROC mpn_nsumdiff_n, 0, reg_save_list
    mov     rdi, rcx
    mov     rsi, rdx
    mov     rdx, r8
    mov     rcx, r9
    mov     r8, [rsp+stack_use+40]

    xor     EAX, EAX            ; clear add & sub carry

; First we handle any words whose sum = 0
    mov     Limb1, [Op1]
    mov     Limb5, [Op2]
    mov     Limb2, Limb1
    add     Limb2, Limb5
    SaveAC
    neg	    Limb2
    jz      .zero_sum ; ajs:notshortform

; Then we handle the first word whose sum !=0. The NOT of this sum needs to
; be incremented, which produces no carry (NOT(x) + 1 = NEG(x))
.not_zero:
    mov     [Op3], Limb2
    LoadSC
    sbb     Limb1, Limb5
    SaveSC
    add     Op1, 8
    add     Op2, 8
    add     Op3, 8
    dec     Size
    mov     [Op4], Limb1
    add     Op4, 8

    shr     Size, 1
    jnc     .n_two

    mov     Limb1, [Op1]
    mov     Limb5, [Op2]
    LoadAC
    mov     Limb2, Limb1
    adc     Limb2, Limb5
    SaveAC
    LoadSC
    sbb     Limb1, Limb5
    SaveSC
    not     Limb2
    add     Op1, 8
    add     Op2, 8
    add     Op3, 8
    mov     [Op3-8], Limb2
    mov     [Op4], Limb1
    add     Op4, 8

  .n_two:

    shr     Size, 1
    jnc     .n_four
    mov     Limb1, [Op1]
    mov     Limb2, [Op1+8]
    mov     Limb5, [Op2]
    mov     Limb6, [Op2+8]
    LoadAC
    mov     Limb3, Limb1
    mov     Limb4, Limb2
    adc     Limb3, Limb5
    not     Limb3
    adc     Limb4, Limb6
    SaveAC
    LoadSC
    not     Limb4
    sbb     Limb1, Limb5
    sbb     Limb2, Limb6
    SaveSC
    mov     [Op3], Limb3
    mov     [Op3+8], Limb4
    mov     [Op4], Limb1
    mov     [Op4+8], Limb2
    add     Op1, 16
    add     Op2, 16
    add     Op3, 16
    add     Op4, 16


  .n_four:

    shr     Size, 1
    jnc     .n_loop_pre ;ajs:notshortform

    LoadAC

    ; slight change of scheme here - avoid too many
    ; memory to reg or reg to memory moves in a row
    mov     Limb1, [Op1]
    mov     Limb5, [Op2]
    mov     Limb0, Limb1
    adc     Limb0, Limb5
    not     Limb0
    mov     [Op3], Limb0
    mov     Limb2, [Op1+8]
    mov     Limb6, [Op2+8]
    mov     Limb0, Limb2
    adc     Limb0, Limb6
    not     Limb0
    mov     [Op3+8], Limb0
    mov     Limb3, [Op1+16]
    mov     Limb7, [Op2+16]
    mov     Limb0, Limb3
    adc     Limb0, Limb7
    not     Limb0
    mov     [Op3+16], Limb0
    mov     Limb4, [Op1+24]
    mov     Limb8, [Op2+24]
    mov     Limb0, Limb4
    adc     Limb0, Limb8
    not     Limb0
    mov     [Op3+24], Limb0

    SaveAC
    LoadSC

    sbb     Limb1, Limb5
    mov     [Op4], Limb1
    sbb     Limb2, Limb6
    mov     [Op4+8], Limb2
    sbb     Limb3, Limb7
    mov     [Op4+16], Limb3
    sbb     Limb4, Limb8
    mov     [Op4+24], Limb4

    SaveSC

    add     Op1, 32
    add     Op2, 32
    add     Op3, 32
    add     Op4, 32
 
    test   Size, Size
  .n_loop_pre:		; If we jump here, ZF=1 iff Size=0
    jz     .n_post      ;ajs:notshortform
    LoadAC              ; set carry for addition

    ; main loop - values below are best case - up to 50% fluctuation possible!
    ; - 3.50      cycles per limb in LD1$
    ; - 3.50      cycles per limb in LD2$
    ; - 5.10-5.50 cycles per limb in LD3$
    align   16
  .n_loop:

  %ifdef USE_PREFETCH
    prefetchnta [Op1+Offs]
    prefetchnta [Op2+Offs]
  %endif

    mov     Limb1, [Op1]        ; add the first quad-limb
    mov     Limb5, [Op2]
    mov     Limb0, Limb1
    adc     Limb0, Limb5
    not     Limb0
    mov     [Op3], Limb0
    mov     Limb2, [Op1+8]
    mov     Limb6, [Op2+8]
    mov     Limb0, Limb2
    adc     Limb0, Limb6
    not     Limb0
    mov     [Op3+8], Limb0
    mov     Limb3, [Op1+16]
    mov     Limb7, [Op2+16]
    mov     Limb0, Limb3
    adc     Limb0, Limb7
    not     Limb0
    mov     [Op3+16], Limb0
    mov     Limb4, [Op1+24]
    mov     Limb8, [Op2+24]
    mov     Limb0, Limb4
    adc     Limb0, Limb8
    not     Limb0
    mov     [Op3+24], Limb0
    lea     Op3, [Op3 + 64]

    SaveAC              ; memorize add-carry
    LoadSC              ; set carry for subtraction

    sbb     Limb1, Limb5        ; now sub the first quad-limb
    mov     [Op4], Limb1
    sbb     Limb2, Limb6
    mov     [Op4+8], Limb2
    sbb     Limb3, Limb7
    mov     [Op4+16], Limb3
    sbb     Limb4, Limb8
    mov     [Op4+24], Limb4
    mov     Limb1, [Op1+32]     ; sub the second quad-limb
    mov     Limb5, [Op2+32]
    mov     Limb0, Limb1
    sbb     Limb0, Limb5
    mov     [Op4+32], Limb0
    mov     Limb2, [Op1+40]
    mov     Limb6, [Op2+40]
    mov     Limb0, Limb2
    sbb     Limb0, Limb6
    mov     [Op4+40], Limb0
    mov     Limb3, [Op1+48]
    mov     Limb7, [Op2+48]
    mov     Limb0, Limb3
    sbb     Limb0, Limb7
    mov     [Op4+48], Limb0
    mov     Limb4, [Op1+56]
    mov     Limb8, [Op2+56]
    mov     Limb0, Limb4
    sbb     Limb0, Limb8
    mov     [Op4+56], Limb0
    lea     Op4, [Op4 + 64]

    SaveSC                      ; memorize sub-carry
    LoadAC                      ; set carry for addition

    adc     Limb1, Limb5        ; add the second quad-limb
    not     Limb1
    mov     [Op3+32-64], Limb1
    adc     Limb2, Limb6
    not     Limb2
    mov     [Op3+40-64], Limb2
    adc     Limb3, Limb7
    not     Limb3
    mov     [Op3+48-64], Limb3
    adc     Limb4, Limb8
    not     Limb4
    mov     [Op3+56-64], Limb4

    lea     Op1, [Op1 + 64]
    lea     Op2, [Op2 + 64]


    dec     Size
    jnz     .n_loop     ;ajs:notshortform

    SaveAC                      ; memorize add-carry
    ; hand back carries
  .n_post:
				; AL = cy1, AH = -cy3. cy2 = 1 here, as
                                ; there were non-zero words in the sum
    inc     al			; AL = cy1 + cy2 = cy1 + 1, AH = -cy3
.all_zero:
    LoadSC			; AL = cy1 + cy2, CY = cy3
    adc     AL, AL		; AL = 2*(cy1 + cy2) + cy3
    movsx   EAX, AL

  .Exit:
  END_PROC reg_save_list
.end:

.zero_sum:
    mov     [Op3], Limb2
    LoadSC
    sbb     Limb1, Limb5
    SaveSC
    mov     [Op4], Limb1
    dec     Size
    jz      .all_zero
    add     Op1, 8
    add     Op2, 8
    add     Op3, 8
    add     Op4, 8
    mov     Limb1, [Op1]
    mov     Limb5, [Op2]
    mov     Limb2, Limb1
    LoadAC
    adc     Limb2, Limb5
    SaveAC
    neg     Limb2
    jz      .zero_sum
    jmp     .not_zero