;  Copyright 2016 Jens Nurmann and Alexander Kruppa

;  This file is part of the MPIR Library.

;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.

;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.

;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.

; mp_limb_t sublsh1_n(mp_ptr Op3, mp_srcptr Op2, mp_srcptr Op1; mp_size_t Size )
; Linux     RAX       RDI         RSI            RDX            RCX
; Win7      RAX       RCX         RDX            R8             R9
;
; Description:
; The function shifts Op1 left one bit, subtracts it from Op2, stores the result
; in Op3 and hands back the total carry. There is a gain in execution speed
; compared to separate shift and subtract by interleaving the elementary operations
; and reducing memory access. The factor depends on the size of the operands
; (the cache hierarchy in which the operands can be handled).
;
; Caveats:
; - for asm the processor MUST support LAHF/SAHF in 64 bit mode!
; - the total carry is in [0..2]!
;
; Comments:
; - asm version implemented, tested & benched on 16.05.2015 by jn
; - improved asm version implemented, tested & benched on 30.07.2015 by jn
; - On Nehalem per limb saving is 0.7 cycles in LD1$, LD2$ and 1-2 in LD3$
; - includes LAHF / SAHF
; - includes prefetching
; - includes XMM save & restore
;
; Linux: (rdi, rcx) = (rsi, rcx) - (rdx, rcx)<<1
; ============================================================================

%define USE_WIN64

%define ADDSUB add
%define ADCSBB adc

%include "yasm_mac.inc"

BITS 64

%define reg_save_list RBX, RBP, RSI, RDI, R10, R11, R12, R13, R14, R15

%define Op3     RCX
%define Op2     RDX
%define Op1     R8
%define Size    R9

%define Limb0   RBX
%define Limb1   RDI
%define Limb2   RSI

%define Limb3   R10
%define Limb4   R11
%define Limb5   R12
%define Limb6   R13
%define Limb7   R14
%define Limb8   R15

%ifdef USE_PREFETCH
%define Offs    RBP
%endif


%macro ACCUMULATE 1
    mov     rax, [Op2 + 8 * %1]
    ADCSBB  rax, Limb%1
    mov     [Op3 + 8 * %1], rax
%endmacro


    align   32

  FRAME_PROC mpn_sublsh1_n, 0, reg_save_list

  %ifdef USE_PREFETCH
    mov     Offs, PREFETCH_STRIDE   ; Attn: check if redefining Offs
  %endif

    ; prepare shift & subtraction with loop-unrolling 8
    xor     Limb0, Limb0
    lahf                        ; memorize clear carry (from "xor" above)

    test    Size, 1
    je      .n_two

    mov     Limb1, [Op1]
    shrd    Limb0, Limb1, 63

    mov     rax, [Op2]
    ADDSUB  rax, Limb0
    mov     [Op3], rax
    lahf

    add     Op1, 8
    add     Op2, 8
    add     Op3, 8
    mov     Limb0, Limb1

  .n_two:

    test    Size, 2
    je      .n_four

    mov     Limb1, [Op1]
    mov     Limb2, [Op1+8]
    shrd    Limb0, Limb1, 63
    shrd    Limb1, Limb2, 63

    sahf
    ACCUMULATE 0
    ACCUMULATE 1
    lahf

    add     Op1, 16
    add     Op2, 16
    add     Op3, 16
    mov     Limb0, Limb2

  .n_four:

    test    Size, 4
    je      .n_test ;ajs:notshortform

    mov     Limb1, [Op1]
    mov     Limb2, [Op1+8]
    shrd    Limb0, Limb1, 63
    shrd    Limb1, Limb2, 63
    mov     Limb3, [Op1+16]
    mov     Limb4, [Op1+24]
    shrd    Limb2, Limb3, 63
    shrd    Limb3, Limb4, 63

    sahf
    ACCUMULATE 0
    ACCUMULATE 1
    ACCUMULATE 2
    ACCUMULATE 3
    lahf

    add     Op1, 32
    add     Op2, 32
    add     Op3, 32
    mov     Limb0, Limb4
    jmp     .n_test ;ajs:notshortform

    ; main loop
    ; - 2.40-2.50 cycles per limb in L1D$
    ; - 2.6       cycles per limb in L2D$
    ; - 2.80-3.30 cycles per limb in L3D$
    align   16
  .n_loop:

  %ifdef USE_PREFETCH
    prefetchnta [Op1+Offs]
    prefetchnta [Op2+Offs]
  %endif

    mov     Limb1, [Op1]        ; prepare shifted oct-limb from Op1
    mov     Limb2, [Op1+8]
    mov     Limb3, [Op1+16]
    shrd    Limb0, Limb1, 63
    shrd    Limb1, Limb2, 63
    shrd    Limb2, Limb3, 63
    mov     Limb4, [Op1+24]
    mov     Limb5, [Op1+32]
    mov     Limb6, [Op1+40]
    shrd    Limb3, Limb4, 63
    shrd    Limb4, Limb5, 63
    shrd    Limb5, Limb6, 63
    mov     Limb7, [Op1+48]
    mov     Limb8, [Op1+56]
    shrd    Limb6, Limb7, 63
    shrd    Limb7, Limb8, 63

    sahf                        ; restore carry
    ACCUMULATE 0                ; sub shifted Op1 from Op2 with result in Op3
    ACCUMULATE 1
    ACCUMULATE 2
    ACCUMULATE 3
    ACCUMULATE 4
    ACCUMULATE 5
    ACCUMULATE 6
    ACCUMULATE 7
    lahf                        ; remember carry for next round

    add     Op1, 64
    add     Op2, 64
    add     Op3, 64
    mov     Limb0, Limb8

  .n_test:

    sub     Size, 8
    jnc     .n_loop

    ; housekeeping - hand back total carry
    shr     Limb0, 63
    sahf
    adc     Limb0, 0            ; Limb0=0/1/2 depending on final carry and shift
    mov     RAX, Limb0
    END_PROC reg_save_list