mpir/mpn/x86_64w/haswell/sumdiff_n.asm

; ============================================================================
;  Copyright 2016 Jens Nurmann and Alexander Kruppa
;  This file is part of the MPIR Library.
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.
; mp_limb_t mpn_sumdiff_n(mp_ptr Op3, mp_ptr Op4, mp_srcptr Op1, mp_srcptr Op2, mp_size_t Size)
; Linux     RAX           RDI         RSI         RDX            RCX            R8
; Win7      RAX           RCX         RDX         R8             R9             Stack
;
; Description:
; The function adds Op2 to Op1 and stores the result in Op3 while at the same
; time subtracting Op2 from Op1 with result in Op4. The final carries from
; addition and subtraction are handed back as a combined mp_limb_t. There is a
; gain in execution speed compared to separate addition and subtraction by
; reducing memory access. The factor depends on the size of the operands (the
; cache hierarchy in which the operands can be handled).
;
; Comments:
; - asm version implemented, tested & benched on 10.06.2015 by jn
; - On Nehalem per limb saving is 0.5 cycle in LD1$, LD2$ and LD3$
; - includes prefetching
; - includes XMM save & restore
; ============================================================================

%define USE_WIN64

%include 'yasm_mac.inc'

%ifdef USE_WIN64

%define reg_save_list RBX, RBP, RSI, RDI, R10, R11, R12, R13, R14, R15

    %define Op3     RCX
    %define Op4     RDX
    %define Op1     R8
    %define Op2     R9
    %define Size    RBX
    %define Limb1   RSI
    %define Limb2   RDI
%else
    %define Op3     RDI
    %define Op4     RSI
    %define Op1     RDX
    %define Op2     RCX
    %define Size    R8
    %define Limb1   RBX
    %define Limb2   R9
%endif

    %define Limb0   RBP
    %define Limb3   R10
    %define Limb4   R11
    %define Limb5   R12
    %define Limb6   R13
    %define Limb7   R14
    %define Limb8   R15

%ifdef USE_PREFETCH
%define Offs    PREFETCH_STRIDE
%endif

%define SaveAC  setc    AL
%define LoadAC  shr     AL, 1

%define SaveSC  sbb     AH, AH
%define LoadSC  add     AH, AH

BITS 64

align   32

   FRAME_PROC mpn_sumdiff_n, 0, reg_save_list
    mov     Size, [rsp+stack_use+40]

    xor     EAX, EAX            ; clear add & sub carry

    shr     Size, 1
    jnc     .sumdiff_n_two

    mov     Limb1, [Op1]
    mov     Limb5, [Op2]
    mov     Limb2, Limb1
    add     Limb2, Limb5
    mov     [Op3], Limb2

    SaveAC

    sub     Limb1, Limb5
    mov     [Op4], Limb1

    SaveSC

    add     Op1, 8
    add     Op2, 8
    add     Op3, 8
    add     Op4, 8

  .sumdiff_n_two:

    shr     Size, 1
    jnc     .sumdiff_n_four

    mov     Limb1, [Op1]
    mov     Limb2, [Op1+8]
    mov     Limb5, [Op2]
    mov     Limb6, [Op2+8]

    LoadAC

    mov     Limb3, Limb1
    adc     Limb3, Limb5
    mov     [Op3], Limb3
    mov     Limb4, Limb2
    adc     Limb4, Limb6
    mov     [Op3+8], Limb4

    SaveAC
    LoadSC

    sbb     Limb1, Limb5
    mov     [Op4], Limb1
    sbb     Limb2, Limb6
    mov     [Op4+8], Limb2

    SaveSC

    add     Op1, 16
    add     Op2, 16
    add     Op3, 16
    add     Op4, 16

  .sumdiff_n_four:

    shr     Size, 1
    jnc     .sumdiff_n_loop_pre ;ajs:notshortform

    LoadAC

    ; slight change of scheme here - avoid too many
    ; memory to reg or reg to memory moves in a row
    mov     Limb1, [Op1]
    mov     Limb5, [Op2]
    mov     Limb0, Limb1
    adc     Limb0, Limb5
    mov     [Op3], Limb0
    mov     Limb2, [Op1+8]
    mov     Limb6, [Op2+8]
    mov     Limb0, Limb2
    adc     Limb0, Limb6
    mov     [Op3+8], Limb0
    mov     Limb3, [Op1+16]
    mov     Limb7, [Op2+16]
    mov     Limb0, Limb3
    adc     Limb0, Limb7
    mov     [Op3+16], Limb0
    mov     Limb4, [Op1+24]
    mov     Limb8, [Op2+24]
    mov     Limb0, Limb4
    adc     Limb0, Limb8
    mov     [Op3+24], Limb0

    SaveAC
    LoadSC

    sbb     Limb1, Limb5
    mov     [Op4], Limb1
    sbb     Limb2, Limb6
    mov     [Op4+8], Limb2
    sbb     Limb3, Limb7
    mov     [Op4+16], Limb3
    sbb     Limb4, Limb8
    mov     [Op4+24], Limb4

    SaveSC

    add     Op1, 32
    add     Op2, 32
    add     Op3, 32
    add     Op4, 32
 
    test   Size, Size
  .sumdiff_n_loop_pre:		; If we jump here, ZF=1 iff Size=0
    jz     .sumdiff_n_post      ;ajs:notshortform
    LoadAC              ; set carry for addition

    ; main loop - values below are best case - up to 50% fluctuation possible!
    ; - 3.50      cycles per limb in LD1$
    ; - 3.50      cycles per limb in LD2$
    ; - 5.10-5.50 cycles per limb in LD3$
    align   16
  .sumdiff_n_loop:

  %ifdef USE_PREFETCH
    prefetchnta [Op1+Offs]
    prefetchnta [Op2+Offs]
  %endif

    mov     Limb1, [Op1]        ; add the first quad-limb
    mov     Limb5, [Op2]
    mov     Limb0, Limb1
    adc     Limb0, Limb5
    mov     [Op3], Limb0
    mov     Limb2, [Op1+8]
    mov     Limb6, [Op2+8]
    mov     Limb0, Limb2
    adc     Limb0, Limb6
    mov     [Op3+8], Limb0
    mov     Limb3, [Op1+16]
    mov     Limb7, [Op2+16]
    mov     Limb0, Limb3
    adc     Limb0, Limb7
    mov     [Op3+16], Limb0
    mov     Limb4, [Op1+24]
    mov     Limb8, [Op2+24]
    mov     Limb0, Limb4
    adc     Limb0, Limb8
    mov     [Op3+24], Limb0
    lea     Op3, [Op3 + 64]

    SaveAC              ; memorize add-carry
    LoadSC              ; set carry for subtraction

    sbb     Limb1, Limb5        ; now sub the first quad-limb
    mov     [Op4], Limb1
    sbb     Limb2, Limb6
    mov     [Op4+8], Limb2
    sbb     Limb3, Limb7
    mov     [Op4+16], Limb3
    sbb     Limb4, Limb8
    mov     [Op4+24], Limb4

    mov     Limb1, [Op1+32]     ; sub the second quad-limb
    mov     Limb5, [Op2+32]
    mov     Limb0, Limb1
    sbb     Limb0, Limb5
    mov     [Op4+32], Limb0
    mov     Limb2, [Op1+40]
    mov     Limb6, [Op2+40]
    mov     Limb0, Limb2
    sbb     Limb0, Limb6
    mov     [Op4+40], Limb0
    mov     Limb3, [Op1+48]
    mov     Limb7, [Op2+48]
    mov     Limb0, Limb3
    sbb     Limb0, Limb7
    mov     [Op4+48], Limb0
    mov     Limb4, [Op1+56]
    mov     Limb8, [Op2+56]
    mov     Limb0, Limb4
    sbb     Limb0, Limb8
    mov     [Op4+56], Limb0
    lea     Op4, [Op4 + 64]

    SaveSC                      ; memorize sub-carry
    LoadAC                      ; set carry for addition

    adc     Limb1, Limb5        ; add the second quad-limb
    mov     [Op3+32-64], Limb1
    adc     Limb2, Limb6
    mov     [Op3+40-64], Limb2
    adc     Limb3, Limb7
    mov     [Op3+48-64], Limb3
    adc     Limb4, Limb8
    mov     [Op3+56-64], Limb4

    lea     Op1, [Op1 + 64]
    lea     Op2, [Op2 + 64]

    dec     Size
    jnz     .sumdiff_n_loop     ;ajs:notshortform

    SaveAC                      ; memorize add-carry
    ; hand back carries
  .sumdiff_n_post:
				; AL = add_carry, AH = -sub_carry
    LoadSC			; AL = add_carry, CY = sub_carry
    adc     AL, AL		; AL = 2*add_carry + sub_carry
    movsx   RAX, AL

   END_PROC reg_save_list
Add the latest haswell and skylake code to the Windows x64 build 2016-12-10 09:15:40 -05:00			`; ============================================================================`
			`; Copyright 2016 Jens Nurmann and Alexander Kruppa`
			`; This file is part of the MPIR Library.`
			`; The MPIR Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 2.1 of the License, or (at`
			`; your option) any later version.`
			`; The MPIR Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`
			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`; Boston, MA 02110-1301, USA.`
			`; mp_limb_t mpn_sumdiff_n(mp_ptr Op3, mp_ptr Op4, mp_srcptr Op1, mp_srcptr Op2, mp_size_t Size)`
			`; Linux RAX RDI RSI RDX RCX R8`
			`; Win7 RAX RCX RDX R8 R9 Stack`
			`;`
			`; Description:`
			`; The function adds Op2 to Op1 and stores the result in Op3 while at the same`
			`; time subtracting Op2 from Op1 with result in Op4. The final carries from`
			`; addition and subtraction are handed back as a combined mp_limb_t. There is a`
			`; gain in execution speed compared to separate addition and subtraction by`
			`; reducing memory access. The factor depends on the size of the operands (the`
			`; cache hierarchy in which the operands can be handled).`
			`;`
			`; Comments:`
			`; - asm version implemented, tested & benched on 10.06.2015 by jn`
			`; - On Nehalem per limb saving is 0.5 cycle in LD1$, LD2$ and LD3$`
			`; - includes prefetching`
			`; - includes XMM save & restore`
			`; ============================================================================`

			`%define USE_WIN64`

			`%include 'yasm_mac.inc'`

			`%ifdef USE_WIN64`

			`%define reg_save_list RBX, RBP, RSI, RDI, R10, R11, R12, R13, R14, R15`

			`%define Op3 RCX`
			`%define Op4 RDX`
			`%define Op1 R8`
			`%define Op2 R9`
			`%define Size RBX`
			`%define Limb1 RSI`
			`%define Limb2 RDI`
			`%else`
			`%define Op3 RDI`
			`%define Op4 RSI`
			`%define Op1 RDX`
			`%define Op2 RCX`
			`%define Size R8`
			`%define Limb1 RBX`
			`%define Limb2 R9`
			`%endif`

			`%define Limb0 RBP`
			`%define Limb3 R10`
			`%define Limb4 R11`
			`%define Limb5 R12`
			`%define Limb6 R13`
			`%define Limb7 R14`
			`%define Limb8 R15`

			`%ifdef USE_PREFETCH`
			`%define Offs PREFETCH_STRIDE`
			`%endif`

			`%define SaveAC setc AL`
			`%define LoadAC shr AL, 1`

			`%define SaveSC sbb AH, AH`
			`%define LoadSC add AH, AH`

			`BITS 64`

			`align 32`

			`FRAME_PROC mpn_sumdiff_n, 0, reg_save_list`
			`mov Size, [rsp+stack_use+40]`

			`xor EAX, EAX ; clear add & sub carry`

			`shr Size, 1`
			`jnc .sumdiff_n_two`

			`mov Limb1, [Op1]`
			`mov Limb5, [Op2]`
			`mov Limb2, Limb1`
			`add Limb2, Limb5`
			`mov [Op3], Limb2`

			`SaveAC`

			`sub Limb1, Limb5`
			`mov [Op4], Limb1`

			`SaveSC`

			`add Op1, 8`
			`add Op2, 8`
			`add Op3, 8`
			`add Op4, 8`

			`.sumdiff_n_two:`

			`shr Size, 1`
			`jnc .sumdiff_n_four`

			`mov Limb1, [Op1]`
			`mov Limb2, [Op1+8]`
			`mov Limb5, [Op2]`
			`mov Limb6, [Op2+8]`

			`LoadAC`

			`mov Limb3, Limb1`
			`adc Limb3, Limb5`
			`mov [Op3], Limb3`
			`mov Limb4, Limb2`
			`adc Limb4, Limb6`
			`mov [Op3+8], Limb4`

			`SaveAC`
			`LoadSC`

			`sbb Limb1, Limb5`
			`mov [Op4], Limb1`
			`sbb Limb2, Limb6`
			`mov [Op4+8], Limb2`

			`SaveSC`

			`add Op1, 16`
			`add Op2, 16`
			`add Op3, 16`
			`add Op4, 16`

			`.sumdiff_n_four:`

			`shr Size, 1`
			`jnc .sumdiff_n_loop_pre ;ajs:notshortform`

			`LoadAC`

			`; slight change of scheme here - avoid too many`
			`; memory to reg or reg to memory moves in a row`
			`mov Limb1, [Op1]`
			`mov Limb5, [Op2]`
			`mov Limb0, Limb1`
			`adc Limb0, Limb5`
			`mov [Op3], Limb0`
			`mov Limb2, [Op1+8]`
			`mov Limb6, [Op2+8]`
			`mov Limb0, Limb2`
			`adc Limb0, Limb6`
			`mov [Op3+8], Limb0`
			`mov Limb3, [Op1+16]`
			`mov Limb7, [Op2+16]`
			`mov Limb0, Limb3`
			`adc Limb0, Limb7`
			`mov [Op3+16], Limb0`
			`mov Limb4, [Op1+24]`
			`mov Limb8, [Op2+24]`
			`mov Limb0, Limb4`
			`adc Limb0, Limb8`
			`mov [Op3+24], Limb0`

			`SaveAC`
			`LoadSC`

			`sbb Limb1, Limb5`
			`mov [Op4], Limb1`
			`sbb Limb2, Limb6`
			`mov [Op4+8], Limb2`
			`sbb Limb3, Limb7`
			`mov [Op4+16], Limb3`
			`sbb Limb4, Limb8`
			`mov [Op4+24], Limb4`

			`SaveSC`

			`add Op1, 32`
			`add Op2, 32`
			`add Op3, 32`
			`add Op4, 32`

			`test Size, Size`
			`.sumdiff_n_loop_pre: ; If we jump here, ZF=1 iff Size=0`
			`jz .sumdiff_n_post ;ajs:notshortform`
			`LoadAC ; set carry for addition`

			`; main loop - values below are best case - up to 50% fluctuation possible!`
			`; - 3.50 cycles per limb in LD1$`
			`; - 3.50 cycles per limb in LD2$`
			`; - 5.10-5.50 cycles per limb in LD3$`
			`align 16`
			`.sumdiff_n_loop:`

			`%ifdef USE_PREFETCH`
			`prefetchnta [Op1+Offs]`
			`prefetchnta [Op2+Offs]`
			`%endif`

			`mov Limb1, [Op1] ; add the first quad-limb`
			`mov Limb5, [Op2]`
			`mov Limb0, Limb1`
			`adc Limb0, Limb5`
			`mov [Op3], Limb0`
			`mov Limb2, [Op1+8]`
			`mov Limb6, [Op2+8]`
			`mov Limb0, Limb2`
			`adc Limb0, Limb6`
			`mov [Op3+8], Limb0`
			`mov Limb3, [Op1+16]`
			`mov Limb7, [Op2+16]`
			`mov Limb0, Limb3`
			`adc Limb0, Limb7`
			`mov [Op3+16], Limb0`
			`mov Limb4, [Op1+24]`
			`mov Limb8, [Op2+24]`
			`mov Limb0, Limb4`
			`adc Limb0, Limb8`
			`mov [Op3+24], Limb0`
			`lea Op3, [Op3 + 64]`

			`SaveAC ; memorize add-carry`
			`LoadSC ; set carry for subtraction`

			`sbb Limb1, Limb5 ; now sub the first quad-limb`
			`mov [Op4], Limb1`
			`sbb Limb2, Limb6`
			`mov [Op4+8], Limb2`
			`sbb Limb3, Limb7`
			`mov [Op4+16], Limb3`
			`sbb Limb4, Limb8`
			`mov [Op4+24], Limb4`

			`mov Limb1, [Op1+32] ; sub the second quad-limb`
			`mov Limb5, [Op2+32]`
			`mov Limb0, Limb1`
			`sbb Limb0, Limb5`
			`mov [Op4+32], Limb0`
			`mov Limb2, [Op1+40]`
			`mov Limb6, [Op2+40]`
			`mov Limb0, Limb2`
			`sbb Limb0, Limb6`
			`mov [Op4+40], Limb0`
			`mov Limb3, [Op1+48]`
			`mov Limb7, [Op2+48]`
			`mov Limb0, Limb3`
			`sbb Limb0, Limb7`
			`mov [Op4+48], Limb0`
			`mov Limb4, [Op1+56]`
			`mov Limb8, [Op2+56]`
			`mov Limb0, Limb4`
			`sbb Limb0, Limb8`
			`mov [Op4+56], Limb0`
			`lea Op4, [Op4 + 64]`

			`SaveSC ; memorize sub-carry`
			`LoadAC ; set carry for addition`

			`adc Limb1, Limb5 ; add the second quad-limb`
			`mov [Op3+32-64], Limb1`
			`adc Limb2, Limb6`
			`mov [Op3+40-64], Limb2`
			`adc Limb3, Limb7`
			`mov [Op3+48-64], Limb3`
			`adc Limb4, Limb8`
			`mov [Op3+56-64], Limb4`

			`lea Op1, [Op1 + 64]`
			`lea Op2, [Op2 + 64]`

			`dec Size`
			`jnz .sumdiff_n_loop ;ajs:notshortform`

			`SaveAC ; memorize add-carry`
			`; hand back carries`
			`.sumdiff_n_post:`
			`; AL = add_carry, AH = -sub_carry`
			`LoadSC ; AL = add_carry, CY = sub_carry`
			`adc AL, AL ; AL = 2*add_carry + sub_carry`
			`movsx RAX, AL`

			`END_PROC reg_save_list`