mpir/mpn/x86_64w/haswell/nsumdiff_n.asm
2017-01-22 22:41:01 +00:00

340 lines
8.7 KiB
NASM

; ============================================================================
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mp_limb_t mpn_nsumdiff_n(mp_ptr Op3, mp_ptr Op4, mp_srcptr Op1, mp_srcptr Op2, mp_size_t Size)
; Linux RAX RDI RSI RDX RCX R8
; Win7 RAX RCX RDX R8 R9 Stack
;
; Description:
; The function computes -(Op2+Op1) and stores the result in Op3 while at the
; same time subtracting Op2 from Op1 with result in Op4. The final carries from
; addition and subtraction are handed back as a combined mp_limb_t. There is a
; gain in execution speed compared to separate addition and subtraction by
; reducing memory access. The factor depends on the size of the operands (the
; cache hierarchy in which the operands can be handled).
;
; Equivalent to, assuming no overlap:
; cy1 = mpn_add_n(r1, s1, s2, n);
; cy2 = mpn_neg_n(r1, r1, n); /* cy2 = [{r1,n} != 0] */
; cy3 = mpn_sub_n(r2, s1, s2, n);
; return 2*(cy1 + cy2) + cy3;
; ============================================================================
%include 'yasm_mac.inc'
%define reg_save_list rsi, rdi, rbx, rbp, r12, r13, r14, r15
%define Op3 RDI
%define Op4 RSI
%define Op1 RDX
%define Op2 RCX
%define Size R8
%define Limb0 RBP
%define Limb1 RBX
%define Limb2 R9
%define Limb3 R10
%define Limb4 R11
%define Limb5 R12
%define Limb6 R13
%define Limb7 R14
%define Limb8 R15
%ifdef USE_PREFETCH
%define Offs PREFETCH_STRIDE ; no more regs avail. => fallback to const
%endif
%define SaveAC setc AL
%define LoadAC shr AL, 1
%define SaveSC sbb AH, AH
%define LoadSC add AH, AH
BITS 64
align 32
FRAME_PROC mpn_nsumdiff_n, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rcx, r9
mov r8, [rsp+stack_use+40]
xor EAX, EAX ; clear add & sub carry
; First we handle any words whose sum = 0
mov Limb1, [Op1]
mov Limb5, [Op2]
mov Limb2, Limb1
add Limb2, Limb5
SaveAC
neg Limb2
jz .zero_sum ; ajs:notshortform
; Then we handle the first word whose sum !=0. The NOT of this sum needs to
; be incremented, which produces no carry (NOT(x) + 1 = NEG(x))
.not_zero:
mov [Op3], Limb2
LoadSC
sbb Limb1, Limb5
SaveSC
add Op1, 8
add Op2, 8
add Op3, 8
dec Size
mov [Op4], Limb1
add Op4, 8
shr Size, 1
jnc .n_two
mov Limb1, [Op1]
mov Limb5, [Op2]
LoadAC
mov Limb2, Limb1
adc Limb2, Limb5
SaveAC
LoadSC
sbb Limb1, Limb5
SaveSC
not Limb2
add Op1, 8
add Op2, 8
add Op3, 8
mov [Op3-8], Limb2
mov [Op4], Limb1
add Op4, 8
.n_two:
shr Size, 1
jnc .n_four
mov Limb1, [Op1]
mov Limb2, [Op1+8]
mov Limb5, [Op2]
mov Limb6, [Op2+8]
LoadAC
mov Limb3, Limb1
mov Limb4, Limb2
adc Limb3, Limb5
not Limb3
adc Limb4, Limb6
SaveAC
LoadSC
not Limb4
sbb Limb1, Limb5
sbb Limb2, Limb6
SaveSC
mov [Op3], Limb3
mov [Op3+8], Limb4
mov [Op4], Limb1
mov [Op4+8], Limb2
add Op1, 16
add Op2, 16
add Op3, 16
add Op4, 16
.n_four:
shr Size, 1
jnc .n_loop_pre ;ajs:notshortform
LoadAC
; slight change of scheme here - avoid too many
; memory to reg or reg to memory moves in a row
mov Limb1, [Op1]
mov Limb5, [Op2]
mov Limb0, Limb1
adc Limb0, Limb5
not Limb0
mov [Op3], Limb0
mov Limb2, [Op1+8]
mov Limb6, [Op2+8]
mov Limb0, Limb2
adc Limb0, Limb6
not Limb0
mov [Op3+8], Limb0
mov Limb3, [Op1+16]
mov Limb7, [Op2+16]
mov Limb0, Limb3
adc Limb0, Limb7
not Limb0
mov [Op3+16], Limb0
mov Limb4, [Op1+24]
mov Limb8, [Op2+24]
mov Limb0, Limb4
adc Limb0, Limb8
not Limb0
mov [Op3+24], Limb0
SaveAC
LoadSC
sbb Limb1, Limb5
mov [Op4], Limb1
sbb Limb2, Limb6
mov [Op4+8], Limb2
sbb Limb3, Limb7
mov [Op4+16], Limb3
sbb Limb4, Limb8
mov [Op4+24], Limb4
SaveSC
add Op1, 32
add Op2, 32
add Op3, 32
add Op4, 32
test Size, Size
.n_loop_pre: ; If we jump here, ZF=1 iff Size=0
jz .n_post ;ajs:notshortform
LoadAC ; set carry for addition
; main loop - values below are best case - up to 50% fluctuation possible!
; - 3.50 cycles per limb in LD1$
; - 3.50 cycles per limb in LD2$
; - 5.10-5.50 cycles per limb in LD3$
align 16
.n_loop:
%ifdef USE_PREFETCH
prefetchnta [Op1+Offs]
prefetchnta [Op2+Offs]
%endif
mov Limb1, [Op1] ; add the first quad-limb
mov Limb5, [Op2]
mov Limb0, Limb1
adc Limb0, Limb5
not Limb0
mov [Op3], Limb0
mov Limb2, [Op1+8]
mov Limb6, [Op2+8]
mov Limb0, Limb2
adc Limb0, Limb6
not Limb0
mov [Op3+8], Limb0
mov Limb3, [Op1+16]
mov Limb7, [Op2+16]
mov Limb0, Limb3
adc Limb0, Limb7
not Limb0
mov [Op3+16], Limb0
mov Limb4, [Op1+24]
mov Limb8, [Op2+24]
mov Limb0, Limb4
adc Limb0, Limb8
not Limb0
mov [Op3+24], Limb0
lea Op3, [Op3 + 64]
SaveAC ; memorize add-carry
LoadSC ; set carry for subtraction
sbb Limb1, Limb5 ; now sub the first quad-limb
mov [Op4], Limb1
sbb Limb2, Limb6
mov [Op4+8], Limb2
sbb Limb3, Limb7
mov [Op4+16], Limb3
sbb Limb4, Limb8
mov [Op4+24], Limb4
mov Limb1, [Op1+32] ; sub the second quad-limb
mov Limb5, [Op2+32]
mov Limb0, Limb1
sbb Limb0, Limb5
mov [Op4+32], Limb0
mov Limb2, [Op1+40]
mov Limb6, [Op2+40]
mov Limb0, Limb2
sbb Limb0, Limb6
mov [Op4+40], Limb0
mov Limb3, [Op1+48]
mov Limb7, [Op2+48]
mov Limb0, Limb3
sbb Limb0, Limb7
mov [Op4+48], Limb0
mov Limb4, [Op1+56]
mov Limb8, [Op2+56]
mov Limb0, Limb4
sbb Limb0, Limb8
mov [Op4+56], Limb0
lea Op4, [Op4 + 64]
SaveSC ; memorize sub-carry
LoadAC ; set carry for addition
adc Limb1, Limb5 ; add the second quad-limb
not Limb1
mov [Op3+32-64], Limb1
adc Limb2, Limb6
not Limb2
mov [Op3+40-64], Limb2
adc Limb3, Limb7
not Limb3
mov [Op3+48-64], Limb3
adc Limb4, Limb8
not Limb4
mov [Op3+56-64], Limb4
lea Op1, [Op1 + 64]
lea Op2, [Op2 + 64]
dec Size
jnz .n_loop ;ajs:notshortform
SaveAC ; memorize add-carry
; hand back carries
.n_post:
; AL = cy1, AH = -cy3. cy2 = 1 here, as
; there were non-zero words in the sum
inc al ; AL = cy1 + cy2 = cy1 + 1, AH = -cy3
.all_zero:
LoadSC ; AL = cy1 + cy2, CY = cy3
adc AL, AL ; AL = 2*(cy1 + cy2) + cy3
movsx EAX, AL
.Exit:
END_PROC reg_save_list
.end:
.zero_sum:
mov [Op3], Limb2
LoadSC
sbb Limb1, Limb5
SaveSC
mov [Op4], Limb1
dec Size
jz .all_zero
add Op1, 8
add Op2, 8
add Op3, 8
add Op4, 8
mov Limb1, [Op1]
mov Limb5, [Op2]
mov Limb2, Limb1
LoadAC
adc Limb2, Limb5
SaveAC
neg Limb2
jz .zero_sum
jmp .not_zero