340 lines
8.7 KiB
NASM
340 lines
8.7 KiB
NASM
; ============================================================================
|
|
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
|
; This file is part of the MPIR Library.
|
|
; The MPIR Library is free software; you can redistribute it and/or modify
|
|
; it under the terms of the GNU Lesser General Public License as published
|
|
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
|
; your option) any later version.
|
|
; The MPIR Library is distributed in the hope that it will be useful, but
|
|
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
; License for more details.
|
|
; You should have received a copy of the GNU Lesser General Public License
|
|
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
|
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
; Boston, MA 02110-1301, USA.
|
|
; mp_limb_t mpn_nsumdiff_n(mp_ptr Op3, mp_ptr Op4, mp_srcptr Op1, mp_srcptr Op2, mp_size_t Size)
|
|
; Linux RAX RDI RSI RDX RCX R8
|
|
; Win7 RAX RCX RDX R8 R9 Stack
|
|
;
|
|
; Description:
|
|
; The function computes -(Op2+Op1) and stores the result in Op3 while at the
|
|
; same time subtracting Op2 from Op1 with result in Op4. The final carries from
|
|
; addition and subtraction are handed back as a combined mp_limb_t. There is a
|
|
; gain in execution speed compared to separate addition and subtraction by
|
|
; reducing memory access. The factor depends on the size of the operands (the
|
|
; cache hierarchy in which the operands can be handled).
|
|
;
|
|
; Equivalent to, assuming no overlap:
|
|
; cy1 = mpn_add_n(r1, s1, s2, n);
|
|
; cy2 = mpn_neg_n(r1, r1, n); /* cy2 = [{r1,n} != 0] */
|
|
; cy3 = mpn_sub_n(r2, s1, s2, n);
|
|
; return 2*(cy1 + cy2) + cy3;
|
|
; ============================================================================
|
|
|
|
|
|
%include 'yasm_mac.inc'
|
|
|
|
%define reg_save_list rsi, rdi, rbx, rbp, r12, r13, r14, r15
|
|
|
|
%define Op3 RDI
|
|
%define Op4 RSI
|
|
%define Op1 RDX
|
|
%define Op2 RCX
|
|
%define Size R8
|
|
|
|
%define Limb0 RBP
|
|
%define Limb1 RBX
|
|
%define Limb2 R9
|
|
%define Limb3 R10
|
|
%define Limb4 R11
|
|
%define Limb5 R12
|
|
%define Limb6 R13
|
|
%define Limb7 R14
|
|
%define Limb8 R15
|
|
|
|
%ifdef USE_PREFETCH
|
|
%define Offs PREFETCH_STRIDE ; no more regs avail. => fallback to const
|
|
%endif
|
|
|
|
%define SaveAC setc AL
|
|
%define LoadAC shr AL, 1
|
|
|
|
%define SaveSC sbb AH, AH
|
|
%define LoadSC add AH, AH
|
|
|
|
BITS 64
|
|
|
|
align 32
|
|
|
|
FRAME_PROC mpn_nsumdiff_n, 0, reg_save_list
|
|
mov rdi, rcx
|
|
mov rsi, rdx
|
|
mov rdx, r8
|
|
mov rcx, r9
|
|
mov r8, [rsp+stack_use+40]
|
|
|
|
xor EAX, EAX ; clear add & sub carry
|
|
|
|
; First we handle any words whose sum = 0
|
|
mov Limb1, [Op1]
|
|
mov Limb5, [Op2]
|
|
mov Limb2, Limb1
|
|
add Limb2, Limb5
|
|
SaveAC
|
|
neg Limb2
|
|
jz .zero_sum ; ajs:notshortform
|
|
|
|
; Then we handle the first word whose sum !=0. The NOT of this sum needs to
|
|
; be incremented, which produces no carry (NOT(x) + 1 = NEG(x))
|
|
.not_zero:
|
|
mov [Op3], Limb2
|
|
LoadSC
|
|
sbb Limb1, Limb5
|
|
SaveSC
|
|
add Op1, 8
|
|
add Op2, 8
|
|
add Op3, 8
|
|
dec Size
|
|
mov [Op4], Limb1
|
|
add Op4, 8
|
|
|
|
shr Size, 1
|
|
jnc .n_two
|
|
|
|
mov Limb1, [Op1]
|
|
mov Limb5, [Op2]
|
|
LoadAC
|
|
mov Limb2, Limb1
|
|
adc Limb2, Limb5
|
|
SaveAC
|
|
LoadSC
|
|
sbb Limb1, Limb5
|
|
SaveSC
|
|
not Limb2
|
|
add Op1, 8
|
|
add Op2, 8
|
|
add Op3, 8
|
|
mov [Op3-8], Limb2
|
|
mov [Op4], Limb1
|
|
add Op4, 8
|
|
|
|
.n_two:
|
|
|
|
shr Size, 1
|
|
jnc .n_four
|
|
mov Limb1, [Op1]
|
|
mov Limb2, [Op1+8]
|
|
mov Limb5, [Op2]
|
|
mov Limb6, [Op2+8]
|
|
LoadAC
|
|
mov Limb3, Limb1
|
|
mov Limb4, Limb2
|
|
adc Limb3, Limb5
|
|
not Limb3
|
|
adc Limb4, Limb6
|
|
SaveAC
|
|
LoadSC
|
|
not Limb4
|
|
sbb Limb1, Limb5
|
|
sbb Limb2, Limb6
|
|
SaveSC
|
|
mov [Op3], Limb3
|
|
mov [Op3+8], Limb4
|
|
mov [Op4], Limb1
|
|
mov [Op4+8], Limb2
|
|
add Op1, 16
|
|
add Op2, 16
|
|
add Op3, 16
|
|
add Op4, 16
|
|
|
|
|
|
.n_four:
|
|
|
|
shr Size, 1
|
|
jnc .n_loop_pre ;ajs:notshortform
|
|
|
|
LoadAC
|
|
|
|
; slight change of scheme here - avoid too many
|
|
; memory to reg or reg to memory moves in a row
|
|
mov Limb1, [Op1]
|
|
mov Limb5, [Op2]
|
|
mov Limb0, Limb1
|
|
adc Limb0, Limb5
|
|
not Limb0
|
|
mov [Op3], Limb0
|
|
mov Limb2, [Op1+8]
|
|
mov Limb6, [Op2+8]
|
|
mov Limb0, Limb2
|
|
adc Limb0, Limb6
|
|
not Limb0
|
|
mov [Op3+8], Limb0
|
|
mov Limb3, [Op1+16]
|
|
mov Limb7, [Op2+16]
|
|
mov Limb0, Limb3
|
|
adc Limb0, Limb7
|
|
not Limb0
|
|
mov [Op3+16], Limb0
|
|
mov Limb4, [Op1+24]
|
|
mov Limb8, [Op2+24]
|
|
mov Limb0, Limb4
|
|
adc Limb0, Limb8
|
|
not Limb0
|
|
mov [Op3+24], Limb0
|
|
|
|
SaveAC
|
|
LoadSC
|
|
|
|
sbb Limb1, Limb5
|
|
mov [Op4], Limb1
|
|
sbb Limb2, Limb6
|
|
mov [Op4+8], Limb2
|
|
sbb Limb3, Limb7
|
|
mov [Op4+16], Limb3
|
|
sbb Limb4, Limb8
|
|
mov [Op4+24], Limb4
|
|
|
|
SaveSC
|
|
|
|
add Op1, 32
|
|
add Op2, 32
|
|
add Op3, 32
|
|
add Op4, 32
|
|
|
|
test Size, Size
|
|
.n_loop_pre: ; If we jump here, ZF=1 iff Size=0
|
|
jz .n_post ;ajs:notshortform
|
|
LoadAC ; set carry for addition
|
|
|
|
; main loop - values below are best case - up to 50% fluctuation possible!
|
|
; - 3.50 cycles per limb in LD1$
|
|
; - 3.50 cycles per limb in LD2$
|
|
; - 5.10-5.50 cycles per limb in LD3$
|
|
align 16
|
|
.n_loop:
|
|
|
|
%ifdef USE_PREFETCH
|
|
prefetchnta [Op1+Offs]
|
|
prefetchnta [Op2+Offs]
|
|
%endif
|
|
|
|
mov Limb1, [Op1] ; add the first quad-limb
|
|
mov Limb5, [Op2]
|
|
mov Limb0, Limb1
|
|
adc Limb0, Limb5
|
|
not Limb0
|
|
mov [Op3], Limb0
|
|
mov Limb2, [Op1+8]
|
|
mov Limb6, [Op2+8]
|
|
mov Limb0, Limb2
|
|
adc Limb0, Limb6
|
|
not Limb0
|
|
mov [Op3+8], Limb0
|
|
mov Limb3, [Op1+16]
|
|
mov Limb7, [Op2+16]
|
|
mov Limb0, Limb3
|
|
adc Limb0, Limb7
|
|
not Limb0
|
|
mov [Op3+16], Limb0
|
|
mov Limb4, [Op1+24]
|
|
mov Limb8, [Op2+24]
|
|
mov Limb0, Limb4
|
|
adc Limb0, Limb8
|
|
not Limb0
|
|
mov [Op3+24], Limb0
|
|
lea Op3, [Op3 + 64]
|
|
|
|
SaveAC ; memorize add-carry
|
|
LoadSC ; set carry for subtraction
|
|
|
|
sbb Limb1, Limb5 ; now sub the first quad-limb
|
|
mov [Op4], Limb1
|
|
sbb Limb2, Limb6
|
|
mov [Op4+8], Limb2
|
|
sbb Limb3, Limb7
|
|
mov [Op4+16], Limb3
|
|
sbb Limb4, Limb8
|
|
mov [Op4+24], Limb4
|
|
mov Limb1, [Op1+32] ; sub the second quad-limb
|
|
mov Limb5, [Op2+32]
|
|
mov Limb0, Limb1
|
|
sbb Limb0, Limb5
|
|
mov [Op4+32], Limb0
|
|
mov Limb2, [Op1+40]
|
|
mov Limb6, [Op2+40]
|
|
mov Limb0, Limb2
|
|
sbb Limb0, Limb6
|
|
mov [Op4+40], Limb0
|
|
mov Limb3, [Op1+48]
|
|
mov Limb7, [Op2+48]
|
|
mov Limb0, Limb3
|
|
sbb Limb0, Limb7
|
|
mov [Op4+48], Limb0
|
|
mov Limb4, [Op1+56]
|
|
mov Limb8, [Op2+56]
|
|
mov Limb0, Limb4
|
|
sbb Limb0, Limb8
|
|
mov [Op4+56], Limb0
|
|
lea Op4, [Op4 + 64]
|
|
|
|
SaveSC ; memorize sub-carry
|
|
LoadAC ; set carry for addition
|
|
|
|
adc Limb1, Limb5 ; add the second quad-limb
|
|
not Limb1
|
|
mov [Op3+32-64], Limb1
|
|
adc Limb2, Limb6
|
|
not Limb2
|
|
mov [Op3+40-64], Limb2
|
|
adc Limb3, Limb7
|
|
not Limb3
|
|
mov [Op3+48-64], Limb3
|
|
adc Limb4, Limb8
|
|
not Limb4
|
|
mov [Op3+56-64], Limb4
|
|
|
|
lea Op1, [Op1 + 64]
|
|
lea Op2, [Op2 + 64]
|
|
|
|
|
|
dec Size
|
|
jnz .n_loop ;ajs:notshortform
|
|
|
|
SaveAC ; memorize add-carry
|
|
; hand back carries
|
|
.n_post:
|
|
; AL = cy1, AH = -cy3. cy2 = 1 here, as
|
|
; there were non-zero words in the sum
|
|
inc al ; AL = cy1 + cy2 = cy1 + 1, AH = -cy3
|
|
.all_zero:
|
|
LoadSC ; AL = cy1 + cy2, CY = cy3
|
|
adc AL, AL ; AL = 2*(cy1 + cy2) + cy3
|
|
movsx EAX, AL
|
|
|
|
.Exit:
|
|
END_PROC reg_save_list
|
|
.end:
|
|
|
|
.zero_sum:
|
|
mov [Op3], Limb2
|
|
LoadSC
|
|
sbb Limb1, Limb5
|
|
SaveSC
|
|
mov [Op4], Limb1
|
|
dec Size
|
|
jz .all_zero
|
|
add Op1, 8
|
|
add Op2, 8
|
|
add Op3, 8
|
|
add Op4, 8
|
|
mov Limb1, [Op1]
|
|
mov Limb5, [Op2]
|
|
mov Limb2, Limb1
|
|
LoadAC
|
|
adc Limb2, Limb5
|
|
SaveAC
|
|
neg Limb2
|
|
jz .zero_sum
|
|
jmp .not_zero
|