1. add revised core 2 assembler to the Windows build
2. add revised nehalem tuning on Windows 3. fix bug in speed on Windows x64
This commit is contained in:
parent
e6047c1c9e
commit
94c011b8a3
@ -1,308 +1,173 @@
|
||||
; PROLOGUE(mpn_addmul_1)
|
||||
; x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
|
||||
|
||||
; Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software
|
||||
; Foundation, Inc.
|
||||
|
||||
; Copyright 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
|
||||
;
|
||||
; Copyright 2008, 2009 Brian Gladman
|
||||
;
|
||||
; This file is part of the GNU MP Library.
|
||||
;
|
||||
; The GNU MP Library is free software; you can redistribute it and/or
|
||||
; modify it under the terms of the GNU Lesser General Public License as
|
||||
; published by the Free Software Foundation; either version 2.1 of the
|
||||
; License, or (at your option) any later version.
|
||||
;
|
||||
; The GNU MP Library is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
; Lesser General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU Lesser General Public
|
||||
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
; not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
; Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
;
|
||||
; CREDITS
|
||||
;
|
||||
; The code used here is derived from that provided by ct35z at:
|
||||
;
|
||||
; http://www.geocities.jp/ct35z/gmp-core2-en.html
|
||||
;
|
||||
; This code is based largely on Pierrick Gaudry's excellent assembly
|
||||
; support for the AMD64 architecture. (Note that Intel64 and AMD64,
|
||||
; while using the same instruction set, have very different
|
||||
; microarchitectures. So, this code performs very poorly on AMD64
|
||||
; machines even though it is near-optimal on Intel64.)
|
||||
;
|
||||
; Roger Golliver works for Intel and provided insightful improvements
|
||||
; particularly in using the "lea" instruction to perform additions
|
||||
; and register-to-register moves.
|
||||
;
|
||||
; Jason Worth Martin's excellent assembly support for the Intel64
|
||||
; architecture has been used where appropriate.
|
||||
;
|
||||
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
|
||||
; AMD64 (http://www.bealto.it). I adapted many of the ideas he
|
||||
; describes to Intel64.
|
||||
;
|
||||
; Agner Fog is a demigod in the x86 world. If you are reading assembly
|
||||
; code files and you haven't heard of Agner Fog, then take a minute to
|
||||
; look over his software optimization manuals (http://www.agner.org/).
|
||||
; They are superb.
|
||||
;
|
||||
; Adapted for use with VC++ and YASM using a special mode in which NASM
|
||||
; preprocessing is used with AT&T assembler syntax. I am very grateful
|
||||
; for the support that Peter Johnson (one of the authors of YASM) has
|
||||
; provided in getting this special YASM mode working. Without his
|
||||
; support this port would have been a great deal more difficult.
|
||||
;
|
||||
; The principle issues that I have had to address is the difference
|
||||
; between GCC and MSVC in their register saving and parameter passing
|
||||
; conventions. Registers that have to be preserved across function
|
||||
; calls are:
|
||||
;
|
||||
; GCC: rbx, rbp, r12..r15
|
||||
; MSVC: rsi, rdi, rbx, rbp, r12..r15 xmm6..xmm15
|
||||
;
|
||||
; Parameter passing conventions for non floating point parameters:
|
||||
;
|
||||
; function( GCC MSVC
|
||||
; p1, rdi rcx
|
||||
; p2, rsi rdx
|
||||
; p3, rdx r8
|
||||
; p4, rcx r9
|
||||
; p5, r8 [rsp+40]
|
||||
; p6, r9 [rsp+48]
|
||||
;
|
||||
; Care must be taken with 32-bit values in 64-bit register or on the
|
||||
; stack because the upper 32-bits of such parameters are undefined.
|
||||
;
|
||||
; Brian Gladman
|
||||
;
|
||||
; Calculate src[size] multiplied by mult[1] and add to /subtract from dst[size] and
|
||||
; return the carry or borrow from the top of the result
|
||||
;
|
||||
; BPL is bytes per limb, which is 8 in the 64-bit code here
|
||||
;
|
||||
; mp_limb_t mpn_addmul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
||||
; mp_limb_t mpn_inclsh_n(mp_ptr, mp_ptr, mp_size_t, mp_uint)
|
||||
; rax rdi rsi rdx rcx
|
||||
; rax rcx rdx r8 r9d
|
||||
|
||||
%define BPL 8
|
||||
%define UNROLL_EXPONENT 4
|
||||
%define UNROLL_SIZE (1 << UNROLL_EXPONENT)
|
||||
%define UNROLL_MASK (UNROLL_SIZE - 1)
|
||||
%define ADDR(p,i,d) (d*BPL)(p, i, BPL)
|
||||
; The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 3 of the License, or (at
|
||||
; your option) any later version.
|
||||
|
||||
; Register Usage
|
||||
; -------- -----
|
||||
; rax low word from mul
|
||||
; rbx
|
||||
; rcx s2limb
|
||||
; rdx high word from mul
|
||||
; rsi s1p
|
||||
; rdi rp
|
||||
; rbp Base Pointer
|
||||
; rsp Stack Pointer
|
||||
; r8 A_x
|
||||
; r9 A_y
|
||||
; r10 A_z
|
||||
; r11 B_x
|
||||
; r12 B_y
|
||||
; r13 B_z
|
||||
; r14 temp
|
||||
; r15 index
|
||||
; The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
|
||||
%include "yasm_mac.inc"
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
%define reg_save_list rsi, rdi, r12, r13, r14, r15
|
||||
; mp_limb_t mpn_addmul_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t)
|
||||
; rax rdi rsi rdx rcx
|
||||
; rax rcx rdx r8 r9
|
||||
|
||||
%define s2limb rcx
|
||||
%define s1p rsi
|
||||
%define rp rdi
|
||||
%define a_x r8
|
||||
%define a_y r9
|
||||
%define a_z r10
|
||||
%define b_x r11
|
||||
%define b_y r12
|
||||
%define b_z r13
|
||||
%define temp r14
|
||||
%define index r15
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
CPU Core2
|
||||
BITS 64
|
||||
%define reg_save_list rbx, rbp, rsi, rdi
|
||||
|
||||
LEAF_PROC mpn_addmul_1
|
||||
xor a_z, a_z
|
||||
jmp entry
|
||||
TEXT
|
||||
|
||||
LEAF_PROC mpn_addmul_1c
|
||||
mov a_z, [rsp+0x28]
|
||||
entry:
|
||||
FRAME_PROC ?mpn_core2_addmul, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
xor rdx, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
xalign 16
|
||||
WIN64_GCC_PROC mpn_addmul_1c, 4
|
||||
lea rbx, [rdx]
|
||||
neg rbx
|
||||
mov rax, [rsi]
|
||||
mov r10, [rdi]
|
||||
lea rdi, [rdi+rdx*8-16]
|
||||
lea rsi, [rsi+rdx*8]
|
||||
mul rcx
|
||||
add rax, r8
|
||||
adc rdx, 0
|
||||
bt ebx, 0
|
||||
jc .1
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
mul rcx
|
||||
add rbx, 2
|
||||
jns .5
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
jmp .3
|
||||
.1: add rbx, 1
|
||||
jns .6
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
mul rcx
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
jmp .4
|
||||
|
||||
lea s1p, [s1p+rdx*8]
|
||||
lea rp, [rp+rdx*8]
|
||||
xor index, index
|
||||
sub index, rdx
|
||||
cmp rdx, 4
|
||||
jge .6
|
||||
lea rax, [rel .1]
|
||||
add rax, [rax+rdx*8]
|
||||
jmp rax
|
||||
xalign 16
|
||||
.2: mul rcx
|
||||
add r10, r8
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
adc r11, r9
|
||||
mov [rdi+rbx*8-8], r10
|
||||
mov r10, [rdi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
adc rbp, 0
|
||||
.3: mul rcx
|
||||
add r10, r11
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
adc r8, rbp
|
||||
mov [rdi+rbx*8], r10
|
||||
mov r10, [rdi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
adc r9, 0
|
||||
.4: add rbx, 2
|
||||
js .2
|
||||
mul rcx
|
||||
add r10, r8
|
||||
adc r11, r9
|
||||
mov [rdi-8], r10
|
||||
adc rbp, 0
|
||||
.5: mov r10, [rdi]
|
||||
add r10, r11
|
||||
adc rax, rbp
|
||||
mov [rdi], r10
|
||||
adc rdx, 0
|
||||
.6: mov r10, [rdi+8]
|
||||
add r10, rax
|
||||
mov [rdi+8], r10
|
||||
mov eax, ebx
|
||||
adc rax, rdx
|
||||
WIN64_GCC_END
|
||||
|
||||
xalign 8
|
||||
.1: dq .2 - .1
|
||||
dq .3 - .1
|
||||
dq .4 - .1
|
||||
dq .5 - .1
|
||||
.2: mov rax, a_z
|
||||
EXIT_PROC reg_save_list
|
||||
xalign 16
|
||||
WIN64_GCC_PROC mpn_addmul_1, 4
|
||||
lea rbx, [rdx]
|
||||
neg rbx
|
||||
mov rax, [rsi]
|
||||
mov r10, [rdi]
|
||||
lea rdi, [rdi+rdx*8-16]
|
||||
lea rsi, [rsi+rdx*8]
|
||||
mul rcx
|
||||
bt ebx, 0
|
||||
jc .1
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
mul rcx
|
||||
add rbx, 2
|
||||
jns .5
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
jmp .3
|
||||
.1: add rbx, 1
|
||||
jns .6
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
mul rcx
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
jmp .4
|
||||
|
||||
.3: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
add a_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8], a_z
|
||||
adc rax, rdx
|
||||
EXIT_PROC reg_save_list
|
||||
xalign 16
|
||||
.2: mul rcx
|
||||
add r10, r8
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
adc r11, r9
|
||||
mov [rdi+rbx*8-8], r10
|
||||
mov r10, [rdi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
adc rbp, 0
|
||||
.3: mul rcx
|
||||
add r10, r11
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
adc r8, rbp
|
||||
mov [rdi+rbx*8], r10
|
||||
mov r10, [rdi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
adc r9, 0
|
||||
.4: add rbx, 2
|
||||
js .2
|
||||
mul rcx
|
||||
add r10, r8
|
||||
adc r11, r9
|
||||
mov [rdi-8], r10
|
||||
adc rbp, 0
|
||||
.5: mov r10, [rdi]
|
||||
add r10, r11
|
||||
adc rax, rbp
|
||||
mov [rdi], r10
|
||||
adc rdx, 0
|
||||
.6: mov r10, [rdi+8]
|
||||
add r10, rax
|
||||
mov [rdi+8], r10
|
||||
mov eax, ebx
|
||||
adc rax, rdx
|
||||
WIN64_GCC_END
|
||||
|
||||
.4: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov b_z, [rp+index*8+8]
|
||||
add a_z, a_x
|
||||
adc rax, a_y
|
||||
mov [rp+index*8], a_z
|
||||
adc rdx, 0
|
||||
add b_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8+8], b_z
|
||||
adc rax, rdx
|
||||
EXIT_PROC reg_save_list
|
||||
|
||||
.5: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov b_z, [rp+index*8+8]
|
||||
mov b_x, rax
|
||||
mov b_y, rdx
|
||||
mov rax, [s1p+index*8+16]
|
||||
mul s2limb
|
||||
add a_z, a_x
|
||||
adc b_x, a_y
|
||||
mov [rp+index*8], a_z
|
||||
mov a_z, [rp+index*8+16]
|
||||
adc b_y, 0
|
||||
add b_z, b_x
|
||||
adc rax, b_y
|
||||
mov [rp+index*8+8], b_z
|
||||
adc rdx, 0
|
||||
add a_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8+16], a_z
|
||||
adc rax, rdx
|
||||
EXIT_PROC reg_save_list
|
||||
|
||||
.6: mov temp, rdx
|
||||
test rdx, 1
|
||||
jz .7
|
||||
mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov b_z, [rp+index*8+8]
|
||||
mov b_x, rax
|
||||
mov b_y, rdx
|
||||
jmp .8
|
||||
.7: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov b_z, [rp+index*8]
|
||||
mov b_x, rax
|
||||
mov b_y, rdx
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov a_z, [rp+index*8+8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
.8: sub temp, 4
|
||||
and temp, UNROLL_MASK
|
||||
inc temp
|
||||
mov rax, (.10 - .9) >> UNROLL_EXPONENT
|
||||
mul temp
|
||||
lea rdx, [rel .10]
|
||||
sub rdx, rax
|
||||
mov rax, [s1p+index*8+16]
|
||||
lea index, [index+temp+3-UNROLL_SIZE]
|
||||
jmp rdx
|
||||
|
||||
%macro seq_1 7
|
||||
mul s2limb
|
||||
%7 %3, %1
|
||||
lea %1, [rax]
|
||||
mov rax, [byte s1p+index*8+8*%6]
|
||||
adc %4, %2
|
||||
mov [byte rp+index*8+8*(%6-3)], %3
|
||||
mov %3, [byte rp+index*8+8*(%6-1)]
|
||||
lea %2, [rdx]
|
||||
adc %5, 0
|
||||
%endmacro
|
||||
|
||||
xalign 16
|
||||
.9:
|
||||
%assign i 0
|
||||
%rep 16
|
||||
%if (i & 1)
|
||||
seq_1 b_x, b_y, b_z, a_x, a_y, i, add
|
||||
%else
|
||||
seq_1 a_x, a_y, a_z, b_x, b_y, i, add
|
||||
%endif
|
||||
%assign i i + 1
|
||||
%endrep
|
||||
.10:add index, UNROLL_SIZE
|
||||
jnz .9
|
||||
.11:mul s2limb
|
||||
add a_z, a_x
|
||||
mov [rp+index*8-24], a_z
|
||||
mov a_z, [rp+index*8-8]
|
||||
adc b_x, a_y
|
||||
adc b_y, 0
|
||||
add b_z, b_x
|
||||
mov [rp+index*8-16], b_z
|
||||
adc rax, b_y
|
||||
adc rdx, 0
|
||||
add a_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8-8], a_z
|
||||
adc rax, rdx
|
||||
.12:END_PROC reg_save_list
|
||||
|
||||
end
|
||||
end
|
||||
|
@ -1,310 +1,173 @@
|
||||
; PROLOGUE(mpn_submul_1)
|
||||
; x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
|
||||
|
||||
; Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software
|
||||
; Foundation, Inc.
|
||||
|
||||
; Copyright 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
|
||||
;
|
||||
; Copyright 2008, 2009 Brian Gladman
|
||||
;
|
||||
; This file is part of the GNU MP Library.
|
||||
;
|
||||
; The GNU MP Library is free software; you can redistribute it and/or
|
||||
; modify it under the terms of the GNU Lesser General Public License as
|
||||
; published by the Free Software Foundation; either version 2.1 of the
|
||||
; License, or (at your option) any later version.
|
||||
;
|
||||
; The GNU MP Library is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
; Lesser General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU Lesser General Public
|
||||
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
||||
; not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||||
; Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
;
|
||||
; CREDITS
|
||||
;
|
||||
; The code used here is derived from that provided by ct35z at:
|
||||
;
|
||||
; http://www.geocities.jp/ct35z/gmp-core2-en.html
|
||||
;
|
||||
; This code is based largely on Pierrick Gaudry's excellent assembly
|
||||
; support for the AMD64 architecture. (Note that Intel64 and AMD64,
|
||||
; while using the same instruction set, have very different
|
||||
; microarchitectures. So, this code performs very poorly on AMD64
|
||||
; machines even though it is near-optimal on Intel64.)
|
||||
;
|
||||
; Roger Golliver works for Intel and provided insightful improvements
|
||||
; particularly in using the "lea" instruction to perform additions
|
||||
; and register-to-register moves.
|
||||
;
|
||||
; Jason Worth Martin's excellent assembly support for the Intel64
|
||||
; architecture has been used where appropriate.
|
||||
;
|
||||
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
|
||||
; AMD64 (http://www.bealto.it). I adapted many of the ideas he
|
||||
; describes to Intel64.
|
||||
;
|
||||
; Agner Fog is a demigod in the x86 world. If you are reading assembly
|
||||
; code files and you haven't heard of Agner Fog, then take a minute to
|
||||
; look over his software optimization manuals (http://www.agner.org/).
|
||||
; They are superb.
|
||||
;
|
||||
; Adapted for use with VC++ and YASM using a special mode in which NASM
|
||||
; preprocessing is used with AT&T assembler syntax. I am very grateful
|
||||
; for the support that Peter Johnson (one of the authors of YASM) has
|
||||
; provided in getting this special YASM mode working. Without his
|
||||
; support this port would have been a great deal more difficult.
|
||||
;
|
||||
; The principle issues that I have had to address is the difference
|
||||
; between GCC and MSVC in their register saving and parameter passing
|
||||
; conventions. Registers that have to be preserved across function
|
||||
; calls are:
|
||||
;
|
||||
; GCC: rbx, rbp, r12..r15
|
||||
; MSVC: rsi, rdi, rbx, rbp, r12..r15 xmm6..xmm15
|
||||
;
|
||||
; Parameter passing conventions for non floating point parameters:
|
||||
;
|
||||
; function( GCC MSVC
|
||||
; p1, rdi rcx
|
||||
; p2, rsi rdx
|
||||
; p3, rdx r8
|
||||
; p4, rcx r9
|
||||
; p5, r8 [rsp+40]
|
||||
; p6, r9 [rsp+48]
|
||||
;
|
||||
; Care must be taken with 32-bit values in 64-bit register or on the
|
||||
; stack because the upper 32-bits of such parameters are undefined.
|
||||
;
|
||||
; Brian Gladman
|
||||
;
|
||||
; Intel64 mpn_addmul_1 -- Multiply a limb vector with a limb and
|
||||
; add the result to a second limb vector.
|
||||
;
|
||||
; Calculate src[size] multiplied by mult[1] and add to /subtract from dst[size] and
|
||||
; return the carry or borrow from the top of the result
|
||||
;
|
||||
; BPL is bytes per limb, which is 8 in the 64-bit code here
|
||||
|
||||
; mp_limb_t mpn_submul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
||||
; mp_limb_t mpn_declsh_n(mp_ptr, mp_ptr, mp_size_t, mp_uint)
|
||||
; rax rdi rsi rdx rcx
|
||||
; rax rcx rdx r8 r9
|
||||
;
|
||||
; The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 3 of the License, or (at
|
||||
; your option) any later version.
|
||||
|
||||
%define BPL 8
|
||||
%define UNROLL_EXPONENT 4
|
||||
%define UNROLL_SIZE (1 << UNROLL_EXPONENT)
|
||||
%define UNROLL_MASK (UNROLL_SIZE - 1)
|
||||
%define ADDR(p,i,d) (d*BPL)(p, i, BPL)
|
||||
; The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
|
||||
; Register Usage
|
||||
; -------- -----
|
||||
; rax low word from mul
|
||||
; rbx
|
||||
; rcx s2limb
|
||||
; rdx high word from mul
|
||||
; rsi s1p
|
||||
; rdi rp
|
||||
; rbp Base Pointer
|
||||
; rsp Stack Pointer
|
||||
; r8 A_x
|
||||
; r9 A_y
|
||||
; r10 A_z
|
||||
; r11 B_x
|
||||
; r12 B_y
|
||||
; r13 B_z
|
||||
; r14 temp
|
||||
; r15 index
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
%include "yasm_mac.inc"
|
||||
; mp_limb_t mpn_addmul_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t)
|
||||
; rax rdi rsi rdx rcx
|
||||
; rax rcx rdx r8 r9
|
||||
|
||||
%define reg_save_list rsi, rdi, r12, r13, r14, r15
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
%define s2limb rcx
|
||||
%define s1p rsi
|
||||
%define rp rdi
|
||||
%define a_x r8
|
||||
%define a_y r9
|
||||
%define a_z r10
|
||||
%define b_x r11
|
||||
%define b_y r12
|
||||
%define b_z r13
|
||||
%define temp r14
|
||||
%define index r15
|
||||
%define reg_save_list rbx, rbp, rsi, rdi
|
||||
|
||||
LEAF_PROC mpn_submul_1
|
||||
xor a_z, a_z
|
||||
jmp entry
|
||||
TEXT
|
||||
|
||||
LEAF_PROC mpn_submul_1c
|
||||
mov a_z, [rsp+0x28]
|
||||
entry:
|
||||
FRAME_PROC ?mpn_core2_submul, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
xor rdx, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
xalign 16
|
||||
WIN64_GCC_PROC mpn_submul_1c, 4
|
||||
lea rbx, [rdx]
|
||||
neg rbx
|
||||
mov rax, [rsi]
|
||||
mov r10, [rdi]
|
||||
lea rdi, [rdi+rdx*8-16]
|
||||
lea rsi, [rsi+rdx*8]
|
||||
mul rcx
|
||||
add rax, r8
|
||||
adc rdx, 0
|
||||
bt ebx, 0
|
||||
jc .1
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
mul rcx
|
||||
add rbx, 2
|
||||
jns .5
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
jmp .3
|
||||
.1: add rbx, 1
|
||||
jns .6
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
mul rcx
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
jmp .4
|
||||
|
||||
lea s1p, [s1p+rdx*8]
|
||||
lea rp, [rp+rdx*8]
|
||||
xor index, index
|
||||
sub index, rdx
|
||||
cmp rdx, 4
|
||||
jge .6
|
||||
lea rax, [rel .1]
|
||||
add rax, [rax+rdx*8]
|
||||
jmp rax
|
||||
xalign 16
|
||||
.2: mul rcx
|
||||
sub r10, r8
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
adc r11, r9
|
||||
mov [rdi+rbx*8-8], r10
|
||||
mov r10, [rdi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
adc rbp, 0
|
||||
.3: mul rcx
|
||||
sub r10, r11
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
adc r8, rbp
|
||||
mov [rdi+rbx*8], r10
|
||||
mov r10, [rdi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
adc r9, 0
|
||||
.4: add rbx, 2
|
||||
js .2
|
||||
mul rcx
|
||||
sub r10, r8
|
||||
adc r11, r9
|
||||
mov [rdi-8], r10
|
||||
adc rbp, 0
|
||||
.5: mov r10, [rdi]
|
||||
sub r10, r11
|
||||
adc rax, rbp
|
||||
mov [rdi], r10
|
||||
adc rdx, 0
|
||||
.6: mov r10, [rdi+8]
|
||||
sub r10, rax
|
||||
mov [rdi+8], r10
|
||||
mov eax, ebx
|
||||
adc rax, rdx
|
||||
WIN64_GCC_END
|
||||
|
||||
xalign 8
|
||||
.1: dq .2 - .1
|
||||
dq .3 - .1
|
||||
dq .4 - .1
|
||||
dq .5 - .1
|
||||
.2: mov rax, a_z
|
||||
EXIT_PROC reg_save_list
|
||||
xalign 16
|
||||
WIN64_GCC_PROC mpn_submul_1, 4
|
||||
lea rbx, [rdx]
|
||||
neg rbx
|
||||
mov rax, [rsi]
|
||||
mov r10, [rdi]
|
||||
lea rdi, [rdi+rdx*8-16]
|
||||
lea rsi, [rsi+rdx*8]
|
||||
mul rcx
|
||||
bt ebx, 0
|
||||
jc .1
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
mul rcx
|
||||
add rbx, 2
|
||||
jns .5
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
jmp .3
|
||||
.1: add rbx, 1
|
||||
jns .6
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
mul rcx
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
jmp .4
|
||||
|
||||
.3: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
sub a_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8], a_z
|
||||
adc rax, rdx
|
||||
EXIT_PROC reg_save_list
|
||||
xalign 16
|
||||
.2: mul rcx
|
||||
sub r10, r8
|
||||
lea r8, [rax]
|
||||
mov rax, [rsi+rbx*8]
|
||||
adc r11, r9
|
||||
mov [rdi+rbx*8-8], r10
|
||||
mov r10, [rdi+rbx*8]
|
||||
lea r9, [rdx]
|
||||
adc rbp, 0
|
||||
.3: mul rcx
|
||||
sub r10, r11
|
||||
lea r11, [rax]
|
||||
mov rax, [rsi+rbx*8+8]
|
||||
adc r8, rbp
|
||||
mov [rdi+rbx*8], r10
|
||||
mov r10, [rdi+rbx*8+8]
|
||||
lea rbp, [rdx]
|
||||
adc r9, 0
|
||||
.4: add rbx, 2
|
||||
js .2
|
||||
mul rcx
|
||||
sub r10, r8
|
||||
adc r11, r9
|
||||
mov [rdi-8], r10
|
||||
adc rbp, 0
|
||||
.5: mov r10, [rdi]
|
||||
sub r10, r11
|
||||
adc rax, rbp
|
||||
mov [rdi], r10
|
||||
adc rdx, 0
|
||||
.6: mov r10, [rdi+8]
|
||||
sub r10, rax
|
||||
mov [rdi+8], r10
|
||||
mov eax, ebx
|
||||
adc rax, rdx
|
||||
WIN64_GCC_END
|
||||
|
||||
.4: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov b_z, [rp+index*8+8]
|
||||
sub a_z, a_x
|
||||
adc rax, a_y
|
||||
mov [rp+index*8], a_z
|
||||
adc rdx, 0
|
||||
sub b_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8+8], b_z
|
||||
adc rax, rdx
|
||||
EXIT_PROC reg_save_list
|
||||
|
||||
.5: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov b_z, [rp+index*8+8]
|
||||
mov b_x, rax
|
||||
mov b_y, rdx
|
||||
mov rax, [s1p+index*8+16]
|
||||
mul s2limb
|
||||
sub a_z, a_x
|
||||
adc b_x, a_y
|
||||
mov [rp+index*8], a_z
|
||||
mov a_z, [rp+index*8+16]
|
||||
adc b_y, 0
|
||||
sub b_z, b_x
|
||||
adc rax, b_y
|
||||
mov [rp+index*8+8], b_z
|
||||
adc rdx, 0
|
||||
sub a_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8+16], a_z
|
||||
adc rax, rdx
|
||||
EXIT_PROC reg_save_list
|
||||
|
||||
.6: mov temp, rdx
|
||||
test rdx, 1
|
||||
jz .7
|
||||
mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov a_z, [rp+index*8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov b_z, [rp+index*8+8]
|
||||
mov b_x, rax
|
||||
mov b_y, rdx
|
||||
jmp .8
|
||||
|
||||
.7: mov rax, [s1p+index*8]
|
||||
mul s2limb
|
||||
add rax, a_z
|
||||
adc rdx, 0
|
||||
mov b_z, [rp+index*8]
|
||||
mov b_x, rax
|
||||
mov b_y, rdx
|
||||
mov rax, [s1p+index*8+8]
|
||||
mul s2limb
|
||||
mov a_z, [rp+index*8+8]
|
||||
mov a_x, rax
|
||||
mov a_y, rdx
|
||||
.8: sub temp, 4
|
||||
and temp, UNROLL_MASK
|
||||
inc temp
|
||||
mov rax, (.10 - .9) >> UNROLL_EXPONENT
|
||||
mul temp
|
||||
lea rdx, [rel .10]
|
||||
sub rdx, rax
|
||||
mov rax, [s1p+index*8+16]
|
||||
lea index, [index+temp+3-UNROLL_SIZE]
|
||||
jmp rdx
|
||||
|
||||
%macro seq_1 7
|
||||
mul s2limb
|
||||
%7 %3, %1
|
||||
lea %1, [rax]
|
||||
mov rax, [byte s1p+index*8+8*%6]
|
||||
adc %4, %2
|
||||
mov [byte rp+index*8+8*(%6-3)], %3
|
||||
mov %3, [byte rp+index*8+8*(%6-1)]
|
||||
lea %2, [rdx]
|
||||
adc %5, 0
|
||||
%endmacro
|
||||
|
||||
xalign 16
|
||||
.9:
|
||||
%assign i 0
|
||||
%rep 16
|
||||
%if (i & 1)
|
||||
seq_1 b_x, b_y, b_z, a_x, a_y, i, sub
|
||||
%else
|
||||
seq_1 a_x, a_y, a_z, b_x, b_y, i, sub
|
||||
%endif
|
||||
%assign i i + 1
|
||||
%endrep
|
||||
.10:add index, UNROLL_SIZE
|
||||
jnz .9
|
||||
.11:mul s2limb
|
||||
sub a_z, a_x
|
||||
mov [rp+index*8-24], a_z
|
||||
mov a_z, [rp+index*8-8]
|
||||
adc b_x, a_y
|
||||
adc b_y, 0
|
||||
sub b_z, b_x
|
||||
mov [rp+index*8-16], b_z
|
||||
adc rax, b_y
|
||||
adc rdx, 0
|
||||
sub a_z, rax
|
||||
mov rax, 0
|
||||
mov [rp+index*8-8], a_z
|
||||
adc rax, rdx
|
||||
.12:END_PROC reg_save_list
|
||||
|
||||
end
|
||||
end
|
||||
|
@ -6,7 +6,7 @@
|
||||
#define MUL_TOOM8H_THRESHOLD 270
|
||||
|
||||
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
|
||||
#define SQR_KARATSUBA_THRESHOLD 27
|
||||
#define SQR_KARATSUBA_THRESHOLD 26
|
||||
#define SQR_TOOM3_THRESHOLD 90
|
||||
#define SQR_TOOM4_THRESHOLD 248
|
||||
#define SQR_TOOM8_THRESHOLD 351
|
||||
@ -40,9 +40,9 @@
|
||||
#define SET_STR_DC_THRESHOLD 6082
|
||||
#define SET_STR_PRECOMPUTE_THRESHOLD 7122
|
||||
|
||||
#define MUL_FFT_FULL_THRESHOLD 2880
|
||||
#define MUL_FFT_FULL_THRESHOLD 3528
|
||||
|
||||
#define SQR_FFT_FULL_THRESHOLD 2880
|
||||
#define SQR_FFT_FULL_THRESHOLD 2368
|
||||
|
||||
#define MULLOW_BASECASE_THRESHOLD 7
|
||||
#define MULLOW_DC_THRESHOLD 11
|
||||
|
12
tune/speed.c
12
tune/speed.c
@ -1117,6 +1117,14 @@ check_align_option (const char *name, mp_size_t align)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN64
|
||||
# define s2_format "%lld-%lld"
|
||||
# define s3_format "%lld(%lld)%lld"
|
||||
#else
|
||||
# define s2_format "%ld-%ld"
|
||||
# define s3_format "%ld(%ld)%ld"
|
||||
#endif
|
||||
|
||||
int
|
||||
main (int argc, char *argv[])
|
||||
{
|
||||
@ -1228,13 +1236,13 @@ main (int argc, char *argv[])
|
||||
size_allocnum += 10;
|
||||
}
|
||||
size_array[size_num].inc = 0;
|
||||
if (sscanf (s, "%ld(%ld)%ld",
|
||||
if (sscanf (s, s3_format,
|
||||
&size_array[size_num].start,
|
||||
&size_array[size_num].inc,
|
||||
&size_array[size_num].end) != 3)
|
||||
{
|
||||
|
||||
if (sscanf (s, "%ld-%ld",
|
||||
if (sscanf (s, s2_format,
|
||||
&size_array[size_num].start,
|
||||
&size_array[size_num].end) != 2)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user