From 94c011b8a3b096f91dd2eb9fbfaae53d6de045ee Mon Sep 17 00:00:00 2001 From: BrianGladman Date: Mon, 24 Feb 2014 21:07:18 +0000 Subject: [PATCH] 1. add revised core 2 assembler to the Windows build 2. add revised nehalem tuning on Windows 3. fix bug in speed on Windows x64 --- mpn/x86_64w/core2/addmul_1.asm | 453 +++++++++++------------------- mpn/x86_64w/core2/submul_1.asm | 455 +++++++++++-------------------- mpn/x86_64w/nehalem/gmp-mparam.h | 6 +- tune/speed.c | 12 +- 4 files changed, 331 insertions(+), 595 deletions(-) diff --git a/mpn/x86_64w/core2/addmul_1.asm b/mpn/x86_64w/core2/addmul_1.asm index c9cdc37d..142b89b7 100644 --- a/mpn/x86_64w/core2/addmul_1.asm +++ b/mpn/x86_64w/core2/addmul_1.asm @@ -1,308 +1,173 @@ -; PROLOGUE(mpn_addmul_1) +; x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2". + +; Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software +; Foundation, Inc. -; Copyright 2006 Jason Worth Martin -; -; Copyright 2008, 2009 Brian Gladman -; ; This file is part of the GNU MP Library. -; -; The GNU MP Library is free software; you can redistribute it and/or -; modify it under the terms of the GNU Lesser General Public License as -; published by the Free Software Foundation; either version 2.1 of the -; License, or (at your option) any later version. -; -; The GNU MP Library is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -; Lesser General Public License for more details. -; -; You should have received a copy of the GNU Lesser General Public -; License along with the GNU MP Library; see the file COPYING.LIB. If -; not, write to the Free Software Foundation, Inc., 51 Franklin Street, -; Fifth Floor, Boston, MA 02110-1301, USA. -; -; CREDITS -; -; The code used here is derived from that provided by ct35z at: -; -; http://www.geocities.jp/ct35z/gmp-core2-en.html -; -; This code is based largely on Pierrick Gaudry's excellent assembly -; support for the AMD64 architecture. (Note that Intel64 and AMD64, -; while using the same instruction set, have very different -; microarchitectures. So, this code performs very poorly on AMD64 -; machines even though it is near-optimal on Intel64.) -; -; Roger Golliver works for Intel and provided insightful improvements -; particularly in using the "lea" instruction to perform additions -; and register-to-register moves. -; -; Jason Worth Martin's excellent assembly support for the Intel64 -; architecture has been used where appropriate. -; -; Eric Bainville has a brilliant exposition of optimizing arithmetic for -; AMD64 (http://www.bealto.it). I adapted many of the ideas he -; describes to Intel64. -; -; Agner Fog is a demigod in the x86 world. If you are reading assembly -; code files and you haven't heard of Agner Fog, then take a minute to -; look over his software optimization manuals (http://www.agner.org/). -; They are superb. -; -; Adapted for use with VC++ and YASM using a special mode in which NASM -; preprocessing is used with AT&T assembler syntax. I am very grateful -; for the support that Peter Johnson (one of the authors of YASM) has -; provided in getting this special YASM mode working. Without his -; support this port would have been a great deal more difficult. -; -; The principle issues that I have had to address is the difference -; between GCC and MSVC in their register saving and parameter passing -; conventions. Registers that have to be preserved across function -; calls are: -; -; GCC: rbx, rbp, r12..r15 -; MSVC: rsi, rdi, rbx, rbp, r12..r15 xmm6..xmm15 -; -; Parameter passing conventions for non floating point parameters: -; -; function( GCC MSVC -; p1, rdi rcx -; p2, rsi rdx -; p3, rdx r8 -; p4, rcx r9 -; p5, r8 [rsp+40] -; p6, r9 [rsp+48] -; -; Care must be taken with 32-bit values in 64-bit register or on the -; stack because the upper 32-bits of such parameters are undefined. -; -; Brian Gladman -; -; Calculate src[size] multiplied by mult[1] and add to /subtract from dst[size] and -; return the carry or borrow from the top of the result -; -; BPL is bytes per limb, which is 8 in the 64-bit code here -; -; mp_limb_t mpn_addmul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t) -; mp_limb_t mpn_inclsh_n(mp_ptr, mp_ptr, mp_size_t, mp_uint) -; rax rdi rsi rdx rcx -; rax rcx rdx r8 r9d -%define BPL 8 -%define UNROLL_EXPONENT 4 -%define UNROLL_SIZE (1 << UNROLL_EXPONENT) -%define UNROLL_MASK (UNROLL_SIZE - 1) -%define ADDR(p,i,d) (d*BPL)(p, i, BPL) +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 3 of the License, or (at +; your option) any later version. -; Register Usage -; -------- ----- -; rax low word from mul -; rbx -; rcx s2limb -; rdx high word from mul -; rsi s1p -; rdi rp -; rbp Base Pointer -; rsp Stack Pointer -; r8 A_x -; r9 A_y -; r10 A_z -; r11 B_x -; r12 B_y -; r13 B_z -; r14 temp -; r15 index +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. -%include "yasm_mac.inc" +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. -%define reg_save_list rsi, rdi, r12, r13, r14, r15 +; mp_limb_t mpn_addmul_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t) +; rax rdi rsi rdx rcx +; rax rcx rdx r8 r9 -%define s2limb rcx -%define s1p rsi -%define rp rdi -%define a_x r8 -%define a_y r9 -%define a_z r10 -%define b_x r11 -%define b_y r12 -%define b_z r13 -%define temp r14 -%define index r15 +%include 'yasm_mac.inc' - CPU Core2 - BITS 64 +%define reg_save_list rbx, rbp, rsi, rdi - LEAF_PROC mpn_addmul_1 - xor a_z, a_z - jmp entry + TEXT - LEAF_PROC mpn_addmul_1c - mov a_z, [rsp+0x28] -entry: - FRAME_PROC ?mpn_core2_addmul, 0, reg_save_list - mov rdi, rcx - mov rsi, rdx - xor rdx, rdx - mov rdx, r8 - mov rcx, r9 + xalign 16 + WIN64_GCC_PROC mpn_addmul_1c, 4 + lea rbx, [rdx] + neg rbx + mov rax, [rsi] + mov r10, [rdi] + lea rdi, [rdi+rdx*8-16] + lea rsi, [rsi+rdx*8] + mul rcx + add rax, r8 + adc rdx, 0 + bt ebx, 0 + jc .1 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + mul rcx + add rbx, 2 + jns .5 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + jmp .3 +.1: add rbx, 1 + jns .6 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + mul rcx + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + jmp .4 - lea s1p, [s1p+rdx*8] - lea rp, [rp+rdx*8] - xor index, index - sub index, rdx - cmp rdx, 4 - jge .6 - lea rax, [rel .1] - add rax, [rax+rdx*8] - jmp rax + xalign 16 +.2: mul rcx + add r10, r8 + lea r8, [rax] + mov rax, [rsi+rbx*8] + adc r11, r9 + mov [rdi+rbx*8-8], r10 + mov r10, [rdi+rbx*8] + lea r9, [rdx] + adc rbp, 0 +.3: mul rcx + add r10, r11 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + adc r8, rbp + mov [rdi+rbx*8], r10 + mov r10, [rdi+rbx*8+8] + lea rbp, [rdx] + adc r9, 0 +.4: add rbx, 2 + js .2 + mul rcx + add r10, r8 + adc r11, r9 + mov [rdi-8], r10 + adc rbp, 0 +.5: mov r10, [rdi] + add r10, r11 + adc rax, rbp + mov [rdi], r10 + adc rdx, 0 +.6: mov r10, [rdi+8] + add r10, rax + mov [rdi+8], r10 + mov eax, ebx + adc rax, rdx + WIN64_GCC_END - xalign 8 -.1: dq .2 - .1 - dq .3 - .1 - dq .4 - .1 - dq .5 - .1 -.2: mov rax, a_z - EXIT_PROC reg_save_list + xalign 16 + WIN64_GCC_PROC mpn_addmul_1, 4 + lea rbx, [rdx] + neg rbx + mov rax, [rsi] + mov r10, [rdi] + lea rdi, [rdi+rdx*8-16] + lea rsi, [rsi+rdx*8] + mul rcx + bt ebx, 0 + jc .1 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + mul rcx + add rbx, 2 + jns .5 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + jmp .3 +.1: add rbx, 1 + jns .6 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + mul rcx + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + jmp .4 -.3: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - add a_z, rax - mov rax, 0 - mov [rp+index*8], a_z - adc rax, rdx - EXIT_PROC reg_save_list + xalign 16 +.2: mul rcx + add r10, r8 + lea r8, [rax] + mov rax, [rsi+rbx*8] + adc r11, r9 + mov [rdi+rbx*8-8], r10 + mov r10, [rdi+rbx*8] + lea r9, [rdx] + adc rbp, 0 +.3: mul rcx + add r10, r11 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + adc r8, rbp + mov [rdi+rbx*8], r10 + mov r10, [rdi+rbx*8+8] + lea rbp, [rdx] + adc r9, 0 +.4: add rbx, 2 + js .2 + mul rcx + add r10, r8 + adc r11, r9 + mov [rdi-8], r10 + adc rbp, 0 +.5: mov r10, [rdi] + add r10, r11 + adc rax, rbp + mov [rdi], r10 + adc rdx, 0 +.6: mov r10, [rdi+8] + add r10, rax + mov [rdi+8], r10 + mov eax, ebx + adc rax, rdx + WIN64_GCC_END -.4: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - mov a_x, rax - mov a_y, rdx - - mov rax, [s1p+index*8+8] - mul s2limb - mov b_z, [rp+index*8+8] - add a_z, a_x - adc rax, a_y - mov [rp+index*8], a_z - adc rdx, 0 - add b_z, rax - mov rax, 0 - mov [rp+index*8+8], b_z - adc rax, rdx - EXIT_PROC reg_save_list - -.5: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - mov a_x, rax - mov a_y, rdx - mov rax, [s1p+index*8+8] - mul s2limb - mov b_z, [rp+index*8+8] - mov b_x, rax - mov b_y, rdx - mov rax, [s1p+index*8+16] - mul s2limb - add a_z, a_x - adc b_x, a_y - mov [rp+index*8], a_z - mov a_z, [rp+index*8+16] - adc b_y, 0 - add b_z, b_x - adc rax, b_y - mov [rp+index*8+8], b_z - adc rdx, 0 - add a_z, rax - mov rax, 0 - mov [rp+index*8+16], a_z - adc rax, rdx - EXIT_PROC reg_save_list - -.6: mov temp, rdx - test rdx, 1 - jz .7 - mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - mov a_x, rax - mov a_y, rdx - mov rax, [s1p+index*8+8] - mul s2limb - mov b_z, [rp+index*8+8] - mov b_x, rax - mov b_y, rdx - jmp .8 -.7: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov b_z, [rp+index*8] - mov b_x, rax - mov b_y, rdx - mov rax, [s1p+index*8+8] - mul s2limb - mov a_z, [rp+index*8+8] - mov a_x, rax - mov a_y, rdx -.8: sub temp, 4 - and temp, UNROLL_MASK - inc temp - mov rax, (.10 - .9) >> UNROLL_EXPONENT - mul temp - lea rdx, [rel .10] - sub rdx, rax - mov rax, [s1p+index*8+16] - lea index, [index+temp+3-UNROLL_SIZE] - jmp rdx - -%macro seq_1 7 - mul s2limb - %7 %3, %1 - lea %1, [rax] - mov rax, [byte s1p+index*8+8*%6] - adc %4, %2 - mov [byte rp+index*8+8*(%6-3)], %3 - mov %3, [byte rp+index*8+8*(%6-1)] - lea %2, [rdx] - adc %5, 0 -%endmacro - - xalign 16 -.9: -%assign i 0 -%rep 16 - %if (i & 1) - seq_1 b_x, b_y, b_z, a_x, a_y, i, add - %else - seq_1 a_x, a_y, a_z, b_x, b_y, i, add - %endif -%assign i i + 1 -%endrep -.10:add index, UNROLL_SIZE - jnz .9 -.11:mul s2limb - add a_z, a_x - mov [rp+index*8-24], a_z - mov a_z, [rp+index*8-8] - adc b_x, a_y - adc b_y, 0 - add b_z, b_x - mov [rp+index*8-16], b_z - adc rax, b_y - adc rdx, 0 - add a_z, rax - mov rax, 0 - mov [rp+index*8-8], a_z - adc rax, rdx -.12:END_PROC reg_save_list - - end + end diff --git a/mpn/x86_64w/core2/submul_1.asm b/mpn/x86_64w/core2/submul_1.asm index d2d24fc3..3ce6966e 100644 --- a/mpn/x86_64w/core2/submul_1.asm +++ b/mpn/x86_64w/core2/submul_1.asm @@ -1,310 +1,173 @@ -; PROLOGUE(mpn_submul_1) +; x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2". + +; Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software +; Foundation, Inc. -; Copyright 2006 Jason Worth Martin -; -; Copyright 2008, 2009 Brian Gladman -; ; This file is part of the GNU MP Library. -; -; The GNU MP Library is free software; you can redistribute it and/or -; modify it under the terms of the GNU Lesser General Public License as -; published by the Free Software Foundation; either version 2.1 of the -; License, or (at your option) any later version. -; -; The GNU MP Library is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -; Lesser General Public License for more details. -; -; You should have received a copy of the GNU Lesser General Public -; License along with the GNU MP Library; see the file COPYING.LIB. If -; not, write to the Free Software Foundation, Inc., 51 Franklin Street, -; Fifth Floor, Boston, MA 02110-1301, USA. -; -; CREDITS -; -; The code used here is derived from that provided by ct35z at: -; -; http://www.geocities.jp/ct35z/gmp-core2-en.html -; -; This code is based largely on Pierrick Gaudry's excellent assembly -; support for the AMD64 architecture. (Note that Intel64 and AMD64, -; while using the same instruction set, have very different -; microarchitectures. So, this code performs very poorly on AMD64 -; machines even though it is near-optimal on Intel64.) -; -; Roger Golliver works for Intel and provided insightful improvements -; particularly in using the "lea" instruction to perform additions -; and register-to-register moves. -; -; Jason Worth Martin's excellent assembly support for the Intel64 -; architecture has been used where appropriate. -; -; Eric Bainville has a brilliant exposition of optimizing arithmetic for -; AMD64 (http://www.bealto.it). I adapted many of the ideas he -; describes to Intel64. -; -; Agner Fog is a demigod in the x86 world. If you are reading assembly -; code files and you haven't heard of Agner Fog, then take a minute to -; look over his software optimization manuals (http://www.agner.org/). -; They are superb. -; -; Adapted for use with VC++ and YASM using a special mode in which NASM -; preprocessing is used with AT&T assembler syntax. I am very grateful -; for the support that Peter Johnson (one of the authors of YASM) has -; provided in getting this special YASM mode working. Without his -; support this port would have been a great deal more difficult. -; -; The principle issues that I have had to address is the difference -; between GCC and MSVC in their register saving and parameter passing -; conventions. Registers that have to be preserved across function -; calls are: -; -; GCC: rbx, rbp, r12..r15 -; MSVC: rsi, rdi, rbx, rbp, r12..r15 xmm6..xmm15 -; -; Parameter passing conventions for non floating point parameters: -; -; function( GCC MSVC -; p1, rdi rcx -; p2, rsi rdx -; p3, rdx r8 -; p4, rcx r9 -; p5, r8 [rsp+40] -; p6, r9 [rsp+48] -; -; Care must be taken with 32-bit values in 64-bit register or on the -; stack because the upper 32-bits of such parameters are undefined. -; -; Brian Gladman -; -; Intel64 mpn_addmul_1 -- Multiply a limb vector with a limb and -; add the result to a second limb vector. -; -; Calculate src[size] multiplied by mult[1] and add to /subtract from dst[size] and -; return the carry or borrow from the top of the result -; -; BPL is bytes per limb, which is 8 in the 64-bit code here -; mp_limb_t mpn_submul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t) -; mp_limb_t mpn_declsh_n(mp_ptr, mp_ptr, mp_size_t, mp_uint) -; rax rdi rsi rdx rcx -; rax rcx rdx r8 r9 -; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 3 of the License, or (at +; your option) any later version. -%define BPL 8 -%define UNROLL_EXPONENT 4 -%define UNROLL_SIZE (1 << UNROLL_EXPONENT) -%define UNROLL_MASK (UNROLL_SIZE - 1) -%define ADDR(p,i,d) (d*BPL)(p, i, BPL) +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. -; Register Usage -; -------- ----- -; rax low word from mul -; rbx -; rcx s2limb -; rdx high word from mul -; rsi s1p -; rdi rp -; rbp Base Pointer -; rsp Stack Pointer -; r8 A_x -; r9 A_y -; r10 A_z -; r11 B_x -; r12 B_y -; r13 B_z -; r14 temp -; r15 index +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. -%include "yasm_mac.inc" +; mp_limb_t mpn_addmul_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t) +; rax rdi rsi rdx rcx +; rax rcx rdx r8 r9 -%define reg_save_list rsi, rdi, r12, r13, r14, r15 +%include 'yasm_mac.inc' -%define s2limb rcx -%define s1p rsi -%define rp rdi -%define a_x r8 -%define a_y r9 -%define a_z r10 -%define b_x r11 -%define b_y r12 -%define b_z r13 -%define temp r14 -%define index r15 +%define reg_save_list rbx, rbp, rsi, rdi - LEAF_PROC mpn_submul_1 - xor a_z, a_z - jmp entry + TEXT - LEAF_PROC mpn_submul_1c - mov a_z, [rsp+0x28] -entry: - FRAME_PROC ?mpn_core2_submul, 0, reg_save_list - mov rdi, rcx - mov rsi, rdx - xor rdx, rdx - mov rdx, r8 - mov rcx, r9 + xalign 16 + WIN64_GCC_PROC mpn_submul_1c, 4 + lea rbx, [rdx] + neg rbx + mov rax, [rsi] + mov r10, [rdi] + lea rdi, [rdi+rdx*8-16] + lea rsi, [rsi+rdx*8] + mul rcx + add rax, r8 + adc rdx, 0 + bt ebx, 0 + jc .1 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + mul rcx + add rbx, 2 + jns .5 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + jmp .3 +.1: add rbx, 1 + jns .6 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + mul rcx + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + jmp .4 - lea s1p, [s1p+rdx*8] - lea rp, [rp+rdx*8] - xor index, index - sub index, rdx - cmp rdx, 4 - jge .6 - lea rax, [rel .1] - add rax, [rax+rdx*8] - jmp rax + xalign 16 +.2: mul rcx + sub r10, r8 + lea r8, [rax] + mov rax, [rsi+rbx*8] + adc r11, r9 + mov [rdi+rbx*8-8], r10 + mov r10, [rdi+rbx*8] + lea r9, [rdx] + adc rbp, 0 +.3: mul rcx + sub r10, r11 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + adc r8, rbp + mov [rdi+rbx*8], r10 + mov r10, [rdi+rbx*8+8] + lea rbp, [rdx] + adc r9, 0 +.4: add rbx, 2 + js .2 + mul rcx + sub r10, r8 + adc r11, r9 + mov [rdi-8], r10 + adc rbp, 0 +.5: mov r10, [rdi] + sub r10, r11 + adc rax, rbp + mov [rdi], r10 + adc rdx, 0 +.6: mov r10, [rdi+8] + sub r10, rax + mov [rdi+8], r10 + mov eax, ebx + adc rax, rdx + WIN64_GCC_END - xalign 8 -.1: dq .2 - .1 - dq .3 - .1 - dq .4 - .1 - dq .5 - .1 -.2: mov rax, a_z - EXIT_PROC reg_save_list + xalign 16 + WIN64_GCC_PROC mpn_submul_1, 4 + lea rbx, [rdx] + neg rbx + mov rax, [rsi] + mov r10, [rdi] + lea rdi, [rdi+rdx*8-16] + lea rsi, [rsi+rdx*8] + mul rcx + bt ebx, 0 + jc .1 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + mul rcx + add rbx, 2 + jns .5 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + jmp .3 +.1: add rbx, 1 + jns .6 + lea r8, [rax] + mov rax, [rsi+rbx*8] + lea r9, [rdx] + mul rcx + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + lea rbp, [rdx] + jmp .4 -.3: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - sub a_z, rax - mov rax, 0 - mov [rp+index*8], a_z - adc rax, rdx - EXIT_PROC reg_save_list + xalign 16 +.2: mul rcx + sub r10, r8 + lea r8, [rax] + mov rax, [rsi+rbx*8] + adc r11, r9 + mov [rdi+rbx*8-8], r10 + mov r10, [rdi+rbx*8] + lea r9, [rdx] + adc rbp, 0 +.3: mul rcx + sub r10, r11 + lea r11, [rax] + mov rax, [rsi+rbx*8+8] + adc r8, rbp + mov [rdi+rbx*8], r10 + mov r10, [rdi+rbx*8+8] + lea rbp, [rdx] + adc r9, 0 +.4: add rbx, 2 + js .2 + mul rcx + sub r10, r8 + adc r11, r9 + mov [rdi-8], r10 + adc rbp, 0 +.5: mov r10, [rdi] + sub r10, r11 + adc rax, rbp + mov [rdi], r10 + adc rdx, 0 +.6: mov r10, [rdi+8] + sub r10, rax + mov [rdi+8], r10 + mov eax, ebx + adc rax, rdx + WIN64_GCC_END -.4: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - mov a_x, rax - mov a_y, rdx - - mov rax, [s1p+index*8+8] - mul s2limb - mov b_z, [rp+index*8+8] - sub a_z, a_x - adc rax, a_y - mov [rp+index*8], a_z - adc rdx, 0 - sub b_z, rax - mov rax, 0 - mov [rp+index*8+8], b_z - adc rax, rdx - EXIT_PROC reg_save_list - -.5: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - mov a_x, rax - mov a_y, rdx - mov rax, [s1p+index*8+8] - mul s2limb - mov b_z, [rp+index*8+8] - mov b_x, rax - mov b_y, rdx - mov rax, [s1p+index*8+16] - mul s2limb - sub a_z, a_x - adc b_x, a_y - mov [rp+index*8], a_z - mov a_z, [rp+index*8+16] - adc b_y, 0 - sub b_z, b_x - adc rax, b_y - mov [rp+index*8+8], b_z - adc rdx, 0 - sub a_z, rax - mov rax, 0 - mov [rp+index*8+16], a_z - adc rax, rdx - EXIT_PROC reg_save_list - -.6: mov temp, rdx - test rdx, 1 - jz .7 - mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov a_z, [rp+index*8] - mov a_x, rax - mov a_y, rdx - mov rax, [s1p+index*8+8] - mul s2limb - mov b_z, [rp+index*8+8] - mov b_x, rax - mov b_y, rdx - jmp .8 - -.7: mov rax, [s1p+index*8] - mul s2limb - add rax, a_z - adc rdx, 0 - mov b_z, [rp+index*8] - mov b_x, rax - mov b_y, rdx - mov rax, [s1p+index*8+8] - mul s2limb - mov a_z, [rp+index*8+8] - mov a_x, rax - mov a_y, rdx -.8: sub temp, 4 - and temp, UNROLL_MASK - inc temp - mov rax, (.10 - .9) >> UNROLL_EXPONENT - mul temp - lea rdx, [rel .10] - sub rdx, rax - mov rax, [s1p+index*8+16] - lea index, [index+temp+3-UNROLL_SIZE] - jmp rdx - -%macro seq_1 7 - mul s2limb - %7 %3, %1 - lea %1, [rax] - mov rax, [byte s1p+index*8+8*%6] - adc %4, %2 - mov [byte rp+index*8+8*(%6-3)], %3 - mov %3, [byte rp+index*8+8*(%6-1)] - lea %2, [rdx] - adc %5, 0 -%endmacro - - xalign 16 -.9: -%assign i 0 -%rep 16 - %if (i & 1) - seq_1 b_x, b_y, b_z, a_x, a_y, i, sub - %else - seq_1 a_x, a_y, a_z, b_x, b_y, i, sub - %endif -%assign i i + 1 -%endrep -.10:add index, UNROLL_SIZE - jnz .9 -.11:mul s2limb - sub a_z, a_x - mov [rp+index*8-24], a_z - mov a_z, [rp+index*8-8] - adc b_x, a_y - adc b_y, 0 - sub b_z, b_x - mov [rp+index*8-16], b_z - adc rax, b_y - adc rdx, 0 - sub a_z, rax - mov rax, 0 - mov [rp+index*8-8], a_z - adc rax, rdx -.12:END_PROC reg_save_list - - end + end diff --git a/mpn/x86_64w/nehalem/gmp-mparam.h b/mpn/x86_64w/nehalem/gmp-mparam.h index a20a10b7..67e200c0 100644 --- a/mpn/x86_64w/nehalem/gmp-mparam.h +++ b/mpn/x86_64w/nehalem/gmp-mparam.h @@ -6,7 +6,7 @@ #define MUL_TOOM8H_THRESHOLD 270 #define SQR_BASECASE_THRESHOLD 0 /* always (native) */ -#define SQR_KARATSUBA_THRESHOLD 27 +#define SQR_KARATSUBA_THRESHOLD 26 #define SQR_TOOM3_THRESHOLD 90 #define SQR_TOOM4_THRESHOLD 248 #define SQR_TOOM8_THRESHOLD 351 @@ -40,9 +40,9 @@ #define SET_STR_DC_THRESHOLD 6082 #define SET_STR_PRECOMPUTE_THRESHOLD 7122 -#define MUL_FFT_FULL_THRESHOLD 2880 +#define MUL_FFT_FULL_THRESHOLD 3528 -#define SQR_FFT_FULL_THRESHOLD 2880 +#define SQR_FFT_FULL_THRESHOLD 2368 #define MULLOW_BASECASE_THRESHOLD 7 #define MULLOW_DC_THRESHOLD 11 diff --git a/tune/speed.c b/tune/speed.c index 78c1d45e..1edb8db4 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -1117,6 +1117,14 @@ check_align_option (const char *name, mp_size_t align) } } +#ifdef _WIN64 +# define s2_format "%lld-%lld" +# define s3_format "%lld(%lld)%lld" +#else +# define s2_format "%ld-%ld" +# define s3_format "%ld(%ld)%ld" +#endif + int main (int argc, char *argv[]) { @@ -1228,13 +1236,13 @@ main (int argc, char *argv[]) size_allocnum += 10; } size_array[size_num].inc = 0; - if (sscanf (s, "%ld(%ld)%ld", + if (sscanf (s, s3_format, &size_array[size_num].start, &size_array[size_num].inc, &size_array[size_num].end) != 3) { - if (sscanf (s, "%ld-%ld", + if (sscanf (s, s2_format, &size_array[size_num].start, &size_array[size_num].end) != 2) {