diff --git a/build.vc14/cdata/mpn/x86_64w/sandybridge/cfg.h b/build.vc14/cdata/mpn/x86_64w/sandybridge/cfg.h index 4effa96c..f33b195a 100644 --- a/build.vc14/cdata/mpn/x86_64w/sandybridge/cfg.h +++ b/build.vc14/cdata/mpn/x86_64w/sandybridge/cfg.h @@ -34,6 +34,7 @@ mpn_modexact_1c_odd mpn_mul_1 mpn_mul_2 mpn_mul_basecase +mpn_mullow_n_basecase mpn_mulmid_basecase mpn_nand_n mpn_nior_n diff --git a/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj b/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj index 2c8b07ae..f33b7bfd 100644 --- a/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj +++ b/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj @@ -354,6 +354,10 @@ postbuild "$(TargetPath)" 14 + + + + @@ -385,6 +389,7 @@ postbuild "$(TargetPath)" 14 + @@ -469,6 +474,7 @@ postbuild "$(TargetPath)" 14 + @@ -527,7 +533,6 @@ postbuild "$(TargetPath)" 14 - @@ -537,11 +542,14 @@ postbuild "$(TargetPath)" 14 + + + @@ -615,6 +623,7 @@ postbuild "$(TargetPath)" 14 + diff --git a/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj.filters b/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj.filters index 30484590..24d6f01d 100644 --- a/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj.filters +++ b/build.vc14/dll_mpir_sandybridge/dll_mpir_sandybridge.vcxproj.filters @@ -778,6 +778,18 @@ Source Files\mpz + + Source Files\mpz + + + Source Files\mpz + + + Source Files\mpz + + + Source Files\mpz + Source Files\mpz @@ -871,6 +883,9 @@ Source Files\mpz + + Source Files\mpz + Source Files\mpz @@ -1123,6 +1138,9 @@ Source Files\mpn + + Source Files\mpn + Source Files\mpn @@ -1297,9 +1315,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1327,6 +1342,12 @@ Source Files\mpn + + Source Files\mpn + + + Source Files\mpn + Source Files\mpn @@ -1342,6 +1363,9 @@ Source Files\mpn + + Source Files\mpn + Source Files\mpn @@ -1557,6 +1581,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm diff --git a/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj b/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj index 07552c54..8902ba8a 100644 --- a/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj +++ b/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj @@ -350,6 +350,10 @@ postbuild "$(TargetPath)" 14 + + + + @@ -381,6 +385,7 @@ postbuild "$(TargetPath)" 14 + @@ -454,6 +459,7 @@ postbuild "$(TargetPath)" 14 + @@ -512,7 +518,6 @@ postbuild "$(TargetPath)" 14 - @@ -522,11 +527,14 @@ postbuild "$(TargetPath)" 14 + + + @@ -600,6 +608,7 @@ postbuild "$(TargetPath)" 14 + diff --git a/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj.filters b/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj.filters index c47d7142..1ac34ba0 100644 --- a/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj.filters +++ b/build.vc14/lib_mpir_sandybridge/lib_mpir_sandybridge.vcxproj.filters @@ -777,6 +777,18 @@ Source Files\mpz + + Source Files\mpz + + + Source Files\mpz + + + Source Files\mpz + + + Source Files\mpz + Source Files\mpz @@ -870,6 +882,9 @@ Source Files\mpz + + Source Files\mpz + Source Files\mpz @@ -1089,6 +1104,9 @@ Source Files\mpn + + Source Files\mpn + Source Files\mpn @@ -1263,9 +1281,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1293,6 +1308,12 @@ Source Files\mpn + + Source Files\mpn + + + Source Files\mpn + Source Files\mpn @@ -1308,6 +1329,9 @@ Source Files\mpn + + Source Files\mpn + Source Files\mpn @@ -1523,6 +1547,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm diff --git a/mpn/x86_64w/mullow_n_basecase.asm b/mpn/x86_64w/mullow_n_basecase.asm new file mode 100644 index 00000000..35706248 --- /dev/null +++ b/mpn/x86_64w/mullow_n_basecase.asm @@ -0,0 +1,336 @@ +; AMD64 mpn_mullow_n_basecase +; +; Copyright 2015 Free Software Foundation, Inc. +; +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. +; +;void mpn_mullow_n_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +; rdi rsi rdx rcx +; rcx rdx r8 r9 + +%include 'yasm_mac.inc' + + TEXT + + LEAF_PROC mpn_mullow_n_basecase + cmp r9, 3 + je asm_sym(?mpn_mullow1) + jg asm_sym(?mpn_mullow2) + mov rax, [rdx] + mov r11, [r8+8] + mov r8, [r8] + cmp r9, 1 + jg .3 + +.2: imul rax, r8 + mov [rcx], rax + ret + +.3: mov r9, [rdx+8] + imul r11, rax + mul r8 + mov [rcx], rax + imul r8, r9 + lea rax, [r11+rdx] + add rax, r8 + mov [rcx+8], rax + ret + +%define reg_save_list rsi, rdi + + FRAME_PROC ?mpn_mullow1, 0, reg_save_list + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov rax, [rsi] + mov r8, [rdx] + mov r9, [rdx+8] + mov r11, [rdx+16] + mul r8 + mov [rdi], rax + mov rax, [rsi] + mov rcx, rdx + mul r9 + imul r9, [rsi+8] + mov r10, [rsi+16] + imul r10, r8 + add rcx, rax + adc r9, rdx + add r9, r10 + mov rax, [rsi+8] + mul r8 + add rcx, rax + adc r9, rdx + mov rax, r11 + imul rax, [rsi] + add r9, rax + mov [rdi+8], rcx + mov [rdi+16], r9 + END_PROC reg_save_list + +%define reg_save_list rsi, rdi, rbx, rbp, r13, r14, r15 + + align 16 + FRAME_PROC ?mpn_mullow2, 0, reg_save_list + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 +.5: mov rax, [rsi] + mov r13, [rdx] + mov r11, rdx + lea rdi, [rdi+rcx*8] + lea rsi, [rsi+rcx*8] + neg rcx + mul r13 + test cl, 1 + jz .12 +.6: lea rdi, [rdi-8] + lea rsi, [rsi-8] + test cl, 2 + jnz .8 +.7: lea r9, [rcx-1] + lea r8, [rel .18] + mov rbx, rax + mov r15, rdx + xor ebp, ebp + xor r10d, r10d + mov rax, [rsi+rcx*8+16] + jmp .11 + +.8: lea r9, [rcx+1] + lea r8, [rel .19] + mov rbp, rax + mov r10, rdx + xor ebx, ebx + jmp .10 + + align 16 +.9: mov [rdi+r9*8-16], rbx + add r15, rax + mov rax, [rsi+r9*8] + adc rbp, rdx + xor ebx, ebx + mul r13 + mov [rdi+r9*8-8], r15 + add rbp, rax + adc r10, rdx +.10:mov rax, [rsi+r9*8+8] + mul r13 + mov [rdi+r9*8], rbp + add r10, rax + adc rbx, rdx + mov rax, [rsi+r9*8+16] + mul r13 + mov [rdi+r9*8+8], r10 + xor ebp, ebp + mov r10, rbp + add rbx, rax + mov rax, [rsi+r9*8+24] + mov r15, rbp + adc r15, rdx +.11:mul r13 + add r9, 4 + js .9 + mov [rdi-16], rbx + add r15, rax + mov [rdi-8], r15 + adc rbp, rdx + imul r13, [rsi] + add rbp, r13 + mov [rdi], rbp + add rcx, 1 + jz .23 + mov r13, [r11+8] + mov r14, [r11+16] + lea rsi, [rsi+16] + lea r11, [r11+8] + lea rdi, [rdi+24] + jmp r8 + +.12:mov r14, [r11+8] + test cl, 2 + jz .14 + + align 16 +.13:lea r9, [rcx+0] + mov r10, rax + mov rbx, rdx + xor r15d, r15d + mov rax, [rsi+rcx*8] + lea r8, [rel .19] + jmp .17 + + align 16 +.14:lea r9, [rcx+2] + mov r10d, 0 + mov r15, rax + mov rax, [rsi+rcx*8] + mov rbp, rdx + lea r8, [rel .18] + jmp .16 + + align 16 +.15:mov rax, [rsi+r9*8-32] + mul r14 + add rbx, rax + adc r15, rdx + mov rax, [rsi+r9*8-24] + xor ebp, ebp + mul r13 + add rbx, rax + mov rax, [rsi+r9*8-24] + adc r15, rdx + adc ebp, 0 + mul r14 + add r15, rax + mov [rdi+r9*8-24], rbx + adc rbp, rdx + mov rax, [rsi+r9*8-16] + mul r13 + mov r10d, 0 + add r15, rax + adc rbp, rdx + mov rax, [rsi+r9*8-16] + adc r10d, 0 +.16:mov ebx, 0 + mov [rdi+r9*8-16], r15 + mul r14 + add rbp, rax + mov rax, [rsi+r9*8-8] + adc r10, rdx + mov r15d, 0 + mul r13 + add rbp, rax + mov rax, [rsi+r9*8-8] + adc r10, rdx + adc ebx, r15d + mul r14 + add r10, rax + mov [rdi+r9*8-8], rbp + adc rbx, rdx + mov rax, [rsi+r9*8] + mul r13 + add r10, rax + adc rbx, rdx + adc r15d, 0 +.17:add r9, 4 + mov [rdi+r9*8-32], r10 + js .15 + imul r14, [rsi-16] + add rbx, r14 + imul r13, [rsi-8] + add rbx, r13 + mov [rdi-8], rbx + add rcx, 2 + jz .23 + mov r13, [r11+16] + mov r14, [r11+24] + lea r11, [r11+16] + lea rdi, [rdi+16] + jmp r8 +.18:lea r9, [rcx-2] + mov rax, [rsi+rcx*8-16] + mul r13 + mov r10, rax + mov rax, [rsi+rcx*8-16] + mov rbx, rdx + xor r15d, r15d + lea r8, [rel .19] + jmp .22 +.19:lea r9, [rcx+0] + mov rax, [rsi+rcx*8-16] + xor r10d, r10d + mul r13 + mov r15, rax + mov rax, [rsi+rcx*8-16] + mov rbp, rdx + lea r8, [rel .18] + jmp .21 + + align 16 +.20:add [rdi+r9*8-32], r10 + adc rbx, rax + mov rax, [rsi+r9*8-24] + adc r15, rdx + xor ebp, ebp + mul r13 + add rbx, rax + mov rax, [rsi+r9*8-24] + adc r15, rdx + adc ebp, ebp + mul r14 + xor r10d, r10d + add [rdi+r9*8-24], rbx + adc r15, rax + mov rax, [rsi+r9*8-16] + adc rbp, rdx + mul r13 + add r15, rax + mov rax, [rsi+r9*8-16] + adc rbp, rdx + adc r10d, 0 +.21:mul r14 + add [rdi+r9*8-16], r15 + adc rbp, rax + mov rax, [rsi+r9*8-8] + adc r10, rdx + mul r13 + xor ebx, ebx + add rbp, rax + adc r10, rdx + mov r15d, 0 + mov rax, [rsi+r9*8-8] + adc ebx, r15d + mul r14 + add [rdi+r9*8-8], rbp + adc r10, rax + adc rbx, rdx + mov rax, [rsi+r9*8] + mul r13 + add r10, rax + mov rax, [rsi+r9*8] + adc rbx, rdx + adc r15d, 0 +.22:mul r14 + add r9, 4 + js .20 + add [rdi-32], r10 + adc rbx, rax + imul r13, [rsi-24] + add rbx, r13 + add [rdi-24], rbx + add rcx, 2 + jns .23 + lea r11, [r11+16] + mov r13, [r11] + mov r14, [r11+8] + lea rsi, [rsi-16] + jmp r8 +.23: + END_PROC reg_save_list diff --git a/mpn/x86_64w/yasm_mac.inc b/mpn/x86_64w/yasm_mac.inc index 52804569..19ee6bbd 100644 --- a/mpn/x86_64w/yasm_mac.inc +++ b/mpn/x86_64w/yasm_mac.inc @@ -63,6 +63,8 @@ %define r6b bpl %define r7b spl +%define asm_sym(x) __g %+ x + ; Standard macro for alignment (used to allow easy subsititution of ; alternative padding schemes) @@ -105,13 +107,13 @@ %macro FRAME_PROC 2-* - global __g%1 + global asm_sym(%1) %ifdef DLL - export __g%1 + export asm_sym(%1) %endif - PROC_FRAME __g%1 + PROC_FRAME asm_sym(%1) %rotate 1 %if %1 < 0 @@ -216,13 +218,13 @@ %macro LEAF_PROC 1 - global __g%1 + global asm_sym(%1) %ifdef DLL - export __g%1 + export asm_sym(%1) %endif -__g%1: +asm_sym(%1): %endmacro