From 3ce4ca48e3865d8addf5406565bd7bee506e7ea1 Mon Sep 17 00:00:00 2001 From: Brian Gladman Date: Wed, 18 Jan 2017 13:09:22 +0000 Subject: [PATCH] add latest skylake AVX code to Windows --- build.vc14/mpir.sln | 135 ++++++++--------- mpn/x86_64w/skylake/avx/add_err1_n.asm | 126 ++++++++++++++++ mpn/x86_64w/skylake/avx/mul_1.asm | 196 +++++++++++++++++++++++++ mpn/x86_64w/skylake/avx/sub_err1_n.asm | 125 ++++++++++++++++ 4 files changed, 515 insertions(+), 67 deletions(-) create mode 100644 mpn/x86_64w/skylake/avx/add_err1_n.asm create mode 100644 mpn/x86_64w/skylake/avx/mul_1.asm create mode 100644 mpn/x86_64w/skylake/avx/sub_err1_n.asm diff --git a/build.vc14/mpir.sln b/build.vc14/mpir.sln index 84b20d86..e0ad2dbb 100644 --- a/build.vc14/mpir.sln +++ b/build.vc14/mpir.sln @@ -1,6 +1,7 @@ + Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 14 -VisualStudioVersion = 14.0.25420.1 +VisualStudioVersion = 14.0.24720.0 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lib_mpir_k8", "lib_mpir_k8\lib_mpir_k8.vcxproj", "{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}" EndProject @@ -56,98 +57,98 @@ Global Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|x64 - {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64 - {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|x64 + {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|Win32 {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|x64.ActiveCfg = Release|x64 - {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32 - {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|Win32 + {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|Win32 + {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64 {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|Win32.ActiveCfg = Release|Win32 - {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|Win32 - {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32 - {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|Win32 + {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|x64 + {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32 + {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|x64 {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|Win32.ActiveCfg = Release|Win32 - {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|Win32 - {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|x64 - {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64 - {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|x64 + {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|x64 + {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32 + {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|x64 + {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|Win32 {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|x64.ActiveCfg = Release|x64 - {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32 - {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64 + {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|Win32 + {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64 {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|Win32.ActiveCfg = Release|Win32 {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|x64.ActiveCfg = Release|x64 - {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|x64 - {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64 - {D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|x64 + {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32 + {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64 + {D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|Win32 {D058893B-87A8-4161-8821-FA5707504B2C}.Release|x64.ActiveCfg = Release|x64 - {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32 - {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64 + {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|Win32 + {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64 {4A742B65-9836-4F46-8310-728F046A31C1}.Release|Win32.ActiveCfg = Release|Win32 {4A742B65-9836-4F46-8310-728F046A31C1}.Release|x64.ActiveCfg = Release|x64 - {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|x64 - {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64 - {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|x64 + {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32 + {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64 + {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|Win32 {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|x64.ActiveCfg = Release|x64 - {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|x64 - {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64 - {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|x64 + {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|Win32 + {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64 + {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|Win32 {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|x64.ActiveCfg = Release|x64 - {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32 - {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64 + {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|Win32 + {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64 {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|Win32.ActiveCfg = Release|Win32 {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|x64.ActiveCfg = Release|x64 - {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|x64 - {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64 - {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|x64 + {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32 + {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64 + {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|Win32 {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|x64.ActiveCfg = Release|x64 - {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|x64 - {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64 - {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|x64 + {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|Win32 + {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64 + {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|Win32 {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|x64.ActiveCfg = Release|x64 - {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|x64 - {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64 - {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|x64 + {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|Win32 + {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64 + {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|Win32 {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|x64.ActiveCfg = Release|x64 - {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|x64 - {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64 - {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|x64 + {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|Win32 + {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64 + {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|Win32 {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|x64.ActiveCfg = Release|x64 - {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|x64 - {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64 - {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|x64 + {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|Win32 + {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64 + {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|Win32 {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|x64.ActiveCfg = Release|x64 - {6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|Win32.ActiveCfg = Debug|x64 - {6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|x64.ActiveCfg = Debug|x64 - {6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|Win32.ActiveCfg = Release|x64 + {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|Win32 + {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64 + {6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|Win32.ActiveCfg = Release|Win32 {6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|x64.ActiveCfg = Release|x64 - {43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|Win32.ActiveCfg = Debug|x64 - {43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|x64.ActiveCfg = Debug|x64 - {43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|Win32.ActiveCfg = Release|x64 + {6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|Win32.ActiveCfg = Debug|Win32 + {6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|x64.ActiveCfg = Debug|x64 + {43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|Win32.ActiveCfg = Release|Win32 {43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|x64.ActiveCfg = Release|x64 - {1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|Win32.ActiveCfg = Debug|x64 - {1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|x64.ActiveCfg = Debug|x64 - {1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|Win32.ActiveCfg = Release|x64 + {43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|Win32.ActiveCfg = Debug|Win32 + {43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|x64.ActiveCfg = Debug|x64 + {1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|Win32.ActiveCfg = Release|Win32 {1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|x64.ActiveCfg = Release|x64 - {5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|Win32.ActiveCfg = Debug|x64 - {5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|x64.ActiveCfg = Debug|x64 - {5811A327-3992-4365-95CC-47CB0F9532A5}.Release|Win32.ActiveCfg = Release|x64 + {1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|Win32.ActiveCfg = Debug|Win32 + {1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|x64.ActiveCfg = Debug|x64 + {5811A327-3992-4365-95CC-47CB0F9532A5}.Release|Win32.ActiveCfg = Release|Win32 {5811A327-3992-4365-95CC-47CB0F9532A5}.Release|x64.ActiveCfg = Release|x64 - {58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|Win32.ActiveCfg = Debug|x64 - {58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|x64.ActiveCfg = Debug|x64 - {58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|Win32.ActiveCfg = Release|x64 + {5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|Win32.ActiveCfg = Debug|Win32 + {5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|x64.ActiveCfg = Debug|x64 + {58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|Win32.ActiveCfg = Release|Win32 {58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|x64.ActiveCfg = Release|x64 - {1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|Win32.ActiveCfg = Debug|x64 - {1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|x64.ActiveCfg = Debug|x64 - {1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|Win32.ActiveCfg = Release|x64 + {58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|Win32.ActiveCfg = Debug|Win32 + {58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|x64.ActiveCfg = Debug|x64 + {1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|Win32.ActiveCfg = Release|Win32 {1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|x64.ActiveCfg = Release|x64 - {DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|Win32.ActiveCfg = Debug|x64 - {DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|x64.ActiveCfg = Debug|x64 - {DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|Win32.ActiveCfg = Release|x64 + {1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|Win32.ActiveCfg = Debug|Win32 + {1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|x64.ActiveCfg = Debug|x64 + {DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|Win32.ActiveCfg = Release|Win32 {DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|x64.ActiveCfg = Release|x64 - {09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|Win32.ActiveCfg = Debug|x64 - {09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|x64.ActiveCfg = Debug|x64 - {09A387AF-18EA-40EB-AD27-9DF346740987}.Release|Win32.ActiveCfg = Release|x64 + {DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|Win32.ActiveCfg = Debug|Win32 + {DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|x64.ActiveCfg = Debug|x64 + {09A387AF-18EA-40EB-AD27-9DF346740987}.Release|Win32.ActiveCfg = Release|Win32 {09A387AF-18EA-40EB-AD27-9DF346740987}.Release|x64.ActiveCfg = Release|x64 + {09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|Win32.ActiveCfg = Debug|Win32 + {09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|x64.ActiveCfg = Debug|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/mpn/x86_64w/skylake/avx/add_err1_n.asm b/mpn/x86_64w/skylake/avx/add_err1_n.asm new file mode 100644 index 00000000..63dec4b0 --- /dev/null +++ b/mpn/x86_64w/skylake/avx/add_err1_n.asm @@ -0,0 +1,126 @@ +; AMD64 mpn_add_err1_n +; Copyright 2017 Alexander Kruppa +; This file is part of the MPIR Library. +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. + +; (rdi,rcx) = (rsi,rcx)+(rdx,rcx)+CyIn +; rax = carry +; (rcx,2) = rev(r8,rcx) \dot (carry,rcx) where carry is the sequence +; of carries from the addition of (rsi,rcx)+(rdx,rcx) + +; mp_limb_t mpn_add_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t) +; mp_limb_t mpn_sub_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t) +; rax rdi rsi rdx rcx r8 r9 8(rsp) +; rax rcx rdx r8 r9 [rsp+40] [rsp+48] [rsp+56] + +%include 'yasm_mac.inc' + +%define SumP rdi +%define Inp1P rsi +%define Inp2P rdx +%define EP r11 +%define SizeRest rcx +%define YP r8 +%define Size r9 +%define CyIn [rsp+8] +%define LIMB0 rax +%define E0 r12 +%define E1 r13 +%define Zero r14 +%define Dummy rbx + +%define reg_save_list rsi, rdi, rbx, r12, r13, r14 + + align 32 + BITS 64 + +%macro DO_LIMB 1 + mov LIMB0, [Inp1P + %1*8] + adc LIMB0, [Inp2P + %1*8] + mov [SumP + %1*8], LIMB0 + mov LIMB0, [YP - %1*8] + cmovnc LIMB0, Zero + inc Dummy ; OF = 0 + adox E0, LIMB0 + adox E1, Zero +%endmacro + +FRAME_PROC mpn_add_err1_n, 0, reg_save_list + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov r11, r9 + mov r8, [rsp + stack_use + 40] + mov r9, [rsp + stack_use + 48] + mov LIMB0, [rsp + stack_use + 56] + + mov SizeRest, Size + lea YP, [YP + Size*8 - 8] + and SizeRest, 7 + xor Zero, Zero + mov E0, Zero + mov E1, Zero + shr Size, 3 + bt LIMB0, 0 + jz .testrest + + align 16 +.loop: + DO_LIMB 0 + DO_LIMB 1 + DO_LIMB 2 + DO_LIMB 3 + DO_LIMB 4 + DO_LIMB 5 + DO_LIMB 6 + DO_LIMB 7 + + lea Inp1P, [Inp1P+64] + lea Inp2P, [Inp2P+64] + lea SumP, [SumP+64] + lea YP, [YP-64] + + dec Size + jne .loop + +.testrest: + inc SizeRest + dec SizeRest + jz .exit + +.rest: + DO_LIMB 0 + dec SizeRest + jz .exit + DO_LIMB 1 + dec SizeRest + jz .exit + DO_LIMB 2 + dec SizeRest + jz .exit + DO_LIMB 3 + dec SizeRest + jz .exit + lea Inp1P, [Inp1P+32] + lea Inp2P, [Inp2P+32] + lea SumP, [SumP+32] + lea YP, [YP-32] + jmp .rest + +.exit: + mov rax, Zero + setc al + mov [EP], E0 + mov [EP+8], E1 + END_PROC reg_save_list diff --git a/mpn/x86_64w/skylake/avx/mul_1.asm b/mpn/x86_64w/skylake/avx/mul_1.asm new file mode 100644 index 00000000..f78fff40 --- /dev/null +++ b/mpn/x86_64w/skylake/avx/mul_1.asm @@ -0,0 +1,196 @@ +; AMD64 mpn_mul_1 +; Copyright 2016 Jens Nurmann and Alexander Kruppa +; This file is part of the MPIR Library. +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. + +; (rdi,rdx) = rcx*(rsi,rdx) +; rax = high word of product + +; mp_limb_t mpn_mul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t) +; mp_limb_t mpn_mul_1c(mp_ptr, mp_ptr, mp_size_t, mp_limb_t, mp_limb_t) +; rax rdi rsi rdx rcx r8 +; rax rcx rdx r8 r9 [rsp+40] + + +%include 'yasm_mac.inc' + + BITS 64 + +; the following register allocation scheme is valid for Linux + + %define RP RDI + %define S1P RSI + %define Size RCX + %define S2 RDX + + %define MulLo0 R8 + %define MulHi0 R9 + %define MulLo1 R10 + %define MulHi1 R11 + %define MulLo2 R12 + %define MulHi2 R13 + %define MulLo3 R14 + %define MulHi3 RBX + +%define reg_save_list rsi, rdi, rbx, r12, r13, r14 + + align 32 +FRAME_PROC mpn_mul_1, 0, reg_save_list + mov rdi, rcx + mov rsi, rdx + mov rcx, r8 + mov rdx, r9 + mov r8, [rsp + stack_use + 40] + + xor MulHi3, MulHi3 + + mov RAX, Size ; may be increased by 1 at the end + sub Size, 4 + jc .Post ; separate handling of remaining max. 3 limb => + + ; prepare a quadlimb for main-loop entry + mulx MulHi0, MulLo0, [S1P] + mulx MulHi1, MulLo1, [S1P+8] + mulx MulHi2, MulLo2, [S1P+16] + mulx MulHi3, MulLo3, [S1P+24] + add S1P, 32 + add MulLo1, MulHi0 + adc MulLo2, MulHi1 + adc MulLo3, MulHi2 + adc MulHi3, 0 + + jmp .Check ; enter main loop => + + ; main loop (unloaded operands) + ; - 1.25 cycles per limb in L1D$ + ; - 1.25 cycles per limb in L2D$ + ; - 1.60-1.72 cycles per limb in L3D$ + align 32 + .Loop: + + mov [RP], MulLo0 + mov [RP+8], MulLo1 + mov [RP+16], MulLo2 + mov [RP+24], MulLo3 + mulx MulHi0, MulLo0, [S1P] + mulx MulHi1, MulLo1, [S1P+8] + mulx MulHi2, MulLo2, [S1P+16] + add MulLo0, MulHi3 + mov [RP+32], MulLo0 + adc MulLo1, MulHi0 + mov [RP+40], MulLo1 + adc MulLo2, MulHi1 + mov [RP+48], MulLo2 + mulx MulHi3, MulLo3, [S1P+24] + mulx MulHi0, MulLo0, [S1P+32] + mulx MulHi1, MulLo1, [S1P+40] + adc MulLo3, MulHi2 ; no carry-out here + adc MulLo0, MulHi3 + adc MulLo1, MulHi0 + mulx MulHi2, MulLo2, [S1P+48] + adc MulLo2, MulHi1 + mov [RP+56], MulLo3 + mulx MulHi3, MulLo3, [S1P+56] + adc MulLo3, MulHi2 + adc MulHi3, 0 + + add S1P, 64 + add RP, 64 + + .Check: + + sub Size, 8 + jnc .Loop + + ; core loop roll-out 8 can generate dangling quad-limb + test Size, 4 + je .Store ; no dangling quad-limb => + + mov [RP], MulLo0 + mulx MulHi0, MulLo0, [S1P] + mov [RP+8], MulLo1 + mulx MulHi1, MulLo1, [S1P+8] + mov [RP+16], MulLo2 + mulx MulHi2, MulLo2, [S1P+16] + add MulLo0, MulHi3 + mov [RP+24], MulLo3 + mulx MulHi3, MulLo3, [S1P+24] + adc MulLo1, MulHi0 + adc MulLo2, MulHi1 + adc MulLo3, MulHi2 + adc MulHi3, 0 + + add S1P, 32 + add RP, 32 + + ; store remaining quad-limb from main loop + .Store: + mov [RP], MulLo0 + mov [RP+8], MulLo1 + mov [RP+16], MulLo2 + mov [RP+24], MulLo3 + add RP, 32 + + ; handle final 0-3 single limb of S1P + .Post: + + and Size, 3 + je .Post0 + + cmp Size, 2 + ja .Post3 + je .Post2 + + .Post1: + + mulx MulHi0, MulLo0, [S1P] + add MulLo0, MulHi3 + adc MulHi0, 0 + mov [RP], MulLo0 + mov rax, MulHi0 + jmp .Exit + + .Post2: + + mulx MulHi0, MulLo0, [S1P] + mulx MulHi1, MulLo1, [S1P+8] + add MulLo0, MulHi3 + adc MulLo1, MulHi0 + adc MulHi1, 0 + mov [RP], MulLo0 + mov [RP+8], MulLo1 + mov rax, MulHi1 + jmp .Exit + + .Post3: + + mulx MulHi0, MulLo0, [S1P] + mulx MulHi1, MulLo1, [S1P+8] + mulx MulHi2, MulLo2, [S1P+16] + add MulLo0, MulHi3 + adc MulLo1, MulHi0 + adc MulLo2, MulHi1 + adc MulHi2, 0 + mov [RP], MulLo0 + mov [RP+8], MulLo1 + mov [RP+16], MulLo2 + mov rax, MulHi2 + jmp .Exit + + .Post0: + + mov rax, MulHi3 + + .Exit: + END_PROC reg_save_list diff --git a/mpn/x86_64w/skylake/avx/sub_err1_n.asm b/mpn/x86_64w/skylake/avx/sub_err1_n.asm new file mode 100644 index 00000000..5c1e1c7a --- /dev/null +++ b/mpn/x86_64w/skylake/avx/sub_err1_n.asm @@ -0,0 +1,125 @@ +dd_err1_n.as; AMD64 mpn_sub_err1_n +; Copyright 2017 Alexander Kruppa +; This file is part of the MPIR Library. +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. + +; (rdi,rcx) = (rsi,rcx)-(rdx,rcx)-BwIn +; rax = borrow +; (rcx,2) = rev(r8,rcx) \dot (borrow,rcx) where borrow is the sequence +; of borrows from the subtraction of (rsi,rcx)-(rdx,rcx) + +; mp_limb_t mpn_add_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t) +; mp_limb_t mpn_sub_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t) +; rax rdi rsi rdx rcx r8 r9 8(rsp) +; rax rcx rdx r8 r9 [rsp+40] [rsp+48] [rsp+56] + +%include 'yasm_mac.inc' + +%define SumP rdi +%define Inp1P rsi +%define Inp2P rdx +%define EP r11 +%define SizeRest rcx +%define YP r8 +%define Size r9 +%define LIMB0 rax +%define E0 r12 +%define E1 r13 +%define Zero r14 +%define Dummy rbx + +%define reg_save_list rsi, rdi, rbx, r12, r13, r14 + + align 32 + BITS 64 + +%macro DO_LIMB 1 + mov LIMB0, [Inp1P + %1*8] + sbb LIMB0, [Inp2P + %1*8] + mov [SumP + %1*8], LIMB0 + mov LIMB0, [YP - %1*8] + cmovnc LIMB0, Zero + inc Dummy ; OF = 0 + adox E0, LIMB0 + adox E1, Zero +%endmacro + +FRAME_PROC mpn_sub_err1_n, 0, reg_save_list + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov r11, r9 + mov r8, [rsp + stack_use + 40] + mov r9, [rsp + stack_use + 48] + mov LIMB0, [rsp + stack_use + 56] + + mov SizeRest, Size + lea YP, [YP + Size*8 - 8] + and SizeRest, 7 + xor Zero, Zero + mov E0, Zero + mov E1, Zero + shr Size, 3 + bt LIMB0, 0 + jz .testrest + + align 16 +.loop: + DO_LIMB 0 + DO_LIMB 1 + DO_LIMB 2 + DO_LIMB 3 + DO_LIMB 4 + DO_LIMB 5 + DO_LIMB 6 + DO_LIMB 7 + + lea Inp1P, [Inp1P+64] + lea Inp2P, [Inp2P+64] + lea SumP, [SumP+64] + lea YP, [YP-64] + + dec Size + jne .loop + +.testrest: + inc SizeRest + dec SizeRest + jz .exit + +.rest: + DO_LIMB 0 + dec SizeRest + jz .exit + DO_LIMB 1 + dec SizeRest + jz .exit + DO_LIMB 2 + dec SizeRest + jz .exit + DO_LIMB 3 + dec SizeRest + jz .exit + lea Inp1P, [Inp1P+32] + lea Inp2P, [Inp2P+32] + lea SumP, [SumP+32] + lea YP, [YP-32] + jmp .rest + +.exit: + mov rax, Zero + setc al + mov [EP], E0 + mov [EP+8], E1 + END_PROC reg_save_list