add latest skylake AVX code to Windows
This commit is contained in:
parent
721da455a0
commit
3ce4ca48e3
@ -1,6 +1,7 @@
|
||||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 14
|
||||
VisualStudioVersion = 14.0.25420.1
|
||||
VisualStudioVersion = 14.0.24720.0
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lib_mpir_k8", "lib_mpir_k8\lib_mpir_k8.vcxproj", "{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}"
|
||||
EndProject
|
||||
@ -56,98 +57,98 @@ Global
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|x64
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|x64.ActiveCfg = Release|x64
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|Win32
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|x64
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|Win32
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|x64
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|x64
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|x64.ActiveCfg = Release|x64
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|x64.ActiveCfg = Release|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|x64
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|x64.ActiveCfg = Release|x64
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|x64.ActiveCfg = Release|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|x64
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|x64.ActiveCfg = Release|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|x64.ActiveCfg = Release|x64
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|x64.ActiveCfg = Release|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|x64
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|x64.ActiveCfg = Release|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|x64.ActiveCfg = Release|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|x64.ActiveCfg = Release|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|x64.ActiveCfg = Release|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|x64.ActiveCfg = Release|x64
|
||||
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|Win32.ActiveCfg = Release|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|x64.ActiveCfg = Release|x64
|
||||
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|Win32.ActiveCfg = Release|x64
|
||||
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|x64.ActiveCfg = Release|x64
|
||||
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|Win32.ActiveCfg = Release|x64
|
||||
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|x64.ActiveCfg = Release|x64
|
||||
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{5811A327-3992-4365-95CC-47CB0F9532A5}.Release|Win32.ActiveCfg = Release|x64
|
||||
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{5811A327-3992-4365-95CC-47CB0F9532A5}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{5811A327-3992-4365-95CC-47CB0F9532A5}.Release|x64.ActiveCfg = Release|x64
|
||||
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|Win32.ActiveCfg = Release|x64
|
||||
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|x64.ActiveCfg = Release|x64
|
||||
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|Win32.ActiveCfg = Release|x64
|
||||
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|x64.ActiveCfg = Release|x64
|
||||
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|Win32.ActiveCfg = Release|x64
|
||||
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|x64.ActiveCfg = Release|x64
|
||||
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{09A387AF-18EA-40EB-AD27-9DF346740987}.Release|Win32.ActiveCfg = Release|x64
|
||||
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{09A387AF-18EA-40EB-AD27-9DF346740987}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{09A387AF-18EA-40EB-AD27-9DF346740987}.Release|x64.ActiveCfg = Release|x64
|
||||
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|x64.ActiveCfg = Debug|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
126
mpn/x86_64w/skylake/avx/add_err1_n.asm
Normal file
126
mpn/x86_64w/skylake/avx/add_err1_n.asm
Normal file
@ -0,0 +1,126 @@
|
||||
; AMD64 mpn_add_err1_n
|
||||
; Copyright 2017 Alexander Kruppa
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
; (rdi,rcx) = (rsi,rcx)+(rdx,rcx)+CyIn
|
||||
; rax = carry
|
||||
; (rcx,2) = rev(r8,rcx) \dot (carry,rcx) where carry is the sequence
|
||||
; of carries from the addition of (rsi,rcx)+(rdx,rcx)
|
||||
|
||||
; mp_limb_t mpn_add_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
||||
; mp_limb_t mpn_sub_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
||||
; rax rdi rsi rdx rcx r8 r9 8(rsp)
|
||||
; rax rcx rdx r8 r9 [rsp+40] [rsp+48] [rsp+56]
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
%define SumP rdi
|
||||
%define Inp1P rsi
|
||||
%define Inp2P rdx
|
||||
%define EP r11
|
||||
%define SizeRest rcx
|
||||
%define YP r8
|
||||
%define Size r9
|
||||
%define CyIn [rsp+8]
|
||||
%define LIMB0 rax
|
||||
%define E0 r12
|
||||
%define E1 r13
|
||||
%define Zero r14
|
||||
%define Dummy rbx
|
||||
|
||||
%define reg_save_list rsi, rdi, rbx, r12, r13, r14
|
||||
|
||||
align 32
|
||||
BITS 64
|
||||
|
||||
%macro DO_LIMB 1
|
||||
mov LIMB0, [Inp1P + %1*8]
|
||||
adc LIMB0, [Inp2P + %1*8]
|
||||
mov [SumP + %1*8], LIMB0
|
||||
mov LIMB0, [YP - %1*8]
|
||||
cmovnc LIMB0, Zero
|
||||
inc Dummy ; OF = 0
|
||||
adox E0, LIMB0
|
||||
adox E1, Zero
|
||||
%endmacro
|
||||
|
||||
FRAME_PROC mpn_add_err1_n, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov r11, r9
|
||||
mov r8, [rsp + stack_use + 40]
|
||||
mov r9, [rsp + stack_use + 48]
|
||||
mov LIMB0, [rsp + stack_use + 56]
|
||||
|
||||
mov SizeRest, Size
|
||||
lea YP, [YP + Size*8 - 8]
|
||||
and SizeRest, 7
|
||||
xor Zero, Zero
|
||||
mov E0, Zero
|
||||
mov E1, Zero
|
||||
shr Size, 3
|
||||
bt LIMB0, 0
|
||||
jz .testrest
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
DO_LIMB 0
|
||||
DO_LIMB 1
|
||||
DO_LIMB 2
|
||||
DO_LIMB 3
|
||||
DO_LIMB 4
|
||||
DO_LIMB 5
|
||||
DO_LIMB 6
|
||||
DO_LIMB 7
|
||||
|
||||
lea Inp1P, [Inp1P+64]
|
||||
lea Inp2P, [Inp2P+64]
|
||||
lea SumP, [SumP+64]
|
||||
lea YP, [YP-64]
|
||||
|
||||
dec Size
|
||||
jne .loop
|
||||
|
||||
.testrest:
|
||||
inc SizeRest
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
|
||||
.rest:
|
||||
DO_LIMB 0
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
DO_LIMB 1
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
DO_LIMB 2
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
DO_LIMB 3
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
lea Inp1P, [Inp1P+32]
|
||||
lea Inp2P, [Inp2P+32]
|
||||
lea SumP, [SumP+32]
|
||||
lea YP, [YP-32]
|
||||
jmp .rest
|
||||
|
||||
.exit:
|
||||
mov rax, Zero
|
||||
setc al
|
||||
mov [EP], E0
|
||||
mov [EP+8], E1
|
||||
END_PROC reg_save_list
|
196
mpn/x86_64w/skylake/avx/mul_1.asm
Normal file
196
mpn/x86_64w/skylake/avx/mul_1.asm
Normal file
@ -0,0 +1,196 @@
|
||||
; AMD64 mpn_mul_1
|
||||
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
; (rdi,rdx) = rcx*(rsi,rdx)
|
||||
; rax = high word of product
|
||||
|
||||
; mp_limb_t mpn_mul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
||||
; mp_limb_t mpn_mul_1c(mp_ptr, mp_ptr, mp_size_t, mp_limb_t, mp_limb_t)
|
||||
; rax rdi rsi rdx rcx r8
|
||||
; rax rcx rdx r8 r9 [rsp+40]
|
||||
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
BITS 64
|
||||
|
||||
; the following register allocation scheme is valid for Linux
|
||||
|
||||
%define RP RDI
|
||||
%define S1P RSI
|
||||
%define Size RCX
|
||||
%define S2 RDX
|
||||
|
||||
%define MulLo0 R8
|
||||
%define MulHi0 R9
|
||||
%define MulLo1 R10
|
||||
%define MulHi1 R11
|
||||
%define MulLo2 R12
|
||||
%define MulHi2 R13
|
||||
%define MulLo3 R14
|
||||
%define MulHi3 RBX
|
||||
|
||||
%define reg_save_list rsi, rdi, rbx, r12, r13, r14
|
||||
|
||||
align 32
|
||||
FRAME_PROC mpn_mul_1, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rcx, r8
|
||||
mov rdx, r9
|
||||
mov r8, [rsp + stack_use + 40]
|
||||
|
||||
xor MulHi3, MulHi3
|
||||
|
||||
mov RAX, Size ; may be increased by 1 at the end
|
||||
sub Size, 4
|
||||
jc .Post ; separate handling of remaining max. 3 limb =>
|
||||
|
||||
; prepare a quadlimb for main-loop entry
|
||||
mulx MulHi0, MulLo0, [S1P]
|
||||
mulx MulHi1, MulLo1, [S1P+8]
|
||||
mulx MulHi2, MulLo2, [S1P+16]
|
||||
mulx MulHi3, MulLo3, [S1P+24]
|
||||
add S1P, 32
|
||||
add MulLo1, MulHi0
|
||||
adc MulLo2, MulHi1
|
||||
adc MulLo3, MulHi2
|
||||
adc MulHi3, 0
|
||||
|
||||
jmp .Check ; enter main loop =>
|
||||
|
||||
; main loop (unloaded operands)
|
||||
; - 1.25 cycles per limb in L1D$
|
||||
; - 1.25 cycles per limb in L2D$
|
||||
; - 1.60-1.72 cycles per limb in L3D$
|
||||
align 32
|
||||
.Loop:
|
||||
|
||||
mov [RP], MulLo0
|
||||
mov [RP+8], MulLo1
|
||||
mov [RP+16], MulLo2
|
||||
mov [RP+24], MulLo3
|
||||
mulx MulHi0, MulLo0, [S1P]
|
||||
mulx MulHi1, MulLo1, [S1P+8]
|
||||
mulx MulHi2, MulLo2, [S1P+16]
|
||||
add MulLo0, MulHi3
|
||||
mov [RP+32], MulLo0
|
||||
adc MulLo1, MulHi0
|
||||
mov [RP+40], MulLo1
|
||||
adc MulLo2, MulHi1
|
||||
mov [RP+48], MulLo2
|
||||
mulx MulHi3, MulLo3, [S1P+24]
|
||||
mulx MulHi0, MulLo0, [S1P+32]
|
||||
mulx MulHi1, MulLo1, [S1P+40]
|
||||
adc MulLo3, MulHi2 ; no carry-out here
|
||||
adc MulLo0, MulHi3
|
||||
adc MulLo1, MulHi0
|
||||
mulx MulHi2, MulLo2, [S1P+48]
|
||||
adc MulLo2, MulHi1
|
||||
mov [RP+56], MulLo3
|
||||
mulx MulHi3, MulLo3, [S1P+56]
|
||||
adc MulLo3, MulHi2
|
||||
adc MulHi3, 0
|
||||
|
||||
add S1P, 64
|
||||
add RP, 64
|
||||
|
||||
.Check:
|
||||
|
||||
sub Size, 8
|
||||
jnc .Loop
|
||||
|
||||
; core loop roll-out 8 can generate dangling quad-limb
|
||||
test Size, 4
|
||||
je .Store ; no dangling quad-limb =>
|
||||
|
||||
mov [RP], MulLo0
|
||||
mulx MulHi0, MulLo0, [S1P]
|
||||
mov [RP+8], MulLo1
|
||||
mulx MulHi1, MulLo1, [S1P+8]
|
||||
mov [RP+16], MulLo2
|
||||
mulx MulHi2, MulLo2, [S1P+16]
|
||||
add MulLo0, MulHi3
|
||||
mov [RP+24], MulLo3
|
||||
mulx MulHi3, MulLo3, [S1P+24]
|
||||
adc MulLo1, MulHi0
|
||||
adc MulLo2, MulHi1
|
||||
adc MulLo3, MulHi2
|
||||
adc MulHi3, 0
|
||||
|
||||
add S1P, 32
|
||||
add RP, 32
|
||||
|
||||
; store remaining quad-limb from main loop
|
||||
.Store:
|
||||
mov [RP], MulLo0
|
||||
mov [RP+8], MulLo1
|
||||
mov [RP+16], MulLo2
|
||||
mov [RP+24], MulLo3
|
||||
add RP, 32
|
||||
|
||||
; handle final 0-3 single limb of S1P
|
||||
.Post:
|
||||
|
||||
and Size, 3
|
||||
je .Post0
|
||||
|
||||
cmp Size, 2
|
||||
ja .Post3
|
||||
je .Post2
|
||||
|
||||
.Post1:
|
||||
|
||||
mulx MulHi0, MulLo0, [S1P]
|
||||
add MulLo0, MulHi3
|
||||
adc MulHi0, 0
|
||||
mov [RP], MulLo0
|
||||
mov rax, MulHi0
|
||||
jmp .Exit
|
||||
|
||||
.Post2:
|
||||
|
||||
mulx MulHi0, MulLo0, [S1P]
|
||||
mulx MulHi1, MulLo1, [S1P+8]
|
||||
add MulLo0, MulHi3
|
||||
adc MulLo1, MulHi0
|
||||
adc MulHi1, 0
|
||||
mov [RP], MulLo0
|
||||
mov [RP+8], MulLo1
|
||||
mov rax, MulHi1
|
||||
jmp .Exit
|
||||
|
||||
.Post3:
|
||||
|
||||
mulx MulHi0, MulLo0, [S1P]
|
||||
mulx MulHi1, MulLo1, [S1P+8]
|
||||
mulx MulHi2, MulLo2, [S1P+16]
|
||||
add MulLo0, MulHi3
|
||||
adc MulLo1, MulHi0
|
||||
adc MulLo2, MulHi1
|
||||
adc MulHi2, 0
|
||||
mov [RP], MulLo0
|
||||
mov [RP+8], MulLo1
|
||||
mov [RP+16], MulLo2
|
||||
mov rax, MulHi2
|
||||
jmp .Exit
|
||||
|
||||
.Post0:
|
||||
|
||||
mov rax, MulHi3
|
||||
|
||||
.Exit:
|
||||
END_PROC reg_save_list
|
125
mpn/x86_64w/skylake/avx/sub_err1_n.asm
Normal file
125
mpn/x86_64w/skylake/avx/sub_err1_n.asm
Normal file
@ -0,0 +1,125 @@
|
||||
dd_err1_n.as; AMD64 mpn_sub_err1_n
|
||||
; Copyright 2017 Alexander Kruppa
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
; (rdi,rcx) = (rsi,rcx)-(rdx,rcx)-BwIn
|
||||
; rax = borrow
|
||||
; (rcx,2) = rev(r8,rcx) \dot (borrow,rcx) where borrow is the sequence
|
||||
; of borrows from the subtraction of (rsi,rcx)-(rdx,rcx)
|
||||
|
||||
; mp_limb_t mpn_add_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
||||
; mp_limb_t mpn_sub_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
|
||||
; rax rdi rsi rdx rcx r8 r9 8(rsp)
|
||||
; rax rcx rdx r8 r9 [rsp+40] [rsp+48] [rsp+56]
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
%define SumP rdi
|
||||
%define Inp1P rsi
|
||||
%define Inp2P rdx
|
||||
%define EP r11
|
||||
%define SizeRest rcx
|
||||
%define YP r8
|
||||
%define Size r9
|
||||
%define LIMB0 rax
|
||||
%define E0 r12
|
||||
%define E1 r13
|
||||
%define Zero r14
|
||||
%define Dummy rbx
|
||||
|
||||
%define reg_save_list rsi, rdi, rbx, r12, r13, r14
|
||||
|
||||
align 32
|
||||
BITS 64
|
||||
|
||||
%macro DO_LIMB 1
|
||||
mov LIMB0, [Inp1P + %1*8]
|
||||
sbb LIMB0, [Inp2P + %1*8]
|
||||
mov [SumP + %1*8], LIMB0
|
||||
mov LIMB0, [YP - %1*8]
|
||||
cmovnc LIMB0, Zero
|
||||
inc Dummy ; OF = 0
|
||||
adox E0, LIMB0
|
||||
adox E1, Zero
|
||||
%endmacro
|
||||
|
||||
FRAME_PROC mpn_sub_err1_n, 0, reg_save_list
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov r11, r9
|
||||
mov r8, [rsp + stack_use + 40]
|
||||
mov r9, [rsp + stack_use + 48]
|
||||
mov LIMB0, [rsp + stack_use + 56]
|
||||
|
||||
mov SizeRest, Size
|
||||
lea YP, [YP + Size*8 - 8]
|
||||
and SizeRest, 7
|
||||
xor Zero, Zero
|
||||
mov E0, Zero
|
||||
mov E1, Zero
|
||||
shr Size, 3
|
||||
bt LIMB0, 0
|
||||
jz .testrest
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
DO_LIMB 0
|
||||
DO_LIMB 1
|
||||
DO_LIMB 2
|
||||
DO_LIMB 3
|
||||
DO_LIMB 4
|
||||
DO_LIMB 5
|
||||
DO_LIMB 6
|
||||
DO_LIMB 7
|
||||
|
||||
lea Inp1P, [Inp1P+64]
|
||||
lea Inp2P, [Inp2P+64]
|
||||
lea SumP, [SumP+64]
|
||||
lea YP, [YP-64]
|
||||
|
||||
dec Size
|
||||
jne .loop
|
||||
|
||||
.testrest:
|
||||
inc SizeRest
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
|
||||
.rest:
|
||||
DO_LIMB 0
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
DO_LIMB 1
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
DO_LIMB 2
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
DO_LIMB 3
|
||||
dec SizeRest
|
||||
jz .exit
|
||||
lea Inp1P, [Inp1P+32]
|
||||
lea Inp2P, [Inp2P+32]
|
||||
lea SumP, [SumP+32]
|
||||
lea YP, [YP-32]
|
||||
jmp .rest
|
||||
|
||||
.exit:
|
||||
mov rax, Zero
|
||||
setc al
|
||||
mov [EP], E0
|
||||
mov [EP+8], E1
|
||||
END_PROC reg_save_list
|
Loading…
Reference in New Issue
Block a user