diff --git a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj
index e5cb0750..9d78cfbd 100644
--- a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj
+++ b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj
@@ -1,4 +1,4 @@
-
+
@@ -51,7 +51,7 @@ prebuild haswell x64 14
- DLL;USE_WIN64
+ DLL
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -62,6 +62,7 @@ prebuild haswell x64 14
+
cd ..\..\build.vc
postbuild "$(TargetPath)" 14
@@ -75,7 +76,7 @@ prebuild haswell x64 14
- DLL;USE_WIN64
+ DLL
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -86,6 +87,7 @@ prebuild haswell x64 14
+
cd ..\..\build.vc
postbuild "$(TargetPath)" 14
@@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 14
-
@@ -474,8 +475,6 @@ postbuild "$(TargetPath)" 14
-
-
@@ -526,7 +525,6 @@ postbuild "$(TargetPath)" 14
-
@@ -567,7 +565,6 @@ postbuild "$(TargetPath)" 14
-
@@ -610,17 +607,22 @@ postbuild "$(TargetPath)" 14
+
+
+
+
+
@@ -631,8 +633,8 @@ postbuild "$(TargetPath)" 14
-
-
+
+
-
-
\ No newline at end of file
+
+
diff --git a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters
index 47b0f2ff..63b6f7c4 100644
--- a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters
+++ b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters
@@ -1114,9 +1114,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1144,12 +1141,6 @@
Source Files\mpn
-
- Source Files\mpn
-
-
- Source Files\mpn
-
Source Files\mpn
@@ -1300,9 +1291,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1423,9 +1411,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1548,9 +1533,18 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1569,6 +1563,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1581,6 +1578,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
diff --git a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj
index c5c09002..fbd0be25 100644
--- a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj
+++ b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj
@@ -51,7 +51,8 @@ prebuild haswell x64 14
- USE_WIN64
+
+
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -73,7 +74,8 @@ prebuild haswell x64 14
- USE_WIN64
+
+
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 14
-
@@ -459,8 +460,6 @@ postbuild "$(TargetPath)" 14
-
-
@@ -511,7 +510,6 @@ postbuild "$(TargetPath)" 14
-
@@ -552,7 +550,6 @@ postbuild "$(TargetPath)" 14
-
@@ -595,13 +592,29 @@ postbuild "$(TargetPath)" 14
+
+ USE_WIN64
+ USE_WIN64
+
+
+ USE_WIN64
+ USE_WIN64
+
+
+ USE_WIN64
+ USE_WIN64
+
+
+ USE_WIN64
+ USE_WIN64
+
USE_WIN64
USE_WIN64
@@ -609,11 +622,13 @@ postbuild "$(TargetPath)" 14
+
+ USE_WIN64
+ USE_WIN64
+
-
-
-
-
+ USE_WIN64
+ USE_WIN64
diff --git a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters
index 31860663..fd203d93 100644
--- a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters
+++ b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters
@@ -1080,9 +1080,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1110,12 +1107,6 @@
Source Files\mpn
-
- Source Files\mpn
-
-
- Source Files\mpn
-
Source Files\mpn
@@ -1266,9 +1257,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1389,9 +1377,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1514,9 +1499,18 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1535,6 +1529,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1547,6 +1544,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
diff --git a/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj b/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj
index 2c73cdcd..807b2a1d 100644
--- a/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj
+++ b/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj
@@ -168,7 +168,7 @@ check_config $(Platform) $(Configuration) 14
MachineX64
- No
+ true
diff --git a/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h b/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h
index 122c4baf..50783988 100644
--- a/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h
+++ b/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h
@@ -8,12 +8,14 @@ mpn_divexact_byfobm1
mpn_divrem_2
mpn_divrem_euclidean_qr_1
mpn_divrem_euclidean_qr_2
+mpn_lshift
mpn_lshift1
mpn_modexact_1_odd
mpn_modexact_1c_odd
mpn_mul_2
mpn_mulmid_basecase
mpn_preinv_divrem_1
+mpn_rshift
mpn_rshift1
mpn_sqr_basecase
mpn_sub_err1_n
diff --git a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj
index f79efb04..8a7f1c9b 100644
--- a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj
+++ b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj
@@ -1,4 +1,4 @@
-
+
@@ -51,7 +51,7 @@ prebuild haswell x64 15
- DLL;USE_WIN64
+ DLL
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -62,6 +62,7 @@ prebuild haswell x64 15
+
cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@@ -75,7 +76,7 @@ prebuild haswell x64 15
- DLL;USE_WIN64
+ DLL
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -86,6 +87,7 @@ prebuild haswell x64 15
+
cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 15
-
@@ -526,7 +527,6 @@ postbuild "$(TargetPath)" 15
-
@@ -567,7 +567,6 @@ postbuild "$(TargetPath)" 15
-
@@ -610,6 +609,7 @@ postbuild "$(TargetPath)" 15
+
@@ -617,10 +617,12 @@ postbuild "$(TargetPath)" 15
+
+
@@ -631,8 +633,8 @@ postbuild "$(TargetPath)" 15
-
-
+
+
-
-
\ No newline at end of file
+
+
diff --git a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters
index 47b0f2ff..1660d33c 100644
--- a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters
+++ b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters
@@ -1114,9 +1114,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1300,9 +1297,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1423,9 +1417,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1548,6 +1539,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1569,6 +1563,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1581,6 +1578,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
diff --git a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj
index 69420a7e..561ef482 100644
--- a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj
+++ b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj
@@ -1,4 +1,4 @@
-
+
@@ -51,7 +51,7 @@ prebuild haswell x64 15
- USE_WIN64
+
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -60,6 +60,7 @@ prebuild haswell x64 15
..\..\
NDEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions)
+
cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@@ -73,7 +74,7 @@ prebuild haswell x64 15
- USE_WIN64
+
..\..\mpn\x86_64w\
true
$(IntDir)mpn\
@@ -82,6 +83,7 @@ prebuild haswell x64 15
..\..\
_DEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions)
+
cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 15
-
@@ -511,7 +512,6 @@ postbuild "$(TargetPath)" 15
-
@@ -552,7 +552,6 @@ postbuild "$(TargetPath)" 15
-
@@ -595,6 +594,7 @@ postbuild "$(TargetPath)" 15
+
@@ -602,10 +602,12 @@ postbuild "$(TargetPath)" 15
+
+
@@ -616,8 +618,8 @@ postbuild "$(TargetPath)" 15
-
-
+
+
-
-
\ No newline at end of file
+
+
diff --git a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters
index 31860663..21bf6302 100644
--- a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters
+++ b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters
@@ -1080,9 +1080,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1266,9 +1263,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1389,9 +1383,6 @@
Source Files\mpn
-
- Source Files\mpn
-
Source Files\mpn
@@ -1514,6 +1505,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1535,6 +1529,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
@@ -1547,6 +1544,9 @@
Source Files\mpn\yasm
+
+ Source Files\mpn\yasm
+
Source Files\mpn\yasm
diff --git a/build.vc15/mpir.sln b/build.vc15/mpir.sln
index 3f54c2f5..8413ca83 100644
--- a/build.vc15/mpir.sln
+++ b/build.vc15/mpir.sln
@@ -1,6 +1,7 @@
+
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
-VisualStudioVersion = 15.0.25914.0
+VisualStudioVersion = 15.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lib_mpir_k8", "lib_mpir_k8\lib_mpir_k8.vcxproj", "{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}"
EndProject
@@ -44,74 +45,74 @@ Global
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|x64
- {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
- {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|x64
+ {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|Win32
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|x64.ActiveCfg = Release|x64
- {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
- {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|Win32
+ {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|Win32
+ {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|Win32.ActiveCfg = Release|Win32
- {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|Win32
- {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
- {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|Win32
+ {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|x64
+ {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
+ {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|x64
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|Win32.ActiveCfg = Release|Win32
- {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|Win32
- {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|x64
- {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
- {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|x64
+ {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|x64
+ {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
+ {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|x64
+ {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|Win32
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|x64.ActiveCfg = Release|x64
- {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
- {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
+ {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|Win32
+ {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|Win32.ActiveCfg = Release|Win32
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|x64.ActiveCfg = Release|x64
- {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|x64
- {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
- {D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|x64
+ {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
+ {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
+ {D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|Win32
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|x64.ActiveCfg = Release|x64
- {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
- {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
+ {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|Win32
+ {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|Win32.ActiveCfg = Release|Win32
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|x64.ActiveCfg = Release|x64
- {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|x64
- {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
- {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|x64
+ {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
+ {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
+ {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|Win32
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|x64.ActiveCfg = Release|x64
- {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|x64
- {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
- {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|x64
+ {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|Win32
+ {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
+ {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|Win32
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|x64.ActiveCfg = Release|x64
- {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
- {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
+ {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|Win32
+ {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|Win32.ActiveCfg = Release|Win32
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|x64.ActiveCfg = Release|x64
- {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|x64
- {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
- {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|x64
+ {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
+ {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
+ {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|Win32
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|x64.ActiveCfg = Release|x64
- {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|x64
- {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
- {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|x64
+ {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|Win32
+ {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
+ {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|Win32
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|x64.ActiveCfg = Release|x64
- {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|x64
- {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
- {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|x64
+ {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|Win32
+ {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
+ {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|Win32
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|x64.ActiveCfg = Release|x64
- {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|x64
- {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
- {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|x64
+ {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|Win32
+ {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
+ {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|Win32
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|x64.ActiveCfg = Release|x64
- {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|x64
- {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
- {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|x64
+ {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|Win32
+ {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
+ {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|Win32
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|x64.ActiveCfg = Release|x64
- {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|x64
- {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64
- {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|x64
+ {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|Win32
+ {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
+ {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|Win32
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|x64.ActiveCfg = Release|x64
- {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|x64
- {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64
- {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|x64
+ {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|Win32
+ {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64
+ {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|Win32
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|x64.ActiveCfg = Release|x64
+ {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|Win32
+ {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/mpn/x86_64w/haswell/add_n.asm b/mpn/x86_64w/haswell/add_n.asm
new file mode 100644
index 00000000..cac4b8cc
--- /dev/null
+++ b/mpn/x86_64w/haswell/add_n.asm
@@ -0,0 +1,111 @@
+; PROLOGUE(mpn_add_n)
+
+; Version 1.0.3.
+;
+; Copyright 2008 Jason Moxham
+;
+; Windows Conversion Copyright 2008 Brian Gladman
+;
+; This file is part of the MPIR Library.
+; The MPIR Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published
+; by the Free Software Foundation; either version 2.1 of the License, or (at
+; your option) any later version.
+; The MPIR Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+; You should have received a copy of the GNU Lesser General Public License
+; along with the MPIR Library; see the file COPYING.LIB. If not, write
+; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+; Boston, MA 02110-1301, USA.
+;
+; Calculate src1[size] plus(minus) src2[size] and store the result in
+; dst[size]. The return value is the carry bit from the top of the result
+; (1 or 0). The _nc version accepts 1 or 0 for an initial carry into the
+; low limb of the calculation. Note values other than 1 or 0 here will
+; lead to garbage results.
+;
+; mp_limb_t mpn_add_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)
+; mp_limb_t mpn_add_nc(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t)
+; rax rdi rsi rdx rcx r8
+; rax rcx rdx r8 r9 [rsp+40]
+
+%include "yasm_mac.inc"
+
+ CPU Athlon64
+ BITS 64
+
+ xalign 8
+ LEAF_PROC mpn_add_nc
+ mov r10,[rsp+40]
+ jmp entry
+
+ xalign 8
+ LEAF_PROC mpn_add_n
+ xor r10, r10
+entry:
+ mov rax, r9
+ and rax, 3
+ shr r9, 2
+ lea r9,[r10+r9*2]
+ sar r9, 1
+ jnz .2
+
+ mov r10, [rdx]
+ adc r10, [r8]
+ mov [rcx], r10
+ dec rax
+ jz .1
+ mov r10, [rdx+8]
+ adc r10, [r8+8]
+ mov [rcx+8], r10
+ dec rax
+ jz .1
+ mov r10, [rdx+16]
+ adc r10, [r8+16]
+ mov [rcx+16], r10
+ dec rax
+.1: adc rax, rax
+ ret
+
+ xalign 8
+.2: mov r10, [rdx]
+ mov r11, [rdx+8]
+ lea rdx, [rdx+32]
+ adc r10, [r8]
+ adc r11, [r8+8]
+ lea r8, [r8+32]
+ mov [rcx], r10
+ mov [rcx+8], r11
+ lea rcx, [rcx+32]
+ mov r10, [rdx-16]
+ mov r11, [rdx-8]
+ adc r10, [r8-16]
+ adc r11, [r8-8]
+ mov [rcx-16], r10
+ dec r9
+ mov [rcx-8], r11
+ jnz .2
+
+ inc rax
+ dec rax
+ jz .3
+ mov r10, [rdx]
+ adc r10, [r8]
+ mov [rcx], r10
+ dec rax
+ jz .3
+ mov r10, [rdx+8]
+ adc r10, [r8+8]
+ mov [rcx+8], r10
+ dec rax
+ jz .3
+ mov r10, [rdx+16]
+ adc r10, [r8+16]
+ mov [rcx+16], r10
+ dec rax
+.3: adc rax, rax
+ ret
+
+ end
diff --git a/mpn/x86_64w/haswell/copyd.asm b/mpn/x86_64w/haswell/copyd.asm
new file mode 100644
index 00000000..1eac1ca6
--- /dev/null
+++ b/mpn/x86_64w/haswell/copyd.asm
@@ -0,0 +1,203 @@
+
+; Copyright 2016 Jens Nurmann and Alexander Kruppa
+
+; This file is part of the MPIR Library.
+
+; The MPIR Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published
+; by the Free Software Foundation; either version 2.1 of the License, or (at
+; your option) any later version.
+
+; The MPIR Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the MPIR Library; see the file COPYING.LIB. If not, write
+; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+; Boston, MA 02110-1301, USA.
+
+
+; mpn_copyd(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
+; Linux RDI RSI RDX
+; Win7 RCX RDX R8
+;
+; Description:
+; The function copies a given number of limb from source to destination (while
+; moving high to low in memory) and hands back the size (in limb) of the
+; destination.
+;
+; Result:
+; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
+; - number of copied limb: range [ 0..max tCounter ]
+;
+; Caveats:
+; - if size 0 is given the content of the destination will remain untouched!
+; - if Op1=Op2 no copy is done!
+;
+; Comments:
+; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
+; - did some experiments with AVX based version with following results
+; - AVX can be faster in L1$-L3$ if destination is aligned on 32 byte
+; - AVX is generally faster on small sized operands (<=100 limb) due too
+; start-up overhead of "rep movsq" - however this could also be achieved by
+; simple copy loop
+; - startup overhead of "rep movsq" with negative direction is 200 cycles!!!
+; - negative direction is unfavourable compared to positive "rep movsq" and
+; to AVX.
+
+%include 'yasm_mac.inc'
+
+BITS 64
+
+%ifdef USE_WIN64
+ %define Op2 RCX
+ %define Op1 RDX
+ %define Size1 R8
+ %define Limb R9
+ %define Offs R10
+%else
+ %define Op2 RDI
+ %define Op1 RSI
+ %define Size1 RDX
+ %define Limb RCX
+ %define Offs R10
+%endif
+
+%define DLimb0 XMM0
+%define QLimb0 YMM0
+%define QLimb1 YMM1
+%define QLimb2 YMM2
+%define QLimb3 YMM3
+
+ align 32
+
+LEAF_PROC mpn_copyd
+ mov RAX, Size1
+ cmp Op1, Op2
+ je .Exit ; no copy required =>
+
+ or RAX, RAX
+ je .Exit ; Size=0 =>
+
+ lea Op1, [Op1+8*Size1-8]
+ lea Op2, [Op2+8*Size1-8]
+
+ ; align the destination (Op2) to 32 byte
+ test Op2, 8
+ jne .lCpyDecA32
+
+ mov Limb, [Op1]
+ mov [Op2], Limb
+ dec Size1
+ je .Exit
+
+ sub Op1, 8
+ sub Op2, 8
+
+ .lCpyDecA32:
+
+ test Op2, 16
+ jnz .lCpyDecAVX
+
+ mov Limb, [Op1]
+ mov [Op2], Limb
+ dec Size1
+ je .Exit
+
+ mov Limb, [Op1-8]
+ mov [Op2-8], Limb
+ dec Size1
+ je .Exit
+
+ sub Op1, 16
+ sub Op2, 16
+
+ .lCpyDecAVX:
+
+ mov Offs, 128
+ jmp .lCpyDecAVXCheck
+
+ ; main loop (prefetching disabled; unloaded cache)
+ ; - 0.30 cycles / limb in L1$
+ ; - 0.60 cycles / limb in L2$
+ ; - 0.70-0.90 cycles / limb in L3$
+ align 16
+ .lCpyDecAVXLoop:
+
+ vmovdqu QLimb0, [Op1-24]
+ vmovdqu QLimb1, [Op1-56]
+ vmovdqu QLimb2, [Op1-88]
+ vmovdqu QLimb3, [Op1-120]
+ vmovdqa [Op2-24], QLimb0
+ vmovdqa [Op2-56], QLimb1
+ vmovdqa [Op2-88], QLimb2
+ vmovdqa [Op2-120], QLimb3
+
+ sub Op1, Offs
+ sub Op2, Offs
+
+ .lCpyDecAVXCheck:
+
+ sub Size1, 16
+ jnc .lCpyDecAVXLoop
+
+ add Size1, 16
+ je .Exit ; AVX copied operand fully =>
+
+ ; copy remaining max. 15 limb
+ test Size1, 8
+ je .lCpyDecFour
+
+ vmovdqu QLimb0, [Op1-24]
+ vmovdqu QLimb1, [Op1-56]
+ vmovdqa [Op2-24], QLimb0
+ vmovdqa [Op2-56], QLimb1
+
+ sub Op1, 64
+ sub Op2, 64
+
+ .lCpyDecFour:
+
+ test Size1, 4
+ je .lCpyDecTwo
+
+ vmovdqu QLimb0, [Op1-24]
+ vmovdqa [Op2-24], QLimb0
+
+ sub Op1, 32
+ sub Op2, 32
+
+ .lCpyDecTwo:
+
+ test Size1, 2
+ je .lCpyDecOne
+
+%if 1
+ ; Avoid SSE2 instruction due to stall on Haswell
+ mov Limb, [Op1]
+ mov [Op2], Limb
+ mov Limb, [Op1-8]
+ mov [Op2-8], Limb
+%else
+ movdqu DLimb0, [Op1-8]
+ movdqa [Op2-8], DLimb0
+%endif
+
+ sub Op1, 16
+ sub Op2, 16
+
+ .lCpyDecOne:
+
+ test Size1, 1
+ je .Exit
+
+ mov Limb, [Op1]
+ mov [Op2], Limb
+
+ .Exit:
+
+ vzeroupper
+ ret
+.end:
diff --git a/mpn/x86_64w/haswell/copyi.asm b/mpn/x86_64w/haswell/copyi.asm
new file mode 100644
index 00000000..97f30df1
--- /dev/null
+++ b/mpn/x86_64w/haswell/copyi.asm
@@ -0,0 +1,199 @@
+
+; Copyright 2016 Jens Nurmann and Alexander Kruppa
+
+; This file is part of the MPIR Library.
+
+; The MPIR Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published
+; by the Free Software Foundation; either version 2.1 of the License, or (at
+; your option) any later version.
+
+; The MPIR Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the MPIR Library; see the file COPYING.LIB. If not, write
+; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+; Boston, MA 02110-1301, USA.
+
+; mpn_copyi(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
+; Linux RDI RSI RDX
+; Win7 RCX RDX R8
+;
+; Description:
+; The function copies a given number of limb from source to destination (while
+; moving low to high in memory) and hands back the size (in limb) of the
+; destination.
+;
+; Result:
+; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
+; - number of copied limb: range [ 0..max tCounter ]
+;
+; Caveats:
+; - if size 0 is given the content of the destination will remain untouched!
+; - if Op1=Op2 no copy is done!
+;
+; Comments:
+; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
+; - did some experiments with AVX based version with following results
+; - AVX can be faster in L1$ (30%), L2$ (10%) if dest. is aligned on 32 byte
+; - AVX is generally faster on small sized operands (<=100 limb) due too
+; start-up overhead of "rep movsq" - however this could also be achieved by
+; simple copy loop
+; - the break-even between AVX and "rep movsq" is around 10,000 limb
+; - the prologue & epilogue can still be optimized!
+
+%include 'yasm_mac.inc'
+
+BITS 64
+
+%ifdef USE_WIN64
+ %define Op2 RCX
+ %define Op1 RDX
+ %define Size1 R8
+ %define Limb R9
+ %define Offs R10
+%else
+ %define Op2 RDI
+ %define Op1 RSI
+ %define Size1 RDX
+ %define Limb RCX
+ %define Offs R10
+%endif
+
+%define DLimb0 XMM0
+%define QLimb0 YMM0
+%define QLimb1 YMM1
+%define QLimb2 YMM2
+%define QLimb3 YMM3
+
+ align 32
+
+LEAF_PROC mpn_copyi
+ mov RAX, Size1
+ cmp Op1, Op2
+ je .Exit ; no copy required =>
+
+ or RAX, RAX
+ je .Exit ; size=0 =>
+
+ ; align the destination (Op2) to 32 byte
+ test Op2, 8
+ je .lCpyIncA32
+
+ mov Limb, [Op1]
+ mov [Op2], Limb
+ dec Size1
+ je .Exit
+
+ add Op1, 8
+ add Op2, 8
+
+ .lCpyIncA32:
+
+ test Op2, 16
+ je .lCpyIncAVX
+
+ mov Limb, [Op1]
+ mov [Op2], Limb
+ dec Size1
+ je .Exit
+
+ mov Limb, [Op1+8]
+ mov [Op2+8], Limb
+ dec Size1
+ je .Exit
+
+ add Op1, 16
+ add Op2, 16
+
+ .lCpyIncAVX:
+
+ mov Offs, 128
+ jmp .lCpyIncAVXCheck
+
+ ; main loop (prefetching disabled; unloaded cache)
+ ; - lCpyInc is slightly slower than lCpyDec through all cache levels?!
+ ; - 0.30 cycles / limb in L1$
+ ; - 0.60 cycles / limb in L2$
+ ; - 0.70-0.90 cycles / limb in L3$
+ align 16
+ .lCpyIncAVXLoop:
+
+ vmovdqu QLimb0, [Op1]
+ vmovdqu QLimb1, [Op1+32]
+ vmovdqu QLimb2, [Op1+64]
+ vmovdqu QLimb3, [Op1+96]
+ vmovdqa [Op2], QLimb0
+ vmovdqa [Op2+32], QLimb1
+ vmovdqa [Op2+64], QLimb2
+ vmovdqa [Op2+96], QLimb3
+
+ add Op1, Offs
+ add Op2, Offs
+
+ .lCpyIncAVXCheck:
+
+ sub Size1, 16
+ jnc .lCpyIncAVXLoop
+
+ add Size1, 16
+ je .Exit ; AVX copied operand fully =>
+
+ ; copy remaining max. 15 limb
+ test Size1, 8
+ je .lCpyIncFour
+
+ vmovdqu QLimb0, [Op1]
+ vmovdqu QLimb1, [Op1+32]
+ vmovdqa [Op2], QLimb0
+ vmovdqa [Op2+32], QLimb1
+
+ add Op1, 64
+ add Op2, 64
+
+ .lCpyIncFour:
+
+ test Size1, 4
+ je .lCpyIncTwo
+
+ vmovdqu QLimb0, [Op1]
+ vmovdqa [Op2], QLimb0
+
+ add Op1, 32
+ add Op2, 32
+
+ .lCpyIncTwo:
+
+ test Size1, 2
+ je .lCpyIncOne
+
+%if 1
+ ; Avoid SSE2 instruction due to stall on Haswell
+ mov Limb, [Op1]
+ mov [Op2], Limb
+ mov Limb, [Op1+8]
+ mov [Op2+8], Limb
+%else
+ movdqu DLimb0, [Op1]
+ movdqa [Op2], DLimb0
+%endif
+
+ add Op1, 16
+ add Op2, 16
+
+ .lCpyIncOne:
+
+ test Size1, 1
+ je .Exit
+
+ mov Limb, [Op1]
+ mov [Op2], Limb
+
+ .Exit:
+
+ vzeroupper
+ ret
+.end:
diff --git a/mpn/x86_64w/haswell/lshift.asm b/mpn/x86_64w/haswell/lshift.asm
new file mode 100644
index 00000000..fbfedc4a
--- /dev/null
+++ b/mpn/x86_64w/haswell/lshift.asm
@@ -0,0 +1,285 @@
+
+; Copyright 2016 Jens Nurmann and Alexander Kruppa
+
+; This file is part of the MPIR Library.
+
+; The MPIR Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published
+; by the Free Software Foundation; either version 2.1 of the License, or (at
+; your option) any later version.
+
+; The MPIR Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the MPIR Library; see the file COPYING.LIB. If not, write
+; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+; Boston, MA 02110-1301, USA.
+
+; mp_limb_t mpn_lshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
+; Linux RAX RDI RSI RDX RCX
+; Win7 RAX RCX RDX R8 R9
+;
+; Description:
+; The function shifts Op1 left by n bit, stores the result in Op2 (non-
+; destructive shl) and hands back the shifted-out most significant bits of Op1.
+; The function operates decreasing in memory supporting in-place operation.
+;
+; Result:
+; - Op2[ Size1-1..0 ] := ( Op1[ Size1-1..0 ]:ShlIn ) << 1
+; - Op1[ 0 ] >> 63
+;
+; Caveats:
+; - caller must ensure that Shift is in [ 1..63 ]!
+; - currently Linux64 support only!
+; - the AVX version uses mnemonics only available on Haswell, Broadwell and
+; Skylake cores
+; - the behaviour of cache prefetching in combination with AVX shifting seems
+; somewhat erratic
+; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
+; - slight (a few percent) improvement for full LD1$ sizes
+; - substantial (>10%) improvement for 1/2 LD2$ sizes
+; - slight (a few percent) improvement for full LD2$ sizes
+; - slight (a few percent) degradation for 1/2 LD3$ sizes
+; - substantial (around 10%) degradation for full LD3$ sizes
+;
+; Comments:
+; - implemented, tested and benched on 31.03.2016 by jn
+; - includes prefetching
+; ============================================================================
+
+%include 'yasm_mac.inc'
+
+BITS 64
+
+%ifdef USE_WIN64
+ %define Op2 R11
+ %define Op1 RDX
+ %define Size1 R8
+ %define Shift RCX
+ %define Limb1 R9
+ %define Limb2 R10
+ %ifdef USE_PREFETCH
+ %define Offs -512 ; No caller-saves regs left, use immediate
+ %endif
+ %define reg_save_list XMM, 6, 7
+%else
+ %define Op2 RDI
+ %define Op1 RSI
+ %define Size1 RDX
+ %define Shift RCX
+ %define Limb1 R8
+ %define Limb2 R9
+ %ifdef USE_PREFETCH
+ %define OFFS_REG 1
+ %define Offs R10
+ %endif
+%endif
+
+%define ShlDL0 XMM2 ; Attn: this must match ShlQL0 definition
+%define ShrDL0 XMM3 ; Attn: this must match ShrQL0 definition
+%define ShlDLCnt XMM6 ; Attn: this must match ShlQlCnt definition
+%define ShrDLCnt XMM7 ; Attn: this must match ShrQlCnt definition
+
+%define QLimb0 YMM0
+%define QLimb1 YMM1
+%define ShlQL0 YMM2
+%define ShrQL0 YMM3
+%define ShlQL1 YMM4
+%define ShrQL1 YMM5
+%define ShlQLCnt YMM6
+%define ShrQLCnt YMM7
+
+ align 32
+FRAME_PROC mpn_lshift, 0, reg_save_list
+%ifdef USE_WIN64
+ mov r11, rcx
+ mov rcx, r9
+%endif
+ xor EAX, EAX
+ sub Size1, 1
+ jc .Exit ; Size1=0 =>
+
+ lea Op1, [Op1+8*Size1]
+ lea Op2, [Op2+8*Size1]
+
+ mov Limb1, [Op1]
+ shld RAX, Limb1, CL
+
+ or Size1, Size1
+ je .lShlEquPost ; Size1=1 =>
+
+ %ifdef USE_PREFETCH
+ %ifdef OFFS_REG
+ mov Offs, -512
+ %endif
+ %endif
+
+ cmp Size1, 8
+ jc .lShlEquFour ; AVX inefficient =>
+
+ ; first align Op2 to 32 bytes
+ test Op2, 8
+ jne .lShlEquA16
+
+ mov Limb2, [Op1-8]
+ shld Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, Limb2
+
+ sub Op1, 8
+ sub Op2, 8
+ sub Size1, 1
+
+ .lShlEquA16:
+
+ test Op2, 16
+ jne .lShlEquAVX
+
+ mov Limb2, [Op1-8]
+ shld Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, [Op1-16]
+ shld Limb2, Limb1, CL
+ mov [Op2-8], Limb2
+
+ sub Op1, 16
+ sub Op2, 16
+ sub Size1, 2
+
+ .lShlEquAVX:
+
+ ; initialize AVX shift counter
+ vmovq ShlDLCnt, RCX
+ neg RCX
+ and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
+ vmovq ShrDLCnt, RCX
+ neg RCX
+ and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
+ vpbroadcastq ShlQLCnt, ShlDLCnt
+ vpbroadcastq ShrQLCnt, ShrDLCnt
+
+ ; pre-fetch first quad-limb
+ vmovdqu QLimb0, [Op1-24]
+ vpsrlvq ShrQL0, QLimb0, ShrQLCnt
+ vpermq ShrQL0, ShrQL0, 10010011b
+
+ sub Op1, 32
+ sub Size1, 4
+ jmp .lShlEquAVXCheck
+
+ ; main loop (prefetching enabled; unloaded cache)
+ ; - 0.60 cycles per limb in LD1$
+ ; - 0.60-0.70 cycles per limb in LD2$
+ ; - 0.70-0.90 cycles per limb in LD3$
+ align 16
+ .lShlEquAVXLoop:
+
+ %ifdef USE_PREFETCH
+ prefetchnta [Op1+Offs]
+ %endif
+
+ vmovdqu QLimb1, [Op1-24]
+ vpsllvq ShlQL0, QLimb0, ShlQLCnt
+ vmovdqu QLimb0, [Op1-56]
+ vpsrlvq ShrQL1, QLimb1, ShrQLCnt
+ vpermq ShrQL1, ShrQL1, 10010011b
+ vpblendd ShrQL0, ShrQL0, ShrQL1, 00000011b
+ vpor ShlQL0, ShlQL0, ShrQL0
+ vpsllvq ShlQL1, QLimb1, ShlQLCnt
+ vpsrlvq ShrQL0, QLimb0, ShrQLCnt
+ vpermq ShrQL0, ShrQL0, 10010011b
+ vpblendd ShrQL1, ShrQL1, ShrQL0, 00000011b
+ vmovdqa [Op2-24], ShlQL0
+ vpor ShlQL1, ShlQL1, ShrQL1
+ vmovdqa [Op2-56], ShlQL1
+
+ sub Op1, 64
+ sub Op2, 64
+
+ .lShlEquAVXCheck:
+
+ sub Size1, 8
+ jnc .lShlEquAVXLoop
+
+ mov Limb1, [Op1]
+ xor Limb2, Limb2
+ shld Limb2, Limb1, CL
+%if 1
+ vmovq ShlDL0, Limb2
+ vpblendd ShrQL0, ShrQL0, ShlQL0, 3
+%else
+ ; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
+ ; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
+ ; but it is only executed once and there is no AVX2 based alternative
+ pinsrq ShrDL0, Limb2, 0 ; SSE4.1
+%endif
+ vpsllvq ShlQL0, QLimb0, ShlQLCnt
+ vpor ShlQL0, ShlQL0, ShrQL0
+ vmovdqa [Op2-24], ShlQL0
+
+ sub Op2, 32
+ add Size1, 8
+
+ ; shift remaining max. 7 limbs with SHLD mnemonic
+ .lShlEquFour:
+
+ sub Op1, 8
+ test Size1, 4
+ je .lShlEquTwo
+
+ mov Limb2, [Op1]
+ shld Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, [Op1-8]
+ shld Limb2, Limb1, CL
+ mov [Op2-8], Limb2
+ mov Limb2, [Op1-16]
+ shld Limb1, Limb2, CL
+ mov [Op2-16], Limb1
+ mov Limb1, [Op1-24]
+ shld Limb2, Limb1, CL
+ mov [Op2-24], Limb2
+
+ sub Op1, 32
+ sub Op2, 32
+
+ .lShlEquTwo:
+
+ test Size1, 2
+ je .lShlEquOne
+
+ mov Limb2, [Op1]
+ shld Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, [Op1-8]
+ shld Limb2, Limb1, CL
+ mov [Op2-8], Limb2
+
+ sub Op1, 16
+ sub Op2, 16
+
+ .lShlEquOne:
+
+ test Size1, 1
+ je .lShlEquPost
+
+ mov Limb2, [Op1]
+ shld Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, Limb2
+
+ sub Op2, 8
+
+ .lShlEquPost:
+
+ shl Limb1, CL
+ mov [Op2], Limb1
+
+ .Exit:
+
+ vzeroupper
+END_PROC reg_save_list
+.end:
\ No newline at end of file
diff --git a/mpn/x86_64w/haswell/rshift.asm b/mpn/x86_64w/haswell/rshift.asm
new file mode 100644
index 00000000..fbcc7d8e
--- /dev/null
+++ b/mpn/x86_64w/haswell/rshift.asm
@@ -0,0 +1,282 @@
+
+; Copyright 2016 Jens Nurmann and Alexander Kruppa
+
+; This file is part of the MPIR Library.
+
+; The MPIR Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published
+; by the Free Software Foundation; either version 2.1 of the License, or (at
+; your option) any later version.
+
+; The MPIR Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the MPIR Library; see the file COPYING.LIB. If not, write
+; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+; Boston, MA 02110-1301, USA.
+
+; mp_limb_t mpn_rshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
+; Linux RAX RDI RSI RDX RCX
+; Windows x64 RAX RCX RDX R8 R9
+;
+; Description:
+; The function shifts Op1 right by Shift bits, stores the result in Op2 (non-
+; destructive shr) and hands back the shifted-out least significant bits of
+; Op1. The function operates increasing in memory supporting in place shifts.
+;
+; Result:
+; - Op2[ Size1-1..0 ] := ( ShrIn:Op1[ Size1-1..0 ] ) >> Shift
+; - Op1[ 0 ] << ( 64-Shift )
+;
+; Caveats:
+; - caller must ensure that Shift is in [ 1..63 ]!
+; - currently Linux64 support only!
+; - the AVX version uses mnemonics only available on Haswell, Broadwell and
+; Skylake cores
+; - the behaviour of cache prefetching in combination with AVX shifting seems
+; somewhat erratic
+; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
+; - slight (a few percent) improvement for full LD1$ sizes
+; - substantial (>10%) improvement for 1/2 LD2$ sizes
+; - slight (a few percent) improvement for full LD2$ sizes
+; - slight (a few percent) degradation for 1/2 LD3$ sizes
+; - substantial (around 10%) degradation for full LD3$ sizes
+;
+; Comments:
+; - implemented, tested and benchmarked on 30.03.2016 by jn
+; - includes prefetching
+; ============================================================================
+
+%include 'yasm_mac.inc'
+
+BITS 64
+
+%ifdef USE_WIN64
+ %define Op2 R11
+ %define Op1 RDX
+ %define Size1 R8
+ %define Shift RCX
+ %define Limb1 R9
+ %define Limb2 R10
+ %ifdef USE_PREFETCH
+ %define Offs -512 ; No caller-saves regs left, use immediate
+ %endif
+ %define reg_save_list XMM, 6, 7
+%else
+ %define Op2 RDI
+ %define Op1 RSI
+ %define Size1 RDX
+ %define Shift RCX
+ %define Limb1 R8
+ %define Limb2 R9
+ %ifdef USE_PREFETCH
+ %define OFFS_REG 1
+ %define Offs R10
+ %endif
+%endif
+
+%define ShrDL0 XMM2 ; Attn: this must match ShrQL0 definition
+%define ShlDL0 XMM3 ; Attn: this must match ShlQL0 definition
+%define ShrDLCnt XMM6 ; Attn: this must match ShrQlCnt definition
+%define ShlDLCnt XMM7 ; Attn: this must match ShlQlCnt definition
+
+%define QLimb0 YMM0
+%define QLimb1 YMM1
+%define ShrQL0 YMM2
+%define ShlQL0 YMM3
+%define ShrQL1 YMM4
+%define ShlQL1 YMM5
+%define ShrQLCnt YMM6
+%define ShlQLCnt YMM7
+
+ align 32
+
+FRAME_PROC mpn_rshift, 0, reg_save_list
+%ifdef USE_WIN64
+ mov r11, rcx
+ mov rcx, r9
+%endif
+ xor EAX, EAX
+ or Size1, Size1
+ je .Exit
+
+ mov Limb1, [Op1]
+ shrd RAX, Limb1, CL
+
+ sub Size1, 1
+ je .lShrEquPost ; Size1=1 =>
+
+ %ifdef USE_PREFETCH
+ mov Offs, 512
+ %endif
+
+ cmp Size1, 8
+ jc .lShrEquFour ; AVX inefficient =>
+
+ ; first align Op2 to 32 bytes
+ test Op2, 8
+ je .lShrEquAlign16
+
+ mov Limb2, [Op1+8]
+ shrd Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, Limb2
+
+ add Op1, 8
+ add Op2, 8
+ sub Size1, 1
+
+ .lShrEquAlign16:
+
+ test Op2, 16
+ je .lShrEquAVX
+
+ mov Limb2, [Op1+8]
+ shrd Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, [Op1+16]
+ shrd Limb2, Limb1, CL
+ mov [Op2+8], Limb2
+
+ add Op1, 16
+ add Op2, 16
+ sub Size1, 2
+
+ .lShrEquAVX:
+
+ ; initialize AVX shift counter
+ vmovq ShrDLCnt, RCX
+ neg RCX
+ and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
+ vmovq ShlDLCnt, RCX
+ neg RCX
+ and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
+ vpbroadcastq ShrQLCnt, ShrDLCnt
+ vpbroadcastq ShlQLCnt, ShlDLCnt
+
+ ; pre-fetch first quad-limb
+ vmovdqu QLimb0, [Op1]
+ vpsllvq ShlQL0, QLimb0, ShlQLCnt
+
+ add Op1, 32
+ sub Size1, 4
+ jmp .lShrEquAVXCheck
+
+ ; main loop (prefetching enabled, unloaded data cache)
+ ; - 0.60 cycles per limb in LD1$
+ ; - 0.60-0.70 cycles per limb in LD2$
+ ; - 0.70-0.90 cycles per limb in LD3$
+ align 16
+ .lShrEquAVXLoop:
+
+ %ifdef USE_PREFETCH
+ prefetchnta [Op1+Offs]
+ %endif
+
+ vmovdqu QLimb1, [Op1]
+ vpsrlvq ShrQL0, QLimb0, ShrQLCnt
+ vmovdqu QLimb0, [Op1+32]
+ vpsllvq ShlQL1, QLimb1, ShlQLCnt
+ vpblendd ShlQL0, ShlQL0, ShlQL1, 00000011b
+ vpermq ShlQL0, ShlQL0, 00111001b
+ vpor ShrQL0, ShrQL0, ShlQL0
+ vpsrlvq ShrQL1, QLimb1, ShrQLCnt
+ vpsllvq ShlQL0, QLimb0, ShlQLCnt
+ vpblendd ShlQL1, ShlQL1, ShlQL0, 00000011b
+ vpermq ShlQL1, ShlQL1, 00111001b
+ vmovdqa [Op2], ShrQL0
+ vpor ShrQL1, ShrQL1, ShlQL1
+ vmovdqa [Op2+32], ShrQL1
+
+ add Op1, 64
+ add Op2, 64
+
+ .lShrEquAVXCheck:
+
+ sub Size1, 8
+ jnc .lShrEquAVXLoop
+
+ mov Limb1, [Op1]
+ xor Limb2, Limb2
+ shrd Limb2, Limb1, CL
+%if 1
+ vmovq ShrDL0, Limb2
+ vpblendd ShlQL0, ShlQL0, ShrQL0, 3
+%else
+ ; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
+ ; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
+ ; but it is only executed once and there is no AVX2 based alternative
+ pinsrq ShlDL0, Limb2, 0 ; SSE4.1
+%endif
+ vpsrlvq ShrQL0, QLimb0, ShrQLCnt
+ vpermq ShlQL0, ShlQL0, 00111001b
+ vpor ShrQL0, ShrQL0, ShlQL0
+ vmovdqa [Op2], ShrQL0
+
+ add Op2, 32
+ add Size1, 8
+
+ ; shift remaining max. 7 limbs with SHRD mnemonic
+ .lShrEquFour:
+
+ add Op1, 8
+ test Size1, 4
+ je .lShrEquTwo
+
+ mov Limb2, [Op1]
+ shrd Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, [Op1+8]
+ shrd Limb2, Limb1, CL
+ mov [Op2+8], Limb2
+ mov Limb2, [Op1+16]
+ shrd Limb1, Limb2, CL
+ mov [Op2+16], Limb1
+ mov Limb1, [Op1+24]
+ shrd Limb2, Limb1, CL
+ mov [Op2+24], Limb2
+
+ add Op1, 32
+ add Op2, 32
+
+ .lShrEquTwo:
+
+ test Size1, 2
+ je .lShrEquOne
+
+ mov Limb2, [Op1]
+ shrd Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, [Op1+8]
+ shrd Limb2, Limb1, CL
+ mov [Op2+8], Limb2
+
+ add Op1, 16
+ add Op2, 16
+
+ .lShrEquOne:
+
+ test Size1, 1
+ je .lShrEquPost
+
+ mov Limb2, [Op1]
+ shrd Limb1, Limb2, CL
+ mov [Op2], Limb1
+ mov Limb1, Limb2
+
+ add Op2, 8
+
+ ; store most significant limb considering shift-in part
+ .lShrEquPost:
+
+ shr Limb1, CL
+ mov [Op2], Limb1
+
+ .Exit:
+
+ vzeroupper
+END_PROC reg_save_list
+.end:
diff --git a/mpn/x86_64w/yasm_mac.inc b/mpn/x86_64w/yasm_mac.inc
index 141e8ab9..bbb061a1 100644
--- a/mpn/x86_64w/yasm_mac.inc
+++ b/mpn/x86_64w/yasm_mac.inc
@@ -119,24 +119,48 @@
%endif
%rotate 1
- %assign gpr_regs 0
+ %assign stack_slots 0
+ %assign xmm_seen 0
%if %0 > 2
%rep %0 - 2
- push_reg %1
- %assign gpr_regs gpr_regs + 1
+ %ifnum %1
+ %if xmm_seen == 0
+ %error Not an XMM register
+ %else
+ alloc_stack 16
+ save_xmm128 XMM%1, 0
+ %assign stack_slots stack_slots + 2
+ %endif
+ %elifid %1
+ %ifidni XMM, %1
+ %if stack_slots & 1 == 0
+ alloc_stack 8
+ %assign stack_slots stack_slots + 1
+ %assign xmm_seen 1
+ %else
+ %assign xmm_seen 2
+ %endif
+ %elif xmm_seen == 0
+ push_reg %1
+ %assign stack_slots stack_slots + 1
+ %else
+ %error XMM registers must be last in the save list
+ %endif
+ %else
+ %error Bad parameter list
+ %endif
%rotate 1
%endrep
%endif
- %if (gpr_regs & 1) == (var_slots & 1)
+ %if (stack_slots & 1) == (var_slots & 1)
%assign var_slots var_slots + 1
%endif
%if var_slots > 0
alloc_stack 8 * var_slots
%endif
- %assign stack_use 8 * (gpr_regs + var_slots)
-
+ %assign stack_use 8 * (stack_slots + var_slots)
END_PROLOGUE
%endmacro
@@ -147,7 +171,16 @@
%if %0 > 0
%rep %0
%rotate -1
- pop %1
+ %ifnum %1
+ movdqa XMM%1, [rsp]
+ add rsp, 16
+ %elifidni %1, XMM
+ %if xmm_seen == 1
+ add rsp, 8
+ %endif
+ %else
+ pop %1
+ %endif
%endrep
%endif
ret
@@ -156,14 +189,25 @@
%macro END_PROC 0-*
- add rsp, 8 * var_slots
+ %if var_slots
+ add rsp, 8 * var_slots
+ %endif
%if %0 > 0
%rep %0
%rotate -1
- pop %1
+ %ifnum %1
+ movdqa XMM%1, [rsp]
+ add rsp, 16
+ %elifidni %1, XMM
+ %if xmm_seen == 1
+ add rsp, 8
+ %endif
+ %else
+ pop %1
+ %endif
%endrep
%endif
- ret
+ ret
ENDPROC_FRAME
%endmacro