From 77b483e79f2d4a728b0d3a1224b8e2305d98ab66 Mon Sep 17 00:00:00 2001 From: Brian Gladman Date: Sat, 26 Nov 2016 22:35:25 +0000 Subject: [PATCH] add more win64 assembler for haswell --- .../dll_mpir_haswell/dll_mpir_haswell.vcxproj | 26 +- .../dll_mpir_haswell.vcxproj.filters | 30 +- .../lib_mpir_haswell/lib_mpir_haswell.vcxproj | 37 ++- .../lib_mpir_haswell.vcxproj.filters | 30 +- build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj | 2 +- build.vc15/cdata/mpn/x86_64w/haswell/cfg.h | 2 + .../dll_mpir_haswell/dll_mpir_haswell.vcxproj | 22 +- .../dll_mpir_haswell.vcxproj.filters | 18 +- .../lib_mpir_haswell/lib_mpir_haswell.vcxproj | 22 +- .../lib_mpir_haswell.vcxproj.filters | 18 +- build.vc15/mpir.sln | 99 +++--- mpn/x86_64w/haswell/add_n.asm | 111 +++++++ mpn/x86_64w/haswell/copyd.asm | 203 +++++++++++++ mpn/x86_64w/haswell/copyi.asm | 199 ++++++++++++ mpn/x86_64w/haswell/lshift.asm | 285 ++++++++++++++++++ mpn/x86_64w/haswell/rshift.asm | 282 +++++++++++++++++ mpn/x86_64w/yasm_mac.inc | 64 +++- 17 files changed, 1299 insertions(+), 151 deletions(-) create mode 100644 mpn/x86_64w/haswell/add_n.asm create mode 100644 mpn/x86_64w/haswell/copyd.asm create mode 100644 mpn/x86_64w/haswell/copyi.asm create mode 100644 mpn/x86_64w/haswell/lshift.asm create mode 100644 mpn/x86_64w/haswell/rshift.asm diff --git a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj index e5cb0750..9d78cfbd 100644 --- a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj +++ b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj @@ -1,4 +1,4 @@ - + @@ -51,7 +51,7 @@ prebuild haswell x64 14 - DLL;USE_WIN64 + DLL ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -62,6 +62,7 @@ prebuild haswell x64 14 + cd ..\..\build.vc postbuild "$(TargetPath)" 14 @@ -75,7 +76,7 @@ prebuild haswell x64 14 - DLL;USE_WIN64 + DLL ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -86,6 +87,7 @@ prebuild haswell x64 14 + cd ..\..\build.vc postbuild "$(TargetPath)" 14 @@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 14 - @@ -474,8 +475,6 @@ postbuild "$(TargetPath)" 14 - - @@ -526,7 +525,6 @@ postbuild "$(TargetPath)" 14 - @@ -567,7 +565,6 @@ postbuild "$(TargetPath)" 14 - @@ -610,17 +607,22 @@ postbuild "$(TargetPath)" 14 + + + + + @@ -631,8 +633,8 @@ postbuild "$(TargetPath)" 14 - - + + - - \ No newline at end of file + + diff --git a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters index 47b0f2ff..63b6f7c4 100644 --- a/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters +++ b/build.vc14/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters @@ -1114,9 +1114,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1144,12 +1141,6 @@ Source Files\mpn - - Source Files\mpn - - - Source Files\mpn - Source Files\mpn @@ -1300,9 +1291,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1423,9 +1411,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1548,9 +1533,18 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm + + Source Files\mpn\yasm + + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1569,6 +1563,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1581,6 +1578,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm diff --git a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj index c5c09002..fbd0be25 100644 --- a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj +++ b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj @@ -51,7 +51,8 @@ prebuild haswell x64 14 - USE_WIN64 + + ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -73,7 +74,8 @@ prebuild haswell x64 14 - USE_WIN64 + + ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 14 - @@ -459,8 +460,6 @@ postbuild "$(TargetPath)" 14 - - @@ -511,7 +510,6 @@ postbuild "$(TargetPath)" 14 - @@ -552,7 +550,6 @@ postbuild "$(TargetPath)" 14 - @@ -595,13 +592,29 @@ postbuild "$(TargetPath)" 14 + + USE_WIN64 + USE_WIN64 + + + USE_WIN64 + USE_WIN64 + + + USE_WIN64 + USE_WIN64 + + + USE_WIN64 + USE_WIN64 + USE_WIN64 USE_WIN64 @@ -609,11 +622,13 @@ postbuild "$(TargetPath)" 14 + + USE_WIN64 + USE_WIN64 + - - - - + USE_WIN64 + USE_WIN64 diff --git a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters index 31860663..fd203d93 100644 --- a/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters +++ b/build.vc14/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters @@ -1080,9 +1080,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1110,12 +1107,6 @@ Source Files\mpn - - Source Files\mpn - - - Source Files\mpn - Source Files\mpn @@ -1266,9 +1257,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1389,9 +1377,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1514,9 +1499,18 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm + + Source Files\mpn\yasm + + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1535,6 +1529,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1547,6 +1544,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm diff --git a/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj b/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj index 2c73cdcd..807b2a1d 100644 --- a/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj +++ b/build.vc14/mpir-tests/cxx.ops/cxx.ops.vcxproj @@ -168,7 +168,7 @@ check_config $(Platform) $(Configuration) 14 MachineX64 - No + true diff --git a/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h b/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h index 122c4baf..50783988 100644 --- a/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h +++ b/build.vc15/cdata/mpn/x86_64w/haswell/cfg.h @@ -8,12 +8,14 @@ mpn_divexact_byfobm1 mpn_divrem_2 mpn_divrem_euclidean_qr_1 mpn_divrem_euclidean_qr_2 +mpn_lshift mpn_lshift1 mpn_modexact_1_odd mpn_modexact_1c_odd mpn_mul_2 mpn_mulmid_basecase mpn_preinv_divrem_1 +mpn_rshift mpn_rshift1 mpn_sqr_basecase mpn_sub_err1_n diff --git a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj index f79efb04..8a7f1c9b 100644 --- a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj +++ b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj @@ -1,4 +1,4 @@ - + @@ -51,7 +51,7 @@ prebuild haswell x64 15 - DLL;USE_WIN64 + DLL ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -62,6 +62,7 @@ prebuild haswell x64 15 + cd ..\..\build.vc postbuild "$(TargetPath)" 15 @@ -75,7 +76,7 @@ prebuild haswell x64 15 - DLL;USE_WIN64 + DLL ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -86,6 +87,7 @@ prebuild haswell x64 15 + cd ..\..\build.vc postbuild "$(TargetPath)" 15 @@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 15 - @@ -526,7 +527,6 @@ postbuild "$(TargetPath)" 15 - @@ -567,7 +567,6 @@ postbuild "$(TargetPath)" 15 - @@ -610,6 +609,7 @@ postbuild "$(TargetPath)" 15 + @@ -617,10 +617,12 @@ postbuild "$(TargetPath)" 15 + + @@ -631,8 +633,8 @@ postbuild "$(TargetPath)" 15 - - + + - - \ No newline at end of file + + diff --git a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters index 47b0f2ff..1660d33c 100644 --- a/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters +++ b/build.vc15/dll_mpir_haswell/dll_mpir_haswell.vcxproj.filters @@ -1114,9 +1114,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1300,9 +1297,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1423,9 +1417,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1548,6 +1539,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1569,6 +1563,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1581,6 +1578,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm diff --git a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj index 69420a7e..561ef482 100644 --- a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj +++ b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj @@ -1,4 +1,4 @@ - + @@ -51,7 +51,7 @@ prebuild haswell x64 15 - USE_WIN64 + ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -60,6 +60,7 @@ prebuild haswell x64 15 ..\..\ NDEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions) + cd ..\..\build.vc postbuild "$(TargetPath)" 15 @@ -73,7 +74,7 @@ prebuild haswell x64 15 - USE_WIN64 + ..\..\mpn\x86_64w\ true $(IntDir)mpn\ @@ -82,6 +83,7 @@ prebuild haswell x64 15 ..\..\ _DEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions) + cd ..\..\build.vc postbuild "$(TargetPath)" 15 @@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 15 - @@ -511,7 +512,6 @@ postbuild "$(TargetPath)" 15 - @@ -552,7 +552,6 @@ postbuild "$(TargetPath)" 15 - @@ -595,6 +594,7 @@ postbuild "$(TargetPath)" 15 + @@ -602,10 +602,12 @@ postbuild "$(TargetPath)" 15 + + @@ -616,8 +618,8 @@ postbuild "$(TargetPath)" 15 - - + + - - \ No newline at end of file + + diff --git a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters index 31860663..21bf6302 100644 --- a/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters +++ b/build.vc15/lib_mpir_haswell/lib_mpir_haswell.vcxproj.filters @@ -1080,9 +1080,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1266,9 +1263,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1389,9 +1383,6 @@ Source Files\mpn - - Source Files\mpn - Source Files\mpn @@ -1514,6 +1505,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1535,6 +1529,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm @@ -1547,6 +1544,9 @@ Source Files\mpn\yasm + + Source Files\mpn\yasm + Source Files\mpn\yasm diff --git a/build.vc15/mpir.sln b/build.vc15/mpir.sln index 3f54c2f5..8413ca83 100644 --- a/build.vc15/mpir.sln +++ b/build.vc15/mpir.sln @@ -1,6 +1,7 @@ + Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 -VisualStudioVersion = 15.0.25914.0 +VisualStudioVersion = 15.0 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lib_mpir_k8", "lib_mpir_k8\lib_mpir_k8.vcxproj", "{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}" EndProject @@ -44,74 +45,74 @@ Global Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|x64 - {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64 - {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|x64 + {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|Win32 {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|x64.ActiveCfg = Release|x64 - {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32 - {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|Win32 + {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|Win32 + {22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64 {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|Win32.ActiveCfg = Release|Win32 - {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|Win32 - {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32 - {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|Win32 + {F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|x64 + {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32 + {F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|x64 {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|Win32.ActiveCfg = Release|Win32 - {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|Win32 - {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|x64 - {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64 - {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|x64 + {52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|x64 + {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32 + {52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|x64 + {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|Win32 {F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|x64.ActiveCfg = Release|x64 - {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32 - {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64 + {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|Win32 + {F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64 {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|Win32.ActiveCfg = Release|Win32 {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|x64.ActiveCfg = Release|x64 - {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|x64 - {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64 - {D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|x64 + {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32 + {EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64 + {D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|Win32 {D058893B-87A8-4161-8821-FA5707504B2C}.Release|x64.ActiveCfg = Release|x64 - {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32 - {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64 + {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|Win32 + {D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64 {4A742B65-9836-4F46-8310-728F046A31C1}.Release|Win32.ActiveCfg = Release|Win32 {4A742B65-9836-4F46-8310-728F046A31C1}.Release|x64.ActiveCfg = Release|x64 - {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|x64 - {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64 - {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|x64 + {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32 + {4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64 + {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|Win32 {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|x64.ActiveCfg = Release|x64 - {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|x64 - {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64 - {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|x64 + {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|Win32 + {935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64 + {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|Win32 {49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|x64.ActiveCfg = Release|x64 - {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32 - {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64 + {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|Win32 + {49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64 {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|Win32.ActiveCfg = Release|Win32 {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|x64.ActiveCfg = Release|x64 - {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|x64 - {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64 - {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|x64 + {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32 + {D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64 + {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|Win32 {EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|x64.ActiveCfg = Release|x64 - {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|x64 - {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64 - {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|x64 + {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|Win32 + {EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64 + {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|Win32 {87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|x64.ActiveCfg = Release|x64 - {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|x64 - {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64 - {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|x64 + {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|Win32 + {87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64 + {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|Win32 {3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|x64.ActiveCfg = Release|x64 - {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|x64 - {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64 - {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|x64 + {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|Win32 + {3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64 + {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|Win32 {5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|x64.ActiveCfg = Release|x64 - {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|x64 - {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64 - {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|x64 + {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|Win32 + {5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64 + {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|Win32 {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|x64.ActiveCfg = Release|x64 - {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|x64 - {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64 - {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|x64 + {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|Win32 + {51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64 + {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|Win32 {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|x64.ActiveCfg = Release|x64 - {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|x64 - {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64 - {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|x64 + {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|Win32 + {A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64 + {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|Win32 {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|x64.ActiveCfg = Release|x64 + {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|Win32 + {A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/mpn/x86_64w/haswell/add_n.asm b/mpn/x86_64w/haswell/add_n.asm new file mode 100644 index 00000000..cac4b8cc --- /dev/null +++ b/mpn/x86_64w/haswell/add_n.asm @@ -0,0 +1,111 @@ +; PROLOGUE(mpn_add_n) + +; Version 1.0.3. +; +; Copyright 2008 Jason Moxham +; +; Windows Conversion Copyright 2008 Brian Gladman +; +; This file is part of the MPIR Library. +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. +; +; Calculate src1[size] plus(minus) src2[size] and store the result in +; dst[size]. The return value is the carry bit from the top of the result +; (1 or 0). The _nc version accepts 1 or 0 for an initial carry into the +; low limb of the calculation. Note values other than 1 or 0 here will +; lead to garbage results. +; +; mp_limb_t mpn_add_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t) +; mp_limb_t mpn_add_nc(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t) +; rax rdi rsi rdx rcx r8 +; rax rcx rdx r8 r9 [rsp+40] + +%include "yasm_mac.inc" + + CPU Athlon64 + BITS 64 + + xalign 8 + LEAF_PROC mpn_add_nc + mov r10,[rsp+40] + jmp entry + + xalign 8 + LEAF_PROC mpn_add_n + xor r10, r10 +entry: + mov rax, r9 + and rax, 3 + shr r9, 2 + lea r9,[r10+r9*2] + sar r9, 1 + jnz .2 + + mov r10, [rdx] + adc r10, [r8] + mov [rcx], r10 + dec rax + jz .1 + mov r10, [rdx+8] + adc r10, [r8+8] + mov [rcx+8], r10 + dec rax + jz .1 + mov r10, [rdx+16] + adc r10, [r8+16] + mov [rcx+16], r10 + dec rax +.1: adc rax, rax + ret + + xalign 8 +.2: mov r10, [rdx] + mov r11, [rdx+8] + lea rdx, [rdx+32] + adc r10, [r8] + adc r11, [r8+8] + lea r8, [r8+32] + mov [rcx], r10 + mov [rcx+8], r11 + lea rcx, [rcx+32] + mov r10, [rdx-16] + mov r11, [rdx-8] + adc r10, [r8-16] + adc r11, [r8-8] + mov [rcx-16], r10 + dec r9 + mov [rcx-8], r11 + jnz .2 + + inc rax + dec rax + jz .3 + mov r10, [rdx] + adc r10, [r8] + mov [rcx], r10 + dec rax + jz .3 + mov r10, [rdx+8] + adc r10, [r8+8] + mov [rcx+8], r10 + dec rax + jz .3 + mov r10, [rdx+16] + adc r10, [r8+16] + mov [rcx+16], r10 + dec rax +.3: adc rax, rax + ret + + end diff --git a/mpn/x86_64w/haswell/copyd.asm b/mpn/x86_64w/haswell/copyd.asm new file mode 100644 index 00000000..1eac1ca6 --- /dev/null +++ b/mpn/x86_64w/haswell/copyd.asm @@ -0,0 +1,203 @@ + +; Copyright 2016 Jens Nurmann and Alexander Kruppa + +; This file is part of the MPIR Library. + +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. + +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. + + +; mpn_copyd(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1) +; Linux RDI RSI RDX +; Win7 RCX RDX R8 +; +; Description: +; The function copies a given number of limb from source to destination (while +; moving high to low in memory) and hands back the size (in limb) of the +; destination. +; +; Result: +; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ] +; - number of copied limb: range [ 0..max tCounter ] +; +; Caveats: +; - if size 0 is given the content of the destination will remain untouched! +; - if Op1=Op2 no copy is done! +; +; Comments: +; - AVX-based version implemented, tested & benched on 05.01.2016 by jn +; - did some experiments with AVX based version with following results +; - AVX can be faster in L1$-L3$ if destination is aligned on 32 byte +; - AVX is generally faster on small sized operands (<=100 limb) due too +; start-up overhead of "rep movsq" - however this could also be achieved by +; simple copy loop +; - startup overhead of "rep movsq" with negative direction is 200 cycles!!! +; - negative direction is unfavourable compared to positive "rep movsq" and +; to AVX. + +%include 'yasm_mac.inc' + +BITS 64 + +%ifdef USE_WIN64 + %define Op2 RCX + %define Op1 RDX + %define Size1 R8 + %define Limb R9 + %define Offs R10 +%else + %define Op2 RDI + %define Op1 RSI + %define Size1 RDX + %define Limb RCX + %define Offs R10 +%endif + +%define DLimb0 XMM0 +%define QLimb0 YMM0 +%define QLimb1 YMM1 +%define QLimb2 YMM2 +%define QLimb3 YMM3 + + align 32 + +LEAF_PROC mpn_copyd + mov RAX, Size1 + cmp Op1, Op2 + je .Exit ; no copy required => + + or RAX, RAX + je .Exit ; Size=0 => + + lea Op1, [Op1+8*Size1-8] + lea Op2, [Op2+8*Size1-8] + + ; align the destination (Op2) to 32 byte + test Op2, 8 + jne .lCpyDecA32 + + mov Limb, [Op1] + mov [Op2], Limb + dec Size1 + je .Exit + + sub Op1, 8 + sub Op2, 8 + + .lCpyDecA32: + + test Op2, 16 + jnz .lCpyDecAVX + + mov Limb, [Op1] + mov [Op2], Limb + dec Size1 + je .Exit + + mov Limb, [Op1-8] + mov [Op2-8], Limb + dec Size1 + je .Exit + + sub Op1, 16 + sub Op2, 16 + + .lCpyDecAVX: + + mov Offs, 128 + jmp .lCpyDecAVXCheck + + ; main loop (prefetching disabled; unloaded cache) + ; - 0.30 cycles / limb in L1$ + ; - 0.60 cycles / limb in L2$ + ; - 0.70-0.90 cycles / limb in L3$ + align 16 + .lCpyDecAVXLoop: + + vmovdqu QLimb0, [Op1-24] + vmovdqu QLimb1, [Op1-56] + vmovdqu QLimb2, [Op1-88] + vmovdqu QLimb3, [Op1-120] + vmovdqa [Op2-24], QLimb0 + vmovdqa [Op2-56], QLimb1 + vmovdqa [Op2-88], QLimb2 + vmovdqa [Op2-120], QLimb3 + + sub Op1, Offs + sub Op2, Offs + + .lCpyDecAVXCheck: + + sub Size1, 16 + jnc .lCpyDecAVXLoop + + add Size1, 16 + je .Exit ; AVX copied operand fully => + + ; copy remaining max. 15 limb + test Size1, 8 + je .lCpyDecFour + + vmovdqu QLimb0, [Op1-24] + vmovdqu QLimb1, [Op1-56] + vmovdqa [Op2-24], QLimb0 + vmovdqa [Op2-56], QLimb1 + + sub Op1, 64 + sub Op2, 64 + + .lCpyDecFour: + + test Size1, 4 + je .lCpyDecTwo + + vmovdqu QLimb0, [Op1-24] + vmovdqa [Op2-24], QLimb0 + + sub Op1, 32 + sub Op2, 32 + + .lCpyDecTwo: + + test Size1, 2 + je .lCpyDecOne + +%if 1 + ; Avoid SSE2 instruction due to stall on Haswell + mov Limb, [Op1] + mov [Op2], Limb + mov Limb, [Op1-8] + mov [Op2-8], Limb +%else + movdqu DLimb0, [Op1-8] + movdqa [Op2-8], DLimb0 +%endif + + sub Op1, 16 + sub Op2, 16 + + .lCpyDecOne: + + test Size1, 1 + je .Exit + + mov Limb, [Op1] + mov [Op2], Limb + + .Exit: + + vzeroupper + ret +.end: diff --git a/mpn/x86_64w/haswell/copyi.asm b/mpn/x86_64w/haswell/copyi.asm new file mode 100644 index 00000000..97f30df1 --- /dev/null +++ b/mpn/x86_64w/haswell/copyi.asm @@ -0,0 +1,199 @@ + +; Copyright 2016 Jens Nurmann and Alexander Kruppa + +; This file is part of the MPIR Library. + +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. + +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. + +; mpn_copyi(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1) +; Linux RDI RSI RDX +; Win7 RCX RDX R8 +; +; Description: +; The function copies a given number of limb from source to destination (while +; moving low to high in memory) and hands back the size (in limb) of the +; destination. +; +; Result: +; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ] +; - number of copied limb: range [ 0..max tCounter ] +; +; Caveats: +; - if size 0 is given the content of the destination will remain untouched! +; - if Op1=Op2 no copy is done! +; +; Comments: +; - AVX-based version implemented, tested & benched on 05.01.2016 by jn +; - did some experiments with AVX based version with following results +; - AVX can be faster in L1$ (30%), L2$ (10%) if dest. is aligned on 32 byte +; - AVX is generally faster on small sized operands (<=100 limb) due too +; start-up overhead of "rep movsq" - however this could also be achieved by +; simple copy loop +; - the break-even between AVX and "rep movsq" is around 10,000 limb +; - the prologue & epilogue can still be optimized! + +%include 'yasm_mac.inc' + +BITS 64 + +%ifdef USE_WIN64 + %define Op2 RCX + %define Op1 RDX + %define Size1 R8 + %define Limb R9 + %define Offs R10 +%else + %define Op2 RDI + %define Op1 RSI + %define Size1 RDX + %define Limb RCX + %define Offs R10 +%endif + +%define DLimb0 XMM0 +%define QLimb0 YMM0 +%define QLimb1 YMM1 +%define QLimb2 YMM2 +%define QLimb3 YMM3 + + align 32 + +LEAF_PROC mpn_copyi + mov RAX, Size1 + cmp Op1, Op2 + je .Exit ; no copy required => + + or RAX, RAX + je .Exit ; size=0 => + + ; align the destination (Op2) to 32 byte + test Op2, 8 + je .lCpyIncA32 + + mov Limb, [Op1] + mov [Op2], Limb + dec Size1 + je .Exit + + add Op1, 8 + add Op2, 8 + + .lCpyIncA32: + + test Op2, 16 + je .lCpyIncAVX + + mov Limb, [Op1] + mov [Op2], Limb + dec Size1 + je .Exit + + mov Limb, [Op1+8] + mov [Op2+8], Limb + dec Size1 + je .Exit + + add Op1, 16 + add Op2, 16 + + .lCpyIncAVX: + + mov Offs, 128 + jmp .lCpyIncAVXCheck + + ; main loop (prefetching disabled; unloaded cache) + ; - lCpyInc is slightly slower than lCpyDec through all cache levels?! + ; - 0.30 cycles / limb in L1$ + ; - 0.60 cycles / limb in L2$ + ; - 0.70-0.90 cycles / limb in L3$ + align 16 + .lCpyIncAVXLoop: + + vmovdqu QLimb0, [Op1] + vmovdqu QLimb1, [Op1+32] + vmovdqu QLimb2, [Op1+64] + vmovdqu QLimb3, [Op1+96] + vmovdqa [Op2], QLimb0 + vmovdqa [Op2+32], QLimb1 + vmovdqa [Op2+64], QLimb2 + vmovdqa [Op2+96], QLimb3 + + add Op1, Offs + add Op2, Offs + + .lCpyIncAVXCheck: + + sub Size1, 16 + jnc .lCpyIncAVXLoop + + add Size1, 16 + je .Exit ; AVX copied operand fully => + + ; copy remaining max. 15 limb + test Size1, 8 + je .lCpyIncFour + + vmovdqu QLimb0, [Op1] + vmovdqu QLimb1, [Op1+32] + vmovdqa [Op2], QLimb0 + vmovdqa [Op2+32], QLimb1 + + add Op1, 64 + add Op2, 64 + + .lCpyIncFour: + + test Size1, 4 + je .lCpyIncTwo + + vmovdqu QLimb0, [Op1] + vmovdqa [Op2], QLimb0 + + add Op1, 32 + add Op2, 32 + + .lCpyIncTwo: + + test Size1, 2 + je .lCpyIncOne + +%if 1 + ; Avoid SSE2 instruction due to stall on Haswell + mov Limb, [Op1] + mov [Op2], Limb + mov Limb, [Op1+8] + mov [Op2+8], Limb +%else + movdqu DLimb0, [Op1] + movdqa [Op2], DLimb0 +%endif + + add Op1, 16 + add Op2, 16 + + .lCpyIncOne: + + test Size1, 1 + je .Exit + + mov Limb, [Op1] + mov [Op2], Limb + + .Exit: + + vzeroupper + ret +.end: diff --git a/mpn/x86_64w/haswell/lshift.asm b/mpn/x86_64w/haswell/lshift.asm new file mode 100644 index 00000000..fbfedc4a --- /dev/null +++ b/mpn/x86_64w/haswell/lshift.asm @@ -0,0 +1,285 @@ + +; Copyright 2016 Jens Nurmann and Alexander Kruppa + +; This file is part of the MPIR Library. + +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. + +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. + +; mp_limb_t mpn_lshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift) +; Linux RAX RDI RSI RDX RCX +; Win7 RAX RCX RDX R8 R9 +; +; Description: +; The function shifts Op1 left by n bit, stores the result in Op2 (non- +; destructive shl) and hands back the shifted-out most significant bits of Op1. +; The function operates decreasing in memory supporting in-place operation. +; +; Result: +; - Op2[ Size1-1..0 ] := ( Op1[ Size1-1..0 ]:ShlIn ) << 1 +; - Op1[ 0 ] >> 63 +; +; Caveats: +; - caller must ensure that Shift is in [ 1..63 ]! +; - currently Linux64 support only! +; - the AVX version uses mnemonics only available on Haswell, Broadwell and +; Skylake cores +; - the behaviour of cache prefetching in combination with AVX shifting seems +; somewhat erratic +; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes +; - slight (a few percent) improvement for full LD1$ sizes +; - substantial (>10%) improvement for 1/2 LD2$ sizes +; - slight (a few percent) improvement for full LD2$ sizes +; - slight (a few percent) degradation for 1/2 LD3$ sizes +; - substantial (around 10%) degradation for full LD3$ sizes +; +; Comments: +; - implemented, tested and benched on 31.03.2016 by jn +; - includes prefetching +; ============================================================================ + +%include 'yasm_mac.inc' + +BITS 64 + +%ifdef USE_WIN64 + %define Op2 R11 + %define Op1 RDX + %define Size1 R8 + %define Shift RCX + %define Limb1 R9 + %define Limb2 R10 + %ifdef USE_PREFETCH + %define Offs -512 ; No caller-saves regs left, use immediate + %endif + %define reg_save_list XMM, 6, 7 +%else + %define Op2 RDI + %define Op1 RSI + %define Size1 RDX + %define Shift RCX + %define Limb1 R8 + %define Limb2 R9 + %ifdef USE_PREFETCH + %define OFFS_REG 1 + %define Offs R10 + %endif +%endif + +%define ShlDL0 XMM2 ; Attn: this must match ShlQL0 definition +%define ShrDL0 XMM3 ; Attn: this must match ShrQL0 definition +%define ShlDLCnt XMM6 ; Attn: this must match ShlQlCnt definition +%define ShrDLCnt XMM7 ; Attn: this must match ShrQlCnt definition + +%define QLimb0 YMM0 +%define QLimb1 YMM1 +%define ShlQL0 YMM2 +%define ShrQL0 YMM3 +%define ShlQL1 YMM4 +%define ShrQL1 YMM5 +%define ShlQLCnt YMM6 +%define ShrQLCnt YMM7 + + align 32 +FRAME_PROC mpn_lshift, 0, reg_save_list +%ifdef USE_WIN64 + mov r11, rcx + mov rcx, r9 +%endif + xor EAX, EAX + sub Size1, 1 + jc .Exit ; Size1=0 => + + lea Op1, [Op1+8*Size1] + lea Op2, [Op2+8*Size1] + + mov Limb1, [Op1] + shld RAX, Limb1, CL + + or Size1, Size1 + je .lShlEquPost ; Size1=1 => + + %ifdef USE_PREFETCH + %ifdef OFFS_REG + mov Offs, -512 + %endif + %endif + + cmp Size1, 8 + jc .lShlEquFour ; AVX inefficient => + + ; first align Op2 to 32 bytes + test Op2, 8 + jne .lShlEquA16 + + mov Limb2, [Op1-8] + shld Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, Limb2 + + sub Op1, 8 + sub Op2, 8 + sub Size1, 1 + + .lShlEquA16: + + test Op2, 16 + jne .lShlEquAVX + + mov Limb2, [Op1-8] + shld Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, [Op1-16] + shld Limb2, Limb1, CL + mov [Op2-8], Limb2 + + sub Op1, 16 + sub Op2, 16 + sub Size1, 2 + + .lShlEquAVX: + + ; initialize AVX shift counter + vmovq ShlDLCnt, RCX + neg RCX + and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63! + vmovq ShrDLCnt, RCX + neg RCX + and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63! + vpbroadcastq ShlQLCnt, ShlDLCnt + vpbroadcastq ShrQLCnt, ShrDLCnt + + ; pre-fetch first quad-limb + vmovdqu QLimb0, [Op1-24] + vpsrlvq ShrQL0, QLimb0, ShrQLCnt + vpermq ShrQL0, ShrQL0, 10010011b + + sub Op1, 32 + sub Size1, 4 + jmp .lShlEquAVXCheck + + ; main loop (prefetching enabled; unloaded cache) + ; - 0.60 cycles per limb in LD1$ + ; - 0.60-0.70 cycles per limb in LD2$ + ; - 0.70-0.90 cycles per limb in LD3$ + align 16 + .lShlEquAVXLoop: + + %ifdef USE_PREFETCH + prefetchnta [Op1+Offs] + %endif + + vmovdqu QLimb1, [Op1-24] + vpsllvq ShlQL0, QLimb0, ShlQLCnt + vmovdqu QLimb0, [Op1-56] + vpsrlvq ShrQL1, QLimb1, ShrQLCnt + vpermq ShrQL1, ShrQL1, 10010011b + vpblendd ShrQL0, ShrQL0, ShrQL1, 00000011b + vpor ShlQL0, ShlQL0, ShrQL0 + vpsllvq ShlQL1, QLimb1, ShlQLCnt + vpsrlvq ShrQL0, QLimb0, ShrQLCnt + vpermq ShrQL0, ShrQL0, 10010011b + vpblendd ShrQL1, ShrQL1, ShrQL0, 00000011b + vmovdqa [Op2-24], ShlQL0 + vpor ShlQL1, ShlQL1, ShrQL1 + vmovdqa [Op2-56], ShlQL1 + + sub Op1, 64 + sub Op2, 64 + + .lShlEquAVXCheck: + + sub Size1, 8 + jnc .lShlEquAVXLoop + + mov Limb1, [Op1] + xor Limb2, Limb2 + shld Limb2, Limb1, CL +%if 1 + vmovq ShlDL0, Limb2 + vpblendd ShrQL0, ShrQL0, ShlQL0, 3 +%else + ; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2 + ; this is generating stalls on Haswell & Broadwell architecture (Agner Fog) + ; but it is only executed once and there is no AVX2 based alternative + pinsrq ShrDL0, Limb2, 0 ; SSE4.1 +%endif + vpsllvq ShlQL0, QLimb0, ShlQLCnt + vpor ShlQL0, ShlQL0, ShrQL0 + vmovdqa [Op2-24], ShlQL0 + + sub Op2, 32 + add Size1, 8 + + ; shift remaining max. 7 limbs with SHLD mnemonic + .lShlEquFour: + + sub Op1, 8 + test Size1, 4 + je .lShlEquTwo + + mov Limb2, [Op1] + shld Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, [Op1-8] + shld Limb2, Limb1, CL + mov [Op2-8], Limb2 + mov Limb2, [Op1-16] + shld Limb1, Limb2, CL + mov [Op2-16], Limb1 + mov Limb1, [Op1-24] + shld Limb2, Limb1, CL + mov [Op2-24], Limb2 + + sub Op1, 32 + sub Op2, 32 + + .lShlEquTwo: + + test Size1, 2 + je .lShlEquOne + + mov Limb2, [Op1] + shld Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, [Op1-8] + shld Limb2, Limb1, CL + mov [Op2-8], Limb2 + + sub Op1, 16 + sub Op2, 16 + + .lShlEquOne: + + test Size1, 1 + je .lShlEquPost + + mov Limb2, [Op1] + shld Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, Limb2 + + sub Op2, 8 + + .lShlEquPost: + + shl Limb1, CL + mov [Op2], Limb1 + + .Exit: + + vzeroupper +END_PROC reg_save_list +.end: \ No newline at end of file diff --git a/mpn/x86_64w/haswell/rshift.asm b/mpn/x86_64w/haswell/rshift.asm new file mode 100644 index 00000000..fbcc7d8e --- /dev/null +++ b/mpn/x86_64w/haswell/rshift.asm @@ -0,0 +1,282 @@ + +; Copyright 2016 Jens Nurmann and Alexander Kruppa + +; This file is part of the MPIR Library. + +; The MPIR Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published +; by the Free Software Foundation; either version 2.1 of the License, or (at +; your option) any later version. + +; The MPIR Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the MPIR Library; see the file COPYING.LIB. If not, write +; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +; Boston, MA 02110-1301, USA. + +; mp_limb_t mpn_rshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift) +; Linux RAX RDI RSI RDX RCX +; Windows x64 RAX RCX RDX R8 R9 +; +; Description: +; The function shifts Op1 right by Shift bits, stores the result in Op2 (non- +; destructive shr) and hands back the shifted-out least significant bits of +; Op1. The function operates increasing in memory supporting in place shifts. +; +; Result: +; - Op2[ Size1-1..0 ] := ( ShrIn:Op1[ Size1-1..0 ] ) >> Shift +; - Op1[ 0 ] << ( 64-Shift ) +; +; Caveats: +; - caller must ensure that Shift is in [ 1..63 ]! +; - currently Linux64 support only! +; - the AVX version uses mnemonics only available on Haswell, Broadwell and +; Skylake cores +; - the behaviour of cache prefetching in combination with AVX shifting seems +; somewhat erratic +; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes +; - slight (a few percent) improvement for full LD1$ sizes +; - substantial (>10%) improvement for 1/2 LD2$ sizes +; - slight (a few percent) improvement for full LD2$ sizes +; - slight (a few percent) degradation for 1/2 LD3$ sizes +; - substantial (around 10%) degradation for full LD3$ sizes +; +; Comments: +; - implemented, tested and benchmarked on 30.03.2016 by jn +; - includes prefetching +; ============================================================================ + +%include 'yasm_mac.inc' + +BITS 64 + +%ifdef USE_WIN64 + %define Op2 R11 + %define Op1 RDX + %define Size1 R8 + %define Shift RCX + %define Limb1 R9 + %define Limb2 R10 + %ifdef USE_PREFETCH + %define Offs -512 ; No caller-saves regs left, use immediate + %endif + %define reg_save_list XMM, 6, 7 +%else + %define Op2 RDI + %define Op1 RSI + %define Size1 RDX + %define Shift RCX + %define Limb1 R8 + %define Limb2 R9 + %ifdef USE_PREFETCH + %define OFFS_REG 1 + %define Offs R10 + %endif +%endif + +%define ShrDL0 XMM2 ; Attn: this must match ShrQL0 definition +%define ShlDL0 XMM3 ; Attn: this must match ShlQL0 definition +%define ShrDLCnt XMM6 ; Attn: this must match ShrQlCnt definition +%define ShlDLCnt XMM7 ; Attn: this must match ShlQlCnt definition + +%define QLimb0 YMM0 +%define QLimb1 YMM1 +%define ShrQL0 YMM2 +%define ShlQL0 YMM3 +%define ShrQL1 YMM4 +%define ShlQL1 YMM5 +%define ShrQLCnt YMM6 +%define ShlQLCnt YMM7 + + align 32 + +FRAME_PROC mpn_rshift, 0, reg_save_list +%ifdef USE_WIN64 + mov r11, rcx + mov rcx, r9 +%endif + xor EAX, EAX + or Size1, Size1 + je .Exit + + mov Limb1, [Op1] + shrd RAX, Limb1, CL + + sub Size1, 1 + je .lShrEquPost ; Size1=1 => + + %ifdef USE_PREFETCH + mov Offs, 512 + %endif + + cmp Size1, 8 + jc .lShrEquFour ; AVX inefficient => + + ; first align Op2 to 32 bytes + test Op2, 8 + je .lShrEquAlign16 + + mov Limb2, [Op1+8] + shrd Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, Limb2 + + add Op1, 8 + add Op2, 8 + sub Size1, 1 + + .lShrEquAlign16: + + test Op2, 16 + je .lShrEquAVX + + mov Limb2, [Op1+8] + shrd Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, [Op1+16] + shrd Limb2, Limb1, CL + mov [Op2+8], Limb2 + + add Op1, 16 + add Op2, 16 + sub Size1, 2 + + .lShrEquAVX: + + ; initialize AVX shift counter + vmovq ShrDLCnt, RCX + neg RCX + and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63! + vmovq ShlDLCnt, RCX + neg RCX + and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63! + vpbroadcastq ShrQLCnt, ShrDLCnt + vpbroadcastq ShlQLCnt, ShlDLCnt + + ; pre-fetch first quad-limb + vmovdqu QLimb0, [Op1] + vpsllvq ShlQL0, QLimb0, ShlQLCnt + + add Op1, 32 + sub Size1, 4 + jmp .lShrEquAVXCheck + + ; main loop (prefetching enabled, unloaded data cache) + ; - 0.60 cycles per limb in LD1$ + ; - 0.60-0.70 cycles per limb in LD2$ + ; - 0.70-0.90 cycles per limb in LD3$ + align 16 + .lShrEquAVXLoop: + + %ifdef USE_PREFETCH + prefetchnta [Op1+Offs] + %endif + + vmovdqu QLimb1, [Op1] + vpsrlvq ShrQL0, QLimb0, ShrQLCnt + vmovdqu QLimb0, [Op1+32] + vpsllvq ShlQL1, QLimb1, ShlQLCnt + vpblendd ShlQL0, ShlQL0, ShlQL1, 00000011b + vpermq ShlQL0, ShlQL0, 00111001b + vpor ShrQL0, ShrQL0, ShlQL0 + vpsrlvq ShrQL1, QLimb1, ShrQLCnt + vpsllvq ShlQL0, QLimb0, ShlQLCnt + vpblendd ShlQL1, ShlQL1, ShlQL0, 00000011b + vpermq ShlQL1, ShlQL1, 00111001b + vmovdqa [Op2], ShrQL0 + vpor ShrQL1, ShrQL1, ShlQL1 + vmovdqa [Op2+32], ShrQL1 + + add Op1, 64 + add Op2, 64 + + .lShrEquAVXCheck: + + sub Size1, 8 + jnc .lShrEquAVXLoop + + mov Limb1, [Op1] + xor Limb2, Limb2 + shrd Limb2, Limb1, CL +%if 1 + vmovq ShrDL0, Limb2 + vpblendd ShlQL0, ShlQL0, ShrQL0, 3 +%else + ; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2 + ; this is generating stalls on Haswell & Broadwell architecture (Agner Fog) + ; but it is only executed once and there is no AVX2 based alternative + pinsrq ShlDL0, Limb2, 0 ; SSE4.1 +%endif + vpsrlvq ShrQL0, QLimb0, ShrQLCnt + vpermq ShlQL0, ShlQL0, 00111001b + vpor ShrQL0, ShrQL0, ShlQL0 + vmovdqa [Op2], ShrQL0 + + add Op2, 32 + add Size1, 8 + + ; shift remaining max. 7 limbs with SHRD mnemonic + .lShrEquFour: + + add Op1, 8 + test Size1, 4 + je .lShrEquTwo + + mov Limb2, [Op1] + shrd Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, [Op1+8] + shrd Limb2, Limb1, CL + mov [Op2+8], Limb2 + mov Limb2, [Op1+16] + shrd Limb1, Limb2, CL + mov [Op2+16], Limb1 + mov Limb1, [Op1+24] + shrd Limb2, Limb1, CL + mov [Op2+24], Limb2 + + add Op1, 32 + add Op2, 32 + + .lShrEquTwo: + + test Size1, 2 + je .lShrEquOne + + mov Limb2, [Op1] + shrd Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, [Op1+8] + shrd Limb2, Limb1, CL + mov [Op2+8], Limb2 + + add Op1, 16 + add Op2, 16 + + .lShrEquOne: + + test Size1, 1 + je .lShrEquPost + + mov Limb2, [Op1] + shrd Limb1, Limb2, CL + mov [Op2], Limb1 + mov Limb1, Limb2 + + add Op2, 8 + + ; store most significant limb considering shift-in part + .lShrEquPost: + + shr Limb1, CL + mov [Op2], Limb1 + + .Exit: + + vzeroupper +END_PROC reg_save_list +.end: diff --git a/mpn/x86_64w/yasm_mac.inc b/mpn/x86_64w/yasm_mac.inc index 141e8ab9..bbb061a1 100644 --- a/mpn/x86_64w/yasm_mac.inc +++ b/mpn/x86_64w/yasm_mac.inc @@ -119,24 +119,48 @@ %endif %rotate 1 - %assign gpr_regs 0 + %assign stack_slots 0 + %assign xmm_seen 0 %if %0 > 2 %rep %0 - 2 - push_reg %1 - %assign gpr_regs gpr_regs + 1 + %ifnum %1 + %if xmm_seen == 0 + %error Not an XMM register + %else + alloc_stack 16 + save_xmm128 XMM%1, 0 + %assign stack_slots stack_slots + 2 + %endif + %elifid %1 + %ifidni XMM, %1 + %if stack_slots & 1 == 0 + alloc_stack 8 + %assign stack_slots stack_slots + 1 + %assign xmm_seen 1 + %else + %assign xmm_seen 2 + %endif + %elif xmm_seen == 0 + push_reg %1 + %assign stack_slots stack_slots + 1 + %else + %error XMM registers must be last in the save list + %endif + %else + %error Bad parameter list + %endif %rotate 1 %endrep %endif - %if (gpr_regs & 1) == (var_slots & 1) + %if (stack_slots & 1) == (var_slots & 1) %assign var_slots var_slots + 1 %endif %if var_slots > 0 alloc_stack 8 * var_slots %endif - %assign stack_use 8 * (gpr_regs + var_slots) - + %assign stack_use 8 * (stack_slots + var_slots) END_PROLOGUE %endmacro @@ -147,7 +171,16 @@ %if %0 > 0 %rep %0 %rotate -1 - pop %1 + %ifnum %1 + movdqa XMM%1, [rsp] + add rsp, 16 + %elifidni %1, XMM + %if xmm_seen == 1 + add rsp, 8 + %endif + %else + pop %1 + %endif %endrep %endif ret @@ -156,14 +189,25 @@ %macro END_PROC 0-* - add rsp, 8 * var_slots + %if var_slots + add rsp, 8 * var_slots + %endif %if %0 > 0 %rep %0 %rotate -1 - pop %1 + %ifnum %1 + movdqa XMM%1, [rsp] + add rsp, 16 + %elifidni %1, XMM + %if xmm_seen == 1 + add rsp, 8 + %endif + %else + pop %1 + %endif %endrep %endif - ret + ret ENDPROC_FRAME %endmacro