add more win64 assembler for haswell

This commit is contained in:
Brian Gladman 2016-11-26 22:35:25 +00:00
parent a95556b926
commit 77b483e79f
17 changed files with 1299 additions and 151 deletions

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Release|x64">
@ -51,7 +51,7 @@ prebuild haswell x64 14
</Command>
</PreBuildEvent>
<YASM>
<Defines>DLL;USE_WIN64</Defines>
<Defines>DLL</Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -62,6 +62,7 @@ prebuild haswell x64 14
</ClCompile>
<Link>
</Link>
<PostBuildEvent>
<Command>cd ..\..\build.vc
postbuild "$(TargetPath)" 14
@ -75,7 +76,7 @@ prebuild haswell x64 14
</Command>
</PreBuildEvent>
<YASM>
<Defines>DLL;USE_WIN64</Defines>
<Defines>DLL</Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -86,6 +87,7 @@ prebuild haswell x64 14
</ClCompile>
<Link>
</Link>
<PostBuildEvent>
<Command>cd ..\..\build.vc
postbuild "$(TargetPath)" 14
@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\cxx\osmpz.cc" />
<ClCompile Include="..\..\mpn\generic\add.c" />
<ClCompile Include="..\..\mpn\generic\add_1.c" />
<ClCompile Include="..\..\mpn\generic\add_n.c" />
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
@ -474,8 +475,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\mpn\generic\cmp.c" />
<ClCompile Include="..\..\mpn\generic\com_n.c" />
<ClCompile Include="..\..\mpn\generic\comb_tables.c" />
<ClCompile Include="..\..\mpn\generic\copyd.c" />
<ClCompile Include="..\..\mpn\generic\copyi.c" />
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c" />
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q_n.c" />
<ClCompile Include="..\..\mpn\generic\dc_bdiv_qr.c" />
@ -526,7 +525,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
<ClCompile Include="..\..\mpn\generic\lshift.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
@ -567,7 +565,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
<ClCompile Include="..\..\mpn\generic\rshift.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
@ -610,17 +607,22 @@ postbuild "$(TargetPath)" 14
<ItemGroup>
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm" />
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_byfobm1.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
@ -631,8 +633,8 @@ postbuild "$(TargetPath)" 14
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\..\build.vc\vsyasm.targets" />
</ImportGroup>
<ItemGroup>
</ImportGroup>
<ItemGroup>
<None Include="..\..\gmp-h.in" />
</ItemGroup>
</Project>
</ItemGroup>
</Project>

View File

@ -1114,9 +1114,6 @@
<ClCompile Include="..\..\mpn\generic\add_1.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\add_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1144,12 +1141,6 @@
<ClCompile Include="..\..\mpn\generic\comb_tables.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\copyd.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\copyi.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1300,9 +1291,6 @@
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\lshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1423,9 +1411,6 @@
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\rshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1548,9 +1533,18 @@
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1569,6 +1563,9 @@
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1581,6 +1578,9 @@
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>

View File

@ -51,7 +51,8 @@ prebuild haswell x64 14
</Command>
</PreBuildEvent>
<YASM>
<Defines>USE_WIN64</Defines>
<Defines>
</Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -73,7 +74,8 @@ prebuild haswell x64 14
</Command>
</PreBuildEvent>
<YASM>
<Defines>USE_WIN64</Defines>
<Defines>
</Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\scanf\vsscanf.c" />
<ClCompile Include="..\..\mpn\generic\add.c" />
<ClCompile Include="..\..\mpn\generic\add_1.c" />
<ClCompile Include="..\..\mpn\generic\add_n.c" />
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
@ -459,8 +460,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\mpn\generic\cmp.c" />
<ClCompile Include="..\..\mpn\generic\com_n.c" />
<ClCompile Include="..\..\mpn\generic\comb_tables.c" />
<ClCompile Include="..\..\mpn\generic\copyd.c" />
<ClCompile Include="..\..\mpn\generic\copyi.c" />
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c" />
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q_n.c" />
<ClCompile Include="..\..\mpn\generic\dc_bdiv_qr.c" />
@ -511,7 +510,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
<ClCompile Include="..\..\mpn\generic\lshift.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
@ -552,7 +550,6 @@ postbuild "$(TargetPath)" 14
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
<ClCompile Include="..\..\mpn\generic\rshift.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
@ -595,13 +592,29 @@ postbuild "$(TargetPath)" 14
<ItemGroup>
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
</YASM>
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm">
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm">
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
</YASM>
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_byfobm1.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
@ -609,11 +622,13 @@ postbuild "$(TargetPath)" 14
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
</YASM>
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />

View File

@ -1080,9 +1080,6 @@
<ClCompile Include="..\..\mpn\generic\add_1.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\add_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1110,12 +1107,6 @@
<ClCompile Include="..\..\mpn\generic\comb_tables.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\copyd.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\copyi.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1266,9 +1257,6 @@
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\lshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1389,9 +1377,6 @@
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\rshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1514,9 +1499,18 @@
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1535,6 +1529,9 @@
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1547,6 +1544,9 @@
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>

View File

@ -168,7 +168,7 @@ check_config $(Platform) $(Configuration) 14
<DataExecutionPrevention>
</DataExecutionPrevention>
<TargetMachine>MachineX64</TargetMachine>
<GenerateDebugInformation>No</GenerateDebugInformation>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemGroup>

View File

@ -8,12 +8,14 @@ mpn_divexact_byfobm1
mpn_divrem_2
mpn_divrem_euclidean_qr_1
mpn_divrem_euclidean_qr_2
mpn_lshift
mpn_lshift1
mpn_modexact_1_odd
mpn_modexact_1c_odd
mpn_mul_2
mpn_mulmid_basecase
mpn_preinv_divrem_1
mpn_rshift
mpn_rshift1
mpn_sqr_basecase
mpn_sub_err1_n

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="15.0.25914.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Release|x64">
@ -51,7 +51,7 @@ prebuild haswell x64 15
</Command>
</PreBuildEvent>
<YASM>
<Defines>DLL;USE_WIN64</Defines>
<Defines>DLL</Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -62,6 +62,7 @@ prebuild haswell x64 15
</ClCompile>
<Link>
</Link>
<PostBuildEvent>
<Command>cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@ -75,7 +76,7 @@ prebuild haswell x64 15
</Command>
</PreBuildEvent>
<YASM>
<Defines>DLL;USE_WIN64</Defines>
<Defines>DLL</Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -86,6 +87,7 @@ prebuild haswell x64 15
</ClCompile>
<Link>
</Link>
<PostBuildEvent>
<Command>cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 15
<ClCompile Include="..\..\cxx\osmpz.cc" />
<ClCompile Include="..\..\mpn\generic\add.c" />
<ClCompile Include="..\..\mpn\generic\add_1.c" />
<ClCompile Include="..\..\mpn\generic\add_n.c" />
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
@ -526,7 +527,6 @@ postbuild "$(TargetPath)" 15
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
<ClCompile Include="..\..\mpn\generic\lshift.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
@ -567,7 +567,6 @@ postbuild "$(TargetPath)" 15
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
<ClCompile Include="..\..\mpn\generic\rshift.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
@ -610,6 +609,7 @@ postbuild "$(TargetPath)" 15
<ItemGroup>
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm" />
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
@ -617,10 +617,12 @@ postbuild "$(TargetPath)" 15
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
@ -631,8 +633,8 @@ postbuild "$(TargetPath)" 15
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\..\build.vc\vsyasm.targets" />
</ImportGroup>
<ItemGroup>
</ImportGroup>
<ItemGroup>
<None Include="..\..\gmp-h.in" />
</ItemGroup>
</Project>
</ItemGroup>
</Project>

View File

@ -1114,9 +1114,6 @@
<ClCompile Include="..\..\mpn\generic\add_1.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\add_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1300,9 +1297,6 @@
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\lshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1423,9 +1417,6 @@
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\rshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1548,6 +1539,9 @@
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1569,6 +1563,9 @@
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1581,6 +1578,9 @@
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="15.0.25914.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Release|x64">
@ -51,7 +51,7 @@ prebuild haswell x64 15
</Command>
</PreBuildEvent>
<YASM>
<Defines>USE_WIN64</Defines>
<Defines></Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -60,6 +60,7 @@ prebuild haswell x64 15
<AdditionalIncludeDirectories>..\..\</AdditionalIncludeDirectories>
<PreprocessorDefinitions>NDEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<PostBuildEvent>
<Command>cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@ -73,7 +74,7 @@ prebuild haswell x64 15
</Command>
</PreBuildEvent>
<YASM>
<Defines>USE_WIN64</Defines>
<Defines></Defines>
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
<Debug>true</Debug>
<ObjectFile>$(IntDir)mpn\</ObjectFile>
@ -82,6 +83,7 @@ prebuild haswell x64 15
<AdditionalIncludeDirectories>..\..\</AdditionalIncludeDirectories>
<PreprocessorDefinitions>_DEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<PostBuildEvent>
<Command>cd ..\..\build.vc
postbuild "$(TargetPath)" 15
@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 15
<ClCompile Include="..\..\scanf\vsscanf.c" />
<ClCompile Include="..\..\mpn\generic\add.c" />
<ClCompile Include="..\..\mpn\generic\add_1.c" />
<ClCompile Include="..\..\mpn\generic\add_n.c" />
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
@ -511,7 +512,6 @@ postbuild "$(TargetPath)" 15
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
<ClCompile Include="..\..\mpn\generic\lshift.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
@ -552,7 +552,6 @@ postbuild "$(TargetPath)" 15
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
<ClCompile Include="..\..\mpn\generic\rshift.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
@ -595,6 +594,7 @@ postbuild "$(TargetPath)" 15
<ItemGroup>
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm" />
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
@ -602,10 +602,12 @@ postbuild "$(TargetPath)" 15
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
@ -616,8 +618,8 @@ postbuild "$(TargetPath)" 15
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\..\build.vc\vsyasm.targets" />
</ImportGroup>
<ItemGroup>
</ImportGroup>
<ItemGroup>
<None Include="..\..\gmp-h.in" />
</ItemGroup>
</Project>
</ItemGroup>
</Project>

View File

@ -1080,9 +1080,6 @@
<ClCompile Include="..\..\mpn\generic\add_1.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\add_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1266,9 +1263,6 @@
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\lshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1389,9 +1383,6 @@
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\rshift.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
<Filter>Source Files\mpn</Filter>
</ClCompile>
@ -1514,6 +1505,9 @@
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1535,6 +1529,9 @@
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
@ -1547,6 +1544,9 @@
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
<Filter>Source Files\mpn\yasm</Filter>
</YASM>

View File

@ -1,6 +1,7 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.25914.0
VisualStudioVersion = 15.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lib_mpir_k8", "lib_mpir_k8\lib_mpir_k8.vcxproj", "{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}"
EndProject
@ -44,74 +45,74 @@ Global
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|x64
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|x64
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|Win32
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|x64.ActiveCfg = Release|x64
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|Win32
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|Win32
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|Win32.ActiveCfg = Release|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|x64
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|x64
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|Win32.ActiveCfg = Release|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|Win32
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|x64
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|x64
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|Win32
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|x64.ActiveCfg = Release|x64
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|Win32
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|Win32.ActiveCfg = Release|Win32
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|x64.ActiveCfg = Release|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|x64
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|Win32
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|x64.ActiveCfg = Release|x64
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|Win32
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|Win32.ActiveCfg = Release|Win32
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|x64.ActiveCfg = Release|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|x64
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|Win32
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|x64.ActiveCfg = Release|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|Win32
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|Win32
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|x64.ActiveCfg = Release|x64
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|Win32
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|Win32.ActiveCfg = Release|Win32
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|x64.ActiveCfg = Release|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|x64
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|Win32
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|x64.ActiveCfg = Release|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|Win32
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|Win32
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|x64.ActiveCfg = Release|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|Win32
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|Win32
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|x64.ActiveCfg = Release|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|Win32
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|Win32
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|x64.ActiveCfg = Release|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|Win32
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|Win32
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|x64.ActiveCfg = Release|x64
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|x64
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|Win32
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|Win32
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|x64.ActiveCfg = Release|x64
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|x64
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|x64
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|Win32
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|Win32
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|x64.ActiveCfg = Release|x64
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|Win32
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

View File

@ -0,0 +1,111 @@
; PROLOGUE(mpn_add_n)
; Version 1.0.3.
;
; Copyright 2008 Jason Moxham
;
; Windows Conversion Copyright 2008 Brian Gladman
;
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
;
; Calculate src1[size] plus(minus) src2[size] and store the result in
; dst[size]. The return value is the carry bit from the top of the result
; (1 or 0). The _nc version accepts 1 or 0 for an initial carry into the
; low limb of the calculation. Note values other than 1 or 0 here will
; lead to garbage results.
;
; mp_limb_t mpn_add_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)
; mp_limb_t mpn_add_nc(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t)
; rax rdi rsi rdx rcx r8
; rax rcx rdx r8 r9 [rsp+40]
%include "yasm_mac.inc"
CPU Athlon64
BITS 64
xalign 8
LEAF_PROC mpn_add_nc
mov r10,[rsp+40]
jmp entry
xalign 8
LEAF_PROC mpn_add_n
xor r10, r10
entry:
mov rax, r9
and rax, 3
shr r9, 2
lea r9,[r10+r9*2]
sar r9, 1
jnz .2
mov r10, [rdx]
adc r10, [r8]
mov [rcx], r10
dec rax
jz .1
mov r10, [rdx+8]
adc r10, [r8+8]
mov [rcx+8], r10
dec rax
jz .1
mov r10, [rdx+16]
adc r10, [r8+16]
mov [rcx+16], r10
dec rax
.1: adc rax, rax
ret
xalign 8
.2: mov r10, [rdx]
mov r11, [rdx+8]
lea rdx, [rdx+32]
adc r10, [r8]
adc r11, [r8+8]
lea r8, [r8+32]
mov [rcx], r10
mov [rcx+8], r11
lea rcx, [rcx+32]
mov r10, [rdx-16]
mov r11, [rdx-8]
adc r10, [r8-16]
adc r11, [r8-8]
mov [rcx-16], r10
dec r9
mov [rcx-8], r11
jnz .2
inc rax
dec rax
jz .3
mov r10, [rdx]
adc r10, [r8]
mov [rcx], r10
dec rax
jz .3
mov r10, [rdx+8]
adc r10, [r8+8]
mov [rcx+8], r10
dec rax
jz .3
mov r10, [rdx+16]
adc r10, [r8+16]
mov [rcx+16], r10
dec rax
.3: adc rax, rax
ret
end

View File

@ -0,0 +1,203 @@
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mpn_copyd(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
; Linux RDI RSI RDX
; Win7 RCX RDX R8
;
; Description:
; The function copies a given number of limb from source to destination (while
; moving high to low in memory) and hands back the size (in limb) of the
; destination.
;
; Result:
; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
; - number of copied limb: range [ 0..max tCounter ]
;
; Caveats:
; - if size 0 is given the content of the destination will remain untouched!
; - if Op1=Op2 no copy is done!
;
; Comments:
; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
; - did some experiments with AVX based version with following results
; - AVX can be faster in L1$-L3$ if destination is aligned on 32 byte
; - AVX is generally faster on small sized operands (<=100 limb) due too
; start-up overhead of "rep movsq" - however this could also be achieved by
; simple copy loop
; - startup overhead of "rep movsq" with negative direction is 200 cycles!!!
; - negative direction is unfavourable compared to positive "rep movsq" and
; to AVX.
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 RCX
%define Op1 RDX
%define Size1 R8
%define Limb R9
%define Offs R10
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Limb RCX
%define Offs R10
%endif
%define DLimb0 XMM0
%define QLimb0 YMM0
%define QLimb1 YMM1
%define QLimb2 YMM2
%define QLimb3 YMM3
align 32
LEAF_PROC mpn_copyd
mov RAX, Size1
cmp Op1, Op2
je .Exit ; no copy required =>
or RAX, RAX
je .Exit ; Size=0 =>
lea Op1, [Op1+8*Size1-8]
lea Op2, [Op2+8*Size1-8]
; align the destination (Op2) to 32 byte
test Op2, 8
jne .lCpyDecA32
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
sub Op1, 8
sub Op2, 8
.lCpyDecA32:
test Op2, 16
jnz .lCpyDecAVX
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
mov Limb, [Op1-8]
mov [Op2-8], Limb
dec Size1
je .Exit
sub Op1, 16
sub Op2, 16
.lCpyDecAVX:
mov Offs, 128
jmp .lCpyDecAVXCheck
; main loop (prefetching disabled; unloaded cache)
; - 0.30 cycles / limb in L1$
; - 0.60 cycles / limb in L2$
; - 0.70-0.90 cycles / limb in L3$
align 16
.lCpyDecAVXLoop:
vmovdqu QLimb0, [Op1-24]
vmovdqu QLimb1, [Op1-56]
vmovdqu QLimb2, [Op1-88]
vmovdqu QLimb3, [Op1-120]
vmovdqa [Op2-24], QLimb0
vmovdqa [Op2-56], QLimb1
vmovdqa [Op2-88], QLimb2
vmovdqa [Op2-120], QLimb3
sub Op1, Offs
sub Op2, Offs
.lCpyDecAVXCheck:
sub Size1, 16
jnc .lCpyDecAVXLoop
add Size1, 16
je .Exit ; AVX copied operand fully =>
; copy remaining max. 15 limb
test Size1, 8
je .lCpyDecFour
vmovdqu QLimb0, [Op1-24]
vmovdqu QLimb1, [Op1-56]
vmovdqa [Op2-24], QLimb0
vmovdqa [Op2-56], QLimb1
sub Op1, 64
sub Op2, 64
.lCpyDecFour:
test Size1, 4
je .lCpyDecTwo
vmovdqu QLimb0, [Op1-24]
vmovdqa [Op2-24], QLimb0
sub Op1, 32
sub Op2, 32
.lCpyDecTwo:
test Size1, 2
je .lCpyDecOne
%if 1
; Avoid SSE2 instruction due to stall on Haswell
mov Limb, [Op1]
mov [Op2], Limb
mov Limb, [Op1-8]
mov [Op2-8], Limb
%else
movdqu DLimb0, [Op1-8]
movdqa [Op2-8], DLimb0
%endif
sub Op1, 16
sub Op2, 16
.lCpyDecOne:
test Size1, 1
je .Exit
mov Limb, [Op1]
mov [Op2], Limb
.Exit:
vzeroupper
ret
.end:

View File

@ -0,0 +1,199 @@
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mpn_copyi(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
; Linux RDI RSI RDX
; Win7 RCX RDX R8
;
; Description:
; The function copies a given number of limb from source to destination (while
; moving low to high in memory) and hands back the size (in limb) of the
; destination.
;
; Result:
; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
; - number of copied limb: range [ 0..max tCounter ]
;
; Caveats:
; - if size 0 is given the content of the destination will remain untouched!
; - if Op1=Op2 no copy is done!
;
; Comments:
; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
; - did some experiments with AVX based version with following results
; - AVX can be faster in L1$ (30%), L2$ (10%) if dest. is aligned on 32 byte
; - AVX is generally faster on small sized operands (<=100 limb) due too
; start-up overhead of "rep movsq" - however this could also be achieved by
; simple copy loop
; - the break-even between AVX and "rep movsq" is around 10,000 limb
; - the prologue & epilogue can still be optimized!
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 RCX
%define Op1 RDX
%define Size1 R8
%define Limb R9
%define Offs R10
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Limb RCX
%define Offs R10
%endif
%define DLimb0 XMM0
%define QLimb0 YMM0
%define QLimb1 YMM1
%define QLimb2 YMM2
%define QLimb3 YMM3
align 32
LEAF_PROC mpn_copyi
mov RAX, Size1
cmp Op1, Op2
je .Exit ; no copy required =>
or RAX, RAX
je .Exit ; size=0 =>
; align the destination (Op2) to 32 byte
test Op2, 8
je .lCpyIncA32
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
add Op1, 8
add Op2, 8
.lCpyIncA32:
test Op2, 16
je .lCpyIncAVX
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
mov Limb, [Op1+8]
mov [Op2+8], Limb
dec Size1
je .Exit
add Op1, 16
add Op2, 16
.lCpyIncAVX:
mov Offs, 128
jmp .lCpyIncAVXCheck
; main loop (prefetching disabled; unloaded cache)
; - lCpyInc is slightly slower than lCpyDec through all cache levels?!
; - 0.30 cycles / limb in L1$
; - 0.60 cycles / limb in L2$
; - 0.70-0.90 cycles / limb in L3$
align 16
.lCpyIncAVXLoop:
vmovdqu QLimb0, [Op1]
vmovdqu QLimb1, [Op1+32]
vmovdqu QLimb2, [Op1+64]
vmovdqu QLimb3, [Op1+96]
vmovdqa [Op2], QLimb0
vmovdqa [Op2+32], QLimb1
vmovdqa [Op2+64], QLimb2
vmovdqa [Op2+96], QLimb3
add Op1, Offs
add Op2, Offs
.lCpyIncAVXCheck:
sub Size1, 16
jnc .lCpyIncAVXLoop
add Size1, 16
je .Exit ; AVX copied operand fully =>
; copy remaining max. 15 limb
test Size1, 8
je .lCpyIncFour
vmovdqu QLimb0, [Op1]
vmovdqu QLimb1, [Op1+32]
vmovdqa [Op2], QLimb0
vmovdqa [Op2+32], QLimb1
add Op1, 64
add Op2, 64
.lCpyIncFour:
test Size1, 4
je .lCpyIncTwo
vmovdqu QLimb0, [Op1]
vmovdqa [Op2], QLimb0
add Op1, 32
add Op2, 32
.lCpyIncTwo:
test Size1, 2
je .lCpyIncOne
%if 1
; Avoid SSE2 instruction due to stall on Haswell
mov Limb, [Op1]
mov [Op2], Limb
mov Limb, [Op1+8]
mov [Op2+8], Limb
%else
movdqu DLimb0, [Op1]
movdqa [Op2], DLimb0
%endif
add Op1, 16
add Op2, 16
.lCpyIncOne:
test Size1, 1
je .Exit
mov Limb, [Op1]
mov [Op2], Limb
.Exit:
vzeroupper
ret
.end:

View File

@ -0,0 +1,285 @@
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mp_limb_t mpn_lshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
; Linux RAX RDI RSI RDX RCX
; Win7 RAX RCX RDX R8 R9
;
; Description:
; The function shifts Op1 left by n bit, stores the result in Op2 (non-
; destructive shl) and hands back the shifted-out most significant bits of Op1.
; The function operates decreasing in memory supporting in-place operation.
;
; Result:
; - Op2[ Size1-1..0 ] := ( Op1[ Size1-1..0 ]:ShlIn ) << 1
; - Op1[ 0 ] >> 63
;
; Caveats:
; - caller must ensure that Shift is in [ 1..63 ]!
; - currently Linux64 support only!
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
; Skylake cores
; - the behaviour of cache prefetching in combination with AVX shifting seems
; somewhat erratic
; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
; - slight (a few percent) improvement for full LD1$ sizes
; - substantial (>10%) improvement for 1/2 LD2$ sizes
; - slight (a few percent) improvement for full LD2$ sizes
; - slight (a few percent) degradation for 1/2 LD3$ sizes
; - substantial (around 10%) degradation for full LD3$ sizes
;
; Comments:
; - implemented, tested and benched on 31.03.2016 by jn
; - includes prefetching
; ============================================================================
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 R11
%define Op1 RDX
%define Size1 R8
%define Shift RCX
%define Limb1 R9
%define Limb2 R10
%ifdef USE_PREFETCH
%define Offs -512 ; No caller-saves regs left, use immediate
%endif
%define reg_save_list XMM, 6, 7
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Shift RCX
%define Limb1 R8
%define Limb2 R9
%ifdef USE_PREFETCH
%define OFFS_REG 1
%define Offs R10
%endif
%endif
%define ShlDL0 XMM2 ; Attn: this must match ShlQL0 definition
%define ShrDL0 XMM3 ; Attn: this must match ShrQL0 definition
%define ShlDLCnt XMM6 ; Attn: this must match ShlQlCnt definition
%define ShrDLCnt XMM7 ; Attn: this must match ShrQlCnt definition
%define QLimb0 YMM0
%define QLimb1 YMM1
%define ShlQL0 YMM2
%define ShrQL0 YMM3
%define ShlQL1 YMM4
%define ShrQL1 YMM5
%define ShlQLCnt YMM6
%define ShrQLCnt YMM7
align 32
FRAME_PROC mpn_lshift, 0, reg_save_list
%ifdef USE_WIN64
mov r11, rcx
mov rcx, r9
%endif
xor EAX, EAX
sub Size1, 1
jc .Exit ; Size1=0 =>
lea Op1, [Op1+8*Size1]
lea Op2, [Op2+8*Size1]
mov Limb1, [Op1]
shld RAX, Limb1, CL
or Size1, Size1
je .lShlEquPost ; Size1=1 =>
%ifdef USE_PREFETCH
%ifdef OFFS_REG
mov Offs, -512
%endif
%endif
cmp Size1, 8
jc .lShlEquFour ; AVX inefficient =>
; first align Op2 to 32 bytes
test Op2, 8
jne .lShlEquA16
mov Limb2, [Op1-8]
shld Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, Limb2
sub Op1, 8
sub Op2, 8
sub Size1, 1
.lShlEquA16:
test Op2, 16
jne .lShlEquAVX
mov Limb2, [Op1-8]
shld Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1-16]
shld Limb2, Limb1, CL
mov [Op2-8], Limb2
sub Op1, 16
sub Op2, 16
sub Size1, 2
.lShlEquAVX:
; initialize AVX shift counter
vmovq ShlDLCnt, RCX
neg RCX
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
vmovq ShrDLCnt, RCX
neg RCX
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
vpbroadcastq ShlQLCnt, ShlDLCnt
vpbroadcastq ShrQLCnt, ShrDLCnt
; pre-fetch first quad-limb
vmovdqu QLimb0, [Op1-24]
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
vpermq ShrQL0, ShrQL0, 10010011b
sub Op1, 32
sub Size1, 4
jmp .lShlEquAVXCheck
; main loop (prefetching enabled; unloaded cache)
; - 0.60 cycles per limb in LD1$
; - 0.60-0.70 cycles per limb in LD2$
; - 0.70-0.90 cycles per limb in LD3$
align 16
.lShlEquAVXLoop:
%ifdef USE_PREFETCH
prefetchnta [Op1+Offs]
%endif
vmovdqu QLimb1, [Op1-24]
vpsllvq ShlQL0, QLimb0, ShlQLCnt
vmovdqu QLimb0, [Op1-56]
vpsrlvq ShrQL1, QLimb1, ShrQLCnt
vpermq ShrQL1, ShrQL1, 10010011b
vpblendd ShrQL0, ShrQL0, ShrQL1, 00000011b
vpor ShlQL0, ShlQL0, ShrQL0
vpsllvq ShlQL1, QLimb1, ShlQLCnt
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
vpermq ShrQL0, ShrQL0, 10010011b
vpblendd ShrQL1, ShrQL1, ShrQL0, 00000011b
vmovdqa [Op2-24], ShlQL0
vpor ShlQL1, ShlQL1, ShrQL1
vmovdqa [Op2-56], ShlQL1
sub Op1, 64
sub Op2, 64
.lShlEquAVXCheck:
sub Size1, 8
jnc .lShlEquAVXLoop
mov Limb1, [Op1]
xor Limb2, Limb2
shld Limb2, Limb1, CL
%if 1
vmovq ShlDL0, Limb2
vpblendd ShrQL0, ShrQL0, ShlQL0, 3
%else
; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
; but it is only executed once and there is no AVX2 based alternative
pinsrq ShrDL0, Limb2, 0 ; SSE4.1
%endif
vpsllvq ShlQL0, QLimb0, ShlQLCnt
vpor ShlQL0, ShlQL0, ShrQL0
vmovdqa [Op2-24], ShlQL0
sub Op2, 32
add Size1, 8
; shift remaining max. 7 limbs with SHLD mnemonic
.lShlEquFour:
sub Op1, 8
test Size1, 4
je .lShlEquTwo
mov Limb2, [Op1]
shld Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1-8]
shld Limb2, Limb1, CL
mov [Op2-8], Limb2
mov Limb2, [Op1-16]
shld Limb1, Limb2, CL
mov [Op2-16], Limb1
mov Limb1, [Op1-24]
shld Limb2, Limb1, CL
mov [Op2-24], Limb2
sub Op1, 32
sub Op2, 32
.lShlEquTwo:
test Size1, 2
je .lShlEquOne
mov Limb2, [Op1]
shld Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1-8]
shld Limb2, Limb1, CL
mov [Op2-8], Limb2
sub Op1, 16
sub Op2, 16
.lShlEquOne:
test Size1, 1
je .lShlEquPost
mov Limb2, [Op1]
shld Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, Limb2
sub Op2, 8
.lShlEquPost:
shl Limb1, CL
mov [Op2], Limb1
.Exit:
vzeroupper
END_PROC reg_save_list
.end:

View File

@ -0,0 +1,282 @@
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mp_limb_t mpn_rshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
; Linux RAX RDI RSI RDX RCX
; Windows x64 RAX RCX RDX R8 R9
;
; Description:
; The function shifts Op1 right by Shift bits, stores the result in Op2 (non-
; destructive shr) and hands back the shifted-out least significant bits of
; Op1. The function operates increasing in memory supporting in place shifts.
;
; Result:
; - Op2[ Size1-1..0 ] := ( ShrIn:Op1[ Size1-1..0 ] ) >> Shift
; - Op1[ 0 ] << ( 64-Shift )
;
; Caveats:
; - caller must ensure that Shift is in [ 1..63 ]!
; - currently Linux64 support only!
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
; Skylake cores
; - the behaviour of cache prefetching in combination with AVX shifting seems
; somewhat erratic
; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
; - slight (a few percent) improvement for full LD1$ sizes
; - substantial (>10%) improvement for 1/2 LD2$ sizes
; - slight (a few percent) improvement for full LD2$ sizes
; - slight (a few percent) degradation for 1/2 LD3$ sizes
; - substantial (around 10%) degradation for full LD3$ sizes
;
; Comments:
; - implemented, tested and benchmarked on 30.03.2016 by jn
; - includes prefetching
; ============================================================================
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 R11
%define Op1 RDX
%define Size1 R8
%define Shift RCX
%define Limb1 R9
%define Limb2 R10
%ifdef USE_PREFETCH
%define Offs -512 ; No caller-saves regs left, use immediate
%endif
%define reg_save_list XMM, 6, 7
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Shift RCX
%define Limb1 R8
%define Limb2 R9
%ifdef USE_PREFETCH
%define OFFS_REG 1
%define Offs R10
%endif
%endif
%define ShrDL0 XMM2 ; Attn: this must match ShrQL0 definition
%define ShlDL0 XMM3 ; Attn: this must match ShlQL0 definition
%define ShrDLCnt XMM6 ; Attn: this must match ShrQlCnt definition
%define ShlDLCnt XMM7 ; Attn: this must match ShlQlCnt definition
%define QLimb0 YMM0
%define QLimb1 YMM1
%define ShrQL0 YMM2
%define ShlQL0 YMM3
%define ShrQL1 YMM4
%define ShlQL1 YMM5
%define ShrQLCnt YMM6
%define ShlQLCnt YMM7
align 32
FRAME_PROC mpn_rshift, 0, reg_save_list
%ifdef USE_WIN64
mov r11, rcx
mov rcx, r9
%endif
xor EAX, EAX
or Size1, Size1
je .Exit
mov Limb1, [Op1]
shrd RAX, Limb1, CL
sub Size1, 1
je .lShrEquPost ; Size1=1 =>
%ifdef USE_PREFETCH
mov Offs, 512
%endif
cmp Size1, 8
jc .lShrEquFour ; AVX inefficient =>
; first align Op2 to 32 bytes
test Op2, 8
je .lShrEquAlign16
mov Limb2, [Op1+8]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, Limb2
add Op1, 8
add Op2, 8
sub Size1, 1
.lShrEquAlign16:
test Op2, 16
je .lShrEquAVX
mov Limb2, [Op1+8]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1+16]
shrd Limb2, Limb1, CL
mov [Op2+8], Limb2
add Op1, 16
add Op2, 16
sub Size1, 2
.lShrEquAVX:
; initialize AVX shift counter
vmovq ShrDLCnt, RCX
neg RCX
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
vmovq ShlDLCnt, RCX
neg RCX
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
vpbroadcastq ShrQLCnt, ShrDLCnt
vpbroadcastq ShlQLCnt, ShlDLCnt
; pre-fetch first quad-limb
vmovdqu QLimb0, [Op1]
vpsllvq ShlQL0, QLimb0, ShlQLCnt
add Op1, 32
sub Size1, 4
jmp .lShrEquAVXCheck
; main loop (prefetching enabled, unloaded data cache)
; - 0.60 cycles per limb in LD1$
; - 0.60-0.70 cycles per limb in LD2$
; - 0.70-0.90 cycles per limb in LD3$
align 16
.lShrEquAVXLoop:
%ifdef USE_PREFETCH
prefetchnta [Op1+Offs]
%endif
vmovdqu QLimb1, [Op1]
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
vmovdqu QLimb0, [Op1+32]
vpsllvq ShlQL1, QLimb1, ShlQLCnt
vpblendd ShlQL0, ShlQL0, ShlQL1, 00000011b
vpermq ShlQL0, ShlQL0, 00111001b
vpor ShrQL0, ShrQL0, ShlQL0
vpsrlvq ShrQL1, QLimb1, ShrQLCnt
vpsllvq ShlQL0, QLimb0, ShlQLCnt
vpblendd ShlQL1, ShlQL1, ShlQL0, 00000011b
vpermq ShlQL1, ShlQL1, 00111001b
vmovdqa [Op2], ShrQL0
vpor ShrQL1, ShrQL1, ShlQL1
vmovdqa [Op2+32], ShrQL1
add Op1, 64
add Op2, 64
.lShrEquAVXCheck:
sub Size1, 8
jnc .lShrEquAVXLoop
mov Limb1, [Op1]
xor Limb2, Limb2
shrd Limb2, Limb1, CL
%if 1
vmovq ShrDL0, Limb2
vpblendd ShlQL0, ShlQL0, ShrQL0, 3
%else
; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
; but it is only executed once and there is no AVX2 based alternative
pinsrq ShlDL0, Limb2, 0 ; SSE4.1
%endif
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
vpermq ShlQL0, ShlQL0, 00111001b
vpor ShrQL0, ShrQL0, ShlQL0
vmovdqa [Op2], ShrQL0
add Op2, 32
add Size1, 8
; shift remaining max. 7 limbs with SHRD mnemonic
.lShrEquFour:
add Op1, 8
test Size1, 4
je .lShrEquTwo
mov Limb2, [Op1]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1+8]
shrd Limb2, Limb1, CL
mov [Op2+8], Limb2
mov Limb2, [Op1+16]
shrd Limb1, Limb2, CL
mov [Op2+16], Limb1
mov Limb1, [Op1+24]
shrd Limb2, Limb1, CL
mov [Op2+24], Limb2
add Op1, 32
add Op2, 32
.lShrEquTwo:
test Size1, 2
je .lShrEquOne
mov Limb2, [Op1]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1+8]
shrd Limb2, Limb1, CL
mov [Op2+8], Limb2
add Op1, 16
add Op2, 16
.lShrEquOne:
test Size1, 1
je .lShrEquPost
mov Limb2, [Op1]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, Limb2
add Op2, 8
; store most significant limb considering shift-in part
.lShrEquPost:
shr Limb1, CL
mov [Op2], Limb1
.Exit:
vzeroupper
END_PROC reg_save_list
.end:

View File

@ -119,24 +119,48 @@
%endif
%rotate 1
%assign gpr_regs 0
%assign stack_slots 0
%assign xmm_seen 0
%if %0 > 2
%rep %0 - 2
push_reg %1
%assign gpr_regs gpr_regs + 1
%ifnum %1
%if xmm_seen == 0
%error Not an XMM register
%else
alloc_stack 16
save_xmm128 XMM%1, 0
%assign stack_slots stack_slots + 2
%endif
%elifid %1
%ifidni XMM, %1
%if stack_slots & 1 == 0
alloc_stack 8
%assign stack_slots stack_slots + 1
%assign xmm_seen 1
%else
%assign xmm_seen 2
%endif
%elif xmm_seen == 0
push_reg %1
%assign stack_slots stack_slots + 1
%else
%error XMM registers must be last in the save list
%endif
%else
%error Bad parameter list
%endif
%rotate 1
%endrep
%endif
%if (gpr_regs & 1) == (var_slots & 1)
%if (stack_slots & 1) == (var_slots & 1)
%assign var_slots var_slots + 1
%endif
%if var_slots > 0
alloc_stack 8 * var_slots
%endif
%assign stack_use 8 * (gpr_regs + var_slots)
%assign stack_use 8 * (stack_slots + var_slots)
END_PROLOGUE
%endmacro
@ -147,7 +171,16 @@
%if %0 > 0
%rep %0
%rotate -1
pop %1
%ifnum %1
movdqa XMM%1, [rsp]
add rsp, 16
%elifidni %1, XMM
%if xmm_seen == 1
add rsp, 8
%endif
%else
pop %1
%endif
%endrep
%endif
ret
@ -156,14 +189,25 @@
%macro END_PROC 0-*
add rsp, 8 * var_slots
%if var_slots
add rsp, 8 * var_slots
%endif
%if %0 > 0
%rep %0
%rotate -1
pop %1
%ifnum %1
movdqa XMM%1, [rsp]
add rsp, 16
%elifidni %1, XMM
%if xmm_seen == 1
add rsp, 8
%endif
%else
pop %1
%endif
%endrep
%endif
ret
ret
ENDPROC_FRAME
%endmacro