add more win64 assembler for haswell
This commit is contained in:
parent
a95556b926
commit
77b483e79f
@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
@ -51,7 +51,7 @@ prebuild haswell x64 14
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>DLL;USE_WIN64</Defines>
|
||||
<Defines>DLL</Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -62,6 +62,7 @@ prebuild haswell x64 14
|
||||
</ClCompile>
|
||||
<Link>
|
||||
</Link>
|
||||
|
||||
<PostBuildEvent>
|
||||
<Command>cd ..\..\build.vc
|
||||
postbuild "$(TargetPath)" 14
|
||||
@ -75,7 +76,7 @@ prebuild haswell x64 14
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>DLL;USE_WIN64</Defines>
|
||||
<Defines>DLL</Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -86,6 +87,7 @@ prebuild haswell x64 14
|
||||
</ClCompile>
|
||||
<Link>
|
||||
</Link>
|
||||
|
||||
<PostBuildEvent>
|
||||
<Command>cd ..\..\build.vc
|
||||
postbuild "$(TargetPath)" 14
|
||||
@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\cxx\osmpz.cc" />
|
||||
<ClCompile Include="..\..\mpn\generic\add.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
|
||||
@ -474,8 +475,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\mpn\generic\cmp.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\com_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\comb_tables.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\copyd.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\copyi.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_qr.c" />
|
||||
@ -526,7 +525,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
|
||||
@ -567,7 +565,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
|
||||
@ -610,17 +607,22 @@ postbuild "$(TargetPath)" 14
|
||||
<ItemGroup>
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_byfobm1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
|
||||
@ -632,7 +634,7 @@ postbuild "$(TargetPath)" 14
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\..\build.vc\vsyasm.targets" />
|
||||
</ImportGroup>
|
||||
<ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\gmp-h.in" />
|
||||
</ItemGroup>
|
||||
</Project>
|
@ -1114,9 +1114,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1144,12 +1141,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\comb_tables.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\copyd.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\copyi.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1300,9 +1291,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1423,9 +1411,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1548,9 +1533,18 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1569,6 +1563,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1581,6 +1578,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
|
@ -51,7 +51,8 @@ prebuild haswell x64 14
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>USE_WIN64</Defines>
|
||||
<Defines>
|
||||
</Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -73,7 +74,8 @@ prebuild haswell x64 14
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>USE_WIN64</Defines>
|
||||
<Defines>
|
||||
</Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\scanf\vsscanf.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
|
||||
@ -459,8 +460,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\mpn\generic\cmp.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\com_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\comb_tables.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\copyd.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\copyi.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_qr.c" />
|
||||
@ -511,7 +510,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
|
||||
@ -552,7 +550,6 @@ postbuild "$(TargetPath)" 14
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
|
||||
@ -595,13 +592,29 @@ postbuild "$(TargetPath)" 14
|
||||
<ItemGroup>
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm">
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm">
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_byfobm1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
|
||||
@ -609,11 +622,13 @@ postbuild "$(TargetPath)" 14
|
||||
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_WIN64</Defines>
|
||||
<Defines Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_WIN64</Defines>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
|
||||
|
@ -1080,9 +1080,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1110,12 +1107,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\comb_tables.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\copyd.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\copyi.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\dc_bdiv_q.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1266,9 +1257,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1389,9 +1377,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1514,9 +1499,18 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyd.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\copyi.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1535,6 +1529,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1547,6 +1544,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
|
@ -168,7 +168,7 @@ check_config $(Platform) $(Configuration) 14
|
||||
<DataExecutionPrevention>
|
||||
</DataExecutionPrevention>
|
||||
<TargetMachine>MachineX64</TargetMachine>
|
||||
<GenerateDebugInformation>No</GenerateDebugInformation>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
|
@ -8,12 +8,14 @@ mpn_divexact_byfobm1
|
||||
mpn_divrem_2
|
||||
mpn_divrem_euclidean_qr_1
|
||||
mpn_divrem_euclidean_qr_2
|
||||
mpn_lshift
|
||||
mpn_lshift1
|
||||
mpn_modexact_1_odd
|
||||
mpn_modexact_1c_odd
|
||||
mpn_mul_2
|
||||
mpn_mulmid_basecase
|
||||
mpn_preinv_divrem_1
|
||||
mpn_rshift
|
||||
mpn_rshift1
|
||||
mpn_sqr_basecase
|
||||
mpn_sub_err1_n
|
||||
|
@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0.25914.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
@ -51,7 +51,7 @@ prebuild haswell x64 15
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>DLL;USE_WIN64</Defines>
|
||||
<Defines>DLL</Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -62,6 +62,7 @@ prebuild haswell x64 15
|
||||
</ClCompile>
|
||||
<Link>
|
||||
</Link>
|
||||
|
||||
<PostBuildEvent>
|
||||
<Command>cd ..\..\build.vc
|
||||
postbuild "$(TargetPath)" 15
|
||||
@ -75,7 +76,7 @@ prebuild haswell x64 15
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>DLL;USE_WIN64</Defines>
|
||||
<Defines>DLL</Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -86,6 +87,7 @@ prebuild haswell x64 15
|
||||
</ClCompile>
|
||||
<Link>
|
||||
</Link>
|
||||
|
||||
<PostBuildEvent>
|
||||
<Command>cd ..\..\build.vc
|
||||
postbuild "$(TargetPath)" 15
|
||||
@ -464,7 +466,6 @@ postbuild "$(TargetPath)" 15
|
||||
<ClCompile Include="..\..\cxx\osmpz.cc" />
|
||||
<ClCompile Include="..\..\mpn\generic\add.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
|
||||
@ -526,7 +527,6 @@ postbuild "$(TargetPath)" 15
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
|
||||
@ -567,7 +567,6 @@ postbuild "$(TargetPath)" 15
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
|
||||
@ -610,6 +609,7 @@ postbuild "$(TargetPath)" 15
|
||||
<ItemGroup>
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
|
||||
@ -617,10 +617,12 @@ postbuild "$(TargetPath)" 15
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
|
||||
@ -632,7 +634,7 @@ postbuild "$(TargetPath)" 15
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\..\build.vc\vsyasm.targets" />
|
||||
</ImportGroup>
|
||||
<ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\gmp-h.in" />
|
||||
</ItemGroup>
|
||||
</Project>
|
@ -1114,9 +1114,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1300,9 +1297,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1423,9 +1417,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1548,6 +1539,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1569,6 +1563,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1581,6 +1578,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
|
@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0.25914.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
@ -51,7 +51,7 @@ prebuild haswell x64 15
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>USE_WIN64</Defines>
|
||||
<Defines></Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -60,6 +60,7 @@ prebuild haswell x64 15
|
||||
<AdditionalIncludeDirectories>..\..\</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>NDEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
|
||||
<PostBuildEvent>
|
||||
<Command>cd ..\..\build.vc
|
||||
postbuild "$(TargetPath)" 15
|
||||
@ -73,7 +74,7 @@ prebuild haswell x64 15
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
<YASM>
|
||||
<Defines>USE_WIN64</Defines>
|
||||
<Defines></Defines>
|
||||
<IncludePaths>..\..\mpn\x86_64w\</IncludePaths>
|
||||
<Debug>true</Debug>
|
||||
<ObjectFile>$(IntDir)mpn\</ObjectFile>
|
||||
@ -82,6 +83,7 @@ prebuild haswell x64 15
|
||||
<AdditionalIncludeDirectories>..\..\</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>_DEBUG;WIN32;_LIB;HAVE_CONFIG_H;_WIN64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
|
||||
<PostBuildEvent>
|
||||
<Command>cd ..\..\build.vc
|
||||
postbuild "$(TargetPath)" 15
|
||||
@ -449,7 +451,6 @@ postbuild "$(TargetPath)" 15
|
||||
<ClCompile Include="..\..\scanf\vsscanf.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addmul_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\addsub_n.c" />
|
||||
@ -511,7 +512,6 @@ postbuild "$(TargetPath)" 15
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul1_inverse_vector.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\mod_1.c" />
|
||||
@ -552,7 +552,6 @@ postbuild "$(TargetPath)" 15
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_1.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_qr.c" />
|
||||
<ClCompile Include="..\..\mpn\generic\sb_div_q.c" />
|
||||
@ -595,6 +594,7 @@ postbuild "$(TargetPath)" 15
|
||||
<ItemGroup>
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err1_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divexact_by3c.asm" />
|
||||
@ -602,10 +602,12 @@ postbuild "$(TargetPath)" 15
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\modexact_1c_odd.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mul_2.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sqr_basecase.asm" />
|
||||
<YASM Include="..\..\mpn\x86_64w\sub_err1_n.asm" />
|
||||
@ -617,7 +619,7 @@ postbuild "$(TargetPath)" 15
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\..\build.vc\vsyasm.targets" />
|
||||
</ImportGroup>
|
||||
<ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\gmp-h.in" />
|
||||
</ItemGroup>
|
||||
</Project>
|
@ -1080,9 +1080,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\add_1.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\add_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\addadd_n.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1266,9 +1263,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\jacobi_base.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\lshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\matrix22_mul.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1389,9 +1383,6 @@
|
||||
<ClCompile Include="..\..\mpn\generic\rsh_divrem_hensel_qr_1_2.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\rshift.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\mpn\generic\sb_bdiv_q.c">
|
||||
<Filter>Source Files\mpn</Filter>
|
||||
</ClCompile>
|
||||
@ -1514,6 +1505,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\add_err2_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\add_n.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\addmul_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1535,6 +1529,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\divrem_euclidean_qr_2.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\lshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
@ -1547,6 +1544,9 @@
|
||||
<YASM Include="..\..\mpn\x86_64w\mulmid_basecase.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
<YASM Include="..\..\mpn\x86_64w\haswell\rshift1.asm">
|
||||
<Filter>Source Files\mpn\yasm</Filter>
|
||||
</YASM>
|
||||
|
@ -1,6 +1,7 @@
|
||||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.25914.0
|
||||
VisualStudioVersion = 15.0
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lib_mpir_k8", "lib_mpir_k8\lib_mpir_k8.vcxproj", "{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}"
|
||||
EndProject
|
||||
@ -44,74 +45,74 @@ Global
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|x64
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|x64.ActiveCfg = Release|x64
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|Win32
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|x64
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|Win32
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|x64
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|x64
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|x64.ActiveCfg = Release|x64
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|x64.ActiveCfg = Release|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|x64
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|x64.ActiveCfg = Release|x64
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|x64.ActiveCfg = Release|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|x64
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|x64.ActiveCfg = Release|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|x64
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|x64.ActiveCfg = Release|x64
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|x64.ActiveCfg = Release|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|x64
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|x64.ActiveCfg = Release|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|x64
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|x64.ActiveCfg = Release|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|x64
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|x64.ActiveCfg = Release|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|x64
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|x64.ActiveCfg = Release|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|x64
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|x64.ActiveCfg = Release|x64
|
||||
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|x64
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Release|x64.ActiveCfg = Release|x64
|
||||
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|x64
|
||||
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|x64
|
||||
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{A6F0116E-48CD-4FD5-8F38-3DF1BE5286AE}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Release|x64.ActiveCfg = Release|x64
|
||||
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{A4D87C88-2541-473D-A2F3-CB4F37A3841F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
111
mpn/x86_64w/haswell/add_n.asm
Normal file
111
mpn/x86_64w/haswell/add_n.asm
Normal file
@ -0,0 +1,111 @@
|
||||
; PROLOGUE(mpn_add_n)
|
||||
|
||||
; Version 1.0.3.
|
||||
;
|
||||
; Copyright 2008 Jason Moxham
|
||||
;
|
||||
; Windows Conversion Copyright 2008 Brian Gladman
|
||||
;
|
||||
; This file is part of the MPIR Library.
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
;
|
||||
; Calculate src1[size] plus(minus) src2[size] and store the result in
|
||||
; dst[size]. The return value is the carry bit from the top of the result
|
||||
; (1 or 0). The _nc version accepts 1 or 0 for an initial carry into the
|
||||
; low limb of the calculation. Note values other than 1 or 0 here will
|
||||
; lead to garbage results.
|
||||
;
|
||||
; mp_limb_t mpn_add_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)
|
||||
; mp_limb_t mpn_add_nc(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t)
|
||||
; rax rdi rsi rdx rcx r8
|
||||
; rax rcx rdx r8 r9 [rsp+40]
|
||||
|
||||
%include "yasm_mac.inc"
|
||||
|
||||
CPU Athlon64
|
||||
BITS 64
|
||||
|
||||
xalign 8
|
||||
LEAF_PROC mpn_add_nc
|
||||
mov r10,[rsp+40]
|
||||
jmp entry
|
||||
|
||||
xalign 8
|
||||
LEAF_PROC mpn_add_n
|
||||
xor r10, r10
|
||||
entry:
|
||||
mov rax, r9
|
||||
and rax, 3
|
||||
shr r9, 2
|
||||
lea r9,[r10+r9*2]
|
||||
sar r9, 1
|
||||
jnz .2
|
||||
|
||||
mov r10, [rdx]
|
||||
adc r10, [r8]
|
||||
mov [rcx], r10
|
||||
dec rax
|
||||
jz .1
|
||||
mov r10, [rdx+8]
|
||||
adc r10, [r8+8]
|
||||
mov [rcx+8], r10
|
||||
dec rax
|
||||
jz .1
|
||||
mov r10, [rdx+16]
|
||||
adc r10, [r8+16]
|
||||
mov [rcx+16], r10
|
||||
dec rax
|
||||
.1: adc rax, rax
|
||||
ret
|
||||
|
||||
xalign 8
|
||||
.2: mov r10, [rdx]
|
||||
mov r11, [rdx+8]
|
||||
lea rdx, [rdx+32]
|
||||
adc r10, [r8]
|
||||
adc r11, [r8+8]
|
||||
lea r8, [r8+32]
|
||||
mov [rcx], r10
|
||||
mov [rcx+8], r11
|
||||
lea rcx, [rcx+32]
|
||||
mov r10, [rdx-16]
|
||||
mov r11, [rdx-8]
|
||||
adc r10, [r8-16]
|
||||
adc r11, [r8-8]
|
||||
mov [rcx-16], r10
|
||||
dec r9
|
||||
mov [rcx-8], r11
|
||||
jnz .2
|
||||
|
||||
inc rax
|
||||
dec rax
|
||||
jz .3
|
||||
mov r10, [rdx]
|
||||
adc r10, [r8]
|
||||
mov [rcx], r10
|
||||
dec rax
|
||||
jz .3
|
||||
mov r10, [rdx+8]
|
||||
adc r10, [r8+8]
|
||||
mov [rcx+8], r10
|
||||
dec rax
|
||||
jz .3
|
||||
mov r10, [rdx+16]
|
||||
adc r10, [r8+16]
|
||||
mov [rcx+16], r10
|
||||
dec rax
|
||||
.3: adc rax, rax
|
||||
ret
|
||||
|
||||
end
|
203
mpn/x86_64w/haswell/copyd.asm
Normal file
203
mpn/x86_64w/haswell/copyd.asm
Normal file
@ -0,0 +1,203 @@
|
||||
|
||||
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
|
||||
; mpn_copyd(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
|
||||
; Linux RDI RSI RDX
|
||||
; Win7 RCX RDX R8
|
||||
;
|
||||
; Description:
|
||||
; The function copies a given number of limb from source to destination (while
|
||||
; moving high to low in memory) and hands back the size (in limb) of the
|
||||
; destination.
|
||||
;
|
||||
; Result:
|
||||
; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
|
||||
; - number of copied limb: range [ 0..max tCounter ]
|
||||
;
|
||||
; Caveats:
|
||||
; - if size 0 is given the content of the destination will remain untouched!
|
||||
; - if Op1=Op2 no copy is done!
|
||||
;
|
||||
; Comments:
|
||||
; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
|
||||
; - did some experiments with AVX based version with following results
|
||||
; - AVX can be faster in L1$-L3$ if destination is aligned on 32 byte
|
||||
; - AVX is generally faster on small sized operands (<=100 limb) due too
|
||||
; start-up overhead of "rep movsq" - however this could also be achieved by
|
||||
; simple copy loop
|
||||
; - startup overhead of "rep movsq" with negative direction is 200 cycles!!!
|
||||
; - negative direction is unfavourable compared to positive "rep movsq" and
|
||||
; to AVX.
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
BITS 64
|
||||
|
||||
%ifdef USE_WIN64
|
||||
%define Op2 RCX
|
||||
%define Op1 RDX
|
||||
%define Size1 R8
|
||||
%define Limb R9
|
||||
%define Offs R10
|
||||
%else
|
||||
%define Op2 RDI
|
||||
%define Op1 RSI
|
||||
%define Size1 RDX
|
||||
%define Limb RCX
|
||||
%define Offs R10
|
||||
%endif
|
||||
|
||||
%define DLimb0 XMM0
|
||||
%define QLimb0 YMM0
|
||||
%define QLimb1 YMM1
|
||||
%define QLimb2 YMM2
|
||||
%define QLimb3 YMM3
|
||||
|
||||
align 32
|
||||
|
||||
LEAF_PROC mpn_copyd
|
||||
mov RAX, Size1
|
||||
cmp Op1, Op2
|
||||
je .Exit ; no copy required =>
|
||||
|
||||
or RAX, RAX
|
||||
je .Exit ; Size=0 =>
|
||||
|
||||
lea Op1, [Op1+8*Size1-8]
|
||||
lea Op2, [Op2+8*Size1-8]
|
||||
|
||||
; align the destination (Op2) to 32 byte
|
||||
test Op2, 8
|
||||
jne .lCpyDecA32
|
||||
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
dec Size1
|
||||
je .Exit
|
||||
|
||||
sub Op1, 8
|
||||
sub Op2, 8
|
||||
|
||||
.lCpyDecA32:
|
||||
|
||||
test Op2, 16
|
||||
jnz .lCpyDecAVX
|
||||
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
dec Size1
|
||||
je .Exit
|
||||
|
||||
mov Limb, [Op1-8]
|
||||
mov [Op2-8], Limb
|
||||
dec Size1
|
||||
je .Exit
|
||||
|
||||
sub Op1, 16
|
||||
sub Op2, 16
|
||||
|
||||
.lCpyDecAVX:
|
||||
|
||||
mov Offs, 128
|
||||
jmp .lCpyDecAVXCheck
|
||||
|
||||
; main loop (prefetching disabled; unloaded cache)
|
||||
; - 0.30 cycles / limb in L1$
|
||||
; - 0.60 cycles / limb in L2$
|
||||
; - 0.70-0.90 cycles / limb in L3$
|
||||
align 16
|
||||
.lCpyDecAVXLoop:
|
||||
|
||||
vmovdqu QLimb0, [Op1-24]
|
||||
vmovdqu QLimb1, [Op1-56]
|
||||
vmovdqu QLimb2, [Op1-88]
|
||||
vmovdqu QLimb3, [Op1-120]
|
||||
vmovdqa [Op2-24], QLimb0
|
||||
vmovdqa [Op2-56], QLimb1
|
||||
vmovdqa [Op2-88], QLimb2
|
||||
vmovdqa [Op2-120], QLimb3
|
||||
|
||||
sub Op1, Offs
|
||||
sub Op2, Offs
|
||||
|
||||
.lCpyDecAVXCheck:
|
||||
|
||||
sub Size1, 16
|
||||
jnc .lCpyDecAVXLoop
|
||||
|
||||
add Size1, 16
|
||||
je .Exit ; AVX copied operand fully =>
|
||||
|
||||
; copy remaining max. 15 limb
|
||||
test Size1, 8
|
||||
je .lCpyDecFour
|
||||
|
||||
vmovdqu QLimb0, [Op1-24]
|
||||
vmovdqu QLimb1, [Op1-56]
|
||||
vmovdqa [Op2-24], QLimb0
|
||||
vmovdqa [Op2-56], QLimb1
|
||||
|
||||
sub Op1, 64
|
||||
sub Op2, 64
|
||||
|
||||
.lCpyDecFour:
|
||||
|
||||
test Size1, 4
|
||||
je .lCpyDecTwo
|
||||
|
||||
vmovdqu QLimb0, [Op1-24]
|
||||
vmovdqa [Op2-24], QLimb0
|
||||
|
||||
sub Op1, 32
|
||||
sub Op2, 32
|
||||
|
||||
.lCpyDecTwo:
|
||||
|
||||
test Size1, 2
|
||||
je .lCpyDecOne
|
||||
|
||||
%if 1
|
||||
; Avoid SSE2 instruction due to stall on Haswell
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
mov Limb, [Op1-8]
|
||||
mov [Op2-8], Limb
|
||||
%else
|
||||
movdqu DLimb0, [Op1-8]
|
||||
movdqa [Op2-8], DLimb0
|
||||
%endif
|
||||
|
||||
sub Op1, 16
|
||||
sub Op2, 16
|
||||
|
||||
.lCpyDecOne:
|
||||
|
||||
test Size1, 1
|
||||
je .Exit
|
||||
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
|
||||
.Exit:
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
.end:
|
199
mpn/x86_64w/haswell/copyi.asm
Normal file
199
mpn/x86_64w/haswell/copyi.asm
Normal file
@ -0,0 +1,199 @@
|
||||
|
||||
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
; mpn_copyi(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
|
||||
; Linux RDI RSI RDX
|
||||
; Win7 RCX RDX R8
|
||||
;
|
||||
; Description:
|
||||
; The function copies a given number of limb from source to destination (while
|
||||
; moving low to high in memory) and hands back the size (in limb) of the
|
||||
; destination.
|
||||
;
|
||||
; Result:
|
||||
; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
|
||||
; - number of copied limb: range [ 0..max tCounter ]
|
||||
;
|
||||
; Caveats:
|
||||
; - if size 0 is given the content of the destination will remain untouched!
|
||||
; - if Op1=Op2 no copy is done!
|
||||
;
|
||||
; Comments:
|
||||
; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
|
||||
; - did some experiments with AVX based version with following results
|
||||
; - AVX can be faster in L1$ (30%), L2$ (10%) if dest. is aligned on 32 byte
|
||||
; - AVX is generally faster on small sized operands (<=100 limb) due too
|
||||
; start-up overhead of "rep movsq" - however this could also be achieved by
|
||||
; simple copy loop
|
||||
; - the break-even between AVX and "rep movsq" is around 10,000 limb
|
||||
; - the prologue & epilogue can still be optimized!
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
BITS 64
|
||||
|
||||
%ifdef USE_WIN64
|
||||
%define Op2 RCX
|
||||
%define Op1 RDX
|
||||
%define Size1 R8
|
||||
%define Limb R9
|
||||
%define Offs R10
|
||||
%else
|
||||
%define Op2 RDI
|
||||
%define Op1 RSI
|
||||
%define Size1 RDX
|
||||
%define Limb RCX
|
||||
%define Offs R10
|
||||
%endif
|
||||
|
||||
%define DLimb0 XMM0
|
||||
%define QLimb0 YMM0
|
||||
%define QLimb1 YMM1
|
||||
%define QLimb2 YMM2
|
||||
%define QLimb3 YMM3
|
||||
|
||||
align 32
|
||||
|
||||
LEAF_PROC mpn_copyi
|
||||
mov RAX, Size1
|
||||
cmp Op1, Op2
|
||||
je .Exit ; no copy required =>
|
||||
|
||||
or RAX, RAX
|
||||
je .Exit ; size=0 =>
|
||||
|
||||
; align the destination (Op2) to 32 byte
|
||||
test Op2, 8
|
||||
je .lCpyIncA32
|
||||
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
dec Size1
|
||||
je .Exit
|
||||
|
||||
add Op1, 8
|
||||
add Op2, 8
|
||||
|
||||
.lCpyIncA32:
|
||||
|
||||
test Op2, 16
|
||||
je .lCpyIncAVX
|
||||
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
dec Size1
|
||||
je .Exit
|
||||
|
||||
mov Limb, [Op1+8]
|
||||
mov [Op2+8], Limb
|
||||
dec Size1
|
||||
je .Exit
|
||||
|
||||
add Op1, 16
|
||||
add Op2, 16
|
||||
|
||||
.lCpyIncAVX:
|
||||
|
||||
mov Offs, 128
|
||||
jmp .lCpyIncAVXCheck
|
||||
|
||||
; main loop (prefetching disabled; unloaded cache)
|
||||
; - lCpyInc is slightly slower than lCpyDec through all cache levels?!
|
||||
; - 0.30 cycles / limb in L1$
|
||||
; - 0.60 cycles / limb in L2$
|
||||
; - 0.70-0.90 cycles / limb in L3$
|
||||
align 16
|
||||
.lCpyIncAVXLoop:
|
||||
|
||||
vmovdqu QLimb0, [Op1]
|
||||
vmovdqu QLimb1, [Op1+32]
|
||||
vmovdqu QLimb2, [Op1+64]
|
||||
vmovdqu QLimb3, [Op1+96]
|
||||
vmovdqa [Op2], QLimb0
|
||||
vmovdqa [Op2+32], QLimb1
|
||||
vmovdqa [Op2+64], QLimb2
|
||||
vmovdqa [Op2+96], QLimb3
|
||||
|
||||
add Op1, Offs
|
||||
add Op2, Offs
|
||||
|
||||
.lCpyIncAVXCheck:
|
||||
|
||||
sub Size1, 16
|
||||
jnc .lCpyIncAVXLoop
|
||||
|
||||
add Size1, 16
|
||||
je .Exit ; AVX copied operand fully =>
|
||||
|
||||
; copy remaining max. 15 limb
|
||||
test Size1, 8
|
||||
je .lCpyIncFour
|
||||
|
||||
vmovdqu QLimb0, [Op1]
|
||||
vmovdqu QLimb1, [Op1+32]
|
||||
vmovdqa [Op2], QLimb0
|
||||
vmovdqa [Op2+32], QLimb1
|
||||
|
||||
add Op1, 64
|
||||
add Op2, 64
|
||||
|
||||
.lCpyIncFour:
|
||||
|
||||
test Size1, 4
|
||||
je .lCpyIncTwo
|
||||
|
||||
vmovdqu QLimb0, [Op1]
|
||||
vmovdqa [Op2], QLimb0
|
||||
|
||||
add Op1, 32
|
||||
add Op2, 32
|
||||
|
||||
.lCpyIncTwo:
|
||||
|
||||
test Size1, 2
|
||||
je .lCpyIncOne
|
||||
|
||||
%if 1
|
||||
; Avoid SSE2 instruction due to stall on Haswell
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
mov Limb, [Op1+8]
|
||||
mov [Op2+8], Limb
|
||||
%else
|
||||
movdqu DLimb0, [Op1]
|
||||
movdqa [Op2], DLimb0
|
||||
%endif
|
||||
|
||||
add Op1, 16
|
||||
add Op2, 16
|
||||
|
||||
.lCpyIncOne:
|
||||
|
||||
test Size1, 1
|
||||
je .Exit
|
||||
|
||||
mov Limb, [Op1]
|
||||
mov [Op2], Limb
|
||||
|
||||
.Exit:
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
.end:
|
285
mpn/x86_64w/haswell/lshift.asm
Normal file
285
mpn/x86_64w/haswell/lshift.asm
Normal file
@ -0,0 +1,285 @@
|
||||
|
||||
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
; mp_limb_t mpn_lshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
|
||||
; Linux RAX RDI RSI RDX RCX
|
||||
; Win7 RAX RCX RDX R8 R9
|
||||
;
|
||||
; Description:
|
||||
; The function shifts Op1 left by n bit, stores the result in Op2 (non-
|
||||
; destructive shl) and hands back the shifted-out most significant bits of Op1.
|
||||
; The function operates decreasing in memory supporting in-place operation.
|
||||
;
|
||||
; Result:
|
||||
; - Op2[ Size1-1..0 ] := ( Op1[ Size1-1..0 ]:ShlIn ) << 1
|
||||
; - Op1[ 0 ] >> 63
|
||||
;
|
||||
; Caveats:
|
||||
; - caller must ensure that Shift is in [ 1..63 ]!
|
||||
; - currently Linux64 support only!
|
||||
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
|
||||
; Skylake cores
|
||||
; - the behaviour of cache prefetching in combination with AVX shifting seems
|
||||
; somewhat erratic
|
||||
; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
|
||||
; - slight (a few percent) improvement for full LD1$ sizes
|
||||
; - substantial (>10%) improvement for 1/2 LD2$ sizes
|
||||
; - slight (a few percent) improvement for full LD2$ sizes
|
||||
; - slight (a few percent) degradation for 1/2 LD3$ sizes
|
||||
; - substantial (around 10%) degradation for full LD3$ sizes
|
||||
;
|
||||
; Comments:
|
||||
; - implemented, tested and benched on 31.03.2016 by jn
|
||||
; - includes prefetching
|
||||
; ============================================================================
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
BITS 64
|
||||
|
||||
%ifdef USE_WIN64
|
||||
%define Op2 R11
|
||||
%define Op1 RDX
|
||||
%define Size1 R8
|
||||
%define Shift RCX
|
||||
%define Limb1 R9
|
||||
%define Limb2 R10
|
||||
%ifdef USE_PREFETCH
|
||||
%define Offs -512 ; No caller-saves regs left, use immediate
|
||||
%endif
|
||||
%define reg_save_list XMM, 6, 7
|
||||
%else
|
||||
%define Op2 RDI
|
||||
%define Op1 RSI
|
||||
%define Size1 RDX
|
||||
%define Shift RCX
|
||||
%define Limb1 R8
|
||||
%define Limb2 R9
|
||||
%ifdef USE_PREFETCH
|
||||
%define OFFS_REG 1
|
||||
%define Offs R10
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define ShlDL0 XMM2 ; Attn: this must match ShlQL0 definition
|
||||
%define ShrDL0 XMM3 ; Attn: this must match ShrQL0 definition
|
||||
%define ShlDLCnt XMM6 ; Attn: this must match ShlQlCnt definition
|
||||
%define ShrDLCnt XMM7 ; Attn: this must match ShrQlCnt definition
|
||||
|
||||
%define QLimb0 YMM0
|
||||
%define QLimb1 YMM1
|
||||
%define ShlQL0 YMM2
|
||||
%define ShrQL0 YMM3
|
||||
%define ShlQL1 YMM4
|
||||
%define ShrQL1 YMM5
|
||||
%define ShlQLCnt YMM6
|
||||
%define ShrQLCnt YMM7
|
||||
|
||||
align 32
|
||||
FRAME_PROC mpn_lshift, 0, reg_save_list
|
||||
%ifdef USE_WIN64
|
||||
mov r11, rcx
|
||||
mov rcx, r9
|
||||
%endif
|
||||
xor EAX, EAX
|
||||
sub Size1, 1
|
||||
jc .Exit ; Size1=0 =>
|
||||
|
||||
lea Op1, [Op1+8*Size1]
|
||||
lea Op2, [Op2+8*Size1]
|
||||
|
||||
mov Limb1, [Op1]
|
||||
shld RAX, Limb1, CL
|
||||
|
||||
or Size1, Size1
|
||||
je .lShlEquPost ; Size1=1 =>
|
||||
|
||||
%ifdef USE_PREFETCH
|
||||
%ifdef OFFS_REG
|
||||
mov Offs, -512
|
||||
%endif
|
||||
%endif
|
||||
|
||||
cmp Size1, 8
|
||||
jc .lShlEquFour ; AVX inefficient =>
|
||||
|
||||
; first align Op2 to 32 bytes
|
||||
test Op2, 8
|
||||
jne .lShlEquA16
|
||||
|
||||
mov Limb2, [Op1-8]
|
||||
shld Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, Limb2
|
||||
|
||||
sub Op1, 8
|
||||
sub Op2, 8
|
||||
sub Size1, 1
|
||||
|
||||
.lShlEquA16:
|
||||
|
||||
test Op2, 16
|
||||
jne .lShlEquAVX
|
||||
|
||||
mov Limb2, [Op1-8]
|
||||
shld Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, [Op1-16]
|
||||
shld Limb2, Limb1, CL
|
||||
mov [Op2-8], Limb2
|
||||
|
||||
sub Op1, 16
|
||||
sub Op2, 16
|
||||
sub Size1, 2
|
||||
|
||||
.lShlEquAVX:
|
||||
|
||||
; initialize AVX shift counter
|
||||
vmovq ShlDLCnt, RCX
|
||||
neg RCX
|
||||
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
|
||||
vmovq ShrDLCnt, RCX
|
||||
neg RCX
|
||||
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
|
||||
vpbroadcastq ShlQLCnt, ShlDLCnt
|
||||
vpbroadcastq ShrQLCnt, ShrDLCnt
|
||||
|
||||
; pre-fetch first quad-limb
|
||||
vmovdqu QLimb0, [Op1-24]
|
||||
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
|
||||
vpermq ShrQL0, ShrQL0, 10010011b
|
||||
|
||||
sub Op1, 32
|
||||
sub Size1, 4
|
||||
jmp .lShlEquAVXCheck
|
||||
|
||||
; main loop (prefetching enabled; unloaded cache)
|
||||
; - 0.60 cycles per limb in LD1$
|
||||
; - 0.60-0.70 cycles per limb in LD2$
|
||||
; - 0.70-0.90 cycles per limb in LD3$
|
||||
align 16
|
||||
.lShlEquAVXLoop:
|
||||
|
||||
%ifdef USE_PREFETCH
|
||||
prefetchnta [Op1+Offs]
|
||||
%endif
|
||||
|
||||
vmovdqu QLimb1, [Op1-24]
|
||||
vpsllvq ShlQL0, QLimb0, ShlQLCnt
|
||||
vmovdqu QLimb0, [Op1-56]
|
||||
vpsrlvq ShrQL1, QLimb1, ShrQLCnt
|
||||
vpermq ShrQL1, ShrQL1, 10010011b
|
||||
vpblendd ShrQL0, ShrQL0, ShrQL1, 00000011b
|
||||
vpor ShlQL0, ShlQL0, ShrQL0
|
||||
vpsllvq ShlQL1, QLimb1, ShlQLCnt
|
||||
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
|
||||
vpermq ShrQL0, ShrQL0, 10010011b
|
||||
vpblendd ShrQL1, ShrQL1, ShrQL0, 00000011b
|
||||
vmovdqa [Op2-24], ShlQL0
|
||||
vpor ShlQL1, ShlQL1, ShrQL1
|
||||
vmovdqa [Op2-56], ShlQL1
|
||||
|
||||
sub Op1, 64
|
||||
sub Op2, 64
|
||||
|
||||
.lShlEquAVXCheck:
|
||||
|
||||
sub Size1, 8
|
||||
jnc .lShlEquAVXLoop
|
||||
|
||||
mov Limb1, [Op1]
|
||||
xor Limb2, Limb2
|
||||
shld Limb2, Limb1, CL
|
||||
%if 1
|
||||
vmovq ShlDL0, Limb2
|
||||
vpblendd ShrQL0, ShrQL0, ShlQL0, 3
|
||||
%else
|
||||
; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
|
||||
; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
|
||||
; but it is only executed once and there is no AVX2 based alternative
|
||||
pinsrq ShrDL0, Limb2, 0 ; SSE4.1
|
||||
%endif
|
||||
vpsllvq ShlQL0, QLimb0, ShlQLCnt
|
||||
vpor ShlQL0, ShlQL0, ShrQL0
|
||||
vmovdqa [Op2-24], ShlQL0
|
||||
|
||||
sub Op2, 32
|
||||
add Size1, 8
|
||||
|
||||
; shift remaining max. 7 limbs with SHLD mnemonic
|
||||
.lShlEquFour:
|
||||
|
||||
sub Op1, 8
|
||||
test Size1, 4
|
||||
je .lShlEquTwo
|
||||
|
||||
mov Limb2, [Op1]
|
||||
shld Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, [Op1-8]
|
||||
shld Limb2, Limb1, CL
|
||||
mov [Op2-8], Limb2
|
||||
mov Limb2, [Op1-16]
|
||||
shld Limb1, Limb2, CL
|
||||
mov [Op2-16], Limb1
|
||||
mov Limb1, [Op1-24]
|
||||
shld Limb2, Limb1, CL
|
||||
mov [Op2-24], Limb2
|
||||
|
||||
sub Op1, 32
|
||||
sub Op2, 32
|
||||
|
||||
.lShlEquTwo:
|
||||
|
||||
test Size1, 2
|
||||
je .lShlEquOne
|
||||
|
||||
mov Limb2, [Op1]
|
||||
shld Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, [Op1-8]
|
||||
shld Limb2, Limb1, CL
|
||||
mov [Op2-8], Limb2
|
||||
|
||||
sub Op1, 16
|
||||
sub Op2, 16
|
||||
|
||||
.lShlEquOne:
|
||||
|
||||
test Size1, 1
|
||||
je .lShlEquPost
|
||||
|
||||
mov Limb2, [Op1]
|
||||
shld Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, Limb2
|
||||
|
||||
sub Op2, 8
|
||||
|
||||
.lShlEquPost:
|
||||
|
||||
shl Limb1, CL
|
||||
mov [Op2], Limb1
|
||||
|
||||
.Exit:
|
||||
|
||||
vzeroupper
|
||||
END_PROC reg_save_list
|
||||
.end:
|
282
mpn/x86_64w/haswell/rshift.asm
Normal file
282
mpn/x86_64w/haswell/rshift.asm
Normal file
@ -0,0 +1,282 @@
|
||||
|
||||
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
||||
|
||||
; This file is part of the MPIR Library.
|
||||
|
||||
; The MPIR Library is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU Lesser General Public License as published
|
||||
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
; your option) any later version.
|
||||
|
||||
; The MPIR Library is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
; License for more details.
|
||||
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||||
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||||
; Boston, MA 02110-1301, USA.
|
||||
|
||||
; mp_limb_t mpn_rshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
|
||||
; Linux RAX RDI RSI RDX RCX
|
||||
; Windows x64 RAX RCX RDX R8 R9
|
||||
;
|
||||
; Description:
|
||||
; The function shifts Op1 right by Shift bits, stores the result in Op2 (non-
|
||||
; destructive shr) and hands back the shifted-out least significant bits of
|
||||
; Op1. The function operates increasing in memory supporting in place shifts.
|
||||
;
|
||||
; Result:
|
||||
; - Op2[ Size1-1..0 ] := ( ShrIn:Op1[ Size1-1..0 ] ) >> Shift
|
||||
; - Op1[ 0 ] << ( 64-Shift )
|
||||
;
|
||||
; Caveats:
|
||||
; - caller must ensure that Shift is in [ 1..63 ]!
|
||||
; - currently Linux64 support only!
|
||||
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
|
||||
; Skylake cores
|
||||
; - the behaviour of cache prefetching in combination with AVX shifting seems
|
||||
; somewhat erratic
|
||||
; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
|
||||
; - slight (a few percent) improvement for full LD1$ sizes
|
||||
; - substantial (>10%) improvement for 1/2 LD2$ sizes
|
||||
; - slight (a few percent) improvement for full LD2$ sizes
|
||||
; - slight (a few percent) degradation for 1/2 LD3$ sizes
|
||||
; - substantial (around 10%) degradation for full LD3$ sizes
|
||||
;
|
||||
; Comments:
|
||||
; - implemented, tested and benchmarked on 30.03.2016 by jn
|
||||
; - includes prefetching
|
||||
; ============================================================================
|
||||
|
||||
%include 'yasm_mac.inc'
|
||||
|
||||
BITS 64
|
||||
|
||||
%ifdef USE_WIN64
|
||||
%define Op2 R11
|
||||
%define Op1 RDX
|
||||
%define Size1 R8
|
||||
%define Shift RCX
|
||||
%define Limb1 R9
|
||||
%define Limb2 R10
|
||||
%ifdef USE_PREFETCH
|
||||
%define Offs -512 ; No caller-saves regs left, use immediate
|
||||
%endif
|
||||
%define reg_save_list XMM, 6, 7
|
||||
%else
|
||||
%define Op2 RDI
|
||||
%define Op1 RSI
|
||||
%define Size1 RDX
|
||||
%define Shift RCX
|
||||
%define Limb1 R8
|
||||
%define Limb2 R9
|
||||
%ifdef USE_PREFETCH
|
||||
%define OFFS_REG 1
|
||||
%define Offs R10
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define ShrDL0 XMM2 ; Attn: this must match ShrQL0 definition
|
||||
%define ShlDL0 XMM3 ; Attn: this must match ShlQL0 definition
|
||||
%define ShrDLCnt XMM6 ; Attn: this must match ShrQlCnt definition
|
||||
%define ShlDLCnt XMM7 ; Attn: this must match ShlQlCnt definition
|
||||
|
||||
%define QLimb0 YMM0
|
||||
%define QLimb1 YMM1
|
||||
%define ShrQL0 YMM2
|
||||
%define ShlQL0 YMM3
|
||||
%define ShrQL1 YMM4
|
||||
%define ShlQL1 YMM5
|
||||
%define ShrQLCnt YMM6
|
||||
%define ShlQLCnt YMM7
|
||||
|
||||
align 32
|
||||
|
||||
FRAME_PROC mpn_rshift, 0, reg_save_list
|
||||
%ifdef USE_WIN64
|
||||
mov r11, rcx
|
||||
mov rcx, r9
|
||||
%endif
|
||||
xor EAX, EAX
|
||||
or Size1, Size1
|
||||
je .Exit
|
||||
|
||||
mov Limb1, [Op1]
|
||||
shrd RAX, Limb1, CL
|
||||
|
||||
sub Size1, 1
|
||||
je .lShrEquPost ; Size1=1 =>
|
||||
|
||||
%ifdef USE_PREFETCH
|
||||
mov Offs, 512
|
||||
%endif
|
||||
|
||||
cmp Size1, 8
|
||||
jc .lShrEquFour ; AVX inefficient =>
|
||||
|
||||
; first align Op2 to 32 bytes
|
||||
test Op2, 8
|
||||
je .lShrEquAlign16
|
||||
|
||||
mov Limb2, [Op1+8]
|
||||
shrd Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, Limb2
|
||||
|
||||
add Op1, 8
|
||||
add Op2, 8
|
||||
sub Size1, 1
|
||||
|
||||
.lShrEquAlign16:
|
||||
|
||||
test Op2, 16
|
||||
je .lShrEquAVX
|
||||
|
||||
mov Limb2, [Op1+8]
|
||||
shrd Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, [Op1+16]
|
||||
shrd Limb2, Limb1, CL
|
||||
mov [Op2+8], Limb2
|
||||
|
||||
add Op1, 16
|
||||
add Op2, 16
|
||||
sub Size1, 2
|
||||
|
||||
.lShrEquAVX:
|
||||
|
||||
; initialize AVX shift counter
|
||||
vmovq ShrDLCnt, RCX
|
||||
neg RCX
|
||||
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
|
||||
vmovq ShlDLCnt, RCX
|
||||
neg RCX
|
||||
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
|
||||
vpbroadcastq ShrQLCnt, ShrDLCnt
|
||||
vpbroadcastq ShlQLCnt, ShlDLCnt
|
||||
|
||||
; pre-fetch first quad-limb
|
||||
vmovdqu QLimb0, [Op1]
|
||||
vpsllvq ShlQL0, QLimb0, ShlQLCnt
|
||||
|
||||
add Op1, 32
|
||||
sub Size1, 4
|
||||
jmp .lShrEquAVXCheck
|
||||
|
||||
; main loop (prefetching enabled, unloaded data cache)
|
||||
; - 0.60 cycles per limb in LD1$
|
||||
; - 0.60-0.70 cycles per limb in LD2$
|
||||
; - 0.70-0.90 cycles per limb in LD3$
|
||||
align 16
|
||||
.lShrEquAVXLoop:
|
||||
|
||||
%ifdef USE_PREFETCH
|
||||
prefetchnta [Op1+Offs]
|
||||
%endif
|
||||
|
||||
vmovdqu QLimb1, [Op1]
|
||||
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
|
||||
vmovdqu QLimb0, [Op1+32]
|
||||
vpsllvq ShlQL1, QLimb1, ShlQLCnt
|
||||
vpblendd ShlQL0, ShlQL0, ShlQL1, 00000011b
|
||||
vpermq ShlQL0, ShlQL0, 00111001b
|
||||
vpor ShrQL0, ShrQL0, ShlQL0
|
||||
vpsrlvq ShrQL1, QLimb1, ShrQLCnt
|
||||
vpsllvq ShlQL0, QLimb0, ShlQLCnt
|
||||
vpblendd ShlQL1, ShlQL1, ShlQL0, 00000011b
|
||||
vpermq ShlQL1, ShlQL1, 00111001b
|
||||
vmovdqa [Op2], ShrQL0
|
||||
vpor ShrQL1, ShrQL1, ShlQL1
|
||||
vmovdqa [Op2+32], ShrQL1
|
||||
|
||||
add Op1, 64
|
||||
add Op2, 64
|
||||
|
||||
.lShrEquAVXCheck:
|
||||
|
||||
sub Size1, 8
|
||||
jnc .lShrEquAVXLoop
|
||||
|
||||
mov Limb1, [Op1]
|
||||
xor Limb2, Limb2
|
||||
shrd Limb2, Limb1, CL
|
||||
%if 1
|
||||
vmovq ShrDL0, Limb2
|
||||
vpblendd ShlQL0, ShlQL0, ShrQL0, 3
|
||||
%else
|
||||
; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
|
||||
; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
|
||||
; but it is only executed once and there is no AVX2 based alternative
|
||||
pinsrq ShlDL0, Limb2, 0 ; SSE4.1
|
||||
%endif
|
||||
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
|
||||
vpermq ShlQL0, ShlQL0, 00111001b
|
||||
vpor ShrQL0, ShrQL0, ShlQL0
|
||||
vmovdqa [Op2], ShrQL0
|
||||
|
||||
add Op2, 32
|
||||
add Size1, 8
|
||||
|
||||
; shift remaining max. 7 limbs with SHRD mnemonic
|
||||
.lShrEquFour:
|
||||
|
||||
add Op1, 8
|
||||
test Size1, 4
|
||||
je .lShrEquTwo
|
||||
|
||||
mov Limb2, [Op1]
|
||||
shrd Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, [Op1+8]
|
||||
shrd Limb2, Limb1, CL
|
||||
mov [Op2+8], Limb2
|
||||
mov Limb2, [Op1+16]
|
||||
shrd Limb1, Limb2, CL
|
||||
mov [Op2+16], Limb1
|
||||
mov Limb1, [Op1+24]
|
||||
shrd Limb2, Limb1, CL
|
||||
mov [Op2+24], Limb2
|
||||
|
||||
add Op1, 32
|
||||
add Op2, 32
|
||||
|
||||
.lShrEquTwo:
|
||||
|
||||
test Size1, 2
|
||||
je .lShrEquOne
|
||||
|
||||
mov Limb2, [Op1]
|
||||
shrd Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, [Op1+8]
|
||||
shrd Limb2, Limb1, CL
|
||||
mov [Op2+8], Limb2
|
||||
|
||||
add Op1, 16
|
||||
add Op2, 16
|
||||
|
||||
.lShrEquOne:
|
||||
|
||||
test Size1, 1
|
||||
je .lShrEquPost
|
||||
|
||||
mov Limb2, [Op1]
|
||||
shrd Limb1, Limb2, CL
|
||||
mov [Op2], Limb1
|
||||
mov Limb1, Limb2
|
||||
|
||||
add Op2, 8
|
||||
|
||||
; store most significant limb considering shift-in part
|
||||
.lShrEquPost:
|
||||
|
||||
shr Limb1, CL
|
||||
mov [Op2], Limb1
|
||||
|
||||
.Exit:
|
||||
|
||||
vzeroupper
|
||||
END_PROC reg_save_list
|
||||
.end:
|
@ -119,24 +119,48 @@
|
||||
%endif
|
||||
%rotate 1
|
||||
|
||||
%assign gpr_regs 0
|
||||
%assign stack_slots 0
|
||||
%assign xmm_seen 0
|
||||
%if %0 > 2
|
||||
%rep %0 - 2
|
||||
%ifnum %1
|
||||
%if xmm_seen == 0
|
||||
%error Not an XMM register
|
||||
%else
|
||||
alloc_stack 16
|
||||
save_xmm128 XMM%1, 0
|
||||
%assign stack_slots stack_slots + 2
|
||||
%endif
|
||||
%elifid %1
|
||||
%ifidni XMM, %1
|
||||
%if stack_slots & 1 == 0
|
||||
alloc_stack 8
|
||||
%assign stack_slots stack_slots + 1
|
||||
%assign xmm_seen 1
|
||||
%else
|
||||
%assign xmm_seen 2
|
||||
%endif
|
||||
%elif xmm_seen == 0
|
||||
push_reg %1
|
||||
%assign gpr_regs gpr_regs + 1
|
||||
%assign stack_slots stack_slots + 1
|
||||
%else
|
||||
%error XMM registers must be last in the save list
|
||||
%endif
|
||||
%else
|
||||
%error Bad parameter list
|
||||
%endif
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endif
|
||||
|
||||
%if (gpr_regs & 1) == (var_slots & 1)
|
||||
%if (stack_slots & 1) == (var_slots & 1)
|
||||
%assign var_slots var_slots + 1
|
||||
%endif
|
||||
|
||||
%if var_slots > 0
|
||||
alloc_stack 8 * var_slots
|
||||
%endif
|
||||
%assign stack_use 8 * (gpr_regs + var_slots)
|
||||
|
||||
%assign stack_use 8 * (stack_slots + var_slots)
|
||||
END_PROLOGUE
|
||||
|
||||
%endmacro
|
||||
@ -147,7 +171,16 @@
|
||||
%if %0 > 0
|
||||
%rep %0
|
||||
%rotate -1
|
||||
%ifnum %1
|
||||
movdqa XMM%1, [rsp]
|
||||
add rsp, 16
|
||||
%elifidni %1, XMM
|
||||
%if xmm_seen == 1
|
||||
add rsp, 8
|
||||
%endif
|
||||
%else
|
||||
pop %1
|
||||
%endif
|
||||
%endrep
|
||||
%endif
|
||||
ret
|
||||
@ -156,11 +189,22 @@
|
||||
|
||||
%macro END_PROC 0-*
|
||||
|
||||
%if var_slots
|
||||
add rsp, 8 * var_slots
|
||||
%endif
|
||||
%if %0 > 0
|
||||
%rep %0
|
||||
%rotate -1
|
||||
%ifnum %1
|
||||
movdqa XMM%1, [rsp]
|
||||
add rsp, 16
|
||||
%elifidni %1, XMM
|
||||
%if xmm_seen == 1
|
||||
add rsp, 8
|
||||
%endif
|
||||
%else
|
||||
pop %1
|
||||
%endif
|
||||
%endrep
|
||||
%endif
|
||||
ret
|
||||
|
Loading…
Reference in New Issue
Block a user