add latest skylake AVX code to Windows

This commit is contained in:
Brian Gladman 2017-01-18 13:09:22 +00:00
parent 721da455a0
commit 3ce4ca48e3
4 changed files with 515 additions and 67 deletions

View File

@ -1,6 +1,7 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
VisualStudioVersion = 14.0.24720.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lib_mpir_k8", "lib_mpir_k8\lib_mpir_k8.vcxproj", "{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}"
EndProject
@ -56,98 +57,98 @@ Global
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|x64
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|x64
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|Win32.ActiveCfg = Release|Win32
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Release|x64.ActiveCfg = Release|x64
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|Win32
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|Win32.ActiveCfg = Debug|Win32
{22D14BC4-ADC6-48C6-8BA1-4AC45799B945}.Debug|x64.ActiveCfg = Debug|x64
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|Win32.ActiveCfg = Release|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Release|x64.ActiveCfg = Release|x64
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|Win32.ActiveCfg = Debug|Win32
{F4418981-ABAA-4B7A-9538-3E0166149F28}.Debug|x64.ActiveCfg = Debug|x64
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|Win32.ActiveCfg = Release|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|Win32
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|x64
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Release|x64.ActiveCfg = Release|x64
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|Win32.ActiveCfg = Debug|Win32
{52C97475-88BE-437F-BC53-D3E1205EA08B}.Debug|x64.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|Win32.ActiveCfg = Release|Win32
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Release|x64.ActiveCfg = Release|x64
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|Win32.ActiveCfg = Debug|Win32
{F4416466-C0B3-4105-B391-CC77FB7A0231}.Debug|x64.ActiveCfg = Debug|x64
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|Win32.ActiveCfg = Release|Win32
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Release|x64.ActiveCfg = Release|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|x64
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|Win32.ActiveCfg = Debug|Win32
{EF286BFC-F78C-45A4-A818-C6C4CCB96D57}.Debug|x64.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|Win32.ActiveCfg = Release|Win32
{D058893B-87A8-4161-8821-FA5707504B2C}.Release|x64.ActiveCfg = Release|x64
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|Win32.ActiveCfg = Debug|Win32
{D058893B-87A8-4161-8821-FA5707504B2C}.Debug|x64.ActiveCfg = Debug|x64
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|Win32.ActiveCfg = Release|Win32
{4A742B65-9836-4F46-8310-728F046A31C1}.Release|x64.ActiveCfg = Release|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|x64
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|Win32.ActiveCfg = Debug|Win32
{4A742B65-9836-4F46-8310-728F046A31C1}.Debug|x64.ActiveCfg = Debug|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|Win32.ActiveCfg = Release|Win32
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Release|x64.ActiveCfg = Release|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|x64
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|Win32.ActiveCfg = Debug|Win32
{935C1E48-A26F-4E73-B8EA-B163F21E833B}.Debug|x64.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|Win32.ActiveCfg = Release|Win32
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Release|x64.ActiveCfg = Release|x64
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|Win32.ActiveCfg = Debug|Win32
{49DD6B75-0009-40DC-BE9E-383C565494C8}.Debug|x64.ActiveCfg = Debug|x64
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|Win32.ActiveCfg = Release|Win32
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Release|x64.ActiveCfg = Release|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|x64
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|Win32.ActiveCfg = Debug|Win32
{D35A0A8E-F81A-4F97-828E-8CAB4B9DBFFC}.Debug|x64.ActiveCfg = Debug|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|Win32.ActiveCfg = Release|Win32
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Release|x64.ActiveCfg = Release|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|x64
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|Win32.ActiveCfg = Debug|Win32
{EC599438-E241-49B5-ACA5-CB897F8041B5}.Debug|x64.ActiveCfg = Debug|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|Win32.ActiveCfg = Release|Win32
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Release|x64.ActiveCfg = Release|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|x64
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|Win32.ActiveCfg = Debug|Win32
{87821166-8F44-41D1-AE5B-151CD73C96D3}.Debug|x64.ActiveCfg = Debug|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|Win32.ActiveCfg = Release|Win32
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Release|x64.ActiveCfg = Release|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|x64
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|Win32.ActiveCfg = Debug|Win32
{3D01F893-119C-4089-A1EA-5645E8C4F366}.Debug|x64.ActiveCfg = Debug|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|Win32.ActiveCfg = Release|Win32
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Release|x64.ActiveCfg = Release|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|x64
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|Win32.ActiveCfg = Debug|Win32
{5E4AA504-9B77-4907-A93F-982FCED077EC}.Debug|x64.ActiveCfg = Debug|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|Win32.ActiveCfg = Release|Win32
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Release|x64.ActiveCfg = Release|x64
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|Win32.ActiveCfg = Debug|x64
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|x64.ActiveCfg = Debug|x64
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|Win32.ActiveCfg = Release|x64
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|Win32.ActiveCfg = Debug|Win32
{51944AE9-3706-4F91-82FA-3AD733EE0FAA}.Debug|x64.ActiveCfg = Debug|x64
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|Win32.ActiveCfg = Release|Win32
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Release|x64.ActiveCfg = Release|x64
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|Win32.ActiveCfg = Debug|x64
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|x64.ActiveCfg = Debug|x64
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|Win32.ActiveCfg = Release|x64
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|Win32.ActiveCfg = Debug|Win32
{6CA24950-D41E-4264-94FE-92AFB04CFD2F}.Debug|x64.ActiveCfg = Debug|x64
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|Win32.ActiveCfg = Release|Win32
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Release|x64.ActiveCfg = Release|x64
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|Win32.ActiveCfg = Debug|x64
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|x64.ActiveCfg = Debug|x64
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|Win32.ActiveCfg = Release|x64
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|Win32.ActiveCfg = Debug|Win32
{43CB4B7D-F204-4C79-8585-556B95C8C120}.Debug|x64.ActiveCfg = Debug|x64
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|Win32.ActiveCfg = Release|Win32
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Release|x64.ActiveCfg = Release|x64
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|Win32.ActiveCfg = Debug|x64
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|x64.ActiveCfg = Debug|x64
{5811A327-3992-4365-95CC-47CB0F9532A5}.Release|Win32.ActiveCfg = Release|x64
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|Win32.ActiveCfg = Debug|Win32
{1BE624B2-5FEA-4DA6-A7FA-9A1340E3361F}.Debug|x64.ActiveCfg = Debug|x64
{5811A327-3992-4365-95CC-47CB0F9532A5}.Release|Win32.ActiveCfg = Release|Win32
{5811A327-3992-4365-95CC-47CB0F9532A5}.Release|x64.ActiveCfg = Release|x64
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|Win32.ActiveCfg = Debug|x64
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|x64.ActiveCfg = Debug|x64
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|Win32.ActiveCfg = Release|x64
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|Win32.ActiveCfg = Debug|Win32
{5811A327-3992-4365-95CC-47CB0F9532A5}.Debug|x64.ActiveCfg = Debug|x64
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|Win32.ActiveCfg = Release|Win32
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Release|x64.ActiveCfg = Release|x64
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|Win32.ActiveCfg = Debug|x64
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|x64.ActiveCfg = Debug|x64
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|Win32.ActiveCfg = Release|x64
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|Win32.ActiveCfg = Debug|Win32
{58ED6FD2-E37D-4364-9428-6314426FFCC2}.Debug|x64.ActiveCfg = Debug|x64
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|Win32.ActiveCfg = Release|Win32
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Release|x64.ActiveCfg = Release|x64
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|Win32.ActiveCfg = Debug|x64
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|x64.ActiveCfg = Debug|x64
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|Win32.ActiveCfg = Release|x64
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|Win32.ActiveCfg = Debug|Win32
{1B807FDE-B444-4AD4-8A2A-814D52124585}.Debug|x64.ActiveCfg = Debug|x64
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|Win32.ActiveCfg = Release|Win32
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Release|x64.ActiveCfg = Release|x64
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|Win32.ActiveCfg = Debug|x64
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|x64.ActiveCfg = Debug|x64
{09A387AF-18EA-40EB-AD27-9DF346740987}.Release|Win32.ActiveCfg = Release|x64
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|Win32.ActiveCfg = Debug|Win32
{DF571850-CFB2-4F13-9F15-DDA6B9D0210E}.Debug|x64.ActiveCfg = Debug|x64
{09A387AF-18EA-40EB-AD27-9DF346740987}.Release|Win32.ActiveCfg = Release|Win32
{09A387AF-18EA-40EB-AD27-9DF346740987}.Release|x64.ActiveCfg = Release|x64
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|Win32.ActiveCfg = Debug|Win32
{09A387AF-18EA-40EB-AD27-9DF346740987}.Debug|x64.ActiveCfg = Debug|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

View File

@ -0,0 +1,126 @@
; AMD64 mpn_add_err1_n
; Copyright 2017 Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,rcx) = (rsi,rcx)+(rdx,rcx)+CyIn
; rax = carry
; (rcx,2) = rev(r8,rcx) \dot (carry,rcx) where carry is the sequence
; of carries from the addition of (rsi,rcx)+(rdx,rcx)
; mp_limb_t mpn_add_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
; mp_limb_t mpn_sub_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
; rax rdi rsi rdx rcx r8 r9 8(rsp)
; rax rcx rdx r8 r9 [rsp+40] [rsp+48] [rsp+56]
%include 'yasm_mac.inc'
%define SumP rdi
%define Inp1P rsi
%define Inp2P rdx
%define EP r11
%define SizeRest rcx
%define YP r8
%define Size r9
%define CyIn [rsp+8]
%define LIMB0 rax
%define E0 r12
%define E1 r13
%define Zero r14
%define Dummy rbx
%define reg_save_list rsi, rdi, rbx, r12, r13, r14
align 32
BITS 64
%macro DO_LIMB 1
mov LIMB0, [Inp1P + %1*8]
adc LIMB0, [Inp2P + %1*8]
mov [SumP + %1*8], LIMB0
mov LIMB0, [YP - %1*8]
cmovnc LIMB0, Zero
inc Dummy ; OF = 0
adox E0, LIMB0
adox E1, Zero
%endmacro
FRAME_PROC mpn_add_err1_n, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov r11, r9
mov r8, [rsp + stack_use + 40]
mov r9, [rsp + stack_use + 48]
mov LIMB0, [rsp + stack_use + 56]
mov SizeRest, Size
lea YP, [YP + Size*8 - 8]
and SizeRest, 7
xor Zero, Zero
mov E0, Zero
mov E1, Zero
shr Size, 3
bt LIMB0, 0
jz .testrest
align 16
.loop:
DO_LIMB 0
DO_LIMB 1
DO_LIMB 2
DO_LIMB 3
DO_LIMB 4
DO_LIMB 5
DO_LIMB 6
DO_LIMB 7
lea Inp1P, [Inp1P+64]
lea Inp2P, [Inp2P+64]
lea SumP, [SumP+64]
lea YP, [YP-64]
dec Size
jne .loop
.testrest:
inc SizeRest
dec SizeRest
jz .exit
.rest:
DO_LIMB 0
dec SizeRest
jz .exit
DO_LIMB 1
dec SizeRest
jz .exit
DO_LIMB 2
dec SizeRest
jz .exit
DO_LIMB 3
dec SizeRest
jz .exit
lea Inp1P, [Inp1P+32]
lea Inp2P, [Inp2P+32]
lea SumP, [SumP+32]
lea YP, [YP-32]
jmp .rest
.exit:
mov rax, Zero
setc al
mov [EP], E0
mov [EP+8], E1
END_PROC reg_save_list

View File

@ -0,0 +1,196 @@
; AMD64 mpn_mul_1
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,rdx) = rcx*(rsi,rdx)
; rax = high word of product
; mp_limb_t mpn_mul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
; mp_limb_t mpn_mul_1c(mp_ptr, mp_ptr, mp_size_t, mp_limb_t, mp_limb_t)
; rax rdi rsi rdx rcx r8
; rax rcx rdx r8 r9 [rsp+40]
%include 'yasm_mac.inc'
BITS 64
; the following register allocation scheme is valid for Linux
%define RP RDI
%define S1P RSI
%define Size RCX
%define S2 RDX
%define MulLo0 R8
%define MulHi0 R9
%define MulLo1 R10
%define MulHi1 R11
%define MulLo2 R12
%define MulHi2 R13
%define MulLo3 R14
%define MulHi3 RBX
%define reg_save_list rsi, rdi, rbx, r12, r13, r14
align 32
FRAME_PROC mpn_mul_1, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rcx, r8
mov rdx, r9
mov r8, [rsp + stack_use + 40]
xor MulHi3, MulHi3
mov RAX, Size ; may be increased by 1 at the end
sub Size, 4
jc .Post ; separate handling of remaining max. 3 limb =>
; prepare a quadlimb for main-loop entry
mulx MulHi0, MulLo0, [S1P]
mulx MulHi1, MulLo1, [S1P+8]
mulx MulHi2, MulLo2, [S1P+16]
mulx MulHi3, MulLo3, [S1P+24]
add S1P, 32
add MulLo1, MulHi0
adc MulLo2, MulHi1
adc MulLo3, MulHi2
adc MulHi3, 0
jmp .Check ; enter main loop =>
; main loop (unloaded operands)
; - 1.25 cycles per limb in L1D$
; - 1.25 cycles per limb in L2D$
; - 1.60-1.72 cycles per limb in L3D$
align 32
.Loop:
mov [RP], MulLo0
mov [RP+8], MulLo1
mov [RP+16], MulLo2
mov [RP+24], MulLo3
mulx MulHi0, MulLo0, [S1P]
mulx MulHi1, MulLo1, [S1P+8]
mulx MulHi2, MulLo2, [S1P+16]
add MulLo0, MulHi3
mov [RP+32], MulLo0
adc MulLo1, MulHi0
mov [RP+40], MulLo1
adc MulLo2, MulHi1
mov [RP+48], MulLo2
mulx MulHi3, MulLo3, [S1P+24]
mulx MulHi0, MulLo0, [S1P+32]
mulx MulHi1, MulLo1, [S1P+40]
adc MulLo3, MulHi2 ; no carry-out here
adc MulLo0, MulHi3
adc MulLo1, MulHi0
mulx MulHi2, MulLo2, [S1P+48]
adc MulLo2, MulHi1
mov [RP+56], MulLo3
mulx MulHi3, MulLo3, [S1P+56]
adc MulLo3, MulHi2
adc MulHi3, 0
add S1P, 64
add RP, 64
.Check:
sub Size, 8
jnc .Loop
; core loop roll-out 8 can generate dangling quad-limb
test Size, 4
je .Store ; no dangling quad-limb =>
mov [RP], MulLo0
mulx MulHi0, MulLo0, [S1P]
mov [RP+8], MulLo1
mulx MulHi1, MulLo1, [S1P+8]
mov [RP+16], MulLo2
mulx MulHi2, MulLo2, [S1P+16]
add MulLo0, MulHi3
mov [RP+24], MulLo3
mulx MulHi3, MulLo3, [S1P+24]
adc MulLo1, MulHi0
adc MulLo2, MulHi1
adc MulLo3, MulHi2
adc MulHi3, 0
add S1P, 32
add RP, 32
; store remaining quad-limb from main loop
.Store:
mov [RP], MulLo0
mov [RP+8], MulLo1
mov [RP+16], MulLo2
mov [RP+24], MulLo3
add RP, 32
; handle final 0-3 single limb of S1P
.Post:
and Size, 3
je .Post0
cmp Size, 2
ja .Post3
je .Post2
.Post1:
mulx MulHi0, MulLo0, [S1P]
add MulLo0, MulHi3
adc MulHi0, 0
mov [RP], MulLo0
mov rax, MulHi0
jmp .Exit
.Post2:
mulx MulHi0, MulLo0, [S1P]
mulx MulHi1, MulLo1, [S1P+8]
add MulLo0, MulHi3
adc MulLo1, MulHi0
adc MulHi1, 0
mov [RP], MulLo0
mov [RP+8], MulLo1
mov rax, MulHi1
jmp .Exit
.Post3:
mulx MulHi0, MulLo0, [S1P]
mulx MulHi1, MulLo1, [S1P+8]
mulx MulHi2, MulLo2, [S1P+16]
add MulLo0, MulHi3
adc MulLo1, MulHi0
adc MulLo2, MulHi1
adc MulHi2, 0
mov [RP], MulLo0
mov [RP+8], MulLo1
mov [RP+16], MulLo2
mov rax, MulHi2
jmp .Exit
.Post0:
mov rax, MulHi3
.Exit:
END_PROC reg_save_list

View File

@ -0,0 +1,125 @@
dd_err1_n.as; AMD64 mpn_sub_err1_n
; Copyright 2017 Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,rcx) = (rsi,rcx)-(rdx,rcx)-BwIn
; rax = borrow
; (rcx,2) = rev(r8,rcx) \dot (borrow,rcx) where borrow is the sequence
; of borrows from the subtraction of (rsi,rcx)-(rdx,rcx)
; mp_limb_t mpn_add_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
; mp_limb_t mpn_sub_err1_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
; rax rdi rsi rdx rcx r8 r9 8(rsp)
; rax rcx rdx r8 r9 [rsp+40] [rsp+48] [rsp+56]
%include 'yasm_mac.inc'
%define SumP rdi
%define Inp1P rsi
%define Inp2P rdx
%define EP r11
%define SizeRest rcx
%define YP r8
%define Size r9
%define LIMB0 rax
%define E0 r12
%define E1 r13
%define Zero r14
%define Dummy rbx
%define reg_save_list rsi, rdi, rbx, r12, r13, r14
align 32
BITS 64
%macro DO_LIMB 1
mov LIMB0, [Inp1P + %1*8]
sbb LIMB0, [Inp2P + %1*8]
mov [SumP + %1*8], LIMB0
mov LIMB0, [YP - %1*8]
cmovnc LIMB0, Zero
inc Dummy ; OF = 0
adox E0, LIMB0
adox E1, Zero
%endmacro
FRAME_PROC mpn_sub_err1_n, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov r11, r9
mov r8, [rsp + stack_use + 40]
mov r9, [rsp + stack_use + 48]
mov LIMB0, [rsp + stack_use + 56]
mov SizeRest, Size
lea YP, [YP + Size*8 - 8]
and SizeRest, 7
xor Zero, Zero
mov E0, Zero
mov E1, Zero
shr Size, 3
bt LIMB0, 0
jz .testrest
align 16
.loop:
DO_LIMB 0
DO_LIMB 1
DO_LIMB 2
DO_LIMB 3
DO_LIMB 4
DO_LIMB 5
DO_LIMB 6
DO_LIMB 7
lea Inp1P, [Inp1P+64]
lea Inp2P, [Inp2P+64]
lea SumP, [SumP+64]
lea YP, [YP-64]
dec Size
jne .loop
.testrest:
inc SizeRest
dec SizeRest
jz .exit
.rest:
DO_LIMB 0
dec SizeRest
jz .exit
DO_LIMB 1
dec SizeRest
jz .exit
DO_LIMB 2
dec SizeRest
jz .exit
DO_LIMB 3
dec SizeRest
jz .exit
lea Inp1P, [Inp1P+32]
lea Inp2P, [Inp2P+32]
lea SumP, [SumP+32]
lea YP, [YP-32]
jmp .rest
.exit:
mov rax, Zero
setc al
mov [EP], E0
mov [EP+8], E1
END_PROC reg_save_list