Add mod_1_3 assembler to Windows

2009-10-05 08:29:15 +00:00 · 2009-10-05 08:29:15 +00:00 · 343fe6ce88
commit 343fe6ce88
parent 4f0a431658
3 changed files with 204 additions and 14 deletions
--- a/build.vc9/dll_mpir_amd64/dll_mpir_amd64.vcproj
+++ b/build.vc9/dll_mpir_amd64/dll_mpir_amd64.vcproj
@ -1164,6 +1164,7 @@
 					>
 					<FileConfiguration
 						Name="Debug|x64"
+						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCLCompilerTool"
@ -1172,6 +1173,7 @@
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
+						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCLCompilerTool"
@ -2526,6 +2528,10 @@
 						RelativePath="..\..\mpn\x86_64w\amd64\mod_1_2.asm"
 						>
 					</File>
+					<File
+						RelativePath="..\..\mpn\x86_64w\amd64\mod_1_3.asm"
+						>
+					</File>
 					<File
 						RelativePath="..\..\mpn\x86_64w\mode1o.asm"
 						>
--- a/build.vc9/lib_mpir_amd64/lib_mpir_amd64.vcproj
+++ b/build.vc9/lib_mpir_amd64/lib_mpir_amd64.vcproj
@ -3059,6 +3059,7 @@
 					>
 					<FileConfiguration
 						Name="Debug|x64"
+						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCLCompilerTool"
@ -3067,6 +3068,7 @@
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
+						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCLCompilerTool"
@ -4466,6 +4468,10 @@
 						RelativePath="..\..\mpn\x86_64w\amd64\mod_1_2.asm"
 						>
 					</File>
+					<File
+						RelativePath="..\..\mpn\x86_64w\amd64\mod_1_3.asm"
+						>
+					</File>
 					<File
 						RelativePath="..\..\mpn\x86_64w\mode1o.asm"
 						>
@ -4513,24 +4519,10 @@
 					<File
 						RelativePath="..\..\mpn\x86_64w\amd64\rsh_divrem_hensel_qr_1_1.asm"
 						>
-						<FileConfiguration
-							Name="Release|x64"
-							>
-							<Tool
-								Name="YASM"
-							/>
-						</FileConfiguration>
 					</File>
 					<File
 						RelativePath="..\..\mpn\x86_64w\amd64\rsh_divrem_hensel_qr_1_2.asm"
 						>
-						<FileConfiguration
-							Name="Release|x64"
-							>
-							<Tool
-								Name="YASM"
-							/>
-						</FileConfiguration>
 					</File>
 					<File
 						RelativePath="..\..\mpn\x86_64w\amd64\rshift.asm"
--- a/mpn/x86_64w/amd64/mod_1_3.asm
+++ b/mpn/x86_64w/amd64/mod_1_3.asm
@ -0,0 +1,192 @@
+
+;  Copyright 2009 Jason Moxham
+;
+;  Windows Conversion Copyright 2008 Brian Gladman
+;
+;  This file is part of the MPIR Library.
+;
+;  The MPIR Library is free software; you can redistribute it and/or modify
+;  it under the terms of the GNU Lesser General Public License as published
+;  by the Free Software Foundation; either version 2.1 of the License, or (at
+;  your option) any later version.
+;
+;  The MPIR Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+;  License for more details.
+;
+;  You should have received a copy of the GNU Lesser General Public License
+;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
+;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;  Boston, MA 02110-1301, USA.
+;
+;  mp_limb_t  mpn_mod_1_3(mp_ptr, mp_ptr, mp_size_t, mp_ptr)
+;  rax                       rdi     rsi        rdx     rcx
+;  rax                       rcx     rdx        r8d      r9
+
+;	(rdi,2)= not fully reduced remainder of (rsi,rdx) / divisor , and top limb <d
+;	where (rcx,4)  contains B^i % divisor
+
+%include '..\yasm_mac.inc'
+
+    CPU  Athlon64
+    BITS 64
+
+%define reg_save_list rsi, rdi, r12, r13, r14, r15
+
+    FRAME_PROC mpn_mod_1_3, 0, reg_save_list
+    mov     rsi, rdx
+    movsxd  rdi, r8d
+	mov     r15, [rsi+rdi*8-8]
+	mov     r14, [rsi+rdi*8-16]
+	mov     rax, [rsi+rdi*8-32]
+	mov     r12, [rsi+rdi*8-40]
+	mov     r8, [r9]
+	mov     r10, [r9+16]
+	mov     r11, [r9+24]
+	mov     r9, [r9+8]
+	sub     rdi, 8
+	jc      L_skiplp
+	xalign  16
+; // r15 r14 -8() -16()=rax -24()=r12
+L_lp:
+	mul     r8
+	add     r12, rax
+	mov     rax, [rsi+rdi*8+40]
+	mov     r13, 0
+	adc     r13, rdx
+	mul     r9
+	add     r12, rax
+	nop
+	adc     r13, rdx
+	mov     rax, r10
+	mul     r14
+	add     r12, rax
+	adc     r13, rdx
+	mov     r14, r12
+	mov     rax, r11
+	mul     r15
+	add     r14, rax
+	mov     r12, [rsi+rdi*8+0]
+	mov     r15, r13
+	mov     rax, [rsi+rdi*8+8]
+	adc     r15, rdx
+	sub     rdi, 3
+	jnc     L_lp
+L_skiplp:
+; // we have loaded up the next two limbs
+; // but because they are out of order we can have to do 3 limbs min
+	cmp     rdi, -2
+	jl      L_case1
+	je      L_case2
+L_case3:
+	; //two more limbs is 4 limbs
+	; // r15 r14 40() 8+24()=rax 0+24()=r12
+	mul     r8
+	add     r12, rax
+	mov     rax, [rsi+rdi*8+40]
+	mov     r13, 0
+	adc     r13, rdx
+	mul     r9
+	add     r12, rax
+	nop
+	adc     r13, rdx
+	mov     rax, r10
+	mul     r14
+	add     r12, rax
+	adc     r13, rdx
+	mov     r14, r12
+	mov     rax, r11
+	mul     r15
+	add     r14, rax
+	mov     r12, [rsi+rdi*8+8]
+	mov     r15, r13
+	mov     rax, [rsi+rdi*8+16]
+	adc     r15, rdx
+	; // r15 r14 rax r12
+	mov     r13, 0
+	mul     r8
+	add     r12, rax
+	adc     r13, rdx
+	mov     rax, r9
+	mul     r14
+	add     r12, rax
+	adc     r13, rdx
+	mov     rax, r10
+	mul     r15
+	add     r12, rax
+	adc     r13, rdx
+	; // r13 r12
+	mov     rax, r8
+	mul     r13
+	jmp     L_xit
+
+	xalign  16
+L_case2:
+	; //two more limbs is 4 limbs
+	; // r15 r14 40() 8+24()=rax 0+24()=r12
+	mul     r8
+	add     r12, rax
+	mov     rax, [rsi+rdi*8+40]
+	mov     r13, 0
+	adc     r13, rdx
+	mul     r9
+	add     r12, rax
+	nop
+	adc     r13, rdx
+	mov     rax, r10
+	mul     r14
+	add     r12, rax
+	adc     r13, rdx
+	mov     r14, r12
+	mov     rax, r11
+	mul     r15
+	add     r14, rax
+	mov     r12, [rsi+rdi*8+16]
+	mov     r15, r13
+	adc     r15, rdx
+	; // r15 r14 r12
+	mov     r13, 0
+	mov     rax, r8
+	mul     r14
+	add     r12, rax
+	adc     r13, rdx
+	mov     rax, r9
+	mul     r15
+	add     r12, rax
+	adc     r13, rdx
+	; // r13 r12
+	mov     rax, r8
+	mul     r13
+	jmp     L_xit
+	
+	xalign  16
+L_case1:
+	; // one more is 3 limbs
+	; // r15 r14 40() 8+24()=rax 0+24()=r12 
+	mul     r8
+	add     r12, rax
+	mov     rax, [rsi+rdi*8+40]
+	mov     r13, 0
+	adc     r13, rdx
+	mul     r9
+	add     r12, rax
+	nop
+	adc     r13, rdx
+	mov     rax, r10
+	mul     r14
+	add     r12, rax
+	adc     r13, rdx
+	mov     rax, r11
+	mul     r15
+	add     r12, rax
+	mov     r15, r13
+	adc     r15, rdx
+	mov     rax, r8
+	mul     r15
+L_xit:
+	add     r12, rax
+	adc     rdx, 0
+	mov     [rcx], r12
+	mov     [rcx+8], rdx
+    END_PROC reg_save_list