422 lines
8.3 KiB
NASM
422 lines
8.3 KiB
NASM
;
|
|
; AMD64 mpn_mulmid_basecase
|
|
;
|
|
; Based on mul_basecase.asm from GMP 4.3.1, modifications are copyright
|
|
; (C) 2009, David Harvey. The original mul_basecase.asm was released under
|
|
; LGPLv3+, license terms reproduced below. These modifications are hereby
|
|
; released under the same terms.
|
|
;
|
|
; Windows Conversion Copyright 2010 Dr B R Gladman
|
|
;
|
|
; Contributed to the GNU project by Torbjorn Granlund and David Harvey.
|
|
;
|
|
; Copyright 2008 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or modify
|
|
; it under the terms of the GNU Lesser General Public License as published
|
|
; by the Free Software Foundation; either version 3 of the License, or (at
|
|
; your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful, but
|
|
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
; License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public License
|
|
; //www.gnu.org/licenses/.
|
|
;
|
|
; void mpn_mulmid_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)
|
|
; rax rdi rsi rdx rcx r8
|
|
; rax rcx rdx r8 r9 [rsp+40]
|
|
|
|
%define reg_save_list rbx, rsi, rdi, rbp, r12, r13, r14, r15
|
|
|
|
%include 'yasm_mac.inc'
|
|
|
|
BITS 64
|
|
TEXT
|
|
|
|
xalign 16
|
|
WIN64_GCC_PROC mpn_mulmid_basecase, 5, frame
|
|
mov r15, rcx
|
|
|
|
|
|
lea r13, [rdx+1]
|
|
sub r13, r8
|
|
|
|
lea rdi, [rdi+r13*8]
|
|
|
|
cmp r13, 4
|
|
jc .29
|
|
|
|
lea rsi, [rsi+rdx*8]
|
|
|
|
test r8, 1
|
|
jz .10
|
|
|
|
.0: mov ebx, r13d
|
|
|
|
neg r13
|
|
mov rax, [rsi+r13*8]
|
|
mov r12, [r15]
|
|
mul r12
|
|
|
|
and r13, -4
|
|
mov r11, r13
|
|
|
|
and ebx, 3
|
|
jz .2
|
|
cmp ebx, 2
|
|
jc .3
|
|
jz .4
|
|
|
|
.1: mov r10, rax
|
|
mov rbx, rdx
|
|
lea r14, [rel .23]
|
|
jmp .8
|
|
|
|
xalign 16
|
|
.2: mov rbp, rax
|
|
mov r10, rdx
|
|
lea r14, [rel .20]
|
|
jmp .7
|
|
|
|
xalign 16
|
|
.3: add r11, 4
|
|
mov rcx, rax
|
|
mov rbp, rdx
|
|
mov r10d, 0
|
|
mov rax, [rsi+r11*8]
|
|
lea r14, [rel .21]
|
|
jmp .6
|
|
|
|
xalign 16
|
|
.4: mov rbx, rax
|
|
mov rcx, rdx
|
|
mov rax, [rsi+r11*8+24]
|
|
mov ebp, 0
|
|
mov r10d, 0
|
|
lea r14, [rel .22]
|
|
jmp .9
|
|
|
|
xalign 16
|
|
.5: mov [rdi+r11*8-16], rbx
|
|
add rcx, rax
|
|
mov rax, [rsi+r11*8]
|
|
adc rbp, rdx
|
|
.6: mov ebx, 0
|
|
mul r12
|
|
mov [rdi+r11*8-8], rcx
|
|
add rbp, rax
|
|
adc r10, rdx
|
|
.7: mov rax, [rsi+r11*8+8]
|
|
mul r12
|
|
mov [rdi+r11*8], rbp
|
|
add r10, rax
|
|
adc rbx, rdx
|
|
.8: mov rax, [rsi+r11*8+16]
|
|
mul r12
|
|
mov [rdi+r11*8+8], r10
|
|
mov ebp, 0
|
|
mov r10, rbp
|
|
add rbx, rax
|
|
mov rax, [rsi+r11*8+24]
|
|
mov rcx, rbp
|
|
adc rcx, rdx
|
|
.9: mul r12
|
|
add r11, 4
|
|
js .5
|
|
mov [rdi-16], rbx
|
|
add rcx, rax
|
|
mov [rdi-8], rcx
|
|
mov [rdi+8], rbp
|
|
adc rbp, rdx
|
|
mov [rdi], rbp
|
|
dec r8
|
|
jz .40
|
|
lea rsi, [rsi-8]
|
|
lea r15, [r15+8]
|
|
mov r11, r13
|
|
mov r12, [r15]
|
|
mov r9, [r15+8]
|
|
jmp r14
|
|
|
|
xalign 16
|
|
.10:mov ebx, r13d
|
|
|
|
neg r13
|
|
mov rax, [rsi+r13*8-8]
|
|
mov r12, [r15]
|
|
mov r9, [r15+8]
|
|
mul r9
|
|
and r13, -4
|
|
mov r11, r13
|
|
and ebx, 3
|
|
jz .12
|
|
cmp ebx, 2
|
|
jc .13
|
|
jz .14
|
|
.11:mov rcx, rax
|
|
mov rbp, rdx
|
|
lea r14, [rel .23]
|
|
jmp .17
|
|
|
|
xalign 16
|
|
.12:mov rbx, rax
|
|
mov rcx, rdx
|
|
lea r14, [rel .20]
|
|
jmp .16
|
|
|
|
xalign 16
|
|
.13:mov r10, rax
|
|
mov rbx, rdx
|
|
mov ecx, 0
|
|
lea r14, [rel .21]
|
|
jmp .19
|
|
|
|
xalign 16
|
|
.14:mov rbp, rax
|
|
mov r10, rdx
|
|
mov ebx, 0
|
|
mov rax, [rsi+r11*8+16]
|
|
lea r14, [rel .22]
|
|
jmp .18
|
|
|
|
xalign 16
|
|
.15:mov rax, [rsi+r11*8-8]
|
|
mul r9
|
|
add rbx, rax
|
|
adc rcx, rdx
|
|
.16:mov ebp, 0
|
|
mov rax, [rsi+r11*8]
|
|
mul r12
|
|
add rbx, rax
|
|
mov rax, [rsi+r11*8]
|
|
adc rcx, rdx
|
|
adc ebp, 0
|
|
mul r9
|
|
add rcx, rax
|
|
mov [rdi+r11*8], rbx
|
|
adc rbp, rdx
|
|
.17:mov rax, [rsi+r11*8+8]
|
|
mul r12
|
|
mov r10d, 0
|
|
add rcx, rax
|
|
adc rbp, rdx
|
|
mov ebx, 0
|
|
adc r10d, 0
|
|
mov rax, [rsi+r11*8+8]
|
|
mov [rdi+r11*8+8], rcx
|
|
mul r9
|
|
add rbp, rax
|
|
mov rax, [rsi+r11*8+16]
|
|
adc r10, rdx
|
|
.18:mov ecx, 0
|
|
mul r12
|
|
add rbp, rax
|
|
mov rax, [rsi+r11*8+16]
|
|
adc r10, rdx
|
|
adc ebx, 0
|
|
mul r9
|
|
add r10, rax
|
|
mov [rdi+r11*8+16], rbp
|
|
adc rbx, rdx
|
|
.19:mov rax, [rsi+r11*8+24]
|
|
mul r12
|
|
add r10, rax
|
|
adc rbx, rdx
|
|
adc ecx, 0
|
|
add r11, 4
|
|
mov [rdi+r11*8-8], r10
|
|
jnz .15
|
|
mov [rdi], rbx
|
|
mov [rdi+8], rcx
|
|
sub r8, 2
|
|
jz .40
|
|
lea r15, [r15+16]
|
|
lea rsi, [rsi-16]
|
|
mov r11, r13
|
|
mov r12, [r15]
|
|
mov r9, [r15+8]
|
|
jmp r14
|
|
|
|
xalign 16
|
|
.20:mov rax, [rsi+r11*8-8]
|
|
mul r9
|
|
mov rcx, rax
|
|
mov rbp, rdx
|
|
mov r10d, 0
|
|
jmp .25
|
|
|
|
xalign 16
|
|
.21:mov rax, [rsi+r11*8+16]
|
|
mul r9
|
|
mov rbx, rax
|
|
mov rcx, rdx
|
|
mov ebp, 0
|
|
mov rax, [rsi+r11*8+24]
|
|
jmp .28
|
|
|
|
xalign 16
|
|
.22:mov rax, [rsi+r11*8+8]
|
|
mul r9
|
|
mov r10, rax
|
|
mov rbx, rdx
|
|
mov ecx, 0
|
|
jmp .27
|
|
|
|
xalign 16
|
|
.23:mov rax, [rsi+r11*8]
|
|
mul r9
|
|
mov rbp, rax
|
|
mov r10, rdx
|
|
mov ebx, 0
|
|
mov ecx, 0
|
|
jmp .26
|
|
|
|
xalign 16
|
|
.24:mov r10d, 0
|
|
add rbx, rax
|
|
mov rax, [rsi+r11*8-8]
|
|
adc rcx, rdx
|
|
adc ebp, 0
|
|
mul r9
|
|
add [rdi+r11*8-8], rbx
|
|
adc rcx, rax
|
|
adc rbp, rdx
|
|
.25:mov rax, [rsi+r11*8]
|
|
mul r12
|
|
add rcx, rax
|
|
mov rax, [rsi+r11*8]
|
|
adc rbp, rdx
|
|
adc r10d, 0
|
|
mul r9
|
|
add [rdi+r11*8], rcx
|
|
mov ecx, 0
|
|
adc rbp, rax
|
|
mov ebx, 0
|
|
adc r10, rdx
|
|
.26:mov rax, [rsi+r11*8+8]
|
|
mul r12
|
|
add rbp, rax
|
|
mov rax, [rsi+r11*8+8]
|
|
adc r10, rdx
|
|
adc ebx, 0
|
|
mul r9
|
|
add [rdi+r11*8+8], rbp
|
|
adc r10, rax
|
|
adc rbx, rdx
|
|
.27:mov rax, [rsi+r11*8+16]
|
|
mul r12
|
|
add r10, rax
|
|
mov rax, [rsi+r11*8+16]
|
|
adc rbx, rdx
|
|
adc ecx, 0
|
|
mul r9
|
|
add [rdi+r11*8+16], r10
|
|
nop ; < not translated >
|
|
adc rbx, rax
|
|
mov ebp, 0
|
|
mov rax, [rsi+r11*8+24]
|
|
adc rcx, rdx
|
|
.28:mul r12
|
|
add r11, 4
|
|
jnz .24
|
|
add rbx, rax
|
|
adc rcx, rdx
|
|
adc ebp, 0
|
|
add [rdi-8], rbx
|
|
adc [rdi], rcx
|
|
adc [rdi+8], rbp
|
|
sub r8, 2
|
|
jz .40
|
|
lea r15, [r15+16]
|
|
lea rsi, [rsi-16]
|
|
mov r11, r13
|
|
mov r12, [r15]
|
|
mov r9, [r15+8]
|
|
jmp r14
|
|
|
|
xalign 16
|
|
.29:xor ebx, ebx
|
|
xor ecx, ecx
|
|
xor ebp, ebp
|
|
neg r13
|
|
mov eax, r8d
|
|
and eax, 3
|
|
jz .31
|
|
cmp eax, 2
|
|
jc .32
|
|
jz .33
|
|
.30:lea r15, [r15-8]
|
|
mov r10, r15
|
|
add r8, 1
|
|
mov r11, r8
|
|
lea r14, [rel .36]
|
|
jmp .36
|
|
.31:mov r10, r15
|
|
mov r11, r8
|
|
lea r14, [rip+0]
|
|
mov rax, [rsi+r11*8-8]
|
|
jmp .35
|
|
.32:lea r15, [r15+8]
|
|
mov r10, r15
|
|
add r8, 3
|
|
mov r11, r8
|
|
lea r14, [rip+0]
|
|
mov rax, [r10-8]
|
|
jmp .38
|
|
.33:lea r15, [r15-16]
|
|
mov r10, r15
|
|
add r8, 2
|
|
mov r11, r8
|
|
lea r14, [rip+0]
|
|
mov rax, [r10+16]
|
|
jmp .37
|
|
|
|
xalign 16
|
|
.34:add rbx, rax
|
|
adc rcx, rdx
|
|
mov rax, [rsi+r11*8-8]
|
|
adc rbp, 0
|
|
.35:mul qword [r10]
|
|
add rbx, rax
|
|
adc rcx, rdx
|
|
adc rbp, 0
|
|
.36:mov rax, [rsi+r11*8-16]
|
|
mul qword [r10+8]
|
|
add rbx, rax
|
|
mov rax, [r10+16]
|
|
adc rcx, rdx
|
|
adc rbp, 0
|
|
.37:mul qword [rsi+r11*8-24]
|
|
add rbx, rax
|
|
mov rax, [r10+24]
|
|
adc rcx, rdx
|
|
lea r10, [r10+32]
|
|
adc rbp, 0
|
|
.38:mul qword [rsi+r11*8-32]
|
|
sub r11, 4
|
|
jnz .34
|
|
add rbx, rax
|
|
adc rcx, rdx
|
|
adc rbp, 0
|
|
mov [rdi+r13*8], rbx
|
|
inc r13
|
|
jz .39
|
|
mov r11, r8
|
|
mov r10, r15
|
|
lea rsi, [rsi+8]
|
|
mov rbx, rcx
|
|
mov rcx, rbp
|
|
xor ebp, ebp
|
|
jmp r14
|
|
.39:mov [rdi], rcx
|
|
mov [rdi+8], rbp
|
|
.40:
|
|
WIN64_GCC_END
|
|
|
|
end
|