mpir/mpn/x86_64w/mulmid_basecase.asm

422 lines
8.3 KiB
NASM

;
; AMD64 mpn_mulmid_basecase
;
; Based on mul_basecase.asm from GMP 4.3.1, modifications are copyright
; (C) 2009, David Harvey. The original mul_basecase.asm was released under
; LGPLv3+, license terms reproduced below. These modifications are hereby
; released under the same terms.
;
; Windows Conversion Copyright 2010 Dr B R Gladman
;
; Contributed to the GNU project by Torbjorn Granlund and David Harvey.
;
; Copyright 2008 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 3 of the License, or (at
; your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
;
; You should have received a copy of the GNU Lesser General Public License
; //www.gnu.org/licenses/.
;
; void mpn_mulmid_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)
; rax rdi rsi rdx rcx r8
; rax rcx rdx r8 r9 [rsp+40]
%define reg_save_list rbx, rsi, rdi, rbp, r12, r13, r14, r15
%include 'yasm_mac.inc'
BITS 64
TEXT
xalign 16
WIN64_GCC_PROC mpn_mulmid_basecase, 5, frame
mov r15, rcx
lea r13, [rdx+1]
sub r13, r8
lea rdi, [rdi+r13*8]
cmp r13, 4
jc .29
lea rsi, [rsi+rdx*8]
test r8, 1
jz .10
.0: mov ebx, r13d
neg r13
mov rax, [rsi+r13*8]
mov r12, [r15]
mul r12
and r13, -4
mov r11, r13
and ebx, 3
jz .2
cmp ebx, 2
jc .3
jz .4
.1: mov r10, rax
mov rbx, rdx
lea r14, [rel .23]
jmp .8
xalign 16
.2: mov rbp, rax
mov r10, rdx
lea r14, [rel .20]
jmp .7
xalign 16
.3: add r11, 4
mov rcx, rax
mov rbp, rdx
mov r10d, 0
mov rax, [rsi+r11*8]
lea r14, [rel .21]
jmp .6
xalign 16
.4: mov rbx, rax
mov rcx, rdx
mov rax, [rsi+r11*8+24]
mov ebp, 0
mov r10d, 0
lea r14, [rel .22]
jmp .9
xalign 16
.5: mov [rdi+r11*8-16], rbx
add rcx, rax
mov rax, [rsi+r11*8]
adc rbp, rdx
.6: mov ebx, 0
mul r12
mov [rdi+r11*8-8], rcx
add rbp, rax
adc r10, rdx
.7: mov rax, [rsi+r11*8+8]
mul r12
mov [rdi+r11*8], rbp
add r10, rax
adc rbx, rdx
.8: mov rax, [rsi+r11*8+16]
mul r12
mov [rdi+r11*8+8], r10
mov ebp, 0
mov r10, rbp
add rbx, rax
mov rax, [rsi+r11*8+24]
mov rcx, rbp
adc rcx, rdx
.9: mul r12
add r11, 4
js .5
mov [rdi-16], rbx
add rcx, rax
mov [rdi-8], rcx
mov [rdi+8], rbp
adc rbp, rdx
mov [rdi], rbp
dec r8
jz .40
lea rsi, [rsi-8]
lea r15, [r15+8]
mov r11, r13
mov r12, [r15]
mov r9, [r15+8]
jmp r14
xalign 16
.10:mov ebx, r13d
neg r13
mov rax, [rsi+r13*8-8]
mov r12, [r15]
mov r9, [r15+8]
mul r9
and r13, -4
mov r11, r13
and ebx, 3
jz .12
cmp ebx, 2
jc .13
jz .14
.11:mov rcx, rax
mov rbp, rdx
lea r14, [rel .23]
jmp .17
xalign 16
.12:mov rbx, rax
mov rcx, rdx
lea r14, [rel .20]
jmp .16
xalign 16
.13:mov r10, rax
mov rbx, rdx
mov ecx, 0
lea r14, [rel .21]
jmp .19
xalign 16
.14:mov rbp, rax
mov r10, rdx
mov ebx, 0
mov rax, [rsi+r11*8+16]
lea r14, [rel .22]
jmp .18
xalign 16
.15:mov rax, [rsi+r11*8-8]
mul r9
add rbx, rax
adc rcx, rdx
.16:mov ebp, 0
mov rax, [rsi+r11*8]
mul r12
add rbx, rax
mov rax, [rsi+r11*8]
adc rcx, rdx
adc ebp, 0
mul r9
add rcx, rax
mov [rdi+r11*8], rbx
adc rbp, rdx
.17:mov rax, [rsi+r11*8+8]
mul r12
mov r10d, 0
add rcx, rax
adc rbp, rdx
mov ebx, 0
adc r10d, 0
mov rax, [rsi+r11*8+8]
mov [rdi+r11*8+8], rcx
mul r9
add rbp, rax
mov rax, [rsi+r11*8+16]
adc r10, rdx
.18:mov ecx, 0
mul r12
add rbp, rax
mov rax, [rsi+r11*8+16]
adc r10, rdx
adc ebx, 0
mul r9
add r10, rax
mov [rdi+r11*8+16], rbp
adc rbx, rdx
.19:mov rax, [rsi+r11*8+24]
mul r12
add r10, rax
adc rbx, rdx
adc ecx, 0
add r11, 4
mov [rdi+r11*8-8], r10
jnz .15
mov [rdi], rbx
mov [rdi+8], rcx
sub r8, 2
jz .40
lea r15, [r15+16]
lea rsi, [rsi-16]
mov r11, r13
mov r12, [r15]
mov r9, [r15+8]
jmp r14
xalign 16
.20:mov rax, [rsi+r11*8-8]
mul r9
mov rcx, rax
mov rbp, rdx
mov r10d, 0
jmp .25
xalign 16
.21:mov rax, [rsi+r11*8+16]
mul r9
mov rbx, rax
mov rcx, rdx
mov ebp, 0
mov rax, [rsi+r11*8+24]
jmp .28
xalign 16
.22:mov rax, [rsi+r11*8+8]
mul r9
mov r10, rax
mov rbx, rdx
mov ecx, 0
jmp .27
xalign 16
.23:mov rax, [rsi+r11*8]
mul r9
mov rbp, rax
mov r10, rdx
mov ebx, 0
mov ecx, 0
jmp .26
xalign 16
.24:mov r10d, 0
add rbx, rax
mov rax, [rsi+r11*8-8]
adc rcx, rdx
adc ebp, 0
mul r9
add [rdi+r11*8-8], rbx
adc rcx, rax
adc rbp, rdx
.25:mov rax, [rsi+r11*8]
mul r12
add rcx, rax
mov rax, [rsi+r11*8]
adc rbp, rdx
adc r10d, 0
mul r9
add [rdi+r11*8], rcx
mov ecx, 0
adc rbp, rax
mov ebx, 0
adc r10, rdx
.26:mov rax, [rsi+r11*8+8]
mul r12
add rbp, rax
mov rax, [rsi+r11*8+8]
adc r10, rdx
adc ebx, 0
mul r9
add [rdi+r11*8+8], rbp
adc r10, rax
adc rbx, rdx
.27:mov rax, [rsi+r11*8+16]
mul r12
add r10, rax
mov rax, [rsi+r11*8+16]
adc rbx, rdx
adc ecx, 0
mul r9
add [rdi+r11*8+16], r10
nop ; < not translated >
adc rbx, rax
mov ebp, 0
mov rax, [rsi+r11*8+24]
adc rcx, rdx
.28:mul r12
add r11, 4
jnz .24
add rbx, rax
adc rcx, rdx
adc ebp, 0
add [rdi-8], rbx
adc [rdi], rcx
adc [rdi+8], rbp
sub r8, 2
jz .40
lea r15, [r15+16]
lea rsi, [rsi-16]
mov r11, r13
mov r12, [r15]
mov r9, [r15+8]
jmp r14
xalign 16
.29:xor ebx, ebx
xor ecx, ecx
xor ebp, ebp
neg r13
mov eax, r8d
and eax, 3
jz .31
cmp eax, 2
jc .32
jz .33
.30:lea r15, [r15-8]
mov r10, r15
add r8, 1
mov r11, r8
lea r14, [rel .36]
jmp .36
.31:mov r10, r15
mov r11, r8
lea r14, [rip+0]
mov rax, [rsi+r11*8-8]
jmp .35
.32:lea r15, [r15+8]
mov r10, r15
add r8, 3
mov r11, r8
lea r14, [rip+0]
mov rax, [r10-8]
jmp .38
.33:lea r15, [r15-16]
mov r10, r15
add r8, 2
mov r11, r8
lea r14, [rip+0]
mov rax, [r10+16]
jmp .37
xalign 16
.34:add rbx, rax
adc rcx, rdx
mov rax, [rsi+r11*8-8]
adc rbp, 0
.35:mul qword [r10]
add rbx, rax
adc rcx, rdx
adc rbp, 0
.36:mov rax, [rsi+r11*8-16]
mul qword [r10+8]
add rbx, rax
mov rax, [r10+16]
adc rcx, rdx
adc rbp, 0
.37:mul qword [rsi+r11*8-24]
add rbx, rax
mov rax, [r10+24]
adc rcx, rdx
lea r10, [r10+32]
adc rbp, 0
.38:mul qword [rsi+r11*8-32]
sub r11, 4
jnz .34
add rbx, rax
adc rcx, rdx
adc rbp, 0
mov [rdi+r13*8], rbx
inc r13
jz .39
mov r11, r8
mov r10, r15
lea rsi, [rsi+8]
mov rbx, rcx
mov rcx, rbp
xor ebp, ebp
jmp r14
.39:mov [rdi], rcx
mov [rdi+8], rbp
.40:
WIN64_GCC_END
end