; AMD64 mpn_mul_basecase optimised for Intel Haswell. ; Contributed to the GNU project by Torbjörn Granlund. ; Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or modify ; it under the terms of either: ; ; * the GNU Lesser General Public License as published by the Free ; Software Foundation; either version 3 of the License, or (at your ; option) any later version. ; ; or ; ; * the GNU General Public License as published by the Free Software ; Foundation; either version 2 of the License, or (at your option) any ; later version. ; ; or both in parallel, as here. ; ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ; for more details. ; ; You should have received copies of the GNU General Public License and the ; GNU Lesser General Public License along with the GNU MP Library. If not, ; see https://www.gnu.org/licenses/. ; ; mp_limb_t mpn_mul_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t) ; rax rdi rsi rdx rcx r8 ; rax rcx rdx r8 r9 [rsp+40] %include 'yasm_mac.inc' %define reg_save_list rbx, rbp, rsi, rdi, r12, r13, r14, r15 %define rp rdi %define up rsi %define un_param rdx %define vp rcx %define un rbx %define un8 bl %define w0 r10 %define w1 r11 %define w2 r12 %define w3 r13 %define n rbp BITS 64 align 16 FRAME_PROC mpn_mul_basecase, 0, reg_save_list mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, [rsp+stack_use+40] mov un, rdx neg un mov n, rdx sar n, 2 test r8b, 1 jz .Ldo_mul_2 mov rdx, [vp] .Ldo_mul_1: test un8, 1 jnz .Lm1x1 .Lm1x0: test un8, 2 jnz .Lm110 .Lm100: mulx w2, r14, [up] mulx w3, w1, [up+8] lea rp, [rp-24] jmp .Lm1l0 .Lm110: mulx r9, w3, [up] mulx r14, w1, [up+8] lea rp, [rp-8] test n, n jz .Lcj2 mulx w2, w0, [up+16] lea up, [up+16] jmp .Lm1l2 .Lm1x1: test un8, 2 jz .Lm111 .Lm101: mulx r14, r9, [up] lea rp, [rp-16] test n, n jz .Lcj1 mulx w2, w0, [up+8] lea up, [up+8] jmp .Lm1l1 .Lm111: mulx w3, w2, [up] mulx r9, w0, [up+8] mulx r14, w1, [up+16] lea up, [up+24] test n, n jnz .Lgt3 add w3, w0 jmp .Lcj3 .Lgt3: add w3, w0 jmp .Lm1l3 align 32 .Lm1tp: lea rp, [rp+32] .Lm1l3: mov [rp], w2 mulx w2, w0, [up] .Lm1l2: mov [rp+8], w3 adc r9, w1 .Lm1l1: adc r14, w0 mov [rp+16], r9 mulx w3, w1, [up+8] .Lm1l0: mov [rp+24], r14 mulx r9, w0, [up+16] adc w2, w1 mulx r14, w1, [up+24] adc w3, w0 lea up, [up+32] dec n jnz .Lm1tp .Lm1ed: lea rp, [rp+32] .Lcj3: mov [rp], w2 .Lcj2: mov [rp+8], w3 adc r9, w1 .Lcj1: mov [rp+16], r9 adc r14, 0 mov [rp+24], r14 dec r8d jz .Lret5 lea vp, [vp+8] lea rp, [rp+32] jmp .Ldo_addmul .Ldo_mul_2: mov r9, [vp] mov r14, [vp+8] lea n, [un] sar n, 2 test un8, 1 jnz .Lm2x1 .Lm2x0: xor w0, w0 test un8, 2 mov rdx, [up] mulx w1, w2, r9 jz .Lm2l0 .Lm210: lea rp, [rp-16] lea up, [up-16] jmp .Lm2l2 .Lm2x1: xor w2, w2 test un8, 2 mov rdx, [up] mulx w3, w0, r9 jz .Lm211 .Lm201: lea rp, [rp-24] lea up, [up+8] jmp .Lm2l1 .Lm211: lea rp, [rp-8] lea up, [up-8] jmp .Lm2l3 align 16 .Lm2tp: mulx w0, rax, r14 add w2, rax mov rdx, [up] mulx w1, rax, r9 adc w0, 0 add w2, rax adc w1, 0 add w2, w3 .Lm2l0: mov [rp], w2 adc w1, 0 mulx w2, rax, r14 add w0, rax mov rdx, [up+8] adc w2, 0 mulx w3, rax, r9 add w0, rax adc w3, 0 add w0, w1 .Lm2l3: mov [rp+8], w0 adc w3, 0 mulx w0, rax, r14 add w2, rax mov rdx, [up+16] mulx w1, rax, r9 adc w0, 0 add w2, rax adc w1, 0 add w2, w3 .Lm2l2: mov [rp+16], w2 adc w1, 0 mulx w2, rax, r14 add w0, rax mov rdx, [up+24] adc w2, 0 mulx w3, rax, r9 add w0, rax adc w3, 0 add w0, w1 lea up, [up+32] .Lm2l1: mov [rp+24], w0 adc w3, 0 inc n lea rp, [rp+32] jnz .Lm2tp .Lm2ed: mulx rax, rdx, r14 add w2, rdx adc rax, 0 add w2, w3 mov [rp], w2 adc rax, 0 mov [rp+8], rax add r8d, -2 jz .Lret5 lea vp, [vp+16] lea rp, [rp+16] .Ldo_addmul: mov [rsp+stack_use+8], r8 lea rp, [un*8+rp] lea up, [un*8+up] .Louter: mov r9, [vp] mov r8, [vp+8] lea n, [un+2] sar n, 2 mov rdx, [up] test un8, 1 jnz .Lbx1 .Lbx0: mov r14, [rp] mov r15, [rp+8] mulx w1, rax, r9 add r14, rax mulx w2, rax, r8 adc w1, 0 mov [rp], r14 add r15, rax adc w2, 0 mov rdx, [up+8] test un8, 2 jnz .Lb10 .Lb00: lea up, [up+16] lea rp, [rp+16] jmp .Llo0 .Lb10: mov r14, [rp+16] lea up, [up+32] mulx w3, rax, r9 jmp .Llo2 .Lbx1: mov r15, [rp] mov r14, [rp+8] mulx w3, rax, r9 add r15, rax adc w3, 0 mulx w0, rax, r8 add r14, rax adc w0, 0 mov rdx, [up+8] mov [rp], r15 mulx w1, rax, r9 test un8, 2 jz .Lb11 .Lb01: mov r15, [rp+16] lea rp, [rp+24] lea up, [up+24] jmp .Llo1 .Lb11: lea rp, [rp+8] lea up, [up+8] jmp .Llo3 align 16 .Ltop: mulx w3, rax, r9 add r15, w0 adc w2, 0 .Llo2: add r15, rax adc w3, 0 mulx w0, rax, r8 add r14, rax adc w0, 0 lea rp, [rp+32] add r15, w1 mov rdx, [up-16] mov [rp-24], r15 adc w3, 0 add r14, w2 mov r15, [rp-8] mulx w1, rax, r9 adc w0, 0 .Llo1: add r14, rax mulx w2, rax, r8 adc w1, 0 add r14, w3 mov [rp-16], r14 adc w1, 0 add r15, rax adc w2, 0 add r15, w0 mov rdx, [up-8] adc w2, 0 .Llo0: mulx w3, rax, r9 add r15, rax adc w3, 0 mov r14, [rp] mulx w0, rax, r8 add r14, rax adc w0, 0 add r15, w1 mov [rp-8], r15 adc w3, 0 mov rdx, [up] add r14, w2 mulx w1, rax, r9 adc w0, 0 .Llo3: add r14, rax adc w1, 0 mulx w2, rax, r8 add r14, w3 mov r15, [rp+8] mov [rp], r14 mov r14, [rp+16] adc w1, 0 add r15, rax adc w2, 0 mov rdx, [up+8] lea up, [up+32] inc n jnz .Ltop .Lend: mulx w3, rax, r9 add r15, w0 adc w2, 0 add r15, rax adc w3, 0 mulx rax, rdx, r8 add r15, w1 mov [rp+8], r15 adc w3, 0 add rdx, w2 adc rax, 0 add rdx, w3 mov [rp+16], rdx adc rax, 0 mov [rp+24], rax add DWORD [rsp+stack_use+8], -2 lea vp, [vp+16] lea up, [up+un*8-16] lea rp, [rp+un*8+32] jnz .Louter mov rax, [rsp+stack_use+8] .Lret5: END_PROC reg_save_list