dnl AMD64 mpn_mulmid_basecase dnl Based on mul_basecase.asm from GMP 4.3.1, modifications are copyright dnl (C) 2009, David Harvey. The original mul_basecase.asm was released under dnl LGPLv3+, license terms reproduced below. These modifications are hereby dnl released under the same terms. dnl ========= Original license terms: dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. dnl Copyright 2008 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. dnl ========= end license terms include(`../config.m4') C cycles/limb C K8,K9: 2.375 (2.5 when un - vn is "small") C K10: ? C P4: ? C P6-15: ? C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`un_param',`%rdx') define(`vp_param',`%rcx') define(`vn', `%r8') define(`vn32', `%r8d') define(`v0', `%r12') define(`v1', `%r9') define(`w0', `%rbx') define(`w1', `%rcx') define(`w2', `%rbp') define(`w3', `%r10') define(`w032', `%ebx') define(`w132', `%ecx') define(`w232', `%ebp') define(`w332', `%r10d') define(`n', `%r11') define(`outer_addr', `%r14') define(`un', `%r13') define(`un32',`%r13d') define(`vp', `%r15') define(`vp_inner', `%r10') ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mulmid_basecase) push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 mov vp_param, vp C use un for row length (= un_param - vn + 1) lea 1(un_param), un sub vn, un lea (rp,un,8), rp cmp $4, un C FIXME: needs tuning jc L(diagonal) lea (up,un_param,8), up test $1, vn jz L(mul_2) C =========================================================== C mul_1 for vp[0] if vn is odd L(mul_1): mov un32, w032 neg un mov (up,un,8), %rax mov (vp), v0 mul v0 and $-4, un C round down to multiple of 4 mov un, n and $3, w032 jz L(mul_1_prologue_0) cmp $2, w032 jc L(mul_1_prologue_1) jz L(mul_1_prologue_2) L(mul_1_prologue_3): mov %rax, w3 mov %rdx, w0 lea L(addmul_prologue_3)(%rip), outer_addr jmp L(mul_1_entry_3) ALIGN(16) L(mul_1_prologue_0): mov %rax, w2 mov %rdx, w3 C note already w0 == 0 lea L(addmul_prologue_0)(%rip), outer_addr jmp L(mul_1_entry_0) ALIGN(16) L(mul_1_prologue_1): add $4, n mov %rax, w1 mov %rdx, w2 mov $0, w332 mov (up,n,8), %rax lea L(addmul_prologue_1)(%rip), outer_addr jmp L(mul_1_entry_1) ALIGN(16) L(mul_1_prologue_2): mov %rax, w0 mov %rdx, w1 mov 24(up,n,8), %rax mov $0, w232 mov $0, w332 lea L(addmul_prologue_2)(%rip), outer_addr jmp L(mul_1_entry_2) C this loop is 10 c/loop = 2.5 c/l on K8 ALIGN(16) L(mul_1_top): mov w0, -16(rp,n,8) add %rax, w1 mov (up,n,8), %rax adc %rdx, w2 L(mul_1_entry_1): mov $0, w032 mul v0 mov w1, -8(rp,n,8) add %rax, w2 adc %rdx, w3 L(mul_1_entry_0): mov 8(up,n,8), %rax mul v0 mov w2, (rp,n,8) add %rax, w3 adc %rdx, w0 L(mul_1_entry_3): mov 16(up,n,8), %rax mul v0 mov w3, 8(rp,n,8) mov $0, w232 C zero mov w2, w3 C zero add %rax, w0 mov 24(up,n,8), %rax mov w2, w1 C zero adc %rdx, w1 L(mul_1_entry_2): mul v0 add $4, n js L(mul_1_top) mov w0, -16(rp) add %rax, w1 mov w1, -8(rp) mov w2, 8(rp) C zero last limb of output adc %rdx, w2 mov w2, (rp) dec vn jz L(ret) lea -8(up), up lea 8(vp), vp mov un, n mov (vp), v0 mov 8(vp), v1 jmp *outer_addr C =========================================================== C mul_2 for vp[0], vp[1] if vn is even ALIGN(16) L(mul_2): mov un32, w032 neg un mov -8(up,un,8), %rax mov (vp), v0 mov 8(vp), v1 mul v1 and $-4, un C round down to multiple of 4 mov un, n and $3, w032 jz L(mul_2_prologue_0) cmp $2, w032 jc L(mul_2_prologue_1) jz L(mul_2_prologue_2) L(mul_2_prologue_3): mov %rax, w1 mov %rdx, w2 lea L(addmul_prologue_3)(%rip), outer_addr jmp L(mul_2_entry_3) ALIGN(16) L(mul_2_prologue_0): mov %rax, w0 mov %rdx, w1 lea L(addmul_prologue_0)(%rip), outer_addr jmp L(mul_2_entry_0) ALIGN(16) L(mul_2_prologue_1): mov %rax, w3 mov %rdx, w0 mov $0, w132 lea L(addmul_prologue_1)(%rip), outer_addr jmp L(mul_2_entry_1) ALIGN(16) L(mul_2_prologue_2): mov %rax, w2 mov %rdx, w3 mov $0, w032 mov 16(up,n,8), %rax lea L(addmul_prologue_2)(%rip), outer_addr jmp L(mul_2_entry_2) C this loop is 18 c/loop = 2.25 c/l on K8 ALIGN(16) L(mul_2_top): mov -8(up,n,8), %rax mul v1 add %rax, w0 adc %rdx, w1 L(mul_2_entry_0): mov $0, w232 mov (up,n,8), %rax mul v0 add %rax, w0 mov (up,n,8), %rax adc %rdx, w1 adc $0, w232 mul v1 add %rax, w1 mov w0, (rp,n,8) adc %rdx, w2 L(mul_2_entry_3): mov 8(up,n,8), %rax mul v0 mov $0, w332 add %rax, w1 adc %rdx, w2 mov $0, w032 adc $0, w332 mov 8(up,n,8), %rax mov w1, 8(rp,n,8) mul v1 add %rax, w2 mov 16(up,n,8), %rax adc %rdx, w3 L(mul_2_entry_2): mov $0, w132 mul v0 add %rax, w2 mov 16(up,n,8), %rax adc %rdx, w3 adc $0, w032 mul v1 add %rax, w3 mov w2, 16(rp,n,8) adc %rdx, w0 L(mul_2_entry_1): mov 24(up,n,8), %rax mul v0 add %rax, w3 adc %rdx, w0 adc $0, w132 add $4, n mov w3, -8(rp,n,8) jnz L(mul_2_top) mov w0, (rp) mov w1, 8(rp) sub $2, vn jz L(ret) lea 16(vp), vp lea -16(up), up mov un, n mov (vp), v0 mov 8(vp), v1 jmp *outer_addr C =========================================================== C addmul_2 for remaining vp's ALIGN(16) L(addmul_prologue_0): mov -8(up,n,8), %rax mul v1 mov %rax, w1 mov %rdx, w2 mov $0, w332 jmp L(addmul_entry_0) ALIGN(16) L(addmul_prologue_1): mov 16(up,n,8), %rax mul v1 mov %rax, w0 mov %rdx, w1 mov $0, w232 mov 24(up,n,8), %rax jmp L(addmul_entry_1) ALIGN(16) L(addmul_prologue_2): mov 8(up,n,8), %rax mul v1 mov %rax, w3 mov %rdx, w0 mov $0, w132 jmp L(addmul_entry_2) ALIGN(16) L(addmul_prologue_3): mov (up,n,8), %rax mul v1 mov %rax, w2 mov %rdx, w3 mov $0, w032 mov $0, w132 jmp L(addmul_entry_3) C this loop is 19 c/loop = 2.375 c/l on K8 ALIGN(16) L(addmul_top): mov $0, w332 add %rax, w0 mov -8(up,n,8), %rax adc %rdx, w1 adc $0, w232 mul v1 add w0, -8(rp,n,8) adc %rax, w1 adc %rdx, w2 L(addmul_entry_0): mov (up,n,8), %rax mul v0 add %rax, w1 mov (up,n,8), %rax adc %rdx, w2 adc $0, w332 mul v1 add w1, (rp,n,8) mov $0, w132 adc %rax, w2 mov $0, w032 adc %rdx, w3 L(addmul_entry_3): mov 8(up,n,8), %rax mul v0 add %rax, w2 mov 8(up,n,8), %rax adc %rdx, w3 adc $0, w032 mul v1 add w2, 8(rp,n,8) adc %rax, w3 adc %rdx, w0 L(addmul_entry_2): mov 16(up,n,8), %rax mul v0 add %rax, w3 mov 16(up,n,8), %rax adc %rdx, w0 adc $0, w132 mul v1 add w3, 16(rp,n,8) nop C don't ask... adc %rax, w0 mov $0, w232 mov 24(up,n,8), %rax adc %rdx, w1 L(addmul_entry_1): mul v0 add $4, n jnz L(addmul_top) add %rax, w0 adc %rdx, w1 adc $0, w232 add w0, -8(rp) adc w1, (rp) adc w2, 8(rp) sub $2, vn jz L(ret) lea 16(vp), vp lea -16(up), up mov un, n mov (vp), v0 mov 8(vp), v1 jmp *outer_addr C =========================================================== C accumulate along diagonals if un - vn is small ALIGN(16) L(diagonal): xor w032, w032 xor w132, w132 xor w232, w232 neg un mov vn32, %eax and $3, %eax jz L(diag_prologue_0) cmp $2, %eax jc L(diag_prologue_1) jz L(diag_prologue_2) L(diag_prologue_3): lea -8(vp), vp mov vp, vp_inner add $1, vn mov vn, n lea L(diag_entry_3)(%rip), outer_addr jmp L(diag_entry_3) L(diag_prologue_0): mov vp, vp_inner mov vn, n lea 0(%rip), outer_addr mov -8(up,n,8), %rax jmp L(diag_entry_0) L(diag_prologue_1): lea 8(vp), vp mov vp, vp_inner add $3, vn mov vn, n lea 0(%rip), outer_addr mov -8(vp_inner), %rax jmp L(diag_entry_1) L(diag_prologue_2): lea -16(vp), vp mov vp, vp_inner add $2, vn mov vn, n lea 0(%rip), outer_addr mov 16(vp_inner), %rax jmp L(diag_entry_2) C this loop is 10 c/loop = 2.5 c/l on K8 ALIGN(16) L(diag_top): add %rax, w0 adc %rdx, w1 mov -8(up,n,8), %rax adc $0, w2 L(diag_entry_0): mulq (vp_inner) add %rax, w0 adc %rdx, w1 adc $0, w2 L(diag_entry_3): mov -16(up,n,8), %rax mulq 8(vp_inner) add %rax, w0 mov 16(vp_inner), %rax adc %rdx, w1 adc $0, w2 L(diag_entry_2): mulq -24(up,n,8) add %rax, w0 mov 24(vp_inner), %rax adc %rdx, w1 lea 32(vp_inner), vp_inner adc $0, w2 L(diag_entry_1): mulq -32(up,n,8) sub $4, n jnz L(diag_top) add %rax, w0 adc %rdx, w1 adc $0, w2 mov w0, (rp,un,8) inc un jz L(diag_end) mov vn, n mov vp, vp_inner lea 8(up), up mov w1, w0 mov w2, w1 xor w232, w232 jmp *outer_addr L(diag_end): mov w1, (rp) mov w2, 8(rp) L(ret): pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx ret EPILOGUE()