mpir/mpn/x86_64/sqr_basecase.asm

dnl  AMD64 mpn_sqr_basecase.

dnl  Contributed to the GNU project by Torbjorn Granlund.

dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C The inner loops of this code are the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.

C NOTES
C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
C     code which uses addmul_2s from the start, conditionally leaving a 1x1
C     multiply to the end.  (In assembly code, one would stop invoking
C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
C     save/restore carry, instead it can propagate into the high product word.
C   * Align more labels, should shave off a few cycles.
C   * We can safely use 32-bit size operations, since operands with (2^32)
C     limbs will lead to non-termination in practice.
C   * The jump table could probably be optimized, at least for non-pic.
C   * The special code for n <= 4 was quickly written.  It is probably too
C     large and unnecessarily slow.
C   * Consider combining small cases code so that the n=k-1 code jumps into the
C     middle of the n=k code.
C   * Avoid saving registers for small cases code.
C   * Needed variables:
C    n   r11  input size
C    i   r8   work left, initially n
C    j   r9   inner loop count
C        r15  unused
C    v0  r13
C    v1  r14
C    rp  rdi
C    up  rsi
C    w0  rbx
C    w1  rcx
C    w2  rbp
C    w3  r10
C    tp  r12
C    lo  rax
C    hi  rdx
C        rsp

C INPUT PARAMETERS
define(`rp',	  `%rdi')
define(`up',	  `%rsi')
define(`n_param', `%rdx')
define(`n_param32', `%edx')

define(`n',	`%r11')
define(`n32',	`%r11d')
define(`tp',	`%r12')
define(`i',	`%r8')
define(`i32',	`%r8d')
define(`j',	`%r9')
define(`j8',	`%r9b')
define(`j32',	`%r9d')
define(`v0',	`%r13')
define(`v1',	`%r14')
define(`w0',	`%rbx')
define(`w032',	`%ebx')
define(`w1',	`%rcx')
define(`w132',	`%ecx')
define(`w2',	`%rbp')
define(`w232',	`%ebp')
define(`w3',	`%r10')
define(`w332',	`%r10d')

ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(mpn_sqr_basecase)
	mov	n_param32, %ecx
	mov	n_param32, n32		C free original n register (rdx)

	add	$-40, %rsp

	and	$3, %ecx
	cmp	$4, n_param32
	lea	4(%rcx), %r8

	mov	%rbx, 32(%rsp)
	mov	%rbp, 24(%rsp)
	mov	%r12, 16(%rsp)
	mov	%r13, 8(%rsp)
	mov	%r14, (%rsp)

	cmovg	%r8, %rcx

	lea	tab(%rip), %rax
ifdef(`PIC',
`	movslq	(%rax,%rcx,4), %r10
	add	%r10, %rax
	jmp	*%rax
',`
	jmp	*(%rax,%rcx,8)
')
	JUMPTABSECT
	ALIGN(8)
tab:	JMPENT(	l4, tab)
	JMPENT(	l1, tab)
	JMPENT(	l2, tab)
	JMPENT(	l3, tab)
	JMPENT(	l0m4, tab)
	JMPENT(	l1m4, tab)
	JMPENT(	l2m4, tab)
	JMPENT(	l3m4, tab)
	TEXT

l1:	mov	(up), %rax
	mul	%rax
	add	$40, %rsp
	mov	%rax, (rp)
	mov	%rdx, 8(rp)
	ret

l2:	mov	(up), %rax
	mov	%rax, %r8
	mul	%rax
	mov	8(up), %r11
	mov	%rax, (rp)
	mov	%r11, %rax
	mov	%rdx, %r9
	mul	%rax
	add	$40, %rsp
	mov	%rax, %r10
	mov	%r11, %rax
	mov	%rdx, %r11
	mul	%r8
	xor	%r8, %r8
	add	%rax, %r9
	adc	%rdx, %r10
	adc	%r8, %r11
	add	%rax, %r9
	mov	%r9, 8(rp)
	adc	%rdx, %r10
	mov	%r10, 16(rp)
	adc	%r8, %r11
	mov	%r11, 24(rp)
	ret

l3:	mov	(up), %rax
	mov	%rax, %r10
	mul	%rax
	mov	8(up), %r11
	mov	%rax, (rp)
	mov	%r11, %rax
	mov	%rdx, 8(rp)
	mul	%rax
	mov	16(up), %rcx
	mov	%rax, 16(rp)
	mov	%rcx, %rax
	mov	%rdx, 24(rp)
	mul	%rax
	mov	%rax, 32(rp)
	mov	%rdx, 40(rp)

	mov	%r11, %rax
	mul	%r10
	mov	%rax, %r8
	mov	%rcx, %rax
	mov	%rdx, %r9
	mul	%r10
	xor	%r10, %r10
	add	%rax, %r9
	mov	%r11, %rax
	mov	%r10, %r11
	adc	%rdx, %r10

	mul	%rcx
	add	$40, %rsp
	add	%rax, %r10
	adc	%r11, %rdx
	add	%r8, %r8
	adc	%r9, %r9
	adc	%r10, %r10
	adc	%rdx, %rdx
	adc	%r11, %r11
	add	%r8, 8(rp)
	adc	%r9, 16(rp)
	adc	%r10, 24(rp)
	adc	%rdx, 32(rp)
	adc	%r11, 40(rp)
	ret

l4:	mov	(up), %rax
	mov	%rax, %r11
	mul	%rax
	mov	8(up), %rbx
	mov	%rax, (rp)
	mov	%rbx, %rax
	mov	%rdx, 8(rp)
	mul	%rax
	mov	%rax, 16(rp)
	mov	%rdx, 24(rp)
	mov	16(up), %rax
	mul	%rax
	mov	%rax, 32(rp)
	mov	%rdx, 40(rp)
	mov	24(up), %rax
	mul	%rax
	mov	%rax, 48(rp)
	mov	%rbx, %rax
	mov	%rdx, 56(rp)

	mul	%r11
	add	$32, %rsp
	mov	%rax, %r8
	mov	%rdx, %r9
	mov	16(up), %rax
	mul	%r11
	xor	%r10, %r10
	add	%rax, %r9
	adc	%rdx, %r10
	mov	24(up), %rax
	mul	%r11
	xor	%r11, %r11
	add	%rax, %r10
	adc	%rdx, %r11
	mov	16(up), %rax
	mul	%rbx
	xor	%rcx, %rcx
	add	%rax, %r10
	adc	%rdx, %r11
	adc	$0, %rcx
	mov	24(up), %rax
	mul	%rbx
	pop	%rbx
	add	%rax, %r11
	adc	%rdx, %rcx
	mov	16(up), %rdx
	mov	24(up), %rax
	mul	%rdx
	add	%rax, %rcx
	adc	$0, %rdx

	add	%r8, %r8
	adc	%r9, %r9
	adc	%r10, %r10
	adc	%r11, %r11
	adc	%rcx, %rcx
	mov	$0, %eax
	adc	%rdx, %rdx

	adc	%rax, %rax
	add	%r8, 8(rp)
	adc	%r9, 16(rp)
	adc	%r10, 24(rp)
	adc	%r11, 32(rp)
	adc	%rcx, 40(rp)
	adc	%rdx, 48(rp)
	adc	%rax, 56(rp)
	ret


l0m4:
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
	mov	(up), v0
	mov	8(up), %rax
	lea	(up,n,8), up		C point up at end of input operand

	lea	-4(n), i
C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
	xor	j32, j32
	sub	n, j

	mul	v0
	xor	w232, w232
	mov	%rax, w0
	mov	16(up,j,8), %rax
	mov	%rdx, w3
	jmp	lL3

	ALIGN(16)
mul_1_m3_top:
	add	%rax, w2
	mov	w3, (tp,j,8)
	mov	(up,j,8), %rax
	adc	%rdx, w1
	xor	w032, w032
	mul	v0
	xor	w332, w332
	mov	w2, 8(tp,j,8)
	add	%rax, w1
	adc	%rdx, w0
	mov	8(up,j,8), %rax
	mov	w1, 16(tp,j,8)
	xor	w232, w232
	mul	v0
	add	%rax, w0
	mov	16(up,j,8), %rax
	adc	%rdx, w3
lL3:	xor	w132, w132
	mul	v0
	add	%rax, w3
	mov	24(up,j,8), %rax
	adc	%rdx, w2
	mov	w0, 24(tp,j,8)
	mul	v0
	add	$4, j
	js	mul_1_m3_top

	add	%rax, w2
	mov	w3, (tp)
	adc	%rdx, w1
	mov	w2, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2
	lea	-8(up), up
	jmp	dowhile


l1m4:
	lea	8(rp,n,8), tp		C point tp in middle of result operand
	mov	(up), v0		C u0
	mov	8(up), %rax		C u1
	lea	8(up,n,8), up		C point up at end of input operand

	lea	-3(n), i
C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
	lea	-3(n), j
	neg	j

	mov	%rax, v1		C u1
	mul	v0			C u0 * u1
	mov	%rdx, w1
	xor	w232, w232
	mov	%rax, 8(rp)
	jmp	lm0

	ALIGN(16)
mul_2_m0_top:
	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	-24(up,j,8), %rax
	mov	$0, w232
	mul	v0
	add	%rax, w0
	mov	-24(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, w232
	mul	v1			C v1 * u0
	add	%rax, w1
	mov	w0, -24(tp,j,8)
	adc	%rdx, w2
lm0:	mov	-16(up,j,8), %rax	C u2, u6 ...
	mul	v0			C u0 * u2
	mov	$0, w332
	add	%rax, w1
	adc	%rdx, w2
	mov	-16(up,j,8), %rax
	adc	$0, w332
	mov	$0, w032
	mov	w1, -16(tp,j,8)
	mul	v1
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	mov	$0, w132
	mul	v0
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	adc	$0, w032
	mul	v1
	add	%rax, w3
	mov	w2, -8(tp,j,8)
	adc	%rdx, w0
lm2x:	mov	(up,j,8), %rax
	mul	v0
	add	%rax, w3
	adc	%rdx, w0
	adc	$0, w132
	add	$4, j
	mov	-32(up,j,8), %rax
	mov	w3, -32(tp,j,8)
	js	mul_2_m0_top

	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	w0, -8(tp)
	mov	w1, (tp)

	lea	-16(up), up
	lea	eval(3*8-24)(tp), tp	C tp += 3
	jmp	dowhile_end


l2m4:
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
	mov	(up), v0
	mov	8(up), %rax
	lea	(up,n,8), up		C point up at end of input operand

	lea	-4(n), i
C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
	lea	-2(n), j
	neg	j

	mul	v0
	mov	%rax, w2
	mov	(up,j,8), %rax
	mov	%rdx, w1
	jmp	lL1

	ALIGN(16)
mul_1_m1_top:
	add	%rax, w2
	mov	w3, (tp,j,8)
	mov	(up,j,8), %rax
	adc	%rdx, w1
lL1:	xor	w032, w032
	mul	v0
	xor	w332, w332
	mov	w2, 8(tp,j,8)
	add	%rax, w1
	adc	%rdx, w0
	mov	8(up,j,8), %rax
	mov	w1, 16(tp,j,8)
	xor	w232, w232
	mul	v0
	add	%rax, w0
	mov	16(up,j,8), %rax
	adc	%rdx, w3
	xor	w132, w132
	mul	v0
	add	%rax, w3
	mov	24(up,j,8), %rax
	adc	%rdx, w2
	mov	w0, 24(tp,j,8)
	mul	v0
	add	$4, j
	js	mul_1_m1_top

	add	%rax, w2
	mov	w3, (tp)
	adc	%rdx, w1
	mov	w2, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2
	lea	-8(up), up
	jmp	dowhile_mid


l3m4:
	lea	8(rp,n,8), tp		C point tp in middle of result operand
	mov	(up), v0		C u0
	mov	8(up), %rax		C u1
	lea	8(up,n,8), up		C point up at end of input operand

	lea	-5(n), i
C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
	lea	-1(n), j
	neg	j

	mov	%rax, v1		C u1
	mul	v0			C u0 * u1
	mov	%rdx, w3
	xor	w032, w032
	xor	w132, w132
	mov	%rax, 8(rp)
	jmp	lm2

	ALIGN(16)
mul_2_m2_top:
	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	-24(up,j,8), %rax
	mov	$0, w232
	mul	v0
	add	%rax, w0
	mov	-24(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, w232
	mul	v1			C v1 * u0
	add	%rax, w1
	mov	w0, -24(tp,j,8)
	adc	%rdx, w2
	mov	-16(up,j,8), %rax
	mul	v0
	mov	$0, w332
	add	%rax, w1
	adc	%rdx, w2
	mov	-16(up,j,8), %rax
	adc	$0, w332
	mov	$0, w032
	mov	w1, -16(tp,j,8)
	mul	v1
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	mov	$0, w132
	mul	v0
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	adc	$0, w032
	mul	v1
	add	%rax, w3
	mov	w2, -8(tp,j,8)
	adc	%rdx, w0
lm2:	mov	(up,j,8), %rax
	mul	v0
	add	%rax, w3
	adc	%rdx, w0
	adc	$0, w132
	add	$4, j
	mov	-32(up,j,8), %rax
	mov	w3, -32(tp,j,8)
	js	mul_2_m2_top

	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	w0, -8(tp)
	mov	w1, (tp)

	lea	-16(up), up
	jmp	dowhile_mid

dowhile:
C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
	lea	4(i), j
	neg	j

	mov	16(up,j,8), v0
	mov	24(up,j,8), v1
	mov	24(up,j,8), %rax
	mul	v0
	xor	w332, w332
	add	%rax, 24(tp,j,8)
	adc	%rdx, w3
	xor	w032, w032
	xor	w132, w132
	jmp	lam2

	ALIGN(16)
addmul_2_m2_top:
	add	w3, (tp,j,8)
	adc	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	mov	$0, w232
	mul	v0
	add	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, w232
	mul	v1				C v1 * u0
	add	w0, 8(tp,j,8)
	adc	%rax, w1
	adc	%rdx, w2
	mov	16(up,j,8), %rax
	mov	$0, w332
	mul	v0				C v0 * u1
	add	%rax, w1
	mov	16(up,j,8), %rax
	adc	%rdx, w2
	adc	$0, w332
	mul	v1				C v1 * u1
	add	w1, 16(tp,j,8)
	adc	%rax, w2
	mov	24(up,j,8), %rax
	adc	%rdx, w3
	mul	v0
	mov	$0, w032
	add	%rax, w2
	adc	%rdx, w3
	mov	$0, w132
	mov	24(up,j,8), %rax
	adc	$0, w032
	mul	v1
	add	w2, 24(tp,j,8)
	adc	%rax, w3
	adc	%rdx, w0
lam2:	mov	32(up,j,8), %rax
	mul	v0
	add	%rax, w3
	mov	32(up,j,8), %rax
	adc	%rdx, w0
	adc	$0, w132
	mul	v1
	add	$4, j
	js	addmul_2_m2_top

	add	w3, (tp)
	adc	%rax, w0
	adc	%rdx, w1
	mov	w0, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2

	add	$-2, i32		C i -= 2

dowhile_mid:
C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
	lea	2(i), j
	neg	j

	mov	(up,j,8), v0
	mov	8(up,j,8), v1
	mov	8(up,j,8), %rax
	mul	v0
	xor	w132, w132
	add	%rax, 8(tp,j,8)
	adc	%rdx, w1
	xor	w232, w232
	jmp	l20

	ALIGN(16)
addmul_2_m0_top:
	add	w3, (tp,j,8)
	adc	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	mov	$0, w232
	mul	v0
	add	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, w232
	mul	v1				C v1 * u0
	add	w0, 8(tp,j,8)
	adc	%rax, w1
	adc	%rdx, w2
l20:	mov	16(up,j,8), %rax
	mov	$0, w332
	mul	v0				C v0 * u1
	add	%rax, w1
	mov	16(up,j,8), %rax
	adc	%rdx, w2
	adc	$0, w332
	mul	v1				C v1 * u1
	add	w1, 16(tp,j,8)
	adc	%rax, w2
	mov	24(up,j,8), %rax
	adc	%rdx, w3
	mul	v0
	mov	$0, w032
	add	%rax, w2
	adc	%rdx, w3
	mov	$0, w132
	mov	24(up,j,8), %rax
	adc	$0, w032
	mul	v1
	add	w2, 24(tp,j,8)
	adc	%rax, w3
	adc	%rdx, w0
	mov	32(up,j,8), %rax
	mul	v0
	add	%rax, w3
	mov	32(up,j,8), %rax
	adc	%rdx, w0
	adc	$0, w132
	mul	v1
	add	$4, j
	js	addmul_2_m0_top

	add	w3, (tp)
	adc	%rax, w0
	adc	%rdx, w1
	mov	w0, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2
dowhile_end:

	add	$-2, i32		C i -= 2
	jne	dowhile

C Function mpn_addmul_2s_2
	mov	-16(up), v0
	mov	-8(up), v1
	mov	-8(up), %rax
	mul	v0
	xor	w332, w332
	add	%rax, -8(tp)
	adc	%rdx, w3
	xor	w032, w032
	xor	w132, w132
	mov	(up), %rax
	mul	v0
	add	%rax, w3
	mov	(up), %rax
	adc	%rdx, w0
	mul	v1
	add	w3, (tp)
	adc	%rax, w0
	adc	%rdx, w1
	mov	w0, 8(tp)
	mov	w1, 16(tp)

C Function mpn_sqr_diag_addlsh1
	lea	-4(n,n), j

	mov	8(rp), %r11
	lea	-8(up), up
	lea	(rp,j,8), rp
	neg	j
	mov	(up,j,4), %rax
	mul	%rax
	test	$2, j8
	jnz	odd

evn:	add	%r11, %r11
	sbb	%ebx, %ebx		C save CF
	add	%rdx, %r11
	mov	%rax, (rp,j,8)
	jmp	ld0

odd:	add	%r11, %r11
	sbb	%ebp, %ebp		C save CF
	add	%rdx, %r11
	mov	%rax, (rp,j,8)
	lea	-2(j), j
	jmp	ld1

	ALIGN(16)
top:	mov	(up,j,4), %rax
	mul	%rax
	add	%ebp, %ebp		C restore carry
	adc	%rax, %r10
	adc	%rdx, %r11
	mov	%r10, (rp,j,8)
ld0:	mov	%r11, 8(rp,j,8)
	mov	16(rp,j,8), %r10
	adc	%r10, %r10
	mov	24(rp,j,8), %r11
	adc	%r11, %r11
	nop
	sbb	%ebp, %ebp		C save CF
	mov	8(up,j,4), %rax
	mul	%rax
	add	%ebx, %ebx		C restore carry
	adc	%rax, %r10
	adc	%rdx, %r11
	mov	%r10, 16(rp,j,8)
ld1:	mov	%r11, 24(rp,j,8)
	mov	32(rp,j,8), %r10
	adc	%r10, %r10
	mov	40(rp,j,8), %r11
	adc	%r11, %r11
	sbb	%ebx, %ebx		C save CF
	add	$4, j
	js	top

	mov	(up), %rax
	mul	%rax
	add	%ebp, %ebp		C restore carry
	adc	%rax, %r10
	adc	%rdx, %r11
	mov	%r10, (rp)
	mov	%r11, 8(rp)
	mov	16(rp), %r10
	adc	%r10, %r10
	sbb	%ebp, %ebp		C save CF
	neg	%ebp
	mov	8(up), %rax
	mul	%rax
	add	%ebx, %ebx		C restore carry
	adc	%rax, %r10
	adc	%rbp, %rdx
	mov	%r10, 16(rp)
	mov	%rdx, 24(rp)

	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbp
	pop	%rbx
	ret
EPILOGUE()
Use GMP version of sqr_basecase on AMD K8/K10/etc and Intel Core2/Penryn/etc. 2014-02-17 11:43:45 -05:00			`dnl AMD64 mpn_sqr_basecase.`

			`dnl Contributed to the GNU project by Torbjorn Granlund.`

			`dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.`

			`dnl This file is part of the GNU MP Library.`

			`dnl The GNU MP Library is free software; you can redistribute it and/or modify`
			`dnl it under the terms of the GNU Lesser General Public License as published`
			`dnl by the Free Software Foundation; either version 3 of the License, or (at`
			`dnl your option) any later version.`

			`dnl The GNU MP Library is distributed in the hope that it will be useful, but`
			`dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`dnl License for more details.`

			`dnl You should have received a copy of the GNU Lesser General Public License`
			`dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.`

			include(`../config.m4')

			`C The inner loops of this code are the result of running a code generation and`
			`C optimization tool suite written by David Harvey and Torbjorn Granlund.`

			`C NOTES`
			`C * There is a major stupidity in that we call mpn_mul_1 initially, for a`
			`C large trip count. Instead, we should follow the generic/sqr_basecase.c`
			`C code which uses addmul_2s from the start, conditionally leaving a 1x1`
			`C multiply to the end. (In assembly code, one would stop invoking`
			`C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)`
			`C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to`
			`C save/restore carry, instead it can propagate into the high product word.`
			`C * Align more labels, should shave off a few cycles.`
			`C * We can safely use 32-bit size operations, since operands with (2^32)`
			`C limbs will lead to non-termination in practice.`
			`C * The jump table could probably be optimized, at least for non-pic.`
			`C * The special code for n <= 4 was quickly written. It is probably too`
			`C large and unnecessarily slow.`
			`C * Consider combining small cases code so that the n=k-1 code jumps into the`
			`C middle of the n=k code.`
			`C * Avoid saving registers for small cases code.`
			`C * Needed variables:`
			`C n r11 input size`
			`C i r8 work left, initially n`
			`C j r9 inner loop count`
			`C r15 unused`
			`C v0 r13`
			`C v1 r14`
			`C rp rdi`
			`C up rsi`
			`C w0 rbx`
			`C w1 rcx`
			`C w2 rbp`
			`C w3 r10`
			`C tp r12`
			`C lo rax`
			`C hi rdx`
			`C rsp`

			`C INPUT PARAMETERS`
			define(`rp', `%rdi')
			define(`up', `%rsi')
			define(`n_param', `%rdx')
			define(`n_param32', `%edx')

			define(`n', `%r11')
			define(`n32', `%r11d')
			define(`tp', `%r12')
			define(`i', `%r8')
			define(`i32', `%r8d')
			define(`j', `%r9')
			define(`j8', `%r9b')
			define(`j32', `%r9d')
			define(`v0', `%r13')
			define(`v1', `%r14')
			define(`w0', `%rbx')
			define(`w032', `%ebx')
			define(`w1', `%rcx')
			define(`w132', `%ecx')
			define(`w2', `%rbp')
			define(`w232', `%ebp')
			define(`w3', `%r10')
			define(`w332', `%r10d')

			`ASM_START()`
			`TEXT`
			`ALIGN(16)`
			`PROLOGUE(mpn_sqr_basecase)`
			`mov n_param32, %ecx`
			`mov n_param32, n32 C free original n register (rdx)`

			`add $-40, %rsp`

			`and $3, %ecx`
			`cmp $4, n_param32`
			`lea 4(%rcx), %r8`

			`mov %rbx, 32(%rsp)`
			`mov %rbp, 24(%rsp)`
			`mov %r12, 16(%rsp)`
			`mov %r13, 8(%rsp)`
			`mov %r14, (%rsp)`

			`cmovg %r8, %rcx`

			`lea tab(%rip), %rax`
			ifdef(`PIC',
			` movslq (%rax,%rcx,4), %r10
			`add %r10, %rax`
			`jmp *%rax`
			',`
			`jmp *(%rax,%rcx,8)`
			`')`
			`JUMPTABSECT`
			`ALIGN(8)`
			`tab: JMPENT( l4, tab)`
			`JMPENT( l1, tab)`
			`JMPENT( l2, tab)`
			`JMPENT( l3, tab)`
			`JMPENT( l0m4, tab)`
			`JMPENT( l1m4, tab)`
			`JMPENT( l2m4, tab)`
			`JMPENT( l3m4, tab)`
			`TEXT`

			`l1: mov (up), %rax`
			`mul %rax`
			`add $40, %rsp`
			`mov %rax, (rp)`
			`mov %rdx, 8(rp)`
			`ret`

			`l2: mov (up), %rax`
			`mov %rax, %r8`
			`mul %rax`
			`mov 8(up), %r11`
			`mov %rax, (rp)`
			`mov %r11, %rax`
			`mov %rdx, %r9`
			`mul %rax`
			`add $40, %rsp`
			`mov %rax, %r10`
			`mov %r11, %rax`
			`mov %rdx, %r11`
			`mul %r8`
			`xor %r8, %r8`
			`add %rax, %r9`
			`adc %rdx, %r10`
			`adc %r8, %r11`
			`add %rax, %r9`
			`mov %r9, 8(rp)`
			`adc %rdx, %r10`
			`mov %r10, 16(rp)`
			`adc %r8, %r11`
			`mov %r11, 24(rp)`
			`ret`

			`l3: mov (up), %rax`
			`mov %rax, %r10`
			`mul %rax`
			`mov 8(up), %r11`
			`mov %rax, (rp)`
			`mov %r11, %rax`
			`mov %rdx, 8(rp)`
			`mul %rax`
			`mov 16(up), %rcx`
			`mov %rax, 16(rp)`
			`mov %rcx, %rax`
			`mov %rdx, 24(rp)`
			`mul %rax`
			`mov %rax, 32(rp)`
			`mov %rdx, 40(rp)`

			`mov %r11, %rax`
			`mul %r10`
			`mov %rax, %r8`
			`mov %rcx, %rax`
			`mov %rdx, %r9`
			`mul %r10`
			`xor %r10, %r10`
			`add %rax, %r9`
			`mov %r11, %rax`
			`mov %r10, %r11`
			`adc %rdx, %r10`

			`mul %rcx`
			`add $40, %rsp`
			`add %rax, %r10`
			`adc %r11, %rdx`
			`add %r8, %r8`
			`adc %r9, %r9`
			`adc %r10, %r10`
			`adc %rdx, %rdx`
			`adc %r11, %r11`
			`add %r8, 8(rp)`
			`adc %r9, 16(rp)`
			`adc %r10, 24(rp)`
			`adc %rdx, 32(rp)`
			`adc %r11, 40(rp)`
			`ret`

			`l4: mov (up), %rax`
			`mov %rax, %r11`
			`mul %rax`
			`mov 8(up), %rbx`
			`mov %rax, (rp)`
			`mov %rbx, %rax`
			`mov %rdx, 8(rp)`
			`mul %rax`
			`mov %rax, 16(rp)`
			`mov %rdx, 24(rp)`
			`mov 16(up), %rax`
			`mul %rax`
			`mov %rax, 32(rp)`
			`mov %rdx, 40(rp)`
			`mov 24(up), %rax`
			`mul %rax`
			`mov %rax, 48(rp)`
			`mov %rbx, %rax`
			`mov %rdx, 56(rp)`

			`mul %r11`
			`add $32, %rsp`
			`mov %rax, %r8`
			`mov %rdx, %r9`
			`mov 16(up), %rax`
			`mul %r11`
			`xor %r10, %r10`
			`add %rax, %r9`
			`adc %rdx, %r10`
			`mov 24(up), %rax`
			`mul %r11`
			`xor %r11, %r11`
			`add %rax, %r10`
			`adc %rdx, %r11`
			`mov 16(up), %rax`
			`mul %rbx`
			`xor %rcx, %rcx`
			`add %rax, %r10`
			`adc %rdx, %r11`
			`adc $0, %rcx`
			`mov 24(up), %rax`
			`mul %rbx`
			`pop %rbx`
			`add %rax, %r11`
			`adc %rdx, %rcx`
			`mov 16(up), %rdx`
			`mov 24(up), %rax`
			`mul %rdx`
			`add %rax, %rcx`
			`adc $0, %rdx`

			`add %r8, %r8`
			`adc %r9, %r9`
			`adc %r10, %r10`
			`adc %r11, %r11`
			`adc %rcx, %rcx`
			`mov $0, %eax`
			`adc %rdx, %rdx`

			`adc %rax, %rax`
			`add %r8, 8(rp)`
			`adc %r9, 16(rp)`
			`adc %r10, 24(rp)`
			`adc %r11, 32(rp)`
			`adc %rcx, 40(rp)`
			`adc %rdx, 48(rp)`
			`adc %rax, 56(rp)`
			`ret`


			`l0m4:`
			`lea -16(rp,n,8), tp C point tp in middle of result operand`
			`mov (up), v0`
			`mov 8(up), %rax`
			`lea (up,n,8), up C point up at end of input operand`

			`lea -4(n), i`
			`C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])`
			`xor j32, j32`
			`sub n, j`

			`mul v0`
			`xor w232, w232`
			`mov %rax, w0`
			`mov 16(up,j,8), %rax`
			`mov %rdx, w3`
			`jmp lL3`

			`ALIGN(16)`
			`mul_1_m3_top:`
			`add %rax, w2`
			`mov w3, (tp,j,8)`
			`mov (up,j,8), %rax`
			`adc %rdx, w1`
			`xor w032, w032`
			`mul v0`
			`xor w332, w332`
			`mov w2, 8(tp,j,8)`
			`add %rax, w1`
			`adc %rdx, w0`
			`mov 8(up,j,8), %rax`
			`mov w1, 16(tp,j,8)`
			`xor w232, w232`
			`mul v0`
			`add %rax, w0`
			`mov 16(up,j,8), %rax`
			`adc %rdx, w3`
			`lL3: xor w132, w132`
			`mul v0`
			`add %rax, w3`
			`mov 24(up,j,8), %rax`
			`adc %rdx, w2`
			`mov w0, 24(tp,j,8)`
			`mul v0`
			`add $4, j`
			`js mul_1_m3_top`

			`add %rax, w2`
			`mov w3, (tp)`
			`adc %rdx, w1`
			`mov w2, 8(tp)`
			`mov w1, 16(tp)`

			`lea eval(2*8)(tp), tp C tp += 2`
			`lea -8(up), up`
			`jmp dowhile`


			`l1m4:`
			`lea 8(rp,n,8), tp C point tp in middle of result operand`
			`mov (up), v0 C u0`
			`mov 8(up), %rax C u1`
			`lea 8(up,n,8), up C point up at end of input operand`

			`lea -3(n), i`
			`C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)`
			`lea -3(n), j`
			`neg j`

			`mov %rax, v1 C u1`
			`mul v0 C u0 * u1`
			`mov %rdx, w1`
			`xor w232, w232`
			`mov %rax, 8(rp)`
			`jmp lm0`

			`ALIGN(16)`
			`mul_2_m0_top:`
			`mul v1`
			`add %rax, w0`
			`adc %rdx, w1`
			`mov -24(up,j,8), %rax`
			`mov $0, w232`
			`mul v0`
			`add %rax, w0`
			`mov -24(up,j,8), %rax`
			`adc %rdx, w1`
			`adc $0, w232`
			`mul v1 C v1 * u0`
			`add %rax, w1`
			`mov w0, -24(tp,j,8)`
			`adc %rdx, w2`
			`lm0: mov -16(up,j,8), %rax C u2, u6 ...`
			`mul v0 C u0 * u2`
			`mov $0, w332`
			`add %rax, w1`
			`adc %rdx, w2`
			`mov -16(up,j,8), %rax`
			`adc $0, w332`
			`mov $0, w032`
			`mov w1, -16(tp,j,8)`
			`mul v1`
			`add %rax, w2`
			`mov -8(up,j,8), %rax`
			`adc %rdx, w3`
			`mov $0, w132`
			`mul v0`
			`add %rax, w2`
			`mov -8(up,j,8), %rax`
			`adc %rdx, w3`
			`adc $0, w032`
			`mul v1`
			`add %rax, w3`
			`mov w2, -8(tp,j,8)`
			`adc %rdx, w0`
			`lm2x: mov (up,j,8), %rax`
			`mul v0`
			`add %rax, w3`
			`adc %rdx, w0`
			`adc $0, w132`
			`add $4, j`
			`mov -32(up,j,8), %rax`
			`mov w3, -32(tp,j,8)`
			`js mul_2_m0_top`

			`mul v1`
			`add %rax, w0`
			`adc %rdx, w1`
			`mov w0, -8(tp)`
			`mov w1, (tp)`

			`lea -16(up), up`
			`lea eval(3*8-24)(tp), tp C tp += 3`
			`jmp dowhile_end`


			`l2m4:`
			`lea -16(rp,n,8), tp C point tp in middle of result operand`
			`mov (up), v0`
			`mov 8(up), %rax`
			`lea (up,n,8), up C point up at end of input operand`

			`lea -4(n), i`
			`C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])`
			`lea -2(n), j`
			`neg j`

			`mul v0`
			`mov %rax, w2`
			`mov (up,j,8), %rax`
			`mov %rdx, w1`
			`jmp lL1`

			`ALIGN(16)`
			`mul_1_m1_top:`
			`add %rax, w2`
			`mov w3, (tp,j,8)`
			`mov (up,j,8), %rax`
			`adc %rdx, w1`
			`lL1: xor w032, w032`
			`mul v0`
			`xor w332, w332`
			`mov w2, 8(tp,j,8)`
			`add %rax, w1`
			`adc %rdx, w0`
			`mov 8(up,j,8), %rax`
			`mov w1, 16(tp,j,8)`
			`xor w232, w232`
			`mul v0`
			`add %rax, w0`
			`mov 16(up,j,8), %rax`
			`adc %rdx, w3`
			`xor w132, w132`
			`mul v0`
			`add %rax, w3`
			`mov 24(up,j,8), %rax`
			`adc %rdx, w2`
			`mov w0, 24(tp,j,8)`
			`mul v0`
			`add $4, j`
			`js mul_1_m1_top`

			`add %rax, w2`
			`mov w3, (tp)`
			`adc %rdx, w1`
			`mov w2, 8(tp)`
			`mov w1, 16(tp)`

			`lea eval(2*8)(tp), tp C tp += 2`
			`lea -8(up), up`
			`jmp dowhile_mid`


			`l3m4:`
			`lea 8(rp,n,8), tp C point tp in middle of result operand`
			`mov (up), v0 C u0`
			`mov 8(up), %rax C u1`
			`lea 8(up,n,8), up C point up at end of input operand`

			`lea -5(n), i`
			`C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)`
			`lea -1(n), j`
			`neg j`

			`mov %rax, v1 C u1`
			`mul v0 C u0 * u1`
			`mov %rdx, w3`
			`xor w032, w032`
			`xor w132, w132`
			`mov %rax, 8(rp)`
			`jmp lm2`

			`ALIGN(16)`
			`mul_2_m2_top:`
			`mul v1`
			`add %rax, w0`
			`adc %rdx, w1`
			`mov -24(up,j,8), %rax`
			`mov $0, w232`
			`mul v0`
			`add %rax, w0`
			`mov -24(up,j,8), %rax`
			`adc %rdx, w1`
			`adc $0, w232`
			`mul v1 C v1 * u0`
			`add %rax, w1`
			`mov w0, -24(tp,j,8)`
			`adc %rdx, w2`
			`mov -16(up,j,8), %rax`
			`mul v0`
			`mov $0, w332`
			`add %rax, w1`
			`adc %rdx, w2`
			`mov -16(up,j,8), %rax`
			`adc $0, w332`
			`mov $0, w032`
			`mov w1, -16(tp,j,8)`
			`mul v1`
			`add %rax, w2`
			`mov -8(up,j,8), %rax`
			`adc %rdx, w3`
			`mov $0, w132`
			`mul v0`
			`add %rax, w2`
			`mov -8(up,j,8), %rax`
			`adc %rdx, w3`
			`adc $0, w032`
			`mul v1`
			`add %rax, w3`
			`mov w2, -8(tp,j,8)`
			`adc %rdx, w0`
			`lm2: mov (up,j,8), %rax`
			`mul v0`
			`add %rax, w3`
			`adc %rdx, w0`
			`adc $0, w132`
			`add $4, j`
			`mov -32(up,j,8), %rax`
			`mov w3, -32(tp,j,8)`
			`js mul_2_m2_top`

			`mul v1`
			`add %rax, w0`
			`adc %rdx, w1`
			`mov w0, -8(tp)`
			`mov w1, (tp)`

			`lea -16(up), up`
			`jmp dowhile_mid`

			`dowhile:`
			`C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)`
			`lea 4(i), j`
			`neg j`

			`mov 16(up,j,8), v0`
			`mov 24(up,j,8), v1`
			`mov 24(up,j,8), %rax`
			`mul v0`
			`xor w332, w332`
			`add %rax, 24(tp,j,8)`
			`adc %rdx, w3`
			`xor w032, w032`
			`xor w132, w132`
			`jmp lam2`

			`ALIGN(16)`
			`addmul_2_m2_top:`
			`add w3, (tp,j,8)`
			`adc %rax, w0`
			`mov 8(up,j,8), %rax`
			`adc %rdx, w1`
			`mov $0, w232`
			`mul v0`
			`add %rax, w0`
			`mov 8(up,j,8), %rax`
			`adc %rdx, w1`
			`adc $0, w232`
			`mul v1 C v1 * u0`
			`add w0, 8(tp,j,8)`
			`adc %rax, w1`
			`adc %rdx, w2`
			`mov 16(up,j,8), %rax`
			`mov $0, w332`
			`mul v0 C v0 * u1`
			`add %rax, w1`
			`mov 16(up,j,8), %rax`
			`adc %rdx, w2`
			`adc $0, w332`
			`mul v1 C v1 * u1`
			`add w1, 16(tp,j,8)`
			`adc %rax, w2`
			`mov 24(up,j,8), %rax`
			`adc %rdx, w3`
			`mul v0`
			`mov $0, w032`
			`add %rax, w2`
			`adc %rdx, w3`
			`mov $0, w132`
			`mov 24(up,j,8), %rax`
			`adc $0, w032`
			`mul v1`
			`add w2, 24(tp,j,8)`
			`adc %rax, w3`
			`adc %rdx, w0`
			`lam2: mov 32(up,j,8), %rax`
			`mul v0`
			`add %rax, w3`
			`mov 32(up,j,8), %rax`
			`adc %rdx, w0`
			`adc $0, w132`
			`mul v1`
			`add $4, j`
			`js addmul_2_m2_top`

			`add w3, (tp)`
			`adc %rax, w0`
			`adc %rdx, w1`
			`mov w0, 8(tp)`
			`mov w1, 16(tp)`

			`lea eval(2*8)(tp), tp C tp += 2`

			`add $-2, i32 C i -= 2`

			`dowhile_mid:`
			`C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)`
			`lea 2(i), j`
			`neg j`

			`mov (up,j,8), v0`
			`mov 8(up,j,8), v1`
			`mov 8(up,j,8), %rax`
			`mul v0`
			`xor w132, w132`
			`add %rax, 8(tp,j,8)`
			`adc %rdx, w1`
			`xor w232, w232`
			`jmp l20`

			`ALIGN(16)`
			`addmul_2_m0_top:`
			`add w3, (tp,j,8)`
			`adc %rax, w0`
			`mov 8(up,j,8), %rax`
			`adc %rdx, w1`
			`mov $0, w232`
			`mul v0`
			`add %rax, w0`
			`mov 8(up,j,8), %rax`
			`adc %rdx, w1`
			`adc $0, w232`
			`mul v1 C v1 * u0`
			`add w0, 8(tp,j,8)`
			`adc %rax, w1`
			`adc %rdx, w2`
			`l20: mov 16(up,j,8), %rax`
			`mov $0, w332`
			`mul v0 C v0 * u1`
			`add %rax, w1`
			`mov 16(up,j,8), %rax`
			`adc %rdx, w2`
			`adc $0, w332`
			`mul v1 C v1 * u1`
			`add w1, 16(tp,j,8)`
			`adc %rax, w2`
			`mov 24(up,j,8), %rax`
			`adc %rdx, w3`
			`mul v0`
			`mov $0, w032`
			`add %rax, w2`
			`adc %rdx, w3`
			`mov $0, w132`
			`mov 24(up,j,8), %rax`
			`adc $0, w032`
			`mul v1`
			`add w2, 24(tp,j,8)`
			`adc %rax, w3`
			`adc %rdx, w0`
			`mov 32(up,j,8), %rax`
			`mul v0`
			`add %rax, w3`
			`mov 32(up,j,8), %rax`
			`adc %rdx, w0`
			`adc $0, w132`
			`mul v1`
			`add $4, j`
			`js addmul_2_m0_top`

			`add w3, (tp)`
			`adc %rax, w0`
			`adc %rdx, w1`
			`mov w0, 8(tp)`
			`mov w1, 16(tp)`

			`lea eval(2*8)(tp), tp C tp += 2`
			`dowhile_end:`

			`add $-2, i32 C i -= 2`
			`jne dowhile`

			`C Function mpn_addmul_2s_2`
			`mov -16(up), v0`
			`mov -8(up), v1`
			`mov -8(up), %rax`
			`mul v0`
			`xor w332, w332`
			`add %rax, -8(tp)`
			`adc %rdx, w3`
			`xor w032, w032`
			`xor w132, w132`
			`mov (up), %rax`
			`mul v0`
			`add %rax, w3`
			`mov (up), %rax`
			`adc %rdx, w0`
			`mul v1`
			`add w3, (tp)`
			`adc %rax, w0`
			`adc %rdx, w1`
			`mov w0, 8(tp)`
			`mov w1, 16(tp)`

			`C Function mpn_sqr_diag_addlsh1`
			`lea -4(n,n), j`

			`mov 8(rp), %r11`
			`lea -8(up), up`
			`lea (rp,j,8), rp`
			`neg j`
			`mov (up,j,4), %rax`
			`mul %rax`
			`test $2, j8`
			`jnz odd`

			`evn: add %r11, %r11`
			`sbb %ebx, %ebx C save CF`
			`add %rdx, %r11`
			`mov %rax, (rp,j,8)`
			`jmp ld0`

			`odd: add %r11, %r11`
			`sbb %ebp, %ebp C save CF`
			`add %rdx, %r11`
			`mov %rax, (rp,j,8)`
			`lea -2(j), j`
			`jmp ld1`

			`ALIGN(16)`
			`top: mov (up,j,4), %rax`
			`mul %rax`
			`add %ebp, %ebp C restore carry`
			`adc %rax, %r10`
			`adc %rdx, %r11`
			`mov %r10, (rp,j,8)`
			`ld0: mov %r11, 8(rp,j,8)`
			`mov 16(rp,j,8), %r10`
			`adc %r10, %r10`
			`mov 24(rp,j,8), %r11`
			`adc %r11, %r11`
			`nop`
			`sbb %ebp, %ebp C save CF`
			`mov 8(up,j,4), %rax`
			`mul %rax`
			`add %ebx, %ebx C restore carry`
			`adc %rax, %r10`
			`adc %rdx, %r11`
			`mov %r10, 16(rp,j,8)`
			`ld1: mov %r11, 24(rp,j,8)`
			`mov 32(rp,j,8), %r10`
			`adc %r10, %r10`
			`mov 40(rp,j,8), %r11`
			`adc %r11, %r11`
			`sbb %ebx, %ebx C save CF`
			`add $4, j`
			`js top`

			`mov (up), %rax`
			`mul %rax`
			`add %ebp, %ebp C restore carry`
			`adc %rax, %r10`
			`adc %rdx, %r11`
			`mov %r10, (rp)`
			`mov %r11, 8(rp)`
			`mov 16(rp), %r10`
			`adc %r10, %r10`
			`sbb %ebp, %ebp C save CF`
			`neg %ebp`
			`mov 8(up), %rax`
			`mul %rax`
			`add %ebx, %ebx C restore carry`
			`adc %rax, %r10`
			`adc %rbp, %rdx`
			`mov %r10, 16(rp)`
			`mov %rdx, 24(rp)`

			`pop %r14`
			`pop %r13`
			`pop %r12`
			`pop %rbp`
			`pop %rbx`
			`ret`
			`EPILOGUE()`