5edb799fbf
prefixes.
581 lines
12 KiB
ActionScript
581 lines
12 KiB
ActionScript
; AMD K8 mpn_mul_basecase -- multiply two mpn numbers.
|
|
;
|
|
; This file is just an adaptation of similar file in the k7 directory.
|
|
; Adapted by P. Gaudry in April 2005.
|
|
; Here is the copyright of the original k7 version:
|
|
;
|
|
; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
|
|
; void mpn_mul_basecase (mp_ptr wp,
|
|
; mp_srcptr xp, mp_size_t xsize,
|
|
; mp_srcptr yp, mp_size_t ysize);
|
|
;
|
|
; Calculate xp,xsize multiplied by yp,ysize, storing the result in
|
|
; wp,xsize+ysize.
|
|
;
|
|
; This routine is essentially the same as mpn/generic/mul_basecase.c, but
|
|
; it's faster because it does most of the mpn_addmul_1() startup
|
|
; calculations only once. The saving is 15-25% on typical sizes coming from
|
|
; the Karatsuba multiply code.
|
|
|
|
%include '../yasm_mac.inc'
|
|
|
|
%define UNROLL_LOG2 4
|
|
%define UNROLL_COUNT (1 << UNROLL_LOG2)
|
|
%define UNROLL_MASK (UNROLL_COUNT - 1)
|
|
%define UNROLL_BYTES (8 * UNROLL_COUNT)
|
|
%define UNROLL_THRESHOLD 5
|
|
|
|
%define ysize r8
|
|
%define yp rcx
|
|
%define xsize rdx
|
|
%define xp rsi
|
|
%define wp rdi
|
|
|
|
|
|
bits 64
|
|
section .text
|
|
align 32
|
|
|
|
G_EXPORT __gmpn_mul_basecase
|
|
|
|
G_LABEL __gmpn_mul_basecase
|
|
cmp xsize, 2
|
|
ja .xsize_more_than_two
|
|
je .two_by_something
|
|
|
|
; one limb by one limb
|
|
|
|
mov rax, [yp] ; rax <- y0
|
|
mul qword [xp] ; [ax:dx] <- y0*x0
|
|
|
|
mov [wp], rax
|
|
mov [wp+8], rdx
|
|
ret
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
.two_by_something:
|
|
|
|
; xsize = 2, hence rdx is free for usage
|
|
|
|
dec ysize ; YSIZE--
|
|
|
|
mov r9, [yp] ; r9 <- y0
|
|
mov rax, [xp] ; rax <- x0
|
|
jnz .two_by_two
|
|
|
|
|
|
; two limbs by one limb
|
|
|
|
mul r9 ; [ax:dx] <- x0*y0
|
|
|
|
mov [wp], rax ; w0 <- low_prod
|
|
mov rax, [xp+8] ; rax <- x1 (rsi is now free)
|
|
mov rsi, rdx ; rsi <- carry
|
|
|
|
mul r9 ; [ax:dx] <- x1*y0
|
|
|
|
add rsi, rax ; rsi <- ax+carry ( --> carry_flag)
|
|
mov [wp+8], rsi ; w1 <- rsi
|
|
|
|
adc rdx, 0 ; dx <- dx+carry
|
|
mov [wp+16], rdx ; w2 <- dx
|
|
|
|
ret
|
|
|
|
|
|
; -----------------------------------------------------------------------------
|
|
|
|
align 16
|
|
|
|
.two_by_two:
|
|
; rax x0 r8
|
|
; rbx **untouched** r9 y0
|
|
; rcx yp r10-11
|
|
; rdx
|
|
; rsi xp
|
|
; rdi wp
|
|
; rbp
|
|
|
|
mul r9 ; [ax:dx] <- x0*y0
|
|
|
|
mov r10, rdx ; r10 <- carry for w1
|
|
|
|
mov [wp], rax ; w0 <- ax
|
|
mov rax, [xp+8] ; ax <- x1
|
|
|
|
mul r9 ; [ax:dx] <- x1*y0
|
|
|
|
add r10, rax ; r10 <- r10 + ax for w1
|
|
|
|
adc rdx, 0 ; dx <- carry for w2
|
|
mov rcx, [yp+8] ; cx <- y1
|
|
mov [wp+8], r10 ; w1 <- r10
|
|
|
|
mov rax, [xp+8] ; ax <- x1
|
|
mov r10, rdx ; carry, for w2
|
|
|
|
mul rcx ; [ax:dx] <- x1*y1
|
|
|
|
add r10, rax ; r10 <- for w2
|
|
|
|
adc rdx, 0 ; for w3
|
|
mov rax, [xp] ; x0
|
|
|
|
mov rsi, rdx ; carry, for w3
|
|
|
|
mul rcx ; x0*y1
|
|
|
|
add [wp+8], rax ; w1 += ax
|
|
adc r10, rdx ; for w2
|
|
mov [wp+16], r10 ; w2 <- r10
|
|
|
|
adc rsi, 0
|
|
mov [wp+24], rsi ; w3 <- carry in rsi
|
|
|
|
ret
|
|
|
|
|
|
; -----------------------------------------------------------------------------
|
|
|
|
align 16
|
|
|
|
.xsize_more_than_two:
|
|
|
|
; The first limb of yp is processed with a simple mpn_mul_1 style loop
|
|
; inline. Unrolling this doesn't seem worthwhile since it's only run once
|
|
; (whereas the addmul below is run ysize-1 many times). A call to the
|
|
; actual mpn_mul_1 will be slowed down by the call and parameter pushing and
|
|
; popping, and doesn't seem likely to be worthwhile on the typical 13-26
|
|
; limb operations the Karatsuba code calls here with.
|
|
|
|
; rax r8 ysize
|
|
; rbx
|
|
; rcx yp
|
|
; rdx xsize
|
|
; rsi xp
|
|
; rdi wp
|
|
; rbp
|
|
|
|
|
|
%define YSIZE r8 ; already there
|
|
%define YP r9 ; init : rcx
|
|
%define XSIZE r10 ; init : rdx
|
|
%define XP r11 ; init : rsi
|
|
%define WP r12 ; init : rdi r12 should be saved!
|
|
|
|
|
|
; FRAME doesn't carry on from previous, no pushes yet here
|
|
|
|
%define SAVE_RBX [rsp+16]
|
|
%define SAVE_R12 [rsp+8]
|
|
%define SAVE_RBP [rsp]
|
|
|
|
sub rsp, 24
|
|
|
|
mov SAVE_RBX, rbx
|
|
mov SAVE_R12, r12
|
|
mov SAVE_RBP, rbp
|
|
|
|
mov YP, rcx
|
|
mov XSIZE, rdx
|
|
mov XP, rsi
|
|
mov WP, rdi
|
|
|
|
mov rbp, [YP]
|
|
mov rcx, XSIZE
|
|
|
|
xor rbx, rbx
|
|
lea rsi, [XP+XSIZE*8] ; xp end
|
|
|
|
lea rdi, [WP+XSIZE*8] ; wp end of mul1
|
|
neg rcx
|
|
|
|
|
|
.mul1:
|
|
; rax scratch
|
|
; rbx carry
|
|
; rcx counter, negative
|
|
; rdx scratch
|
|
; rsi xp end
|
|
; rdi wp end of mul1
|
|
; rbp multiplier
|
|
|
|
mov rax, [rsi+rcx*8]
|
|
|
|
mul rbp
|
|
|
|
add rax, rbx
|
|
mov [rdi+rcx*8], rax
|
|
mov rbx, 0
|
|
|
|
adc rbx, rdx
|
|
inc rcx
|
|
jnz .mul1
|
|
|
|
|
|
mov rdx, YSIZE
|
|
mov rcx, XSIZE
|
|
|
|
mov [rdi], rbx ; final carry
|
|
dec rdx
|
|
|
|
jnz .ysize_more_than_one
|
|
|
|
|
|
mov rbx, SAVE_RBX
|
|
mov rbp, SAVE_RBP
|
|
mov r12, SAVE_R12
|
|
add rsp, 24
|
|
|
|
ret
|
|
|
|
|
|
.ysize_more_than_one:
|
|
cmp rcx, UNROLL_THRESHOLD
|
|
mov rax, YP
|
|
|
|
jae .unroll
|
|
|
|
|
|
; -----------------------------------------------------------------------------
|
|
; simple addmul looping
|
|
;
|
|
; rax yp
|
|
; rbx
|
|
; rcx xsize
|
|
; rdx ysize-1
|
|
; rsi xp end
|
|
; rdi wp end of mul1
|
|
; rbp
|
|
|
|
lea rbp, [rax+rdx*8+8] ; yp end
|
|
neg rcx
|
|
neg rdx
|
|
|
|
mov RAX, [rsi+rcx*8] ; xp low limb
|
|
mov YSIZE, rdx ; -(ysize-1)
|
|
inc rcx
|
|
|
|
xor rbx, rbx ; initial carry
|
|
mov XSIZE, rcx ; -(xsize-1)
|
|
mov YP, rbp
|
|
|
|
mov rbp, [rbp+rdx*8] ; yp second lowest limb - multiplier
|
|
jmp .simple_outer_entry
|
|
|
|
|
|
; this is offset ???? Align ?
|
|
.simple_outer_top:
|
|
; rbp ysize counter, negative
|
|
|
|
mov rdx, YP
|
|
mov rcx, XSIZE ; -(xsize-1)
|
|
xor rbx, rbx ; carry
|
|
|
|
mov YSIZE, rbp
|
|
add rdi, 8 ; next position in wp
|
|
|
|
mov rbp, [rdx+rbp*8] ; yp limb - multiplier
|
|
mov rax, [rsi+rcx*8-8] ; xp low limb
|
|
|
|
|
|
.simple_outer_entry:
|
|
|
|
.simple_inner:
|
|
; rax xp limb
|
|
; rbx carry limb
|
|
; rcx loop counter (negative)
|
|
; rdx scratch
|
|
; rsi xp end
|
|
; rdi wp end
|
|
; rbp multiplier
|
|
|
|
mul rbp
|
|
|
|
add rbx, rax
|
|
adc rdx, 0
|
|
|
|
add [rdi+rcx*8], rbx
|
|
mov rax, [rsi+rcx*8]
|
|
adc rdx, 0
|
|
|
|
inc rcx
|
|
mov rbx, rdx
|
|
jnz .simple_inner
|
|
|
|
mul rbp
|
|
|
|
mov rbp, YSIZE
|
|
add rbx, rax
|
|
|
|
adc rdx, 0
|
|
add [rdi], rbx
|
|
|
|
adc rdx, 0
|
|
inc rbp
|
|
|
|
mov [rdi+8], rdx
|
|
jnz .simple_outer_top
|
|
|
|
|
|
mov rbx, SAVE_RBX
|
|
mov rbp, SAVE_RBP
|
|
mov r12, SAVE_R12
|
|
add rsp, 24
|
|
|
|
ret
|
|
|
|
|
|
|
|
; -----------------------------------------------------------------------------
|
|
;
|
|
; The unrolled loop is the same as in mpn_addmul_1(), see that code for some
|
|
; comments.
|
|
;
|
|
; VAR_ADJUST is the negative of how many limbs the leals in the inner loop
|
|
; increment xp and wp. This is used to adjust back xp and wp, and rshifted
|
|
; to given an initial VAR_COUNTER at the top of the outer loop.
|
|
;
|
|
; VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
|
|
; up to -1, inclusive.
|
|
;
|
|
; VAR_JMP is the computed jump into the unrolled loop.
|
|
;
|
|
; VAR_XP_LOW is the least significant limb of xp, which is needed at the
|
|
; start of the unrolled loop.
|
|
;
|
|
; YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
|
|
; inclusive.
|
|
;
|
|
; YP is offset appropriately so that the YSIZE counter can be
|
|
; added to give the location of the next limb of yp, which is the multiplier
|
|
; in the unrolled loop.
|
|
;
|
|
; The trick with VAR_ADJUST means it's only necessary to do one fetch in the
|
|
; outer loop to take care of xp, wp and the inner loop counter.
|
|
|
|
%define VAR_COUNTER [rsp+16]
|
|
%define VAR_ADJUST [rsp+8]
|
|
%define VAR_XP_LOW [rsp]
|
|
%define VAR_EXTRA_SPACE 24
|
|
|
|
|
|
.unroll:
|
|
; rax yp
|
|
; rbx
|
|
; rcx xsize
|
|
; rdx ysize-1
|
|
; rsi xp end
|
|
; rdi wp end of mul1
|
|
; rbp
|
|
|
|
mov rsi, XP ; from here, XP not used
|
|
mov rbp, [rax+8] ; multiplier (yp second limb)
|
|
lea rax, [rax+rdx*8+8] ; yp adjust for ysize indexing
|
|
|
|
mov rdi, WP
|
|
mov YP, rax
|
|
neg rdx
|
|
|
|
; From here, only YP and YSIZE are used
|
|
; Hence r10, r11, r12 are free for use
|
|
|
|
mov YSIZE, rdx
|
|
lea rbx, [rcx+UNROLL_COUNT-2] ; (xsize-1)+UNROLL_COUNT-1
|
|
dec rcx ; xsize-1
|
|
|
|
mov rax, [rsi] ; xp low limb
|
|
and rbx, -UNROLL_MASK-1
|
|
neg rcx
|
|
|
|
sub rsp, VAR_EXTRA_SPACE
|
|
|
|
neg rbx
|
|
and rcx, UNROLL_MASK
|
|
mov r12, rcx ; for later parity test
|
|
|
|
mov VAR_ADJUST, rbx
|
|
mov rdx, rcx
|
|
mov r10, rcx
|
|
shl rcx, 4
|
|
shl r10, 3
|
|
|
|
sar rbx, UNROLL_LOG2
|
|
|
|
; 24=16+8 code bytes per limb
|
|
%ifdef PIC
|
|
|
|
call .pic_calc
|
|
|
|
.unroll_here:
|
|
..@unroll_here1:
|
|
|
|
%else
|
|
lea rcx, [rcx + r10 + ..@unroll_entry1]
|
|
%endif
|
|
|
|
neg rdx
|
|
|
|
mov VAR_XP_LOW, rax
|
|
mov XP, rcx ; XP used for VAR_JUMP
|
|
lea rdi, [rdi+rdx*8+8] ; wp and xp, adjust for unrolling,
|
|
lea rsi, [rsi+rdx*8+8] ; and start at second limb
|
|
jmp .unroll_outer_entry
|
|
|
|
|
|
%ifdef PIC
|
|
|
|
.pic_calc:
|
|
|
|
lea rcx, [rcx+r10]
|
|
add rcx, ..@unroll_entry1 - ..@unroll_here1
|
|
add rcx, [rsp]
|
|
ret
|
|
|
|
%endif
|
|
|
|
|
|
; --------------------------------------------------------------------------
|
|
|
|
align 32
|
|
|
|
.unroll_outer_top:
|
|
; ebp ysize counter, negative
|
|
|
|
mov rbx, VAR_ADJUST
|
|
mov rdx, YP
|
|
|
|
mov rax, VAR_XP_LOW
|
|
mov YSIZE, rbp ; store incremented ysize counter
|
|
|
|
lea rdi, [rdi+rbx*8+8]
|
|
lea rsi, [rsi+rbx*8]
|
|
sar rbx, UNROLL_LOG2
|
|
|
|
mov rbp, [rdx+rbp*8] ; yp next multiplier
|
|
mov rcx, XP
|
|
|
|
.unroll_outer_entry:
|
|
mul rbp
|
|
|
|
mov rcx, r12
|
|
test cl, 1 ; and clear carry bit
|
|
mov VAR_COUNTER, rbx
|
|
mov rbx, 0
|
|
|
|
mov rcx, 0
|
|
cmovz rcx, rax ; eax into low carry, zero into high carry limb
|
|
cmovnz rbx, rax
|
|
|
|
; Extra fetch of VAR_JMP is bad, but registers are tight
|
|
; TODO: we have more registers, now!!!!
|
|
jmp XP
|
|
|
|
|
|
; -----------------------------------------------------------------------------
|
|
|
|
align 32
|
|
|
|
.unroll_top:
|
|
; rax xp limb
|
|
; rbx carry high
|
|
; rcx carry low
|
|
; rdx scratch
|
|
; rsi xp+8
|
|
; rdi wp
|
|
; rbp yp multiplier limb
|
|
;
|
|
; VAR_COUNTER loop counter, negative
|
|
;
|
|
; 24 bytes each limb
|
|
|
|
.unroll_entry:
|
|
..@unroll_entry1:
|
|
|
|
%define CHUNK_COUNT 2
|
|
%assign i 0
|
|
%rep UNROLL_COUNT / CHUNK_COUNT
|
|
%define disp0 8 * i * CHUNK_COUNT
|
|
|
|
adc rbx, rdx
|
|
mov rax, [byte rsi+disp0]
|
|
|
|
mul rbp
|
|
|
|
add [byte rdi+disp0], rcx
|
|
mov rcx, 0
|
|
|
|
adc rbx, rax
|
|
|
|
|
|
adc rcx, rdx
|
|
mov rax, [byte rsi+disp0+8]
|
|
|
|
mul rbp
|
|
|
|
add [byte rdi+disp0+8], rbx
|
|
mov rbx, 0
|
|
|
|
adc rcx, rax
|
|
|
|
%assign i i + 1
|
|
%endrep
|
|
|
|
inc qword VAR_COUNTER
|
|
lea rsi, [rsi+UNROLL_BYTES]
|
|
lea rdi, [rdi+UNROLL_BYTES]
|
|
|
|
jnz .unroll_top
|
|
|
|
|
|
; rax
|
|
; rbx zero
|
|
; rcx low
|
|
; rdx high
|
|
; rsi
|
|
; rdi wp, pointing at second last limb)
|
|
; rbp
|
|
;
|
|
; carry flag to be added to high
|
|
|
|
mov rbp, YSIZE
|
|
adc rdx, 0
|
|
add [rdi], rcx
|
|
|
|
adc rdx, 0
|
|
inc rbp
|
|
|
|
mov [rdi+8], rdx
|
|
jnz .unroll_outer_top
|
|
|
|
add rsp, 24
|
|
|
|
mov rbp, SAVE_RBP
|
|
mov r12, SAVE_R12
|
|
mov rbx, SAVE_RBX
|
|
add rsp, 24
|
|
|
|
ret
|
|
|
|
end
|