mpir/mpn/x86_64/haswell/karasub.as
2017-01-11 20:26:09 +01:00

370 lines
9.4 KiB
ActionScript

; mpn_karasub
; Copyright 2011,2012 The Code Cavern
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; void mpn_karasub(mp_ptr, mp_ptr, mp_size_t)
; rax rdi rsi rdx
; rax rcx rdx r8
;
; Karasuba Multiplication - split x and y into two equal length halves so
; that x = xh.B + xl and y = yh.B + yl. Then their product is:
;
; x.y = xh.yh.B^2 + (xh.yl + xl.yh).B + xl.yl
; = xh.yh.B^2 + (xh.yh + xl.yl - {xh - xl}.{yh - yl}).B + xl.yl
;
; If the length of the elements is m (about n / 2), the output length is 4 * m
; as illustrated below. The middle two blocks involve three additions and one
; subtraction:
;
; -------------------- rp
; | |-->
; | A:xl.yl[lo] | |
; | | | (xh - xl).(yh - yl)
; -------------------- | -------------------- tp
; <-- | |<--< <-- | |
; | | B:xl.yl[hi] | | | E:[lo] |
; | | | | | |
; | -------------------- | --------------------
; >--> | |--> <-- | |
; |\___ | C:xh.yh[lo] | ____/ | F:[hi] |
; | | | | |
; | -------------------- --------------------
; <-- | |
; | D:xh.yh[hi] |
; | |
; --------------------
;
; Let n2 = floor(n/2), n3 = n - n2, i.e., either n3 = n2 or n3 = n2 + 1.
; The sizes of the blocks are: {rp, 2*n2} = xl.yl, {rp + 2*n2, 2*n3} = xh.yh,
; {tp, 2*n3} = (xh - xl).(yh - yl).
; We arrange sizes so that A, B, C, and E are of length n2, and D and F
; are of length 2*n3-n2. I.e.,
; A = {rp, n2}, B = {rp + n2, n2}, C = {rp + 2*n2, n2}
; D = {rp + 3*n2, 2*n3-n2}, E = {tp, n2}, F = {tp + n2, 2*n3-n2}.
;
; To avoid overwriting B before it is used, we need to do two operations
; in parallel:
;
; (1) B = B + C + A - E = (B + C) + A - E
; (2) C = C + B + D - F = (B + C) + D - F
;
; The final carry from (1) has to be propagated into C and and, D the final
; carry from (2) has to be propagated into D. When the number of input limbs
; is odd, some extra operations have to be undertaken.
%include 'yasm_mac.inc'
BITS 64
%define TP rsi
%define RP rdi
%define A_P rdi
%define B_P rbx
%define C_P rcx
%define D_P rdx
%define E_P rsi
%define F_P rbp
GLOBAL_FUNC mpn_karasub
; requires n>=8
push rbp
push rbx
push r12
push r13
push r14
push r15
push rdx
; n is rdx and put it on the stack
and rdx, -2 ; rdx = 2*n2
shl rdx, 2 ; rdx = 8*n2
lea B_P, [RP + rdx]
lea C_P, [RP + rdx*2]
lea F_P, [E_P + rdx]
lea rdx, [rdx*2 + rdx]
lea D_P, [RP + rdx]
mov rax, B_P
sub rax, 3*8
mov [rsp-8], rax ; for testing end of main loop
; eax contains the carrys
xor eax, eax
mov r11, rax
align 16
.Lp: bt r11, 3
mov r8, [B_P] ; r8 = B[i]
adc r8, [C_P] ; r8 = B[i] + C[i]
mov r9, [B_P + 8] ; r9 = B[i+1]
adc r9, [C_P + 8] ; r9 = B[i+1] + C[i+1]
mov r10, [B_P + 16] ; r10 = B[i+2]
adc r10, [C_P + 16] ; r10 = B[i+2] + C[i+2]
mov r11, [B_P + 24] ; r11 = B[i+3]
adc r11, [C_P + 24] ; r11 = B[i+3] + C[i+3]
mov r12, rax
adc eax, eax
bt r12, 3
mov r12, r8 ; r12 = B[i] + C[i]
mov r13, r9 ; r13 = B[i+1] + C[i+1]
mov r14, r10 ; r14 = B[i+2] + C[i+2]
mov r15, r11 ; r15 = B[i+3] + C[i+3]
adc r8, [A_P] ; r8 = B[i] + C[i] + A[i]
adc r9, [A_P + 8] ; r9 = B[i+1] + C[i+1] + A[i+1]
adc r10, [A_P + 16] ; r10 = B[i+2] + C[i+2] + A[i+2]
adc r11, [A_P + 24] ; r11 = B[i+3] + C[i+3] + A[i+3]
adc eax, eax
bt eax, 4 ; FIXME: can we break the dependency chain here?
sbb r8, [E_P] ; r8 = B[i] + C[i] + A[i] - E[i]
sbb r9, [E_P + 8] ; r9 = B[i+1] + C[i+1] + A[i+1] - E[i+1]
sbb r10, [E_P + 16] ; r10 = B[i+2] + C[i+2] + A[i+2] - E[i+2]
sbb r11, [E_P + 24] ; r11 = B[i+3] + C[i+3] + A[i+3] - E[i+3]
mov [B_P], r8 ; B[i] = B[i] + C[i] + A[i] - E[i]
mov [B_P + 8], r9 ; B[i+1] = B[i+1] + C[i+1] + A[i+1] - E[i+1]
mov [B_P + 16], r10 ; B[i+2] = B[i+2] + C[i+2] + A[i+2] - E[i+2]
mov [B_P + 24], r11 ; B[i+3] = B[i+3] + C[i+3] + A[i+3] - E[i+3]
mov r11, rax
adc eax, eax
bt r11, 3
adc r12, [D_P] ; r12 = B[i] + C[i] + D[i]
adc r13, [D_P + 8] ; r13 = B[i+1] + C[i+1] + D[i+1]
adc r14, [D_P + 16] ; r14 = B[i+2] + C[i+2] + D[i+2]
adc r15, [D_P + 24] ; r15 = B[i+3] + C[i+3] + D[i+3]
mov r11, rax
adc eax, eax
bt r11, 3
sbb r12, [F_P] ; r12 = B[i] + C[i] + D[i] - F[i]
sbb r13, [F_P + 8] ; r13 = B[i+1] + C[i+1] + D[i+1] - F[i+1]
sbb r14, [F_P + 16] ; r14 = B[i+2] + C[i+2] + D[i+2] - F[i+2]
sbb r15, [F_P + 24] ; r15 = B[i+3] + C[i+3] + D[i+3] - F[i+3]
mov r11, rax
adc eax, eax
mov [C_P], r12 ; C[i] = B[i] + C[i] + D[i] - F[i]
mov [C_P + 8], r13 ; C[i+1] = B[i+1] + C[i+1] + D[i+1] - F[i+1]
mov [C_P + 16], r14 ; C[i+2] = B[i+2] + C[i+2] + D[i+2] - F[i+2]
mov [C_P + 24], r15 ; C[i+3] = B[i+3] + C[i+3] + D[i+3] - F[i+3]
lea A_P, [A_P + 4*8]
lea B_P, [B_P + 4*8]
lea C_P, [C_P + 4*8]
lea D_P, [D_P + 4*8]
lea E_P, [E_P + 4*8]
lea F_P, [F_P + 4*8]
mov r9, A_P
sub r9, [rsp-8] ; r9 = A_P - (orig_B_P - 3*8) = A_P - orig_B_P + 24
jc .Lp ; If A_P < orig_B_P - 3*8, then do another 4 words
; Bits of eax contain carries of:
; 0 1 2 3 4
; (B+C+D)-F (B+C)+D (B+C+A)-E (B+C)+A B+C
jz .Lcase3 ; If A_P = orig_B_P - 3*8, then do remaining 3 words
; Difference is 8, 16, or 24, corresponding to 2, 1, or 0 words left to do, resp.
cmp r9, 16
jb .Lcase2 ; r9 = 8 : 2 words
je .Lcase1 ; r9 = 16 : 1 word
jmp .Lcase0 ; r9 = 24 : 0 words
.Lcase3:
bt r11, 3
mov r8, [B_P]
adc r8, [C_P]
mov r9, [B_P + 8]
adc r9, [C_P + 8]
mov r10, [B_P + 16]
adc r10, [C_P + 16]
mov r11, rax
adc eax, eax
bt r11, 3
mov r12, r8
mov r13, r9
mov r14, r10
adc r8, [A_P]
adc r9, [A_P + 8]
adc r10, [A_P + 16]
mov r11, rax
adc eax, eax
bt r11, 3
sbb r8, [E_P]
sbb r9, [E_P + 8]
sbb r10, [E_P + 16]
mov [B_P], r8
mov [B_P + 8], r9
mov [B_P + 16], r10
mov r11, rax
adc eax, eax
bt r11, 3
adc r12, [D_P]
adc r13, [D_P + 8]
adc r14, [D_P + 16]
mov r11, rax
adc eax, eax
bt r11, 3
sbb r12, [F_P]
sbb r13, [F_P + 8]
sbb r14, [F_P + 16]
adc eax, eax
mov [C_P], r12
mov [C_P + 8], r13
mov [C_P + 16], r14
lea A_P, [A_P + 3*8]
lea B_P, [B_P + 3*8]
lea C_P, [C_P + 3*8]
lea D_P, [D_P + 3*8]
lea E_P, [E_P + 3*8]
lea F_P, [F_P + 3*8]
jmp .Lfin
.Lcase2:
bt r11, 3
mov r8, [B_P]
adc r8, [C_P]
mov r9, [B_P + 8]
adc r9, [C_P + 8]
mov r11, rax
adc eax, eax
bt r11, 3
mov r12, r8
mov r13, r9
adc r8, [A_P]
adc r9, [A_P + 8]
mov r11, rax
adc eax, eax
bt r11, 3
sbb r8, [E_P]
sbb r9, [E_P + 8]
mov r11, rax
adc eax, eax
bt r11, 3
adc r12, [D_P]
adc r13, [D_P + 8]
mov r11, rax
adc eax, eax
bt r11, 3
mov [B_P], r8
mov [B_P + 8], r9
sbb r12, [F_P]
sbb r13, [F_P + 8]
adc eax, eax
mov [C_P], r12
mov [C_P + 8], r13
lea A_P, [A_P + 2*8]
lea B_P, [B_P + 2*8]
lea C_P, [C_P + 2*8]
lea D_P, [D_P + 2*8]
lea E_P, [E_P + 2*8]
lea F_P, [F_P + 2*8]
jmp .Lfin
.Lcase1:
bt r11, 3
mov r8, [B_P]
adc r8, [C_P]
mov r11, rax
adc eax, eax
bt r11, 3
mov r12, r8
adc r8, [A_P]
mov r11, rax
adc eax, eax
bt r11, 3
sbb r8, [E_P]
mov r11, rax
adc eax, eax
bt r11, 3
adc r12, [D_P]
mov r11, rax
adc eax, eax
bt r11, 3
mov [B_P], r8
sbb r12, [F_P]
adc eax, eax
mov [C_P], r12
lea A_P, [A_P + 1*8]
lea B_P, [B_P + 1*8]
lea C_P, [C_P + 1*8]
lea D_P, [D_P + 1*8]
lea E_P, [E_P + 1*8]
lea F_P, [F_P + 1*8]
.Lfin:
; Now A_P, ..., F_P point at A[n2]=B[0], B[n2]=C[0], C[n2]=D[0],
; D[n2], E[n2]=F[0], F[n2], resp.
.Lcase0:
; store top two words of D as carrys could change them
pop r15
bt r15, 0
jnc .Lskipload
mov r12, [D_P] ; load D[n2]
mov r13, [D_P + 8] ; load D[n2 + 1]
; the two carrys from 2nd to 3rd
.Lskipload:
xor r8, r8
mov r11, r8 ; r11 is constant 0 now
bt eax, 4 ; carry of B+C
adc r8, r8
mov r9, r8
bt eax, 3 ; carry of (B+C)+A
lea r10, [B_P + 8]
adc [B_P], r8 ; B_P points at B[n2] = C[0]
.L2: adc qword [r10], r11
lea r10, [r10 + 8]
jc .L2
; the two carrys from 3rd to 4th
lea r10, [C_P + 8]
bt eax, 1 ; carry of (B+C)+D
adc [C_P], r9
.L3: adc qword [r10], r11
lea r10, [r10 + 8]
jc .L3
; now the borrow from 2nd to 3rd
mov r10, B_P
bt eax, 2 ; borrow of (B+C+A)-E
.L1: sbb qword [r10], r11
lea r10, [r10 + 8]
jc .L1
; borrow from 3rd to 4th
bt eax, 0 ; borrow of (B+C+D)-F
mov r10, C_P
.L4: sbb qword [r10], r11
lea r10, [r10 + 8]
jc .L4
; if odd then do next two
bt r15, 0
jnc .Lnotodd
sub r12, [F_P] ; r12 contains D[n2]
sbb r13, [F_P + 8] ; r13 contains D[n2+1]
sbb rax, rax
add [C_P], r12
adc [C_P + 8], r13
adc rax, 0 ; rax is -1, 0, or 1
.L7: add [C_P + 16], rax
adc rax, 0
lea C_P, [C_P + 8]
sar rax, 1
jnz .L7
.Lnotodd:
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret