mpir/mpn/x86_64w/amd64/mul_basecase.asm

;  YASM translation of code provided by P. Gaudry for AMD64, converted
;  by Brian Gladman.
;
;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  This file is part of the GNU MP Library.
;
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
;  Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
;  compiler and the YASM assembler.

; AMD64 mpn_mul_basecase -- multiply two mpn numbers.
;
;  Calling interface:
;
; void __gmpn_mul_basecase(
;     mp_ptr rp,          rcx
;     mp_srcptr xp,       rdx
;     mp_size_t xn,        r8
;     mp_srcptr yp,        r9
;     mp_size_t yn [rsp+0x28]    as a *** 32-bit *** word
; )
;
; Multiply xp[xn] by yp[yn] and write the result to rp[un+vn] with xn >= yn on
; entry.
;
; This is an SEH Frame Function with a leaf prologue

%include "..\x86_64_asm.inc"

%define reg_save_list       rbx, rsi, rdi, rbp, r12

%define UNROLL_LOG2         4
%define UNROLL_COUNT        (1 << UNROLL_LOG2)
%define UNROLL_MASK         (UNROLL_COUNT - 1)
%define UNROLL_BYTES        (8 * UNROLL_COUNT)
%define UNROLL_THRESHOLD    5

   bits 64
   section .text

%define v_par   16
%define v_adj    8
%define v_xlo    0
%define v_len   24

%define  r_ptr rcx
%define  x_ptr r11
%define  x_len  r8
%define  y_ptr  r9
%define  y_len r10

%define v_ctr   r8      ; x_len reused
%define v_jmp  r11      ; x_ptr reused

   global   __gmpn_mul_basecase

%ifdef DLL
   export   __gmpn_mul_basecase
%endif

__gmpn_mul_basecase:
    movsxd  x_len,r8d
    mov     rax,[y_ptr]
    cmp     x_len,2
    ja      mul_m_by_n
    je      mul_2_by_n
    mul     qword [rdx]
    mov     [r_ptr],rax
    mov     [r_ptr+8],rdx
    ret

mul_2_by_n:
    movsxd  r10,dword[rsp+0x28] ; load as a 32-bit integer
    mov     x_ptr,rdx
    dec     qword y_len
    jnz     mul_2_by_2
    mov     r8,rax          ; y[0] -> r8 (was x_len)
    mov     rax,[x_ptr]
    mul     r8
    mov     [r_ptr],rax
    mov     rax,[x_ptr+8]
    mov     r9,rdx          ; carry -> r9 (was y_ptr)
    mul     r8
    add     r9,rax
    mov     [r_ptr+8],r9
    adc     rdx,y_len       ; note: r10 = 0 (was y_len)
    mov     [r_ptr+16],rdx
    ret

mul_2_by_2:                 ; r8 (x_len) and r10 (y_len) free
    mov     r10,[x_ptr]     ; x[0]
    mul     r10             ; y[0] * x[0]
    mov     [r_ptr],rax
    mov     r8,rdx          ; cry = { 0, r8 }
    mov     rax,[y_ptr+8]   ; y[1]
    mul     r10             ; y[1] * x[0]
    add     r8,rax
    adc     rdx,byte 0
    mov     r10,[x_ptr+8]   ; x[1] - r11 (x_ptr) now free
    mov     r11,rdx         ; cry = { r11, r8 }
    mov     rax,[y_ptr]     ; y[0]
    mul     r10             ; y[0] * x[1]
    add     r8,rax
    adc     r11,rdx
    mov     [r_ptr+8],r8
    mov     r8,dword 0
    adc     r8,r8           ; cry = { r8, r11 }
    mov     rax,[y_ptr+8]   ; y[1]
    mul     r10             ; x[1] * y[1]
    add     rax,r11
    adc     rdx,r8
    mov     [r_ptr+16],rax
    mov     [r_ptr+24],rdx
    ret

; do first multiply of y[0] * x[n] as it can simply be stored

mul_m_by_n:
    mov     r10d, dword[rsp+0x28]   ; load as a 32-bit integer

prologue fmul_m_by_n, reg_save_list, 3
    mov     x_ptr,rdx
    mov     r12,x_len
    mov     rbp,rax             ; y[0] -> rbp
    xor     rbx,rbx             ; for carry
    lea     rsi,[x_ptr+r12*8]   ; past end of x[]
    lea     rdi,[r_ptr+r12*8]   ; past end of r[]
    neg     r12
.0: mov     rax,[rsi+r12*8]     ; x[n]
    mul     rbp                 ; x[n] * y[0]
    add     rax,rbx             ; add carry from previous round
    mov     [rdi+r12*8],rax     ; store r[n]
    mov     rbx,dword 0         ; propagate carry
    adc     rbx,rdx
    inc     r12                 ; next iteration
    jnz     .0
    mov     [rdi],rbx           ; store final digit in carry
    mov     rdx,y_len           ; done if y_len is 1
    dec     rdx
    jnz     .1                  ; more to do
    jmp     L_exit

.1: cmp     x_len,UNROLL_THRESHOLD  ; unroll if many loops
    jae     L_unroll
    lea     y_ptr,[y_ptr+rdx*8+8]   ; pointer to end limb of y[]
    neg     x_len                   ; negative counter for x[n]
    neg     rdx                     ; negative counter for y[n]
    mov     rax,[rsi+x_len*8]       ; x[0] -> rax
    mov     y_len,rdx               ; now -(y_len - 1)
    inc     x_len                   ; negative counter for x[1]
    xor     rbx,rbx                 ; for carry
    mov     rcx,x_len               ; now -(x_len - 1) -> rcx (was r_ptr)
    mov     rbp,[y_ptr+rdx*8]       ; y[n] -> rbp
    jmp     .3
.2: mov     rcx,x_len               ; restore x[] counter
    xor     rbx,rbx                 ; clear carry
    add     rdi,8                   ; increase end of r[] pointer
    mov     rbp,[y_ptr+y_len*8]     ; y[n] -> rbp
    mov     rax,[rsi+rcx*8-8]       ; x[m] -> rax
.3: mul     rbp                     ; x[m] * y[n]
    add     rbx,rax                 ; add carry
    adc     rdx,byte 0
    add     [rdi+rcx*8],rbx         ; add into r[]
    mov     rax,[rsi+rcx*8]         ; next x[m] ->rax
    adc     rdx,byte 0              ; add carry to rdx
    inc     rcx                     ; got to next limb of x[]
    mov     rbx,rdx                 ; move carry into rbx
    jnz     .3                      ; got to next limb of x[]
    mul     rbp                     ; do last limb
    add     rbx,rax                 ; propagate carry
    adc     rdx,byte  0
    add     [rdi],rbx               ; add into r[]
    adc     rdx,byte 0              ; add add in any carry
    inc     y_len
    mov     [rdi+8],rdx             ; move (not add) carry into r[]
    jnz     .2                      ; go to next limb of y[]
    jmp     L_exit

L_unroll:
    mov     rdi,r_ptr
    mov     rcx,x_len
    mov     rsi,x_ptr
    mov     rbp,[y_ptr+8]
    lea     y_ptr,[y_ptr+rdx*8+8]
    neg     rdx
    mov     y_len,rdx
    lea     rbx,[UNROLL_COUNT-2+rcx]
    dec     rcx
    mov     rax,[rsi]          ; x[0]
    and     rbx,-UNROLL_MASK-1
    neg     rcx
    neg     rbx
    and     rcx,UNROLL_MASK
    mov     [rsp+v_par],rcx
    mov     [rsp+v_adj],rbx
    mov     rdx,rcx
    shl     rcx,3
    lea     rcx,[rcx+rcx*2]
    lea     v_jmp,[rel .4]
    lea     v_jmp,[v_jmp+rcx]
    neg     rdx
    mov     [rsp+v_xlo],rax
    lea     rdi,[rdi+rdx*8+8]
    lea     rsi,[rsi+rdx*8+8]
    jmp     .3
.2: mov     rbx,[rsp+v_adj]
    mov     rax,[rsp+v_xlo]
    lea     rdi,[rdi+rbx*8+8]
    lea     rsi,[rsi+rbx*8]
    mov     rbp,[y_ptr+y_len*8]
.3: mul     rbp
    sar     rbx,UNROLL_LOG2
    mov     rcx,[rsp+v_par]
    mov     v_ctr,rbx
    test    cl,1            ; low word of product + carry
    mov     rbx,dword 0     ; is in rcx on even rounds and
    mov     rcx,dword 0     ; rbx on odd rounds - we must
    cmovz   rcx,rax         ; put low word of first product
    cmovnz  rbx,rax         ; in the right register here
    jmp     v_jmp
.4:
%define CHUNK_COUNT  2
%assign i 0
%rep UNROLL_COUNT / CHUNK_COUNT
%define  disp0 8 * i * CHUNK_COUNT

    mov     rax,[byte rsi+disp0]
    adc     rbx,rdx
    mul     rbp
    add     [byte rdi+disp0],rcx
    mov     rcx,dword 0
    adc     rbx,rax
    mov     rax,[byte rsi+disp0+8]
    adc     rcx,rdx
    mul     rbp
    add     [byte rdi+disp0+8],rbx
    mov     rbx,dword 0
    adc     rcx,rax

%assign  i i + 1
%endrep

    inc     v_ctr
    lea     rsi,[UNROLL_BYTES+rsi]
    lea     rdi,[UNROLL_BYTES+rdi]
    jnz     .4

    adc     rdx,byte 0
    add     [rdi],rcx
    adc     rdx,byte 0
    inc     y_len
    mov     [rdi+8],rdx
    jnz     .2
L_exit:
    epilogue reg_save_list, 3

    end