;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  This file is part of the GNU MP Library.
;
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
;  Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
;  compiler and the YASM assembler.

;  AMD64 mpn_mul_1 -- mpn by limb multiply.
;
;  Calling interface:
;  mp_limb_t mpn_mul_1 (
;     mp_ptr dst,                  rcx
;     mp_srcptr src,               rdx
;     mp_size_t size,               r8
;     mp_limb_t multiplier          r9
;  )
;
;  mp_limb_t mpn_mul_1c (
;     mp_ptr dst,                  rcx
;     mp_srcptr src,               rdx
;     mp_size_t size,               r8
;     mp_limb_t multiplier,         r9
;     mp_limb_t carry       [rsp+0x28]
;  )
;
; Multiply src[size] by mult and store the result in dst[size].  Return the
; carry limb from the top of the result.
;
; mpn_mul_1c() accepts an initial carry for the calculation, it's added into
; the low limb of the destination.
;
; Maximum possible UNROLL_COUNT with the current code is 64.
;
;  This is an SEH Leaf Function (no unwind support needed)

%if 1

%define dst         rcx
%define len          r8
%define mlt          r9
%define cry  [rsp+0x28]
%define src         r10     ; from rdx on input

%define UNROLL_LOG2     4
%define UNROLL_COUNT    (1 << UNROLL_LOG2)
%define UNROLL_MASK     (UNROLL_COUNT - 1)
%define UNROLL_BYTES    8 * UNROLL_COUNT

%if UNROLL_BYTES >= 256
%error unroll count is too large
%elif UNROLL_BYTES >= 128
%define off 128
%else
%define off 0
%endif

%define UNROLL_THRESHOLD   7

    bits    64
    section .text

    global  __gmpn_mul_1
    global  __gmpn_mul_1c

%ifdef DLL
    export  __gmpn_mul_1
    export  __gmpn_mul_1c
%endif

__gmpn_mul_1c:
    mov     r11,[rsp+0x28]
    jmp     start

__gmpn_mul_1:
    xor     r11,r11

start:
    movsxd  len,r8d
    mov     src,rdx
    cmp     len,UNROLL_THRESHOLD
    jae     .1
    lea     src,[src+len*8]
    lea     dst,[dst+len*8]
    neg     len
.0: mov     rax,[src+len*8]
    mul     mlt
    add     rax,r11
    mov     r11,dword 0
    adc     r11,rdx
    mov     [dst+len*8],rax
    inc     len
    jnz     .0
    mov     rax,r11
    ret

; The mov to load the next source limb is done well ahead of the mul, this
; is necessary for full speed.  It leads to one limb handled separately
; after the loop.
;
; When unrolling to 32 or more, an offset of +4 is used on the src pointer,
; to avoid having an 0x80 displacement in the code for the last limb in the
; unrolled loop.  This is for a fair comparison between 16 and 32 unrolling.

.1: lea     rax,[len-2]
    dec     len
    neg     len
    shr     rax,UNROLL_LOG2
    and     len,UNROLL_MASK
    mov     [rsp+0x08],rax      ; loop count in shadow space
    mov     rdx,len
    shl     rdx,4
    lea     rax,[rel .3]
    lea     rdx,[rdx+len*4]
    lea     rdx,[rdx+rax]
    mov     rax,[src]
    neg     len
    lea     src,[src+len*8+off]
    lea     dst,[dst+len*8+off]
    xor     len,len             ; len now zero
    jmp     rdx

.3:
%assign i 0
%rep  UNROLL_COUNT
%define disp   8 * i - off

    mul     mlt                 ; 20 bytes per block
    add     r11,rax
    mov     rax,[byte src+disp+8]
    mov     [byte dst+disp],r11
    mov     r11,len
    adc     r11,rdx

%assign i i + 1
%endrep

    dec     dword [rsp+0x08]
    lea     src,[src+UNROLL_BYTES]
    lea     dst,[dst+UNROLL_BYTES]
    jns     .3
    mul     mlt
    add     r11,rax
    mov     rax,len
    mov     [dst-off],r11
    adc     rax,rdx
    ret

%else

    bits    64
    section .text
    global  __gmpn_mul_1
    global  __gmpn_mul_1c

%ifdef DLL
    export  __gmpn_mul_1
    export  __gmpn_mul_1c
%endif
__gmpn_mul_1c:
    mov     r11, [rsp+0x28]
    jmp     start

    align   16
    nop
    nop
__gmpn_mul_1:
    xor     r11, r11
start:
    lea     r10, [rdx+8*r8]
    lea     rcx, [rcx+8*r8]
    neg     r8
.1: mov     rax, [r10+8*r8]
    mul     r9
    add     rax, r11
    mov     r11d, 0
    adc     r11, rdx
    mov     [rcx+8*r8], rax
    inc     r8
    jne     .1
    mov     rax, r11
    ret

%endif

    end