mpir/mpn/x86_64w/amd64/mul_basecase.asm

;  YASM translation of code provided by P. Gaudry for AMD64, converted
;  by Brian Gladman.
;
;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  This file is part of the GNU MP Library.
;
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
;  Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
;  compiler and the YASM assembler.

; AMD64 mpn_mul_basecase -- multiply two mpn numbers.
;
;  Calling interface:
;
; void __gmpn_mul_basecase(
;     mp_ptr rp,          rcx
;     mp_srcptr xp,       rdx
;     mp_size_t xn,        r8
;     mp_srcptr yp,        r9
;     mp_size_t yn [rsp+0x28]    as a *** 32-bit *** word
; )
;
; Multiply xp[xn] by yp[yn] and write the result to rp[un+vn] with xn >= yn on
; entry.
;
; This is an SEH Frame Function with a leaf prologue

%include "..\x86_64_asm.inc"

%define reg_save_list       rbx, rsi, rdi, rbp, r12

%define UNROLL_LOG2         4
%define UNROLL_COUNT        (1 << UNROLL_LOG2)
%define UNROLL_MASK         (UNROLL_COUNT - 1)
%define UNROLL_BYTES        (8 * UNROLL_COUNT)
%define UNROLL_THRESHOLD    5

   bits 64
   section .text

%define v_par   16
%define v_adj    8
%define v_xlo    0
%define v_len   24

%define  r_ptr rcx
%define  x_ptr r11
%define  x_len  r8
%define  y_ptr  r9
%define  y_len r10

%define v_ctr   r8      ; x_len reused
%define v_jmp  r11      ; x_ptr reused

   global   __gmpn_mul_basecase

%ifdef DLL
   export   __gmpn_mul_basecase
%endif

__gmpn_mul_basecase:
    movsxd  x_len,r8d
    mov     rax,[y_ptr]
    cmp     x_len,2
    ja      mul_m_by_n
    je      mul_2_by_n
    mul     qword [rdx]
    mov     [r_ptr],rax
    mov     [r_ptr+8],rdx
    ret

mul_2_by_n:
    movsxd  r10,dword[rsp+0x28] ; load as a 32-bit integer
    mov     x_ptr,rdx
    dec     qword y_len
    jnz     mul_2_by_2
    mov     r8,rax          ; y[0] -> r8 (was x_len)
    mov     rax,[x_ptr]
    mul     r8
    mov     [r_ptr],rax
    mov     rax,[x_ptr+8]
    mov     r9,rdx          ; carry -> r9 (was y_ptr)
    mul     r8
    add     r9,rax
    mov     [r_ptr+8],r9
    adc     rdx,y_len       ; note: r10 = 0 (was y_len)
    mov     [r_ptr+16],rdx
    ret

mul_2_by_2:                 ; r8 (x_len) and r10 (y_len) free
    mov     r10,[x_ptr]     ; x[0]
    mul     r10             ; y[0] * x[0]
    mov     [r_ptr],rax
    mov     r8,rdx          ; cry = { 0, r8 }
    mov     rax,[y_ptr+8]   ; y[1]
    mul     r10             ; y[1] * x[0]
    add     r8,rax
    adc     rdx,byte 0
    mov     r10,[x_ptr+8]   ; x[1] - r11 (x_ptr) now free
    mov     r11,rdx         ; cry = { r11, r8 }
    mov     rax,[y_ptr]     ; y[0]
    mul     r10             ; y[0] * x[1]
    add     r8,rax
    adc     r11,rdx
    mov     [r_ptr+8],r8
    mov     r8,dword 0
    adc     r8,r8           ; cry = { r8, r11 }
    mov     rax,[y_ptr+8]   ; y[1]
    mul     r10             ; x[1] * y[1]
    add     rax,r11
    adc     rdx,r8
    mov     [r_ptr+16],rax
    mov     [r_ptr+24],rdx
    ret

; do first multiply of y[0] * x[n] as it can simply be stored

mul_m_by_n:
    mov     r10d, dword[rsp+0x28]   ; load as a 32-bit integer

prologue fmul_m_by_n, reg_save_list, 3
    mov     x_ptr,rdx
    mov     r12,x_len
    mov     rbp,rax             ; y[0] -> rbp
    xor     rbx,rbx             ; for carry
    lea     rsi,[x_ptr+r12*8]   ; past end of x[]
    lea     rdi,[r_ptr+r12*8]   ; past end of r[]
    neg     r12
.0: mov     rax,[rsi+r12*8]     ; x[n]
    mul     rbp                 ; x[n] * y[0]
    add     rax,rbx             ; add carry from previous round
    mov     [rdi+r12*8],rax     ; store r[n]
    mov     rbx,dword 0         ; propagate carry
    adc     rbx,rdx
    inc     r12                 ; next iteration
    jnz     .0
    mov     [rdi],rbx           ; store final digit in carry
    mov     rdx,y_len           ; done if y_len is 1
    dec     rdx
    jnz     .1                  ; more to do
    jmp     L_exit

.1: cmp     x_len,UNROLL_THRESHOLD  ; unroll if many loops
    jae     L_unroll
    lea     y_ptr,[y_ptr+rdx*8+8]   ; pointer to end limb of y[]
    neg     x_len                   ; negative counter for x[n]
    neg     rdx                     ; negative counter for y[n]
    mov     rax,[rsi+x_len*8]       ; x[0] -> rax
    mov     y_len,rdx               ; now -(y_len - 1)
    inc     x_len                   ; negative counter for x[1]
    xor     rbx,rbx                 ; for carry
    mov     rcx,x_len               ; now -(x_len - 1) -> rcx (was r_ptr)
    mov     rbp,[y_ptr+rdx*8]       ; y[n] -> rbp
    jmp     .3
.2: mov     rcx,x_len               ; restore x[] counter
    xor     rbx,rbx                 ; clear carry
    add     rdi,8                   ; increase end of r[] pointer
    mov     rbp,[y_ptr+y_len*8]     ; y[n] -> rbp
    mov     rax,[rsi+rcx*8-8]       ; x[m] -> rax
.3: mul     rbp                     ; x[m] * y[n]
    add     rbx,rax                 ; add carry
    adc     rdx,byte 0
    add     [rdi+rcx*8],rbx         ; add into r[]
    mov     rax,[rsi+rcx*8]         ; next x[m] ->rax
    adc     rdx,byte 0              ; add carry to rdx
    inc     rcx                     ; got to next limb of x[]
    mov     rbx,rdx                 ; move carry into rbx
    jnz     .3                      ; got to next limb of x[]
    mul     rbp                     ; do last limb
    add     rbx,rax                 ; propagate carry
    adc     rdx,byte  0
    add     [rdi],rbx               ; add into r[]
    adc     rdx,byte 0              ; add add in any carry
    inc     y_len
    mov     [rdi+8],rdx             ; move (not add) carry into r[]
    jnz     .2                      ; go to next limb of y[]
    jmp     L_exit

L_unroll:
    mov     rdi,r_ptr
    mov     rcx,x_len
    mov     rsi,x_ptr
    mov     rbp,[y_ptr+8]
    lea     y_ptr,[y_ptr+rdx*8+8]
    neg     rdx
    mov     y_len,rdx
    lea     rbx,[UNROLL_COUNT-2+rcx]
    dec     rcx
    mov     rax,[rsi]          ; x[0]
    and     rbx,-UNROLL_MASK-1
    neg     rcx
    neg     rbx
    and     rcx,UNROLL_MASK
    mov     [rsp+v_par],rcx
    mov     [rsp+v_adj],rbx
    mov     rdx,rcx
    shl     rcx,3
    lea     rcx,[rcx+rcx*2]
    lea     v_jmp,[rel .4]
    lea     v_jmp,[v_jmp+rcx]
    neg     rdx
    mov     [rsp+v_xlo],rax
    lea     rdi,[rdi+rdx*8+8]
    lea     rsi,[rsi+rdx*8+8]
    jmp     .3
.2: mov     rbx,[rsp+v_adj]
    mov     rax,[rsp+v_xlo]
    lea     rdi,[rdi+rbx*8+8]
    lea     rsi,[rsi+rbx*8]
    mov     rbp,[y_ptr+y_len*8]
.3: mul     rbp
    sar     rbx,UNROLL_LOG2
    mov     rcx,[rsp+v_par]
    mov     v_ctr,rbx
    test    cl,1            ; low word of product + carry
    mov     rbx,dword 0     ; is in rcx on even rounds and
    mov     rcx,dword 0     ; rbx on odd rounds - we must
    cmovz   rcx,rax         ; put low word of first product
    cmovnz  rbx,rax         ; in the right register here
    jmp     v_jmp
.4:
%define CHUNK_COUNT  2
%assign i 0
%rep UNROLL_COUNT / CHUNK_COUNT
%define  disp0 8 * i * CHUNK_COUNT

    mov     rax,[byte rsi+disp0]
    adc     rbx,rdx
    mul     rbp
    add     [byte rdi+disp0],rcx
    mov     rcx,dword 0
    adc     rbx,rax
    mov     rax,[byte rsi+disp0+8]
    adc     rcx,rdx
    mul     rbp
    add     [byte rdi+disp0+8],rbx
    mov     rbx,dword 0
    adc     rcx,rax

%assign  i i + 1
%endrep

    inc     v_ctr
    lea     rsi,[UNROLL_BYTES+rsi]
    lea     rdi,[UNROLL_BYTES+rdi]
    jnz     .4

    adc     rdx,byte 0
    add     [rdi],rcx
    adc     rdx,byte 0
    inc     y_len
    mov     [rdi+8],rdx
    jnz     .2
L_exit:
    epilogue reg_save_list, 3

    end
Update some text in Windows x64 assembler files (no code changes). 2008-12-24 03:43:03 -05:00			`; YASM translation of code provided by P. Gaudry for AMD64, converted`
			`; by Brian Gladman.`
			`;`
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00			`; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit`
			`; compiler and the YASM assembler.`

			`; AMD64 mpn_mul_basecase -- multiply two mpn numbers.`
			`;`
			`; Calling interface:`
			`;`
			`; void __gmpn_mul_basecase(`
			`; mp_ptr rp, rcx`
			`; mp_srcptr xp, rdx`
			`; mp_size_t xn, r8`
			`; mp_srcptr yp, r9`
			`; mp_size_t yn [rsp+0x28] as a * 32-bit * word`
			`; )`
			`;`
			`; Multiply xp[xn] by yp[yn] and write the result to rp[un+vn] with xn >= yn on`
			`; entry.`
			`;`
			`; This is an SEH Frame Function with a leaf prologue`

			`%include "..\x86_64_asm.inc"`

			`%define reg_save_list rbx, rsi, rdi, rbp, r12`

			`%define UNROLL_LOG2 4`
			`%define UNROLL_COUNT (1 << UNROLL_LOG2)`
			`%define UNROLL_MASK (UNROLL_COUNT - 1)`
			`%define UNROLL_BYTES (8 * UNROLL_COUNT)`
			`%define UNROLL_THRESHOLD 5`

			`bits 64`
			`section .text`

			`%define v_par 16`
			`%define v_adj 8`
			`%define v_xlo 0`
			`%define v_len 24`

			`%define r_ptr rcx`
			`%define x_ptr r11`
			`%define x_len r8`
			`%define y_ptr r9`
			`%define y_len r10`

			`%define v_ctr r8 ; x_len reused`
			`%define v_jmp r11 ; x_ptr reused`

			`global __gmpn_mul_basecase`

			`%ifdef DLL`
			`export __gmpn_mul_basecase`
			`%endif`

			`__gmpn_mul_basecase:`
			`movsxd x_len,r8d`
			`mov rax,[y_ptr]`
			`cmp x_len,2`
			`ja mul_m_by_n`
			`je mul_2_by_n`
			`mul qword [rdx]`
			`mov [r_ptr],rax`
			`mov [r_ptr+8],rdx`
			`ret`

			`mul_2_by_n:`
			`movsxd r10,dword[rsp+0x28] ; load as a 32-bit integer`
			`mov x_ptr,rdx`
			`dec qword y_len`
			`jnz mul_2_by_2`
			`mov r8,rax ; y[0] -> r8 (was x_len)`
			`mov rax,[x_ptr]`
			`mul r8`
			`mov [r_ptr],rax`
			`mov rax,[x_ptr+8]`
			`mov r9,rdx ; carry -> r9 (was y_ptr)`
			`mul r8`
			`add r9,rax`
			`mov [r_ptr+8],r9`
			`adc rdx,y_len ; note: r10 = 0 (was y_len)`
			`mov [r_ptr+16],rdx`
			`ret`

			`mul_2_by_2: ; r8 (x_len) and r10 (y_len) free`
			`mov r10,[x_ptr] ; x[0]`
			`mul r10 ; y[0] * x[0]`
			`mov [r_ptr],rax`
			`mov r8,rdx ; cry = { 0, r8 }`
			`mov rax,[y_ptr+8] ; y[1]`
			`mul r10 ; y[1] * x[0]`
			`add r8,rax`
			`adc rdx,byte 0`
			`mov r10,[x_ptr+8] ; x[1] - r11 (x_ptr) now free`
			`mov r11,rdx ; cry = { r11, r8 }`
			`mov rax,[y_ptr] ; y[0]`
			`mul r10 ; y[0] * x[1]`
			`add r8,rax`
			`adc r11,rdx`
			`mov [r_ptr+8],r8`
			`mov r8,dword 0`
			`adc r8,r8 ; cry = { r8, r11 }`
			`mov rax,[y_ptr+8] ; y[1]`
			`mul r10 ; x[1] * y[1]`
			`add rax,r11`
			`adc rdx,r8`
			`mov [r_ptr+16],rax`
			`mov [r_ptr+24],rdx`
			`ret`

			`; do first multiply of y[0] * x[n] as it can simply be stored`

			`mul_m_by_n:`
			`mov r10d, dword[rsp+0x28] ; load as a 32-bit integer`

			`prologue fmul_m_by_n, reg_save_list, 3`
			`mov x_ptr,rdx`
			`mov r12,x_len`
			`mov rbp,rax ; y[0] -> rbp`
			`xor rbx,rbx ; for carry`
			`lea rsi,[x_ptr+r12*8] ; past end of x[]`
			`lea rdi,[r_ptr+r12*8] ; past end of r[]`
			`neg r12`
			`.0: mov rax,[rsi+r12*8] ; x[n]`
			`mul rbp ; x[n] * y[0]`
			`add rax,rbx ; add carry from previous round`
			`mov [rdi+r12*8],rax ; store r[n]`
			`mov rbx,dword 0 ; propagate carry`
			`adc rbx,rdx`
			`inc r12 ; next iteration`
			`jnz .0`
			`mov [rdi],rbx ; store final digit in carry`
			`mov rdx,y_len ; done if y_len is 1`
			`dec rdx`
			`jnz .1 ; more to do`
			`jmp L_exit`

			`.1: cmp x_len,UNROLL_THRESHOLD ; unroll if many loops`
			`jae L_unroll`
			`lea y_ptr,[y_ptr+rdx*8+8] ; pointer to end limb of y[]`
			`neg x_len ; negative counter for x[n]`
			`neg rdx ; negative counter for y[n]`
			`mov rax,[rsi+x_len*8] ; x[0] -> rax`
			`mov y_len,rdx ; now -(y_len - 1)`
			`inc x_len ; negative counter for x[1]`
			`xor rbx,rbx ; for carry`
			`mov rcx,x_len ; now -(x_len - 1) -> rcx (was r_ptr)`
			`mov rbp,[y_ptr+rdx*8] ; y[n] -> rbp`
			`jmp .3`
			`.2: mov rcx,x_len ; restore x[] counter`
			`xor rbx,rbx ; clear carry`
			`add rdi,8 ; increase end of r[] pointer`
			`mov rbp,[y_ptr+y_len*8] ; y[n] -> rbp`
			`mov rax,[rsi+rcx*8-8] ; x[m] -> rax`
			`.3: mul rbp ; x[m] * y[n]`
			`add rbx,rax ; add carry`
			`adc rdx,byte 0`
			`add [rdi+rcx*8],rbx ; add into r[]`
			`mov rax,[rsi+rcx*8] ; next x[m] ->rax`
			`adc rdx,byte 0 ; add carry to rdx`
			`inc rcx ; got to next limb of x[]`
			`mov rbx,rdx ; move carry into rbx`
			`jnz .3 ; got to next limb of x[]`
			`mul rbp ; do last limb`
			`add rbx,rax ; propagate carry`
			`adc rdx,byte 0`
			`add [rdi],rbx ; add into r[]`
			`adc rdx,byte 0 ; add add in any carry`
			`inc y_len`
			`mov [rdi+8],rdx ; move (not add) carry into r[]`
			`jnz .2 ; go to next limb of y[]`
			`jmp L_exit`

			`L_unroll:`
			`mov rdi,r_ptr`
			`mov rcx,x_len`
			`mov rsi,x_ptr`
			`mov rbp,[y_ptr+8]`
			`lea y_ptr,[y_ptr+rdx*8+8]`
			`neg rdx`
			`mov y_len,rdx`
			`lea rbx,[UNROLL_COUNT-2+rcx]`
			`dec rcx`
			`mov rax,[rsi] ; x[0]`
			`and rbx,-UNROLL_MASK-1`
			`neg rcx`
			`neg rbx`
			`and rcx,UNROLL_MASK`
			`mov [rsp+v_par],rcx`
			`mov [rsp+v_adj],rbx`
			`mov rdx,rcx`
			`shl rcx,3`
			`lea rcx,[rcx+rcx*2]`
			`lea v_jmp,[rel .4]`
			`lea v_jmp,[v_jmp+rcx]`
			`neg rdx`
			`mov [rsp+v_xlo],rax`
			`lea rdi,[rdi+rdx*8+8]`
			`lea rsi,[rsi+rdx*8+8]`
			`jmp .3`
			`.2: mov rbx,[rsp+v_adj]`
			`mov rax,[rsp+v_xlo]`
			`lea rdi,[rdi+rbx*8+8]`
			`lea rsi,[rsi+rbx*8]`
			`mov rbp,[y_ptr+y_len*8]`
			`.3: mul rbp`
			`sar rbx,UNROLL_LOG2`
			`mov rcx,[rsp+v_par]`
			`mov v_ctr,rbx`
			`test cl,1 ; low word of product + carry`
			`mov rbx,dword 0 ; is in rcx on even rounds and`
			`mov rcx,dword 0 ; rbx on odd rounds - we must`
			`cmovz rcx,rax ; put low word of first product`
			`cmovnz rbx,rax ; in the right register here`
			`jmp v_jmp`
			`.4:`
			`%define CHUNK_COUNT 2`
			`%assign i 0`
			`%rep UNROLL_COUNT / CHUNK_COUNT`
			`%define disp0 8 * i * CHUNK_COUNT`

			`mov rax,[byte rsi+disp0]`
			`adc rbx,rdx`
			`mul rbp`
			`add [byte rdi+disp0],rcx`
			`mov rcx,dword 0`
			`adc rbx,rax`
			`mov rax,[byte rsi+disp0+8]`
			`adc rcx,rdx`
			`mul rbp`
			`add [byte rdi+disp0+8],rbx`
			`mov rbx,dword 0`
			`adc rcx,rax`

			`%assign i i + 1`
			`%endrep`

			`inc v_ctr`
			`lea rsi,[UNROLL_BYTES+rsi]`
			`lea rdi,[UNROLL_BYTES+rdi]`
			`jnz .4`

			`adc rdx,byte 0`
			`add [rdi],rcx`
			`adc rdx,byte 0`
			`inc y_len`
			`mov [rdi+8],rdx`
			`jnz .2`
			`L_exit:`
			`epilogue reg_save_list, 3`

			`end`