mpir/mpn/x86_64i/amd64/mul_1.asm
brgladman 48248cda46 1. longlong.h change to add MSVC intrinsics
2. longlong.h rearrangement for Intel compiler
3. MSVC additions in test  code 
4. GMP 4.2.1 bug fixes
5. Intel format assembly code
2008-05-18 22:20:43 +00:00

201 lines
4.9 KiB
NASM

; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
; compiler and the YASM assembler.
; AMD64 mpn_mul_1 -- mpn by limb multiply.
;
; Calling interface:
; mp_limb_t mpn_mul_1 (
; mp_ptr dst, rcx
; mp_srcptr src, rdx
; mp_size_t size, r8
; mp_limb_t multiplier r9
; )
;
; mp_limb_t mpn_mul_1c (
; mp_ptr dst, rcx
; mp_srcptr src, rdx
; mp_size_t size, r8
; mp_limb_t multiplier, r9
; mp_limb_t carry [rsp+0x28]
; )
;
; Multiply src[size] by mult and store the result in dst[size]. Return the
; carry limb from the top of the result.
;
; mpn_mul_1c() accepts an initial carry for the calculation, it's added into
; the low limb of the destination.
;
; Maximum possible UNROLL_COUNT with the current code is 64.
;
; This is an SEH Leaf Function (no unwind support needed)
%if 1
%define dst rcx
%define len r8
%define mlt r9
%define cry [rsp+0x28]
%define src r10 ; from rdx on input
%define UNROLL_LOG2 4
%define UNROLL_COUNT (1 << UNROLL_LOG2)
%define UNROLL_MASK (UNROLL_COUNT - 1)
%define UNROLL_BYTES 8 * UNROLL_COUNT
%if UNROLL_BYTES >= 256
%error unroll count is too large
%elif UNROLL_BYTES >= 128
%define off 128
%else
%define off 0
%endif
%define UNROLL_THRESHOLD 7
bits 64
section .text
global __gmpn_mul_1
global __gmpn_mul_1c
%ifdef DLL
export __gmpn_mul_1
export __gmpn_mul_1c
%endif
__gmpn_mul_1c:
mov r11,[rsp+0x28]
jmp start
__gmpn_mul_1:
xor r11,r11
start:
movsxd len,r8d
mov src,rdx
cmp len,UNROLL_THRESHOLD
jae .1
lea src,[src+len*8]
lea dst,[dst+len*8]
neg len
.0: mov rax,[src+len*8]
mul mlt
add rax,r11
mov r11,dword 0
adc r11,rdx
mov [dst+len*8],rax
inc len
jnz .0
mov rax,r11
ret
; The mov to load the next source limb is done well ahead of the mul, this
; is necessary for full speed. It leads to one limb handled separately
; after the loop.
;
; When unrolling to 32 or more, an offset of +4 is used on the src pointer,
; to avoid having an 0x80 displacement in the code for the last limb in the
; unrolled loop. This is for a fair comparison between 16 and 32 unrolling.
.1: lea rax,[len-2]
dec len
neg len
shr rax,UNROLL_LOG2
and len,UNROLL_MASK
mov [rsp+0x08],rax ; loop count in shadow space
mov rdx,len
shl rdx,4
lea rax,[rel .3]
lea rdx,[rdx+len*4]
lea rdx,[rdx+rax]
mov rax,[src]
neg len
lea src,[src+len*8+off]
lea dst,[dst+len*8+off]
xor len,len ; len now zero
jmp rdx
.3:
%assign i 0
%rep UNROLL_COUNT
%define disp 8 * i - off
mul mlt ; 20 bytes per block
add r11,rax
mov rax,[byte src+disp+8]
mov [byte dst+disp],r11
mov r11,len
adc r11,rdx
%assign i i + 1
%endrep
dec dword [rsp+0x08]
lea src,[src+UNROLL_BYTES]
lea dst,[dst+UNROLL_BYTES]
jns .3
mul mlt
add r11,rax
mov rax,len
mov [dst-off],r11
adc rax,rdx
ret
%else
bits 64
section .text
global __gmpn_mul_1
global __gmpn_mul_1c
%ifdef DLL
export __gmpn_mul_1
export __gmpn_mul_1c
%endif
__gmpn_mul_1c:
mov r11, [rsp+0x28]
jmp start
align 16
nop
nop
__gmpn_mul_1:
xor r11, r11
start:
lea r10, [rdx+8*r8]
lea rcx, [rcx+8*r8]
neg r8
.1: mov rax, [r10+8*r8]
mul r9
add rax, r11
mov r11d, 0
adc r11, rdx
mov [rcx+8*r8], rax
inc r8
jne .1
mov rax, r11
ret
%endif
end