mpir/mpn/x86_64/haswell/rshift1.as

262 lines
6.6 KiB
ActionScript

; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mp_limb_t mpn_rshift1(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1 )
; Linux RAX RDI RSI RDX
; Win7 RAX RCX RDX R8
;
; Description:
; The function shifts Op1 right by one bit, stores the result in Op2 (non-
; destructive shr) and hands back the shifted-out least significant bit of Op1.
; The function operates increasing in memory supporting in place shifts.
;
; Caveats:
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
; Skylake cores
; - the behaviour of cache prefetching in combination with AVX shifting seems
; somewhat erratic
; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
; - slight (a few percent) improvement for full LD1$ sizes
; - substantial (>10%) improvement for 1/2 LD2$ sizes
; - slight (a few percent) improvement for full LD2$ sizes
; - slight (a few percent) degradation for 1/2 LD3$ sizes
; - substantial (around 10%) degradation for full LD3$ sizes
;
; Comments:
; - AVX based version implemented, tested & benched on 21.02.2016 by jn
; - includes cache prefetching
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 RCX
%define Op1 RDX
%define Size1 R8
%define Limb1 R9
%define Limb2 R10
%define Offs 512 ; used direct def. to stay in Win scratch regs
%define ShrDL0 XMM2 ; Attn: this must match ShrQL0 definition
%define ShlDL0 XMM3 ; Attn: this must match ShlQL0 definition
%define QLimb0 YMM0
%define QLimb1 YMM1
%define ShrQL0 YMM2
%define ShlQL0 YMM3
%define ShrQL1 YMM4
%define ShlQL1 YMM5
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Limb1 R8
%define Limb2 R9
%define Offs 512 ; used direct def. to stay in Win scratch regs
%define ShrDL0 XMM2 ; Attn: this must match ShrQL0 definition
%define ShlDL0 XMM3 ; Attn: this must match ShlQL0 definition
%define QLimb0 YMM0
%define QLimb1 YMM1
%define ShrQL0 YMM2
%define ShlQL0 YMM3
%define ShrQL1 YMM4
%define ShlQL1 YMM5
%endif
align 32
GLOBAL_FUNC mpn_rshift1
xor EAX, EAX
or Size1, Size1
je .Exit
mov RAX, [Op1]
mov Limb1, RAX
shl RAX, 63
sub Size1, 1
je .lShr1EquPost ; Size1=1 =>
cmp Size1, 8
jc .lShr1EquFour ; AVX inefficient =>
; first align Op2 to 32 bytes
test Op2, 8
je .lShr1EquAlign16
mov Limb2, [Op1+8]
shrd Limb1, Limb2, 1
mov [Op2], Limb1
mov Limb1, Limb2
add Op1, 8
add Op2, 8
sub Size1, 1
.lShr1EquAlign16:
test Op2, 16
je .lShr1EquAVX
mov Limb2, [Op1+8]
shrd Limb1, Limb2, 1
mov [Op2], Limb1
mov Limb1, [Op1+16]
shrd Limb2, Limb1, 1
mov [Op2+8], Limb2
add Op1, 16
add Op2, 16
sub Size1, 2
.lShr1EquAVX:
; pre-fetch first quad-limb
vmovdqu QLimb0, [Op1]
vpsllq ShlQL0, QLimb0, 63
add Op1, 32
sub Size1, 4
jmp .lShr1EquAVXCheck
; main loop (prefetching enabled, unloaded data cache)
; - 0.60 cycles per limb in LD1$
; - 0.60-0.75 cycles per limb in LD2$
; - 0.75-1.00 cycles per limb in LD3$
align 16
.lShr1EquAVXLoop:
%ifdef USE_PREFETCH
prefetchnta [Op1+Offs]
%endif
vmovdqu QLimb1, [Op1]
vpsrlq ShrQL0, QLimb0, 1
vmovdqu QLimb0, [Op1+32]
vpsllq ShlQL1, QLimb1, 63
vpblendd ShlQL0, ShlQL0, ShlQL1, 00000011b
vpermq ShlQL0, ShlQL0, 00111001b
vpor ShrQL0, ShrQL0, ShlQL0
vpsrlq ShrQL1, QLimb1, 1
vpsllq ShlQL0, QLimb0, 63
vpblendd ShlQL1, ShlQL1, ShlQL0, 00000011b
vpermq ShlQL1, ShlQL1, 00111001b
vmovdqa [Op2], ShrQL0
vpor ShrQL1, ShrQL1, ShlQL1
vmovdqa [Op2+32], ShrQL1
add Op1, 64
add Op2, 64
.lShr1EquAVXCheck:
sub Size1, 8
jnc .lShr1EquAVXLoop
mov Limb2, [Op1]
mov Limb1, Limb2
shl Limb2, 63
%if 1
vmovq ShrDL0, Limb2
vpblendd ShlQL0, ShlQL0, ShrQL0, 3
%else
; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
; but it is only executed once and there is no AVX2 based alternative
pinsrq ShlDL0, Limb2, 0 ; SSE4.1
%endif
vpsrlq ShrQL0, QLimb0, 1
vpermq ShlQL0, ShlQL0, 00111001b
vpor ShrQL0, ShrQL0, ShlQL0
vmovdqa [Op2], ShrQL0
add Op2, 32
add Size1, 8
; shift remaining max. 7 limbs with SHRD mnemonic
.lShr1EquFour:
add Op1, 8
test Size1, 4
je .lShr1EquTwo
mov Limb2, [Op1]
shrd Limb1, Limb2, 1
mov [Op2], Limb1
mov Limb1, [Op1+8]
shrd Limb2, Limb1, 1
mov [Op2+8], Limb2
mov Limb2, [Op1+16]
shrd Limb1, Limb2, 1
mov [Op2+16], Limb1
mov Limb1, [Op1+24]
shrd Limb2, Limb1, 1
mov [Op2+24], Limb2
add Op1, 32
add Op2, 32
.lShr1EquTwo:
test Size1, 2
je .lShr1EquOne
mov Limb2, [Op1]
shrd Limb1, Limb2, 1
mov [Op2], Limb1
mov Limb1, [Op1+8]
shrd Limb2, Limb1, 1
mov [Op2+8], Limb2
add Op1, 16
add Op2, 16
.lShr1EquOne:
test Size1, 1
je .lShr1EquPost
mov Limb2, [Op1]
shrd Limb1, Limb2, 1
mov [Op2], Limb1
mov Limb1, Limb2
add Op2, 8
.lShr1EquPost:
shr Limb1, 1
mov [Op2], Limb1
.Exit:
vzeroupper
ret
.end: