correct errors in broadwell assembler code for Windows x64 for mpn_and_n, mpn_andn_n and mpn_iorn_n

This commit is contained in:
Brian Gladman 2019-05-02 22:12:12 +01:00
parent 331789ca2f
commit ed5498adeb
3 changed files with 48 additions and 46 deletions

View File

@ -63,8 +63,10 @@
%define Src2P R8
%define Size R9
%define SizeD R9D
%define SizeB R9B
%define Count RAX
%define CountD EAX
%define CountB AL
%define Limb0 R10
%define Limb0D R10D
%define QLimb0 YMM0

View File

@ -16,7 +16,7 @@
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,rcx) = not(rsi,rcx) and (rdx,rcx)
; (rdi,rcx) = (rsi,rcx) and not (rdx,rcx)
; There is no initial pointer alignment lead in code below. The argument
; why not is based on some statistical reasoning and measurement points.
@ -89,21 +89,21 @@ LEAF_PROC mpn_andn_n
.Loop:
vmovdqu QLimb0, [Src1P]
vpandn QLimb0, QLimb0, [Src2P]
vmovdqu QLimb0, [Src2P]
vpandn QLimb0, QLimb0, [Src1P]
vmovdqu [ResP], QLimb0
vmovdqu QLimb0, [Src1P+32]
vpandn QLimb0, QLimb0, [Src2P+32]
vmovdqu QLimb0, [Src2P+32]
vpandn QLimb0, QLimb0, [Src1P+32]
vmovdqu [ResP+32], QLimb0
vmovdqu QLimb0, [Src1P+64]
vpandn QLimb0, QLimb0, [Src2P+64]
vmovdqu QLimb0, [Src2P+64]
vpandn QLimb0, QLimb0, [Src1P+64]
vmovdqu [ResP+64], QLimb0
vmovdqu QLimb0, [Src1P+96]
vpandn QLimb0, QLimb0, [Src2P+96]
vmovdqu QLimb0, [Src2P+96]
vpandn QLimb0, QLimb0, [Src1P+96]
vmovdqu [ResP+96], QLimb0
lea Src1P, [Src1P+Limb0]
lea Src2P, [Src2P+Limb0]
lea Src1P, [Src1P+Limb0]
lea ResP, [ResP+Limb0]
add Size, 4
@ -120,28 +120,28 @@ LEAF_PROC mpn_andn_n
.PostAVX3:
add Limb0, 32
vmovdqu QLimb0, [Src1P+64]
vpandn QLimb0, QLimb0, [Src2P+64]
vmovdqu QLimb0, [Src2P+64]
vpandn QLimb0, QLimb0, [Src1P+64]
vmovdqu [ResP+64], QLimb0
.PostAVX2:
add Limb0, 32
vmovdqu QLimb0, [Src1P+32]
vpandn QLimb0, QLimb0, [Src2P+32]
vmovdqu QLimb0, [Src2P+32]
vpandn QLimb0, QLimb0, [Src1P+32]
vmovdqu [ResP+32], QLimb0
.PostAVX1:
add Limb0, 32
vmovdqu QLimb0, [Src1P]
vpandn QLimb0, QLimb0, [Src2P]
vmovdqu QLimb0, [Src2P]
vpandn QLimb0, QLimb0, [Src1P]
vmovdqu [ResP], QLimb0
.PostAVX0:
add Src1P, Limb0
add Src2P, Limb0
add Src1P, Limb0
add ResP, Limb0
add Count, 4
@ -154,20 +154,20 @@ LEAF_PROC mpn_andn_n
.PostGPR3:
mov Limb0, [Src1P+16]
andn Limb0, Limb0, [Src2P+16]
mov Limb0, [Src2P+16]
andn Limb0, Limb0, [Src1P+16]
mov [ResP+16], Limb0
.PostGPR2:
mov Limb0, [Src1P+8]
andn Limb0, Limb0, [Src2P+8]
mov Limb0, [Src2P+8]
andn Limb0, Limb0, [Src1P+8]
mov [ResP+8], Limb0
.PostGPR1:
mov Limb0, [Src1P]
andn Limb0, Limb0, [Src2P]
mov Limb0, [Src2P]
andn Limb0, Limb0, [Src1P]
mov [ResP], Limb0
.Exit:

View File

@ -16,7 +16,7 @@
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,rcx) = not(rsi,rcx) and (rdx,rcx)
; (rdi,rcx) = (rsi,rcx) and not (rdx,rcx)
; There is no initial pointer alignment lead in code below. The argument
; why not is based on some statistical reasoning and measurement points.
@ -92,21 +92,21 @@ LEAF_PROC mpn_iorn_n
.Loop:
vpxor QLimb0, QLimb1, [Src1P]
vpor QLimb0, QLimb0, [Src2P]
vpxor QLimb0, QLimb1, [Src2P]
vpor QLimb0, QLimb0, [Src1P]
vmovdqu [ResP], QLimb0
vpxor QLimb0, QLimb1, [Src1P+32]
vpor QLimb0, QLimb0, [Src2P+32]
vpxor QLimb0, QLimb1, [Src2P+32]
vpor QLimb0, QLimb0, [Src1P+32]
vmovdqu [ResP+32], QLimb0
vpxor QLimb0, QLimb1, [Src1P+64]
vpor QLimb0, QLimb0, [Src2P+64]
vpxor QLimb0, QLimb1, [Src2P+64]
vpor QLimb0, QLimb0, [Src1P+64]
vmovdqu [ResP+64], QLimb0
vpxor QLimb0, QLimb1, [Src1P+96]
vpor QLimb0, QLimb0, [Src2P+96]
vpxor QLimb0, QLimb1, [Src2P+96]
vpor QLimb0, QLimb0, [Src1P+96]
vmovdqu [ResP+96], QLimb0
lea Src1P, [Src1P+Limb0]
lea Src2P, [Src2P+Limb0]
lea Src1P, [Src1P+Limb0]
lea ResP, [ResP+Limb0]
add Size, 4
@ -123,28 +123,28 @@ LEAF_PROC mpn_iorn_n
.PostAVX3:
add Limb0, 32
vpxor QLimb0, QLimb1, [Src1P+64]
vpor QLimb0, QLimb0, [Src2P+64]
vpxor QLimb0, QLimb1, [Src2P+64]
vpor QLimb0, QLimb0, [Src1P+64]
vmovdqu [ResP+64], QLimb0
.PostAVX2:
add Limb0, 32
vpxor QLimb0, QLimb1, [Src1P+32]
vpor QLimb0, QLimb0, [Src2P+32]
vpxor QLimb0, QLimb1, [Src2P+32]
vpor QLimb0, QLimb0, [Src1P+32]
vmovdqu [ResP+32], QLimb0
.PostAVX1:
add Limb0, 32
vpxor QLimb0, QLimb1, [Src1P]
vpor QLimb0, QLimb0, [Src2P]
vpxor QLimb0, QLimb1, [Src2P]
vpor QLimb0, QLimb0, [Src1P]
vmovdqu [ResP], QLimb0
.PostAVX0:
add Src1P, Limb0
add Src2P, Limb0
add Src1P, Limb0
add ResP, Limb0
add Count, 4
@ -157,23 +157,23 @@ LEAF_PROC mpn_iorn_n
.PostGPR3:
mov Limb0, [Src1P+16]
mov Limb0, [Src2P+16]
not Limb0
or Limb0, [Src2P+16]
or Limb0, [Src1P+16]
mov [ResP+16], Limb0
.PostGPR2:
mov Limb0, [Src1P+8]
mov Limb0, [Src2P+8]
not Limb0
or Limb0, [Src2P+8]
or Limb0, [Src1P+8]
mov [ResP+8], Limb0
.PostGPR1:
mov Limb0, [Src1P]
mov Limb0, [Src2P]
not Limb0
or Limb0, [Src2P]
or Limb0, [Src1P]
mov [ResP], Limb0
.Exit: