From c81c5c1010482d247b7263844e02f7371fbd2565 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Wed, 7 May 2014 21:16:19 -0700 Subject: [PATCH] Sync stream_salsa20_amd64_xmm6.S with SUPERCOP 2014-04-25 --- .../amd64_xmm6/stream_salsa20_amd64_xmm6.S | 299 +++++++++--------- 1 file changed, 142 insertions(+), 157 deletions(-) diff --git a/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S b/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S index 940d38ca..8e29a506 100644 --- a/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S +++ b/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S @@ -13,15 +13,15 @@ crypto_stream_salsa20: _crypto_stream_salsa20: mov %rsp,%r11 and $31,%r11 -add $480,%r11 +add $512,%r11 sub %r11,%rsp -movq %r11,352(%rsp) -movq %r12,360(%rsp) -movq %r13,368(%rsp) -movq %r14,376(%rsp) -movq %r15,384(%rsp) -movq %rbx,392(%rsp) -movq %rbp,400(%rsp) +movq %r11,416(%rsp) +movq %r12,424(%rsp) +movq %r13,432(%rsp) +movq %r14,440(%rsp) +movq %r15,448(%rsp) +movq %rbx,456(%rsp) +movq %rbp,464(%rsp) mov %rsi,%r9 mov %rdi,%rdi mov %rdi,%rsi @@ -29,7 +29,6 @@ mov %rdx,%rdx mov %rcx,%r10 cmp $0,%r9 jbe ._done - mov $0,%rax mov %r9,%rcx rep stosb @@ -47,17 +46,18 @@ jmp ._start #endif crypto_stream_salsa20_xor: _crypto_stream_salsa20_xor: + mov %rsp,%r11 and $31,%r11 -add $480,%r11 +add $512,%r11 sub %r11,%rsp -movq %r11,352(%rsp) -movq %r12,360(%rsp) -movq %r13,368(%rsp) -movq %r14,376(%rsp) -movq %r15,384(%rsp) -movq %rbx,392(%rsp) -movq %rbp,400(%rsp) +movq %r11,416(%rsp) +movq %r12,424(%rsp) +movq %r13,432(%rsp) +movq %r14,440(%rsp) +movq %r15,448(%rsp) +movq %rbx,456(%rsp) +movq %rbp,464(%rsp) mov %rdi,%rdi mov %rsi,%rsi mov %rdx,%r9 @@ -71,125 +71,119 @@ movl 20(%r10),%ecx movl 0(%r10),%r8d movl 0(%rdx),%eax movl 16(%r10),%r11d -movl %ecx,0(%rsp) -movl %r8d,4+0(%rsp) -movl %eax,8+0(%rsp) -movl %r11d,12+0(%rsp) +movl %ecx,64(%rsp) +movl %r8d,4+64(%rsp) +movl %eax,8+64(%rsp) +movl %r11d,12+64(%rsp) mov $0,%rcx movl 24(%r10),%r8d movl 4(%r10),%eax movl 4(%rdx),%edx -movl %ecx,16(%rsp) -movl %r8d,4+16(%rsp) -movl %eax,8+16(%rsp) -movl %edx,12+16(%rsp) +movq %rcx,472(%rsp) +movl %ecx,80(%rsp) +movl %r8d,4+80(%rsp) +movl %eax,8+80(%rsp) +movl %edx,12+80(%rsp) movl 12(%r10),%edx mov $0,%rcx movl 28(%r10),%r8d movl 8(%r10),%eax -movl %edx,32(%rsp) -movl %ecx,4+32(%rsp) -movl %r8d,8+32(%rsp) -movl %eax,12+32(%rsp) +movl %edx,96(%rsp) +movl %ecx,4+96(%rsp) +movl %r8d,8+96(%rsp) +movl %eax,12+96(%rsp) mov $1634760805,%rdx mov $857760878,%rcx mov $2036477234,%r8 mov $1797285236,%rax -movl %edx,48(%rsp) -movl %ecx,4+48(%rsp) -movl %r8d,8+48(%rsp) -movl %eax,12+48(%rsp) +movl %edx,112(%rsp) +movl %ecx,4+112(%rsp) +movl %r8d,8+112(%rsp) +movl %eax,12+112(%rsp) cmp $256,%r9 jb ._bytesbetween1and255 - -movdqa 48(%rsp),%xmm0 +movdqa 112(%rsp),%xmm0 pshufd $0x55,%xmm0,%xmm1 pshufd $0xaa,%xmm0,%xmm2 pshufd $0xff,%xmm0,%xmm3 pshufd $0x00,%xmm0,%xmm0 -movdqa %xmm1,64(%rsp) -movdqa %xmm2,80(%rsp) -movdqa %xmm3,96(%rsp) -movdqa %xmm0,112(%rsp) -movdqa 0(%rsp),%xmm0 -pshufd $0xaa,%xmm0,%xmm1 -pshufd $0xff,%xmm0,%xmm2 -pshufd $0x00,%xmm0,%xmm3 -pshufd $0x55,%xmm0,%xmm0 movdqa %xmm1,128(%rsp) movdqa %xmm2,144(%rsp) movdqa %xmm3,160(%rsp) movdqa %xmm0,176(%rsp) -movdqa 16(%rsp),%xmm0 +movdqa 64(%rsp),%xmm0 +pshufd $0xaa,%xmm0,%xmm1 +pshufd $0xff,%xmm0,%xmm2 +pshufd $0x00,%xmm0,%xmm3 +pshufd $0x55,%xmm0,%xmm0 +movdqa %xmm1,192(%rsp) +movdqa %xmm2,208(%rsp) +movdqa %xmm3,224(%rsp) +movdqa %xmm0,240(%rsp) +movdqa 80(%rsp),%xmm0 pshufd $0xff,%xmm0,%xmm1 pshufd $0x55,%xmm0,%xmm2 pshufd $0xaa,%xmm0,%xmm0 -movdqa %xmm1,192(%rsp) -movdqa %xmm2,208(%rsp) -movdqa %xmm0,224(%rsp) -movdqa 32(%rsp),%xmm0 +movdqa %xmm1,256(%rsp) +movdqa %xmm2,272(%rsp) +movdqa %xmm0,288(%rsp) +movdqa 96(%rsp),%xmm0 pshufd $0x00,%xmm0,%xmm1 pshufd $0xaa,%xmm0,%xmm2 pshufd $0xff,%xmm0,%xmm0 -movdqa %xmm1,240(%rsp) -movdqa %xmm2,256(%rsp) -movdqa %xmm0,272(%rsp) +movdqa %xmm1,304(%rsp) +movdqa %xmm2,320(%rsp) +movdqa %xmm0,336(%rsp) ._bytesatleast256: -movl 16(%rsp),%edx -movl 4+32(%rsp),%ecx -movl %edx,288(%rsp) -movl %ecx,304(%rsp) -add $1,%rdx -shl $32,%rcx -add %rcx,%rdx +movq 472(%rsp),%rdx mov %rdx,%rcx shr $32,%rcx -movl %edx,4+288(%rsp) -movl %ecx,4+304(%rsp) +movl %edx,352(%rsp) +movl %ecx,368(%rsp) add $1,%rdx -shl $32,%rcx -add %rcx,%rdx mov %rdx,%rcx shr $32,%rcx -movl %edx,8+288(%rsp) -movl %ecx,8+304(%rsp) +movl %edx,4+352(%rsp) +movl %ecx,4+368(%rsp) add $1,%rdx -shl $32,%rcx -add %rcx,%rdx mov %rdx,%rcx shr $32,%rcx -movl %edx,12+288(%rsp) -movl %ecx,12+304(%rsp) +movl %edx,8+352(%rsp) +movl %ecx,8+368(%rsp) add $1,%rdx -shl $32,%rcx -add %rcx,%rdx mov %rdx,%rcx shr $32,%rcx -movl %edx,16(%rsp) -movl %ecx,4+32(%rsp) -movq %r9,408(%rsp) +movl %edx,12+352(%rsp) +movl %ecx,12+368(%rsp) +add $1,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,80(%rsp) +movl %ecx,4+96(%rsp) +movq %rdx,472(%rsp) +movq %r9,480(%rsp) mov $20,%rdx -movdqa 64(%rsp),%xmm0 -movdqa 80(%rsp),%xmm1 -movdqa 96(%rsp),%xmm2 -movdqa 256(%rsp),%xmm3 -movdqa 272(%rsp),%xmm4 -movdqa 128(%rsp),%xmm5 -movdqa 144(%rsp),%xmm6 -movdqa 176(%rsp),%xmm7 -movdqa 192(%rsp),%xmm8 -movdqa 208(%rsp),%xmm9 -movdqa 224(%rsp),%xmm10 -movdqa 304(%rsp),%xmm11 -movdqa 112(%rsp),%xmm12 -movdqa 160(%rsp),%xmm13 -movdqa 240(%rsp),%xmm14 -movdqa 288(%rsp),%xmm15 +movdqa 128(%rsp),%xmm0 +movdqa 144(%rsp),%xmm1 +movdqa 160(%rsp),%xmm2 +movdqa 320(%rsp),%xmm3 +movdqa 336(%rsp),%xmm4 +movdqa 192(%rsp),%xmm5 +movdqa 208(%rsp),%xmm6 +movdqa 240(%rsp),%xmm7 +movdqa 256(%rsp),%xmm8 +movdqa 272(%rsp),%xmm9 +movdqa 288(%rsp),%xmm10 +movdqa 368(%rsp),%xmm11 +movdqa 176(%rsp),%xmm12 +movdqa 224(%rsp),%xmm13 +movdqa 304(%rsp),%xmm14 +movdqa 352(%rsp),%xmm15 ._mainloop1: -movdqa %xmm1,320(%rsp) -movdqa %xmm2,336(%rsp) +movdqa %xmm1,384(%rsp) +movdqa %xmm2,400(%rsp) movdqa %xmm13,%xmm1 paddd %xmm12,%xmm1 movdqa %xmm1,%xmm2 @@ -239,8 +233,8 @@ pslld $18,%xmm1 pxor %xmm1,%xmm12 psrld $14,%xmm2 pxor %xmm2,%xmm12 -movdqa 320(%rsp),%xmm1 -movdqa %xmm12,320(%rsp) +movdqa 384(%rsp),%xmm1 +movdqa %xmm12,384(%rsp) movdqa %xmm9,%xmm2 paddd %xmm7,%xmm2 movdqa %xmm2,%xmm12 @@ -255,8 +249,8 @@ pslld $7,%xmm2 pxor %xmm2,%xmm3 psrld $25,%xmm12 pxor %xmm12,%xmm3 -movdqa 336(%rsp),%xmm2 -movdqa %xmm0,336(%rsp) +movdqa 400(%rsp),%xmm2 +movdqa %xmm0,400(%rsp) movdqa %xmm6,%xmm0 paddd %xmm2,%xmm0 movdqa %xmm0,%xmm12 @@ -299,8 +293,8 @@ pslld $18,%xmm0 pxor %xmm0,%xmm1 psrld $14,%xmm12 pxor %xmm12,%xmm1 -movdqa 320(%rsp),%xmm0 -movdqa %xmm1,320(%rsp) +movdqa 384(%rsp),%xmm0 +movdqa %xmm1,384(%rsp) movdqa %xmm4,%xmm1 paddd %xmm0,%xmm1 movdqa %xmm1,%xmm12 @@ -315,8 +309,8 @@ pslld $18,%xmm1 pxor %xmm1,%xmm2 psrld $14,%xmm12 pxor %xmm12,%xmm2 -movdqa 336(%rsp),%xmm12 -movdqa %xmm2,336(%rsp) +movdqa 400(%rsp),%xmm12 +movdqa %xmm2,400(%rsp) movdqa %xmm14,%xmm1 paddd %xmm12,%xmm1 movdqa %xmm1,%xmm2 @@ -359,8 +353,8 @@ pslld $18,%xmm1 pxor %xmm1,%xmm0 psrld $14,%xmm2 pxor %xmm2,%xmm0 -movdqa 320(%rsp),%xmm1 -movdqa %xmm0,320(%rsp) +movdqa 384(%rsp),%xmm1 +movdqa %xmm0,384(%rsp) movdqa %xmm8,%xmm0 paddd %xmm14,%xmm0 movdqa %xmm0,%xmm2 @@ -375,8 +369,8 @@ pslld $7,%xmm0 pxor %xmm0,%xmm6 psrld $25,%xmm2 pxor %xmm2,%xmm6 -movdqa 336(%rsp),%xmm2 -movdqa %xmm12,336(%rsp) +movdqa 400(%rsp),%xmm2 +movdqa %xmm12,400(%rsp) movdqa %xmm3,%xmm0 paddd %xmm2,%xmm0 movdqa %xmm0,%xmm12 @@ -426,15 +420,14 @@ pslld $18,%xmm0 pxor %xmm0,%xmm2 psrld $14,%xmm12 pxor %xmm12,%xmm2 -movdqa 320(%rsp),%xmm12 -movdqa 336(%rsp),%xmm0 +movdqa 384(%rsp),%xmm12 +movdqa 400(%rsp),%xmm0 sub $2,%rdx ja ._mainloop1 - -paddd 112(%rsp),%xmm12 -paddd 176(%rsp),%xmm7 -paddd 224(%rsp),%xmm10 -paddd 272(%rsp),%xmm4 +paddd 176(%rsp),%xmm12 +paddd 240(%rsp),%xmm7 +paddd 288(%rsp),%xmm10 +paddd 336(%rsp),%xmm4 movd %xmm12,%rdx movd %xmm7,%rcx movd %xmm10,%r8 @@ -495,10 +488,10 @@ movl %edx,192(%rdi) movl %ecx,196(%rdi) movl %r8d,200(%rdi) movl %r9d,204(%rdi) -paddd 240(%rsp),%xmm14 -paddd 64(%rsp),%xmm0 -paddd 128(%rsp),%xmm5 -paddd 192(%rsp),%xmm8 +paddd 304(%rsp),%xmm14 +paddd 128(%rsp),%xmm0 +paddd 192(%rsp),%xmm5 +paddd 256(%rsp),%xmm8 movd %xmm14,%rdx movd %xmm0,%rcx movd %xmm5,%r8 @@ -559,10 +552,10 @@ movl %edx,208(%rdi) movl %ecx,212(%rdi) movl %r8d,216(%rdi) movl %r9d,220(%rdi) -paddd 288(%rsp),%xmm15 -paddd 304(%rsp),%xmm11 -paddd 80(%rsp),%xmm1 -paddd 144(%rsp),%xmm6 +paddd 352(%rsp),%xmm15 +paddd 368(%rsp),%xmm11 +paddd 144(%rsp),%xmm1 +paddd 208(%rsp),%xmm6 movd %xmm15,%rdx movd %xmm11,%rcx movd %xmm1,%r8 @@ -623,10 +616,10 @@ movl %edx,224(%rdi) movl %ecx,228(%rdi) movl %r8d,232(%rdi) movl %r9d,236(%rdi) -paddd 160(%rsp),%xmm13 -paddd 208(%rsp),%xmm9 -paddd 256(%rsp),%xmm3 -paddd 96(%rsp),%xmm2 +paddd 224(%rsp),%xmm13 +paddd 272(%rsp),%xmm9 +paddd 320(%rsp),%xmm3 +paddd 160(%rsp),%xmm2 movd %xmm13,%rdx movd %xmm9,%rcx movd %xmm3,%r8 @@ -687,33 +680,31 @@ movl %edx,240(%rdi) movl %ecx,244(%rdi) movl %r8d,248(%rdi) movl %r9d,252(%rdi) -movq 408(%rsp),%r9 +movq 480(%rsp),%r9 sub $256,%r9 add $256,%rsi add $256,%rdi cmp $256,%r9 jae ._bytesatleast256 - cmp $0,%r9 jbe ._done ._bytesbetween1and255: cmp $64,%r9 jae ._nocopy - mov %rdi,%rdx -leaq 416(%rsp),%rdi +leaq 0(%rsp),%rdi mov %r9,%rcx rep movsb -leaq 416(%rsp),%rdi -leaq 416(%rsp),%rsi +leaq 0(%rsp),%rdi +leaq 0(%rsp),%rsi ._nocopy: -movq %r9,408(%rsp) -movdqa 48(%rsp),%xmm0 -movdqa 0(%rsp),%xmm1 -movdqa 16(%rsp),%xmm2 -movdqa 32(%rsp),%xmm3 +movq %r9,480(%rsp) +movdqa 112(%rsp),%xmm0 +movdqa 64(%rsp),%xmm1 +movdqa 80(%rsp),%xmm2 +movdqa 96(%rsp),%xmm3 movdqa %xmm1,%xmm4 mov $20,%rcx @@ -845,11 +836,10 @@ pxor %xmm5,%xmm0 pshufd $0x39,%xmm3,%xmm3 pxor %xmm6,%xmm0 ja ._mainloop2 - -paddd 48(%rsp),%xmm0 -paddd 0(%rsp),%xmm1 -paddd 16(%rsp),%xmm2 -paddd 32(%rsp),%xmm3 +paddd 112(%rsp),%xmm0 +paddd 64(%rsp),%xmm1 +paddd 80(%rsp),%xmm2 +paddd 96(%rsp),%xmm3 movd %xmm0,%rcx movd %xmm1,%r8 movd %xmm2,%r9 @@ -910,22 +900,17 @@ movl %ecx,60(%rdi) movl %r8d,44(%rdi) movl %r9d,28(%rdi) movl %eax,12(%rdi) -movq 408(%rsp),%r9 -movl 16(%rsp),%ecx -movl 4+32(%rsp),%r8d +movq 480(%rsp),%r9 +movq 472(%rsp),%rcx add $1,%rcx -shl $32,%r8 -add %r8,%rcx mov %rcx,%r8 shr $32,%r8 -movl %ecx,16(%rsp) -movl %r8d,4+32(%rsp) +movl %ecx,80(%rsp) +movl %r8d,4+96(%rsp) +movq %rcx,472(%rsp) cmp $64,%r9 - ja ._bytesatleast65 - jae ._bytesatleast64 - mov %rdi,%rsi mov %rdx,%rdi mov %r9,%rcx @@ -933,16 +918,16 @@ rep movsb ._bytesatleast64: ._done: -movq 352(%rsp),%r11 -movq 360(%rsp),%r12 -movq 368(%rsp),%r13 -movq 376(%rsp),%r14 -movq 384(%rsp),%r15 -movq 392(%rsp),%rbx -movq 400(%rsp),%rbp +movq 416(%rsp),%r11 +movq 424(%rsp),%r12 +movq 432(%rsp),%r13 +movq 440(%rsp),%r14 +movq 448(%rsp),%r15 +movq 456(%rsp),%rbx +movq 464(%rsp),%rbp add %r11,%rsp xor %rax,%rax -xor %rdx,%rdx +mov %rsi,%rdx ret ._bytesatleast65: