Sync stream_salsa20_amd64_xmm6.S with SUPERCOP 2014-04-25

This commit is contained in:
Frank Denis 2014-05-07 21:16:19 -07:00
parent 353f296429
commit c81c5c1010

View File

@ -13,15 +13,15 @@ crypto_stream_salsa20:
_crypto_stream_salsa20: _crypto_stream_salsa20:
mov %rsp,%r11 mov %rsp,%r11
and $31,%r11 and $31,%r11
add $480,%r11 add $512,%r11
sub %r11,%rsp sub %r11,%rsp
movq %r11,352(%rsp) movq %r11,416(%rsp)
movq %r12,360(%rsp) movq %r12,424(%rsp)
movq %r13,368(%rsp) movq %r13,432(%rsp)
movq %r14,376(%rsp) movq %r14,440(%rsp)
movq %r15,384(%rsp) movq %r15,448(%rsp)
movq %rbx,392(%rsp) movq %rbx,456(%rsp)
movq %rbp,400(%rsp) movq %rbp,464(%rsp)
mov %rsi,%r9 mov %rsi,%r9
mov %rdi,%rdi mov %rdi,%rdi
mov %rdi,%rsi mov %rdi,%rsi
@ -29,7 +29,6 @@ mov %rdx,%rdx
mov %rcx,%r10 mov %rcx,%r10
cmp $0,%r9 cmp $0,%r9
jbe ._done jbe ._done
mov $0,%rax mov $0,%rax
mov %r9,%rcx mov %r9,%rcx
rep stosb rep stosb
@ -47,17 +46,18 @@ jmp ._start
#endif #endif
crypto_stream_salsa20_xor: crypto_stream_salsa20_xor:
_crypto_stream_salsa20_xor: _crypto_stream_salsa20_xor:
mov %rsp,%r11 mov %rsp,%r11
and $31,%r11 and $31,%r11
add $480,%r11 add $512,%r11
sub %r11,%rsp sub %r11,%rsp
movq %r11,352(%rsp) movq %r11,416(%rsp)
movq %r12,360(%rsp) movq %r12,424(%rsp)
movq %r13,368(%rsp) movq %r13,432(%rsp)
movq %r14,376(%rsp) movq %r14,440(%rsp)
movq %r15,384(%rsp) movq %r15,448(%rsp)
movq %rbx,392(%rsp) movq %rbx,456(%rsp)
movq %rbp,400(%rsp) movq %rbp,464(%rsp)
mov %rdi,%rdi mov %rdi,%rdi
mov %rsi,%rsi mov %rsi,%rsi
mov %rdx,%r9 mov %rdx,%r9
@ -71,125 +71,119 @@ movl 20(%r10),%ecx
movl 0(%r10),%r8d movl 0(%r10),%r8d
movl 0(%rdx),%eax movl 0(%rdx),%eax
movl 16(%r10),%r11d movl 16(%r10),%r11d
movl %ecx,0(%rsp) movl %ecx,64(%rsp)
movl %r8d,4+0(%rsp) movl %r8d,4+64(%rsp)
movl %eax,8+0(%rsp) movl %eax,8+64(%rsp)
movl %r11d,12+0(%rsp) movl %r11d,12+64(%rsp)
mov $0,%rcx mov $0,%rcx
movl 24(%r10),%r8d movl 24(%r10),%r8d
movl 4(%r10),%eax movl 4(%r10),%eax
movl 4(%rdx),%edx movl 4(%rdx),%edx
movl %ecx,16(%rsp) movq %rcx,472(%rsp)
movl %r8d,4+16(%rsp) movl %ecx,80(%rsp)
movl %eax,8+16(%rsp) movl %r8d,4+80(%rsp)
movl %edx,12+16(%rsp) movl %eax,8+80(%rsp)
movl %edx,12+80(%rsp)
movl 12(%r10),%edx movl 12(%r10),%edx
mov $0,%rcx mov $0,%rcx
movl 28(%r10),%r8d movl 28(%r10),%r8d
movl 8(%r10),%eax movl 8(%r10),%eax
movl %edx,32(%rsp) movl %edx,96(%rsp)
movl %ecx,4+32(%rsp) movl %ecx,4+96(%rsp)
movl %r8d,8+32(%rsp) movl %r8d,8+96(%rsp)
movl %eax,12+32(%rsp) movl %eax,12+96(%rsp)
mov $1634760805,%rdx mov $1634760805,%rdx
mov $857760878,%rcx mov $857760878,%rcx
mov $2036477234,%r8 mov $2036477234,%r8
mov $1797285236,%rax mov $1797285236,%rax
movl %edx,48(%rsp) movl %edx,112(%rsp)
movl %ecx,4+48(%rsp) movl %ecx,4+112(%rsp)
movl %r8d,8+48(%rsp) movl %r8d,8+112(%rsp)
movl %eax,12+48(%rsp) movl %eax,12+112(%rsp)
cmp $256,%r9 cmp $256,%r9
jb ._bytesbetween1and255 jb ._bytesbetween1and255
movdqa 112(%rsp),%xmm0
movdqa 48(%rsp),%xmm0
pshufd $0x55,%xmm0,%xmm1 pshufd $0x55,%xmm0,%xmm1
pshufd $0xaa,%xmm0,%xmm2 pshufd $0xaa,%xmm0,%xmm2
pshufd $0xff,%xmm0,%xmm3 pshufd $0xff,%xmm0,%xmm3
pshufd $0x00,%xmm0,%xmm0 pshufd $0x00,%xmm0,%xmm0
movdqa %xmm1,64(%rsp)
movdqa %xmm2,80(%rsp)
movdqa %xmm3,96(%rsp)
movdqa %xmm0,112(%rsp)
movdqa 0(%rsp),%xmm0
pshufd $0xaa,%xmm0,%xmm1
pshufd $0xff,%xmm0,%xmm2
pshufd $0x00,%xmm0,%xmm3
pshufd $0x55,%xmm0,%xmm0
movdqa %xmm1,128(%rsp) movdqa %xmm1,128(%rsp)
movdqa %xmm2,144(%rsp) movdqa %xmm2,144(%rsp)
movdqa %xmm3,160(%rsp) movdqa %xmm3,160(%rsp)
movdqa %xmm0,176(%rsp) movdqa %xmm0,176(%rsp)
movdqa 16(%rsp),%xmm0 movdqa 64(%rsp),%xmm0
pshufd $0xaa,%xmm0,%xmm1
pshufd $0xff,%xmm0,%xmm2
pshufd $0x00,%xmm0,%xmm3
pshufd $0x55,%xmm0,%xmm0
movdqa %xmm1,192(%rsp)
movdqa %xmm2,208(%rsp)
movdqa %xmm3,224(%rsp)
movdqa %xmm0,240(%rsp)
movdqa 80(%rsp),%xmm0
pshufd $0xff,%xmm0,%xmm1 pshufd $0xff,%xmm0,%xmm1
pshufd $0x55,%xmm0,%xmm2 pshufd $0x55,%xmm0,%xmm2
pshufd $0xaa,%xmm0,%xmm0 pshufd $0xaa,%xmm0,%xmm0
movdqa %xmm1,192(%rsp) movdqa %xmm1,256(%rsp)
movdqa %xmm2,208(%rsp) movdqa %xmm2,272(%rsp)
movdqa %xmm0,224(%rsp) movdqa %xmm0,288(%rsp)
movdqa 32(%rsp),%xmm0 movdqa 96(%rsp),%xmm0
pshufd $0x00,%xmm0,%xmm1 pshufd $0x00,%xmm0,%xmm1
pshufd $0xaa,%xmm0,%xmm2 pshufd $0xaa,%xmm0,%xmm2
pshufd $0xff,%xmm0,%xmm0 pshufd $0xff,%xmm0,%xmm0
movdqa %xmm1,240(%rsp) movdqa %xmm1,304(%rsp)
movdqa %xmm2,256(%rsp) movdqa %xmm2,320(%rsp)
movdqa %xmm0,272(%rsp) movdqa %xmm0,336(%rsp)
._bytesatleast256: ._bytesatleast256:
movl 16(%rsp),%edx movq 472(%rsp),%rdx
movl 4+32(%rsp),%ecx
movl %edx,288(%rsp)
movl %ecx,304(%rsp)
add $1,%rdx
shl $32,%rcx
add %rcx,%rdx
mov %rdx,%rcx mov %rdx,%rcx
shr $32,%rcx shr $32,%rcx
movl %edx,4+288(%rsp) movl %edx,352(%rsp)
movl %ecx,4+304(%rsp) movl %ecx,368(%rsp)
add $1,%rdx add $1,%rdx
shl $32,%rcx
add %rcx,%rdx
mov %rdx,%rcx mov %rdx,%rcx
shr $32,%rcx shr $32,%rcx
movl %edx,8+288(%rsp) movl %edx,4+352(%rsp)
movl %ecx,8+304(%rsp) movl %ecx,4+368(%rsp)
add $1,%rdx add $1,%rdx
shl $32,%rcx
add %rcx,%rdx
mov %rdx,%rcx mov %rdx,%rcx
shr $32,%rcx shr $32,%rcx
movl %edx,12+288(%rsp) movl %edx,8+352(%rsp)
movl %ecx,12+304(%rsp) movl %ecx,8+368(%rsp)
add $1,%rdx add $1,%rdx
shl $32,%rcx
add %rcx,%rdx
mov %rdx,%rcx mov %rdx,%rcx
shr $32,%rcx shr $32,%rcx
movl %edx,16(%rsp) movl %edx,12+352(%rsp)
movl %ecx,4+32(%rsp) movl %ecx,12+368(%rsp)
movq %r9,408(%rsp) add $1,%rdx
mov %rdx,%rcx
shr $32,%rcx
movl %edx,80(%rsp)
movl %ecx,4+96(%rsp)
movq %rdx,472(%rsp)
movq %r9,480(%rsp)
mov $20,%rdx mov $20,%rdx
movdqa 64(%rsp),%xmm0 movdqa 128(%rsp),%xmm0
movdqa 80(%rsp),%xmm1 movdqa 144(%rsp),%xmm1
movdqa 96(%rsp),%xmm2 movdqa 160(%rsp),%xmm2
movdqa 256(%rsp),%xmm3 movdqa 320(%rsp),%xmm3
movdqa 272(%rsp),%xmm4 movdqa 336(%rsp),%xmm4
movdqa 128(%rsp),%xmm5 movdqa 192(%rsp),%xmm5
movdqa 144(%rsp),%xmm6 movdqa 208(%rsp),%xmm6
movdqa 176(%rsp),%xmm7 movdqa 240(%rsp),%xmm7
movdqa 192(%rsp),%xmm8 movdqa 256(%rsp),%xmm8
movdqa 208(%rsp),%xmm9 movdqa 272(%rsp),%xmm9
movdqa 224(%rsp),%xmm10 movdqa 288(%rsp),%xmm10
movdqa 304(%rsp),%xmm11 movdqa 368(%rsp),%xmm11
movdqa 112(%rsp),%xmm12 movdqa 176(%rsp),%xmm12
movdqa 160(%rsp),%xmm13 movdqa 224(%rsp),%xmm13
movdqa 240(%rsp),%xmm14 movdqa 304(%rsp),%xmm14
movdqa 288(%rsp),%xmm15 movdqa 352(%rsp),%xmm15
._mainloop1: ._mainloop1:
movdqa %xmm1,320(%rsp) movdqa %xmm1,384(%rsp)
movdqa %xmm2,336(%rsp) movdqa %xmm2,400(%rsp)
movdqa %xmm13,%xmm1 movdqa %xmm13,%xmm1
paddd %xmm12,%xmm1 paddd %xmm12,%xmm1
movdqa %xmm1,%xmm2 movdqa %xmm1,%xmm2
@ -239,8 +233,8 @@ pslld $18,%xmm1
pxor %xmm1,%xmm12 pxor %xmm1,%xmm12
psrld $14,%xmm2 psrld $14,%xmm2
pxor %xmm2,%xmm12 pxor %xmm2,%xmm12
movdqa 320(%rsp),%xmm1 movdqa 384(%rsp),%xmm1
movdqa %xmm12,320(%rsp) movdqa %xmm12,384(%rsp)
movdqa %xmm9,%xmm2 movdqa %xmm9,%xmm2
paddd %xmm7,%xmm2 paddd %xmm7,%xmm2
movdqa %xmm2,%xmm12 movdqa %xmm2,%xmm12
@ -255,8 +249,8 @@ pslld $7,%xmm2
pxor %xmm2,%xmm3 pxor %xmm2,%xmm3
psrld $25,%xmm12 psrld $25,%xmm12
pxor %xmm12,%xmm3 pxor %xmm12,%xmm3
movdqa 336(%rsp),%xmm2 movdqa 400(%rsp),%xmm2
movdqa %xmm0,336(%rsp) movdqa %xmm0,400(%rsp)
movdqa %xmm6,%xmm0 movdqa %xmm6,%xmm0
paddd %xmm2,%xmm0 paddd %xmm2,%xmm0
movdqa %xmm0,%xmm12 movdqa %xmm0,%xmm12
@ -299,8 +293,8 @@ pslld $18,%xmm0
pxor %xmm0,%xmm1 pxor %xmm0,%xmm1
psrld $14,%xmm12 psrld $14,%xmm12
pxor %xmm12,%xmm1 pxor %xmm12,%xmm1
movdqa 320(%rsp),%xmm0 movdqa 384(%rsp),%xmm0
movdqa %xmm1,320(%rsp) movdqa %xmm1,384(%rsp)
movdqa %xmm4,%xmm1 movdqa %xmm4,%xmm1
paddd %xmm0,%xmm1 paddd %xmm0,%xmm1
movdqa %xmm1,%xmm12 movdqa %xmm1,%xmm12
@ -315,8 +309,8 @@ pslld $18,%xmm1
pxor %xmm1,%xmm2 pxor %xmm1,%xmm2
psrld $14,%xmm12 psrld $14,%xmm12
pxor %xmm12,%xmm2 pxor %xmm12,%xmm2
movdqa 336(%rsp),%xmm12 movdqa 400(%rsp),%xmm12
movdqa %xmm2,336(%rsp) movdqa %xmm2,400(%rsp)
movdqa %xmm14,%xmm1 movdqa %xmm14,%xmm1
paddd %xmm12,%xmm1 paddd %xmm12,%xmm1
movdqa %xmm1,%xmm2 movdqa %xmm1,%xmm2
@ -359,8 +353,8 @@ pslld $18,%xmm1
pxor %xmm1,%xmm0 pxor %xmm1,%xmm0
psrld $14,%xmm2 psrld $14,%xmm2
pxor %xmm2,%xmm0 pxor %xmm2,%xmm0
movdqa 320(%rsp),%xmm1 movdqa 384(%rsp),%xmm1
movdqa %xmm0,320(%rsp) movdqa %xmm0,384(%rsp)
movdqa %xmm8,%xmm0 movdqa %xmm8,%xmm0
paddd %xmm14,%xmm0 paddd %xmm14,%xmm0
movdqa %xmm0,%xmm2 movdqa %xmm0,%xmm2
@ -375,8 +369,8 @@ pslld $7,%xmm0
pxor %xmm0,%xmm6 pxor %xmm0,%xmm6
psrld $25,%xmm2 psrld $25,%xmm2
pxor %xmm2,%xmm6 pxor %xmm2,%xmm6
movdqa 336(%rsp),%xmm2 movdqa 400(%rsp),%xmm2
movdqa %xmm12,336(%rsp) movdqa %xmm12,400(%rsp)
movdqa %xmm3,%xmm0 movdqa %xmm3,%xmm0
paddd %xmm2,%xmm0 paddd %xmm2,%xmm0
movdqa %xmm0,%xmm12 movdqa %xmm0,%xmm12
@ -426,15 +420,14 @@ pslld $18,%xmm0
pxor %xmm0,%xmm2 pxor %xmm0,%xmm2
psrld $14,%xmm12 psrld $14,%xmm12
pxor %xmm12,%xmm2 pxor %xmm12,%xmm2
movdqa 320(%rsp),%xmm12 movdqa 384(%rsp),%xmm12
movdqa 336(%rsp),%xmm0 movdqa 400(%rsp),%xmm0
sub $2,%rdx sub $2,%rdx
ja ._mainloop1 ja ._mainloop1
paddd 176(%rsp),%xmm12
paddd 112(%rsp),%xmm12 paddd 240(%rsp),%xmm7
paddd 176(%rsp),%xmm7 paddd 288(%rsp),%xmm10
paddd 224(%rsp),%xmm10 paddd 336(%rsp),%xmm4
paddd 272(%rsp),%xmm4
movd %xmm12,%rdx movd %xmm12,%rdx
movd %xmm7,%rcx movd %xmm7,%rcx
movd %xmm10,%r8 movd %xmm10,%r8
@ -495,10 +488,10 @@ movl %edx,192(%rdi)
movl %ecx,196(%rdi) movl %ecx,196(%rdi)
movl %r8d,200(%rdi) movl %r8d,200(%rdi)
movl %r9d,204(%rdi) movl %r9d,204(%rdi)
paddd 240(%rsp),%xmm14 paddd 304(%rsp),%xmm14
paddd 64(%rsp),%xmm0 paddd 128(%rsp),%xmm0
paddd 128(%rsp),%xmm5 paddd 192(%rsp),%xmm5
paddd 192(%rsp),%xmm8 paddd 256(%rsp),%xmm8
movd %xmm14,%rdx movd %xmm14,%rdx
movd %xmm0,%rcx movd %xmm0,%rcx
movd %xmm5,%r8 movd %xmm5,%r8
@ -559,10 +552,10 @@ movl %edx,208(%rdi)
movl %ecx,212(%rdi) movl %ecx,212(%rdi)
movl %r8d,216(%rdi) movl %r8d,216(%rdi)
movl %r9d,220(%rdi) movl %r9d,220(%rdi)
paddd 288(%rsp),%xmm15 paddd 352(%rsp),%xmm15
paddd 304(%rsp),%xmm11 paddd 368(%rsp),%xmm11
paddd 80(%rsp),%xmm1 paddd 144(%rsp),%xmm1
paddd 144(%rsp),%xmm6 paddd 208(%rsp),%xmm6
movd %xmm15,%rdx movd %xmm15,%rdx
movd %xmm11,%rcx movd %xmm11,%rcx
movd %xmm1,%r8 movd %xmm1,%r8
@ -623,10 +616,10 @@ movl %edx,224(%rdi)
movl %ecx,228(%rdi) movl %ecx,228(%rdi)
movl %r8d,232(%rdi) movl %r8d,232(%rdi)
movl %r9d,236(%rdi) movl %r9d,236(%rdi)
paddd 160(%rsp),%xmm13 paddd 224(%rsp),%xmm13
paddd 208(%rsp),%xmm9 paddd 272(%rsp),%xmm9
paddd 256(%rsp),%xmm3 paddd 320(%rsp),%xmm3
paddd 96(%rsp),%xmm2 paddd 160(%rsp),%xmm2
movd %xmm13,%rdx movd %xmm13,%rdx
movd %xmm9,%rcx movd %xmm9,%rcx
movd %xmm3,%r8 movd %xmm3,%r8
@ -687,33 +680,31 @@ movl %edx,240(%rdi)
movl %ecx,244(%rdi) movl %ecx,244(%rdi)
movl %r8d,248(%rdi) movl %r8d,248(%rdi)
movl %r9d,252(%rdi) movl %r9d,252(%rdi)
movq 408(%rsp),%r9 movq 480(%rsp),%r9
sub $256,%r9 sub $256,%r9
add $256,%rsi add $256,%rsi
add $256,%rdi add $256,%rdi
cmp $256,%r9 cmp $256,%r9
jae ._bytesatleast256 jae ._bytesatleast256
cmp $0,%r9 cmp $0,%r9
jbe ._done jbe ._done
._bytesbetween1and255: ._bytesbetween1and255:
cmp $64,%r9 cmp $64,%r9
jae ._nocopy jae ._nocopy
mov %rdi,%rdx mov %rdi,%rdx
leaq 416(%rsp),%rdi leaq 0(%rsp),%rdi
mov %r9,%rcx mov %r9,%rcx
rep movsb rep movsb
leaq 416(%rsp),%rdi leaq 0(%rsp),%rdi
leaq 416(%rsp),%rsi leaq 0(%rsp),%rsi
._nocopy: ._nocopy:
movq %r9,408(%rsp) movq %r9,480(%rsp)
movdqa 48(%rsp),%xmm0 movdqa 112(%rsp),%xmm0
movdqa 0(%rsp),%xmm1 movdqa 64(%rsp),%xmm1
movdqa 16(%rsp),%xmm2 movdqa 80(%rsp),%xmm2
movdqa 32(%rsp),%xmm3 movdqa 96(%rsp),%xmm3
movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm4
mov $20,%rcx mov $20,%rcx
@ -845,11 +836,10 @@ pxor %xmm5,%xmm0
pshufd $0x39,%xmm3,%xmm3 pshufd $0x39,%xmm3,%xmm3
pxor %xmm6,%xmm0 pxor %xmm6,%xmm0
ja ._mainloop2 ja ._mainloop2
paddd 112(%rsp),%xmm0
paddd 48(%rsp),%xmm0 paddd 64(%rsp),%xmm1
paddd 0(%rsp),%xmm1 paddd 80(%rsp),%xmm2
paddd 16(%rsp),%xmm2 paddd 96(%rsp),%xmm3
paddd 32(%rsp),%xmm3
movd %xmm0,%rcx movd %xmm0,%rcx
movd %xmm1,%r8 movd %xmm1,%r8
movd %xmm2,%r9 movd %xmm2,%r9
@ -910,22 +900,17 @@ movl %ecx,60(%rdi)
movl %r8d,44(%rdi) movl %r8d,44(%rdi)
movl %r9d,28(%rdi) movl %r9d,28(%rdi)
movl %eax,12(%rdi) movl %eax,12(%rdi)
movq 408(%rsp),%r9 movq 480(%rsp),%r9
movl 16(%rsp),%ecx movq 472(%rsp),%rcx
movl 4+32(%rsp),%r8d
add $1,%rcx add $1,%rcx
shl $32,%r8
add %r8,%rcx
mov %rcx,%r8 mov %rcx,%r8
shr $32,%r8 shr $32,%r8
movl %ecx,16(%rsp) movl %ecx,80(%rsp)
movl %r8d,4+32(%rsp) movl %r8d,4+96(%rsp)
movq %rcx,472(%rsp)
cmp $64,%r9 cmp $64,%r9
ja ._bytesatleast65 ja ._bytesatleast65
jae ._bytesatleast64 jae ._bytesatleast64
mov %rdi,%rsi mov %rdi,%rsi
mov %rdx,%rdi mov %rdx,%rdi
mov %r9,%rcx mov %r9,%rcx
@ -933,16 +918,16 @@ rep movsb
._bytesatleast64: ._bytesatleast64:
._done: ._done:
movq 352(%rsp),%r11 movq 416(%rsp),%r11
movq 360(%rsp),%r12 movq 424(%rsp),%r12
movq 368(%rsp),%r13 movq 432(%rsp),%r13
movq 376(%rsp),%r14 movq 440(%rsp),%r14
movq 384(%rsp),%r15 movq 448(%rsp),%r15
movq 392(%rsp),%rbx movq 456(%rsp),%rbx
movq 400(%rsp),%rbp movq 464(%rsp),%rbp
add %r11,%rsp add %r11,%rsp
xor %rax,%rax xor %rax,%rax
xor %rdx,%rdx mov %rsi,%rdx
ret ret
._bytesatleast65: ._bytesatleast65: