From 808d1807dcc00e156f1a4b2dae2fe59d2e4b2e32 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 27 Apr 2013 10:21:32 -0700 Subject: [PATCH] Add optimized salsa20 amd64 assembly implementation --- .gitignore | 1 - configure.ac | 19 + src/libsodium/Makefile.am | 14 +- .../crypto_stream/salsa20/amd64_xmm6/api.h | 1 + .../amd64_xmm6/stream_salsa20_amd64_xmm6.s | 947 ++++++++++++++++++ ...{stream_salsa20.c => stream_salsa20_ref.c} | 4 + .../ref/{xor_salsa20.c => xor_salsa20_ref.c} | 4 + .../sodium/crypto_scalarmult_curve25519.h.in | 4 +- .../include/sodium/crypto_stream_salsa20.h | 14 +- .../include/sodium/crypto_stream_salsa20.h.in | 16 +- 10 files changed, 1010 insertions(+), 14 deletions(-) create mode 100644 src/libsodium/crypto_stream/salsa20/amd64_xmm6/api.h create mode 100644 src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.s rename src/libsodium/crypto_stream/salsa20/ref/{stream_salsa20.c => stream_salsa20_ref.c} (96%) rename src/libsodium/crypto_stream/salsa20/ref/{xor_salsa20.c => xor_salsa20_ref.c} (96%) diff --git a/.gitignore b/.gitignore index 922c80e1..332f8242 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ *.log *.o *.plist -*.s *.scan *.status *.tar.* diff --git a/configure.ac b/configure.ac index ff68b9a5..31e9392e 100644 --- a/configure.ac +++ b/configure.ac @@ -34,6 +34,7 @@ AC_SUBST(DLL_VERSION) LX_CFLAGS=${CFLAGS-NONE} AC_PROG_CC_C99 +AM_PROG_AS AC_USE_SYSTEM_EXTENSIONS CPPFLAGS="$CPPFLAGS -D_FORTIFY_SOURCE=2" @@ -184,6 +185,24 @@ AC_C_BIGENDIAN( AC_MSG_WARN([universal endianess]) ) +AC_MSG_CHECKING(if we can assemble basic amd64 code) +HAVE_AMD64_ASM_V=0 +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +]], [[ +#if defined(__amd64) || defined(__amd64__) || defined(__x86_64__) +/* neat */ +#else +# error !amd64 +#endif +__asm__("pxor %xmm12,%xmm6"); +]])], +[AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_AMD64_ASM], [1], [basic amd64 code can be assembled]) + HAVE_AMD64_ASM_V=1], +[AC_MSG_RESULT(no)]) +AM_CONDITIONAL([HAVE_AMD64_ASM], [test $HAVE_AMD64_ASM_V = 1]) +AC_SUBST(HAVE_AMD64_ASM_V) + AC_MSG_CHECKING(for 128-bit arithmetic) HAVE_TI_MODE_V=0 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 3a2adcb5..da5900ef 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -137,9 +137,6 @@ libsodium_la_SOURCES = \ crypto_stream/aes256estream/hongjun/aes256.h \ crypto_stream/aes256estream/hongjun/api.h \ crypto_stream/aes256estream/hongjun/ecrypt-sync.h \ - crypto_stream/salsa20/ref/api.h \ - crypto_stream/salsa20/ref/stream_salsa20.c \ - crypto_stream/salsa20/ref/xor_salsa20.c \ crypto_stream/salsa2012/ref/api.h \ crypto_stream/salsa2012/ref/stream_salsa2012.c \ crypto_stream/salsa2012/ref/xor_salsa2012.c \ @@ -173,6 +170,17 @@ libsodium_la_SOURCES += \ crypto_scalarmult/curve25519/ref/smult_curve25519_ref.c endif +if HAVE_AMD64_ASM +libsodium_la_SOURCES += \ + crypto_stream/salsa20/amd64_xmm6/api.h \ + crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.s +else +libsodium_la_SOURCES += \ + crypto_stream/salsa20/ref/api.h \ + crypto_stream/salsa20/ref/stream_salsa20_ref.c \ + crypto_stream/salsa20/ref/xor_salsa20_ref.c +endif + libsodium_la_LDFLAGS = \ $(AM_LDFLAGS) \ -export-dynamic \ diff --git a/src/libsodium/crypto_stream/salsa20/amd64_xmm6/api.h b/src/libsodium/crypto_stream/salsa20/amd64_xmm6/api.h new file mode 100644 index 00000000..037fb59d --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/amd64_xmm6/api.h @@ -0,0 +1 @@ +#include "crypto_stream_salsa20.h" diff --git a/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.s b/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.s new file mode 100644 index 00000000..9fb04066 --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.s @@ -0,0 +1,947 @@ + +#if defined(__amd64) || defined(__amd64__) || defined(__x86_64__) + +.text +.p2align 5 + +.globl _crypto_stream_salsa20 +.globl crypto_stream_salsa20 +_crypto_stream_salsa20: +crypto_stream_salsa20: +mov %rsp,%r11 +and $31,%r11 +add $480,%r11 +sub %r11,%rsp +movq %r11,352(%rsp) +movq %r12,360(%rsp) +movq %r13,368(%rsp) +movq %r14,376(%rsp) +movq %r15,384(%rsp) +movq %rbx,392(%rsp) +movq %rbp,400(%rsp) +mov %rsi,%r9 +mov %rdi,%rdi +mov %rdi,%rsi +mov %rdx,%rdx +mov %rcx,%r10 +cmp $0,%r9 +jbe ._done + +mov $0,%rax +mov %r9,%rcx +rep stosb +sub %r9,%rdi +jmp ._start + +.text +.p2align 5 + +.globl _crypto_stream_salsa20_xor +.globl crypto_stream_salsa20_xor +_crypto_stream_salsa20_xor: +crypto_stream_salsa20_xor: +mov %rsp,%r11 +and $31,%r11 +add $480,%r11 +sub %r11,%rsp +movq %r11,352(%rsp) +movq %r12,360(%rsp) +movq %r13,368(%rsp) +movq %r14,376(%rsp) +movq %r15,384(%rsp) +movq %rbx,392(%rsp) +movq %rbp,400(%rsp) +mov %rdi,%rdi +mov %rsi,%rsi +mov %rdx,%r9 +mov %rcx,%rdx +mov %r8,%r10 +cmp $0,%r9 +jbe ._done + +._start: +movl 20(%r10),%ecx +movl 0(%r10),%r8d +movl 0(%rdx),%eax +movl 16(%r10),%r11d +movl %ecx,0(%rsp) +movl %r8d,4+0(%rsp) +movl %eax,8+0(%rsp) +movl %r11d,12+0(%rsp) +mov $0,%rcx +movl 24(%r10),%r8d +movl 4(%r10),%eax +movl 4(%rdx),%edx +movl %ecx,16(%rsp) +movl %r8d,4+16(%rsp) +movl %eax,8+16(%rsp) +movl %edx,12+16(%rsp) +movl 12(%r10),%edx +mov $0,%rcx +movl 28(%r10),%r8d +movl 8(%r10),%eax +movl %edx,32(%rsp) +movl %ecx,4+32(%rsp) +movl %r8d,8+32(%rsp) +movl %eax,12+32(%rsp) +mov $1634760805,%rdx +mov $857760878,%rcx +mov $2036477234,%r8 +mov $1797285236,%rax +movl %edx,48(%rsp) +movl %ecx,4+48(%rsp) +movl %r8d,8+48(%rsp) +movl %eax,12+48(%rsp) +cmp $256,%r9 +jb ._bytesbetween1and255 + +movdqa 48(%rsp),%xmm0 +pshufd $0x55,%xmm0,%xmm1 +pshufd $0xaa,%xmm0,%xmm2 +pshufd $0xff,%xmm0,%xmm3 +pshufd $0x00,%xmm0,%xmm0 +movdqa %xmm1,64(%rsp) +movdqa %xmm2,80(%rsp) +movdqa %xmm3,96(%rsp) +movdqa %xmm0,112(%rsp) +movdqa 0(%rsp),%xmm0 +pshufd $0xaa,%xmm0,%xmm1 +pshufd $0xff,%xmm0,%xmm2 +pshufd $0x00,%xmm0,%xmm3 +pshufd $0x55,%xmm0,%xmm0 +movdqa %xmm1,128(%rsp) +movdqa %xmm2,144(%rsp) +movdqa %xmm3,160(%rsp) +movdqa %xmm0,176(%rsp) +movdqa 16(%rsp),%xmm0 +pshufd $0xff,%xmm0,%xmm1 +pshufd $0x55,%xmm0,%xmm2 +pshufd $0xaa,%xmm0,%xmm0 +movdqa %xmm1,192(%rsp) +movdqa %xmm2,208(%rsp) +movdqa %xmm0,224(%rsp) +movdqa 32(%rsp),%xmm0 +pshufd $0x00,%xmm0,%xmm1 +pshufd $0xaa,%xmm0,%xmm2 +pshufd $0xff,%xmm0,%xmm0 +movdqa %xmm1,240(%rsp) +movdqa %xmm2,256(%rsp) +movdqa %xmm0,272(%rsp) + +._bytesatleast256: +movl 16(%rsp),%edx +movl 4+32(%rsp),%ecx +movl %edx,288(%rsp) +movl %ecx,304(%rsp) +add $1,%rdx +shl $32,%rcx +add %rcx,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,4+288(%rsp) +movl %ecx,4+304(%rsp) +add $1,%rdx +shl $32,%rcx +add %rcx,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,8+288(%rsp) +movl %ecx,8+304(%rsp) +add $1,%rdx +shl $32,%rcx +add %rcx,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,12+288(%rsp) +movl %ecx,12+304(%rsp) +add $1,%rdx +shl $32,%rcx +add %rcx,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,16(%rsp) +movl %ecx,4+32(%rsp) +movq %r9,408(%rsp) +mov $20,%rdx +movdqa 64(%rsp),%xmm0 +movdqa 80(%rsp),%xmm1 +movdqa 96(%rsp),%xmm2 +movdqa 256(%rsp),%xmm3 +movdqa 272(%rsp),%xmm4 +movdqa 128(%rsp),%xmm5 +movdqa 144(%rsp),%xmm6 +movdqa 176(%rsp),%xmm7 +movdqa 192(%rsp),%xmm8 +movdqa 208(%rsp),%xmm9 +movdqa 224(%rsp),%xmm10 +movdqa 304(%rsp),%xmm11 +movdqa 112(%rsp),%xmm12 +movdqa 160(%rsp),%xmm13 +movdqa 240(%rsp),%xmm14 +movdqa 288(%rsp),%xmm15 + +._mainloop1: +movdqa %xmm1,320(%rsp) +movdqa %xmm2,336(%rsp) +movdqa %xmm13,%xmm1 +paddd %xmm12,%xmm1 +movdqa %xmm1,%xmm2 +pslld $7,%xmm1 +pxor %xmm1,%xmm14 +psrld $25,%xmm2 +pxor %xmm2,%xmm14 +movdqa %xmm7,%xmm1 +paddd %xmm0,%xmm1 +movdqa %xmm1,%xmm2 +pslld $7,%xmm1 +pxor %xmm1,%xmm11 +psrld $25,%xmm2 +pxor %xmm2,%xmm11 +movdqa %xmm12,%xmm1 +paddd %xmm14,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm15 +psrld $23,%xmm2 +pxor %xmm2,%xmm15 +movdqa %xmm0,%xmm1 +paddd %xmm11,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm9 +psrld $23,%xmm2 +pxor %xmm2,%xmm9 +movdqa %xmm14,%xmm1 +paddd %xmm15,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm13 +psrld $19,%xmm2 +pxor %xmm2,%xmm13 +movdqa %xmm11,%xmm1 +paddd %xmm9,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm7 +psrld $19,%xmm2 +pxor %xmm2,%xmm7 +movdqa %xmm15,%xmm1 +paddd %xmm13,%xmm1 +movdqa %xmm1,%xmm2 +pslld $18,%xmm1 +pxor %xmm1,%xmm12 +psrld $14,%xmm2 +pxor %xmm2,%xmm12 +movdqa 320(%rsp),%xmm1 +movdqa %xmm12,320(%rsp) +movdqa %xmm9,%xmm2 +paddd %xmm7,%xmm2 +movdqa %xmm2,%xmm12 +pslld $18,%xmm2 +pxor %xmm2,%xmm0 +psrld $14,%xmm12 +pxor %xmm12,%xmm0 +movdqa %xmm5,%xmm2 +paddd %xmm1,%xmm2 +movdqa %xmm2,%xmm12 +pslld $7,%xmm2 +pxor %xmm2,%xmm3 +psrld $25,%xmm12 +pxor %xmm12,%xmm3 +movdqa 336(%rsp),%xmm2 +movdqa %xmm0,336(%rsp) +movdqa %xmm6,%xmm0 +paddd %xmm2,%xmm0 +movdqa %xmm0,%xmm12 +pslld $7,%xmm0 +pxor %xmm0,%xmm4 +psrld $25,%xmm12 +pxor %xmm12,%xmm4 +movdqa %xmm1,%xmm0 +paddd %xmm3,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm10 +psrld $23,%xmm12 +pxor %xmm12,%xmm10 +movdqa %xmm2,%xmm0 +paddd %xmm4,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm8 +psrld $23,%xmm12 +pxor %xmm12,%xmm8 +movdqa %xmm3,%xmm0 +paddd %xmm10,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm5 +psrld $19,%xmm12 +pxor %xmm12,%xmm5 +movdqa %xmm4,%xmm0 +paddd %xmm8,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm6 +psrld $19,%xmm12 +pxor %xmm12,%xmm6 +movdqa %xmm10,%xmm0 +paddd %xmm5,%xmm0 +movdqa %xmm0,%xmm12 +pslld $18,%xmm0 +pxor %xmm0,%xmm1 +psrld $14,%xmm12 +pxor %xmm12,%xmm1 +movdqa 320(%rsp),%xmm0 +movdqa %xmm1,320(%rsp) +movdqa %xmm4,%xmm1 +paddd %xmm0,%xmm1 +movdqa %xmm1,%xmm12 +pslld $7,%xmm1 +pxor %xmm1,%xmm7 +psrld $25,%xmm12 +pxor %xmm12,%xmm7 +movdqa %xmm8,%xmm1 +paddd %xmm6,%xmm1 +movdqa %xmm1,%xmm12 +pslld $18,%xmm1 +pxor %xmm1,%xmm2 +psrld $14,%xmm12 +pxor %xmm12,%xmm2 +movdqa 336(%rsp),%xmm12 +movdqa %xmm2,336(%rsp) +movdqa %xmm14,%xmm1 +paddd %xmm12,%xmm1 +movdqa %xmm1,%xmm2 +pslld $7,%xmm1 +pxor %xmm1,%xmm5 +psrld $25,%xmm2 +pxor %xmm2,%xmm5 +movdqa %xmm0,%xmm1 +paddd %xmm7,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm10 +psrld $23,%xmm2 +pxor %xmm2,%xmm10 +movdqa %xmm12,%xmm1 +paddd %xmm5,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm8 +psrld $23,%xmm2 +pxor %xmm2,%xmm8 +movdqa %xmm7,%xmm1 +paddd %xmm10,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm4 +psrld $19,%xmm2 +pxor %xmm2,%xmm4 +movdqa %xmm5,%xmm1 +paddd %xmm8,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm14 +psrld $19,%xmm2 +pxor %xmm2,%xmm14 +movdqa %xmm10,%xmm1 +paddd %xmm4,%xmm1 +movdqa %xmm1,%xmm2 +pslld $18,%xmm1 +pxor %xmm1,%xmm0 +psrld $14,%xmm2 +pxor %xmm2,%xmm0 +movdqa 320(%rsp),%xmm1 +movdqa %xmm0,320(%rsp) +movdqa %xmm8,%xmm0 +paddd %xmm14,%xmm0 +movdqa %xmm0,%xmm2 +pslld $18,%xmm0 +pxor %xmm0,%xmm12 +psrld $14,%xmm2 +pxor %xmm2,%xmm12 +movdqa %xmm11,%xmm0 +paddd %xmm1,%xmm0 +movdqa %xmm0,%xmm2 +pslld $7,%xmm0 +pxor %xmm0,%xmm6 +psrld $25,%xmm2 +pxor %xmm2,%xmm6 +movdqa 336(%rsp),%xmm2 +movdqa %xmm12,336(%rsp) +movdqa %xmm3,%xmm0 +paddd %xmm2,%xmm0 +movdqa %xmm0,%xmm12 +pslld $7,%xmm0 +pxor %xmm0,%xmm13 +psrld $25,%xmm12 +pxor %xmm12,%xmm13 +movdqa %xmm1,%xmm0 +paddd %xmm6,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm15 +psrld $23,%xmm12 +pxor %xmm12,%xmm15 +movdqa %xmm2,%xmm0 +paddd %xmm13,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm9 +psrld $23,%xmm12 +pxor %xmm12,%xmm9 +movdqa %xmm6,%xmm0 +paddd %xmm15,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm11 +psrld $19,%xmm12 +pxor %xmm12,%xmm11 +movdqa %xmm13,%xmm0 +paddd %xmm9,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm3 +psrld $19,%xmm12 +pxor %xmm12,%xmm3 +movdqa %xmm15,%xmm0 +paddd %xmm11,%xmm0 +movdqa %xmm0,%xmm12 +pslld $18,%xmm0 +pxor %xmm0,%xmm1 +psrld $14,%xmm12 +pxor %xmm12,%xmm1 +movdqa %xmm9,%xmm0 +paddd %xmm3,%xmm0 +movdqa %xmm0,%xmm12 +pslld $18,%xmm0 +pxor %xmm0,%xmm2 +psrld $14,%xmm12 +pxor %xmm12,%xmm2 +movdqa 320(%rsp),%xmm12 +movdqa 336(%rsp),%xmm0 +sub $2,%rdx +ja ._mainloop1 + +paddd 112(%rsp),%xmm12 +paddd 176(%rsp),%xmm7 +paddd 224(%rsp),%xmm10 +paddd 272(%rsp),%xmm4 +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +pshufd $0x39,%xmm12,%xmm12 +pshufd $0x39,%xmm7,%xmm7 +pshufd $0x39,%xmm10,%xmm10 +pshufd $0x39,%xmm4,%xmm4 +xorl 0(%rsi),%edx +xorl 4(%rsi),%ecx +xorl 8(%rsi),%r8d +xorl 12(%rsi),%r9d +movl %edx,0(%rdi) +movl %ecx,4(%rdi) +movl %r8d,8(%rdi) +movl %r9d,12(%rdi) +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +pshufd $0x39,%xmm12,%xmm12 +pshufd $0x39,%xmm7,%xmm7 +pshufd $0x39,%xmm10,%xmm10 +pshufd $0x39,%xmm4,%xmm4 +xorl 64(%rsi),%edx +xorl 68(%rsi),%ecx +xorl 72(%rsi),%r8d +xorl 76(%rsi),%r9d +movl %edx,64(%rdi) +movl %ecx,68(%rdi) +movl %r8d,72(%rdi) +movl %r9d,76(%rdi) +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +pshufd $0x39,%xmm12,%xmm12 +pshufd $0x39,%xmm7,%xmm7 +pshufd $0x39,%xmm10,%xmm10 +pshufd $0x39,%xmm4,%xmm4 +xorl 128(%rsi),%edx +xorl 132(%rsi),%ecx +xorl 136(%rsi),%r8d +xorl 140(%rsi),%r9d +movl %edx,128(%rdi) +movl %ecx,132(%rdi) +movl %r8d,136(%rdi) +movl %r9d,140(%rdi) +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +xorl 192(%rsi),%edx +xorl 196(%rsi),%ecx +xorl 200(%rsi),%r8d +xorl 204(%rsi),%r9d +movl %edx,192(%rdi) +movl %ecx,196(%rdi) +movl %r8d,200(%rdi) +movl %r9d,204(%rdi) +paddd 240(%rsp),%xmm14 +paddd 64(%rsp),%xmm0 +paddd 128(%rsp),%xmm5 +paddd 192(%rsp),%xmm8 +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +pshufd $0x39,%xmm14,%xmm14 +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm5,%xmm5 +pshufd $0x39,%xmm8,%xmm8 +xorl 16(%rsi),%edx +xorl 20(%rsi),%ecx +xorl 24(%rsi),%r8d +xorl 28(%rsi),%r9d +movl %edx,16(%rdi) +movl %ecx,20(%rdi) +movl %r8d,24(%rdi) +movl %r9d,28(%rdi) +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +pshufd $0x39,%xmm14,%xmm14 +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm5,%xmm5 +pshufd $0x39,%xmm8,%xmm8 +xorl 80(%rsi),%edx +xorl 84(%rsi),%ecx +xorl 88(%rsi),%r8d +xorl 92(%rsi),%r9d +movl %edx,80(%rdi) +movl %ecx,84(%rdi) +movl %r8d,88(%rdi) +movl %r9d,92(%rdi) +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +pshufd $0x39,%xmm14,%xmm14 +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm5,%xmm5 +pshufd $0x39,%xmm8,%xmm8 +xorl 144(%rsi),%edx +xorl 148(%rsi),%ecx +xorl 152(%rsi),%r8d +xorl 156(%rsi),%r9d +movl %edx,144(%rdi) +movl %ecx,148(%rdi) +movl %r8d,152(%rdi) +movl %r9d,156(%rdi) +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +xorl 208(%rsi),%edx +xorl 212(%rsi),%ecx +xorl 216(%rsi),%r8d +xorl 220(%rsi),%r9d +movl %edx,208(%rdi) +movl %ecx,212(%rdi) +movl %r8d,216(%rdi) +movl %r9d,220(%rdi) +paddd 288(%rsp),%xmm15 +paddd 304(%rsp),%xmm11 +paddd 80(%rsp),%xmm1 +paddd 144(%rsp),%xmm6 +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +pshufd $0x39,%xmm15,%xmm15 +pshufd $0x39,%xmm11,%xmm11 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm6,%xmm6 +xorl 32(%rsi),%edx +xorl 36(%rsi),%ecx +xorl 40(%rsi),%r8d +xorl 44(%rsi),%r9d +movl %edx,32(%rdi) +movl %ecx,36(%rdi) +movl %r8d,40(%rdi) +movl %r9d,44(%rdi) +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +pshufd $0x39,%xmm15,%xmm15 +pshufd $0x39,%xmm11,%xmm11 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm6,%xmm6 +xorl 96(%rsi),%edx +xorl 100(%rsi),%ecx +xorl 104(%rsi),%r8d +xorl 108(%rsi),%r9d +movl %edx,96(%rdi) +movl %ecx,100(%rdi) +movl %r8d,104(%rdi) +movl %r9d,108(%rdi) +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +pshufd $0x39,%xmm15,%xmm15 +pshufd $0x39,%xmm11,%xmm11 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm6,%xmm6 +xorl 160(%rsi),%edx +xorl 164(%rsi),%ecx +xorl 168(%rsi),%r8d +xorl 172(%rsi),%r9d +movl %edx,160(%rdi) +movl %ecx,164(%rdi) +movl %r8d,168(%rdi) +movl %r9d,172(%rdi) +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +xorl 224(%rsi),%edx +xorl 228(%rsi),%ecx +xorl 232(%rsi),%r8d +xorl 236(%rsi),%r9d +movl %edx,224(%rdi) +movl %ecx,228(%rdi) +movl %r8d,232(%rdi) +movl %r9d,236(%rdi) +paddd 160(%rsp),%xmm13 +paddd 208(%rsp),%xmm9 +paddd 256(%rsp),%xmm3 +paddd 96(%rsp),%xmm2 +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +pshufd $0x39,%xmm13,%xmm13 +pshufd $0x39,%xmm9,%xmm9 +pshufd $0x39,%xmm3,%xmm3 +pshufd $0x39,%xmm2,%xmm2 +xorl 48(%rsi),%edx +xorl 52(%rsi),%ecx +xorl 56(%rsi),%r8d +xorl 60(%rsi),%r9d +movl %edx,48(%rdi) +movl %ecx,52(%rdi) +movl %r8d,56(%rdi) +movl %r9d,60(%rdi) +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +pshufd $0x39,%xmm13,%xmm13 +pshufd $0x39,%xmm9,%xmm9 +pshufd $0x39,%xmm3,%xmm3 +pshufd $0x39,%xmm2,%xmm2 +xorl 112(%rsi),%edx +xorl 116(%rsi),%ecx +xorl 120(%rsi),%r8d +xorl 124(%rsi),%r9d +movl %edx,112(%rdi) +movl %ecx,116(%rdi) +movl %r8d,120(%rdi) +movl %r9d,124(%rdi) +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +pshufd $0x39,%xmm13,%xmm13 +pshufd $0x39,%xmm9,%xmm9 +pshufd $0x39,%xmm3,%xmm3 +pshufd $0x39,%xmm2,%xmm2 +xorl 176(%rsi),%edx +xorl 180(%rsi),%ecx +xorl 184(%rsi),%r8d +xorl 188(%rsi),%r9d +movl %edx,176(%rdi) +movl %ecx,180(%rdi) +movl %r8d,184(%rdi) +movl %r9d,188(%rdi) +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +xorl 240(%rsi),%edx +xorl 244(%rsi),%ecx +xorl 248(%rsi),%r8d +xorl 252(%rsi),%r9d +movl %edx,240(%rdi) +movl %ecx,244(%rdi) +movl %r8d,248(%rdi) +movl %r9d,252(%rdi) +movq 408(%rsp),%r9 +sub $256,%r9 +add $256,%rsi +add $256,%rdi +cmp $256,%r9 +jae ._bytesatleast256 + +cmp $0,%r9 +jbe ._done + +._bytesbetween1and255: +cmp $64,%r9 +jae ._nocopy + +mov %rdi,%rdx +leaq 416(%rsp),%rdi +mov %r9,%rcx +rep movsb +leaq 416(%rsp),%rdi +leaq 416(%rsp),%rsi + +._nocopy: +movq %r9,408(%rsp) +movdqa 48(%rsp),%xmm0 +movdqa 0(%rsp),%xmm1 +movdqa 16(%rsp),%xmm2 +movdqa 32(%rsp),%xmm3 +movdqa %xmm1,%xmm4 +mov $20,%rcx + +._mainloop2: +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm3 +pxor %xmm6,%xmm3 +paddd %xmm3,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm3,%xmm3 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm1 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pxor %xmm6,%xmm0 +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm1 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm1,%xmm1 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm3 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm3 +paddd %xmm3,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm3,%xmm3 +pxor %xmm6,%xmm0 +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm3 +pxor %xmm6,%xmm3 +paddd %xmm3,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm3,%xmm3 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm1 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pxor %xmm6,%xmm0 +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm1 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm1,%xmm1 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm3 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm3 +sub $4,%rcx +paddd %xmm3,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +pxor %xmm7,%xmm7 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm3,%xmm3 +pxor %xmm6,%xmm0 +ja ._mainloop2 + +paddd 48(%rsp),%xmm0 +paddd 0(%rsp),%xmm1 +paddd 16(%rsp),%xmm2 +paddd 32(%rsp),%xmm3 +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm2,%xmm2 +pshufd $0x39,%xmm3,%xmm3 +xorl 0(%rsi),%ecx +xorl 48(%rsi),%r8d +xorl 32(%rsi),%r9d +xorl 16(%rsi),%eax +movl %ecx,0(%rdi) +movl %r8d,48(%rdi) +movl %r9d,32(%rdi) +movl %eax,16(%rdi) +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm2,%xmm2 +pshufd $0x39,%xmm3,%xmm3 +xorl 20(%rsi),%ecx +xorl 4(%rsi),%r8d +xorl 52(%rsi),%r9d +xorl 36(%rsi),%eax +movl %ecx,20(%rdi) +movl %r8d,4(%rdi) +movl %r9d,52(%rdi) +movl %eax,36(%rdi) +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm2,%xmm2 +pshufd $0x39,%xmm3,%xmm3 +xorl 40(%rsi),%ecx +xorl 24(%rsi),%r8d +xorl 8(%rsi),%r9d +xorl 56(%rsi),%eax +movl %ecx,40(%rdi) +movl %r8d,24(%rdi) +movl %r9d,8(%rdi) +movl %eax,56(%rdi) +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +xorl 60(%rsi),%ecx +xorl 44(%rsi),%r8d +xorl 28(%rsi),%r9d +xorl 12(%rsi),%eax +movl %ecx,60(%rdi) +movl %r8d,44(%rdi) +movl %r9d,28(%rdi) +movl %eax,12(%rdi) +movq 408(%rsp),%r9 +movl 16(%rsp),%ecx +movl 4+32(%rsp),%r8d +add $1,%rcx +shl $32,%r8 +add %r8,%rcx +mov %rcx,%r8 +shr $32,%r8 +movl %ecx,16(%rsp) +movl %r8d,4+32(%rsp) +cmp $64,%r9 + +ja ._bytesatleast65 + +jae ._bytesatleast64 + +mov %rdi,%rsi +mov %rdx,%rdi +mov %r9,%rcx +rep movsb + +._bytesatleast64: +._done: +movq 352(%rsp),%r11 +movq 360(%rsp),%r12 +movq 368(%rsp),%r13 +movq 376(%rsp),%r14 +movq 384(%rsp),%r15 +movq 392(%rsp),%rbx +movq 400(%rsp),%rbp +add %r11,%rsp +xor %rax,%rax +xor %rdx,%rdx +ret + +._bytesatleast65: +sub $64,%r9 +add $64,%rdi +add $64,%rsi +jmp ._bytesbetween1and255 + +#endif diff --git a/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20.c b/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.c similarity index 96% rename from src/libsodium/crypto_stream/salsa20/ref/stream_salsa20.c rename to src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.c index 9f5087ce..66015e32 100644 --- a/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20.c +++ b/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.c @@ -7,6 +7,8 @@ Public domain. #include "api.h" #include "crypto_core_salsa20.h" +#ifndef HAVE_AMD64_ASM + typedef unsigned int uint32; static const unsigned char sigma[16] = { @@ -49,3 +51,5 @@ int crypto_stream( } return 0; } + +#endif diff --git a/src/libsodium/crypto_stream/salsa20/ref/xor_salsa20.c b/src/libsodium/crypto_stream/salsa20/ref/xor_salsa20_ref.c similarity index 96% rename from src/libsodium/crypto_stream/salsa20/ref/xor_salsa20.c rename to src/libsodium/crypto_stream/salsa20/ref/xor_salsa20_ref.c index 925f0572..7d728b27 100644 --- a/src/libsodium/crypto_stream/salsa20/ref/xor_salsa20.c +++ b/src/libsodium/crypto_stream/salsa20/ref/xor_salsa20_ref.c @@ -7,6 +7,8 @@ Public domain. #include "api.h" #include "crypto_core_salsa20.h" +#ifndef HAVE_AMD64_ASM + typedef unsigned int uint32; static const unsigned char sigma[16] = { @@ -52,3 +54,5 @@ int crypto_stream_xor( } return 0; } + +#endif diff --git a/src/libsodium/include/sodium/crypto_scalarmult_curve25519.h.in b/src/libsodium/include/sodium/crypto_scalarmult_curve25519.h.in index 6d9c035e..e87657ff 100644 --- a/src/libsodium/include/sodium/crypto_scalarmult_curve25519.h.in +++ b/src/libsodium/include/sodium/crypto_scalarmult_curve25519.h.in @@ -2,7 +2,9 @@ #define crypto_scalarmult_curve25519_H #if @HAVE_TI_MODE_V@ -# define SODIUM_HAVE_TI_MODE +# ifndef SODIUM_HAVE_TI_MODE +# define SODIUM_HAVE_TI_MODE +# endif #endif #include "export.h" diff --git a/src/libsodium/include/sodium/crypto_stream_salsa20.h b/src/libsodium/include/sodium/crypto_stream_salsa20.h index b80a4327..f3e1091a 100644 --- a/src/libsodium/include/sodium/crypto_stream_salsa20.h +++ b/src/libsodium/include/sodium/crypto_stream_salsa20.h @@ -9,6 +9,12 @@ * the crypto_box functions. */ +#if 1 +# ifndef SODIUM_HAVE_AMD64_ASM +# define SODIUM_HAVE_AMD64_ASM +# endif +#endif + #include "export.h" #define crypto_stream_salsa20_KEYBYTES 32U @@ -24,7 +30,10 @@ int crypto_stream_salsa20(unsigned char *,unsigned long long,const unsigned char SODIUM_EXPORT int crypto_stream_salsa20_xor(unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *); -#if 1 +#ifdef SODIUM_HAVE_AMD64_ASM +# define crypto_stream_salsa20_amd64_xmm6 crypto_stream_salsa20 +# define crypto_stream_salsa20_amd64_xmm6_xor crypto_stream_salsa20_xor +#else # define crypto_stream_salsa20_ref crypto_stream_salsa20 # define crypto_stream_salsa20_ref_xor crypto_stream_salsa20_xor #endif @@ -33,7 +42,4 @@ int crypto_stream_salsa20_xor(unsigned char *,const unsigned char *,unsigned lon } #endif -#define crypto_stream_salsa20_ref crypto_stream_salsa20 -#define crypto_stream_salsa20_ref_xor crypto_stream_salsa20_xor - #endif diff --git a/src/libsodium/include/sodium/crypto_stream_salsa20.h.in b/src/libsodium/include/sodium/crypto_stream_salsa20.h.in index 211d8d86..9f8e9df2 100644 --- a/src/libsodium/include/sodium/crypto_stream_salsa20.h.in +++ b/src/libsodium/include/sodium/crypto_stream_salsa20.h.in @@ -9,6 +9,12 @@ * the crypto_box functions. */ +#if @HAVE_AMD64_ASM_V@ +# ifndef SODIUM_HAVE_AMD64_ASM +# define SODIUM_HAVE_AMD64_ASM +# endif +#endif + #include "export.h" #define crypto_stream_salsa20_KEYBYTES 32U @@ -24,16 +30,16 @@ int crypto_stream_salsa20(unsigned char *,unsigned long long,const unsigned char SODIUM_EXPORT int crypto_stream_salsa20_xor(unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *); -#if 1 +#ifdef SODIUM_HAVE_AMD64_ASM +# define crypto_stream_salsa20_amd64_xmm6 crypto_stream_salsa20 +# define crypto_stream_salsa20_amd64_xmm6_xor crypto_stream_salsa20_xor +#else # define crypto_stream_salsa20_ref crypto_stream_salsa20 # define crypto_stream_salsa20_ref_xor crypto_stream_salsa20_xor -#endif +#endif #ifdef __cplusplus } #endif -#define crypto_stream_salsa20_ref crypto_stream_salsa20 -#define crypto_stream_salsa20_ref_xor crypto_stream_salsa20_xor - #endif