diff --git a/configure.ac b/configure.ac index 343d7017..f136d13f 100644 --- a/configure.ac +++ b/configure.ac @@ -102,6 +102,7 @@ AC_ARG_ENABLE(minimal, [ AS_IF([test "x$enableval" = "xyes"], [ enable_minimal="yes" + AC_DEFINE([MINIMAL], [1], [Define for a minimal build, without deprecated functions and functions that high-level APIs depend on]) ], [ enable_minimal="no" ]) diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 83c53e83..9ad50ada 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -16,8 +16,7 @@ libsodium_la_SOURCES = \ crypto_core/curve25519/ref10/curve25519_ref10.c \ crypto_core/hsalsa20/ref2/core_hsalsa20_ref2.c \ crypto_core/hsalsa20/core_hsalsa20.c \ - crypto_core/salsa20/ref/core_salsa20_ref.c \ - crypto_core/salsa20/core_salsa20.c \ + crypto_core/salsa/ref/core_salsa_ref.c \ crypto_generichash/crypto_generichash.c \ crypto_generichash/blake2b/generichash_blake2.c \ crypto_generichash/blake2b/ref/blake2.h \ @@ -81,6 +80,7 @@ libsodium_la_SOURCES = \ crypto_stream/chacha20/ref/stream_chacha20_ref.c \ crypto_stream/crypto_stream.c \ crypto_stream/salsa20/stream_salsa20.c \ + crypto_stream/salsa20/ref/stream_salsa20_ref.c \ crypto_stream/xsalsa20/stream_xsalsa20.c \ crypto_verify/sodium/verify.c \ include/sodium/private/common.h \ @@ -141,24 +141,11 @@ libsodium_la_SOURCES += \ crypto_scalarmult/curve25519/sandy2x/sandy2x.S endif -if HAVE_AMD64_ASM -libsodium_la_SOURCES += \ - crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S -else -libsodium_la_SOURCES += \ - crypto_stream/salsa20/ref/stream_salsa20_ref.c \ - crypto_stream/salsa20/ref/xor_salsa20_ref.c -endif - if !MINIMAL libsodium_la_SOURCES += \ crypto_aead/xchacha20poly1305/sodium/aead_xchacha20poly1305.c \ crypto_box/curve25519xchacha20poly1305/box_curve25519xchacha20poly1305.c \ crypto_core/hchacha20/core_hchacha20.c \ - crypto_core/salsa2012/core_salsa2012.c \ - crypto_core/salsa2012/ref/core_salsa2012_ref.c \ - crypto_core/salsa208/core_salsa208.c \ - crypto_core/salsa208/ref/core_salsa208_ref.c \ crypto_secretbox/xchacha20poly1305/secretbox_xchacha20poly1305.c \ crypto_shorthash/siphash24/shorthash_siphashx24.c \ crypto_shorthash/siphash24/ref/shorthash_siphashx24_ref.c \ @@ -174,10 +161,8 @@ libsodium_la_SOURCES += \ crypto_stream/aes128ctr/nacl/xor_afternm_aes128ctr.c \ crypto_stream/aes128ctr/stream_aes128ctr.c \ crypto_stream/salsa2012/ref/stream_salsa2012_ref.c \ - crypto_stream/salsa2012/ref/xor_salsa2012.c \ crypto_stream/salsa2012/stream_salsa2012.c \ crypto_stream/salsa208/ref/stream_salsa208_ref.c \ - crypto_stream/salsa208/ref/xor_salsa208.c \ crypto_stream/salsa208/stream_salsa208.c \ crypto_stream/xchacha20/stream_xchacha20.c endif @@ -219,7 +204,10 @@ libsse2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ libsse2_la_SOURCES = \ crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c \ crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c \ - crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h + crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h \ + crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.c \ + crypto_stream/salsa20/xmm6int/u1.h \ + crypto_stream/salsa20/xmm6int/u4.h libssse3_la_LDFLAGS = $(libsodium_la_LDFLAGS) libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ diff --git a/src/libsodium/crypto_core/salsa/ref/core_salsa_ref.c b/src/libsodium/crypto_core/salsa/ref/core_salsa_ref.c new file mode 100644 index 00000000..656593d1 --- /dev/null +++ b/src/libsodium/crypto_core/salsa/ref/core_salsa_ref.c @@ -0,0 +1,195 @@ + +#include +#include + +#include "crypto_core_salsa20.h" +#include "crypto_core_salsa2012.h" +#include "crypto_core_salsa208.h" +#include "private/common.h" + +static void +crypto_core_salsa(unsigned char *out, const unsigned char *in, + const unsigned char *k, const unsigned char *c, + const int rounds) +{ + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, + x15; + uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, + j15; + int i; + + x0 = 0x61707865; + x5 = 0x3320646e; + x10 = 0x79622d32; + x15 = 0x6b206574; + if (c != NULL) { + j0 = x0 = LOAD32_LE(c + 0); + j5 = x5 = LOAD32_LE(c + 4); + j10 = x10 = LOAD32_LE(c + 8); + j15 = x15 = LOAD32_LE(c + 12); + } + j1 = x1 = LOAD32_LE(k + 0); + j2 = x2 = LOAD32_LE(k + 4); + j3 = x3 = LOAD32_LE(k + 8); + j4 = x4 = LOAD32_LE(k + 12); + j11 = x11 = LOAD32_LE(k + 16); + j12 = x12 = LOAD32_LE(k + 20); + j13 = x13 = LOAD32_LE(k + 24); + j14 = x14 = LOAD32_LE(k + 28); + + j6 = x6 = LOAD32_LE(in + 0); + j7 = x7 = LOAD32_LE(in + 4); + j8 = x8 = LOAD32_LE(in + 8); + j9 = x9 = LOAD32_LE(in + 12); + + for (i = 0; i < rounds; i += 2) { + x4 ^= ROTL32(x0 + x12, 7); + x8 ^= ROTL32(x4 + x0, 9); + x12 ^= ROTL32(x8 + x4, 13); + x0 ^= ROTL32(x12 + x8, 18); + x9 ^= ROTL32(x5 + x1, 7); + x13 ^= ROTL32(x9 + x5, 9); + x1 ^= ROTL32(x13 + x9, 13); + x5 ^= ROTL32(x1 + x13, 18); + x14 ^= ROTL32(x10 + x6, 7); + x2 ^= ROTL32(x14 + x10, 9); + x6 ^= ROTL32(x2 + x14, 13); + x10 ^= ROTL32(x6 + x2, 18); + x3 ^= ROTL32(x15 + x11, 7); + x7 ^= ROTL32(x3 + x15, 9); + x11 ^= ROTL32(x7 + x3, 13); + x15 ^= ROTL32(x11 + x7, 18); + x1 ^= ROTL32(x0 + x3, 7); + x2 ^= ROTL32(x1 + x0, 9); + x3 ^= ROTL32(x2 + x1, 13); + x0 ^= ROTL32(x3 + x2, 18); + x6 ^= ROTL32(x5 + x4, 7); + x7 ^= ROTL32(x6 + x5, 9); + x4 ^= ROTL32(x7 + x6, 13); + x5 ^= ROTL32(x4 + x7, 18); + x11 ^= ROTL32(x10 + x9, 7); + x8 ^= ROTL32(x11 + x10, 9); + x9 ^= ROTL32(x8 + x11, 13); + x10 ^= ROTL32(x9 + x8, 18); + x12 ^= ROTL32(x15 + x14, 7); + x13 ^= ROTL32(x12 + x15, 9); + x14 ^= ROTL32(x13 + x12, 13); + x15 ^= ROTL32(x14 + x13, 18); + } + STORE32_LE(out + 0, x0 + j0); + STORE32_LE(out + 4, x1 + j1); + STORE32_LE(out + 8, x2 + j2); + STORE32_LE(out + 12, x3 + j3); + STORE32_LE(out + 16, x4 + j4); + STORE32_LE(out + 20, x5 + j5); + STORE32_LE(out + 24, x6 + j6); + STORE32_LE(out + 28, x7 + j7); + STORE32_LE(out + 32, x8 + j8); + STORE32_LE(out + 36, x9 + j9); + STORE32_LE(out + 40, x10 + j10); + STORE32_LE(out + 44, x11 + j11); + STORE32_LE(out + 48, x12 + j12); + STORE32_LE(out + 52, x13 + j13); + STORE32_LE(out + 56, x14 + j14); + STORE32_LE(out + 60, x15 + j15); +} + +int +crypto_core_salsa20(unsigned char *out, const unsigned char *in, + const unsigned char *k, const unsigned char *c) +{ + crypto_core_salsa(out, in, k, c, 20); + return 0; +} + +size_t +crypto_core_salsa20_outputbytes(void) +{ + return crypto_core_salsa20_OUTPUTBYTES; +} + +size_t +crypto_core_salsa20_inputbytes(void) +{ + return crypto_core_salsa20_INPUTBYTES; +} + +size_t +crypto_core_salsa20_keybytes(void) +{ + return crypto_core_salsa20_KEYBYTES; +} + +size_t +crypto_core_salsa20_constbytes(void) +{ + return crypto_core_salsa20_CONSTBYTES; +} + +#ifndef MINIMAL + +int +crypto_core_salsa2012(unsigned char *out, const unsigned char *in, + const unsigned char *k, const unsigned char *c) +{ + crypto_core_salsa(out, in, k, c, 12); + return 0; +} + +size_t +crypto_core_salsa2012_outputbytes(void) +{ + return crypto_core_salsa2012_OUTPUTBYTES; +} + +size_t +crypto_core_salsa2012_inputbytes(void) +{ + return crypto_core_salsa2012_INPUTBYTES; +} + +size_t +crypto_core_salsa2012_keybytes(void) +{ + return crypto_core_salsa2012_KEYBYTES; +} + +size_t +crypto_core_salsa2012_constbytes(void) +{ + return crypto_core_salsa2012_CONSTBYTES; +} + +int +crypto_core_salsa208(unsigned char *out, const unsigned char *in, + const unsigned char *k, const unsigned char *c) +{ + crypto_core_salsa(out, in, k, c, 8); + return 0; +} + +size_t +crypto_core_salsa208_outputbytes(void) +{ + return crypto_core_salsa208_OUTPUTBYTES; +} + +size_t +crypto_core_salsa208_inputbytes(void) +{ + return crypto_core_salsa208_INPUTBYTES; +} + +size_t +crypto_core_salsa208_keybytes(void) +{ + return crypto_core_salsa208_KEYBYTES; +} + +size_t +crypto_core_salsa208_constbytes(void) +{ + return crypto_core_salsa208_CONSTBYTES; +} + +#endif diff --git a/src/libsodium/crypto_core/salsa20/core_salsa20.c b/src/libsodium/crypto_core/salsa20/core_salsa20.c deleted file mode 100644 index 910b4619..00000000 --- a/src/libsodium/crypto_core/salsa20/core_salsa20.c +++ /dev/null @@ -1,21 +0,0 @@ -#include "crypto_core_salsa20.h" - -size_t -crypto_core_salsa20_outputbytes(void) { - return crypto_core_salsa20_OUTPUTBYTES; -} - -size_t -crypto_core_salsa20_inputbytes(void) { - return crypto_core_salsa20_INPUTBYTES; -} - -size_t -crypto_core_salsa20_keybytes(void) { - return crypto_core_salsa20_KEYBYTES; -} - -size_t -crypto_core_salsa20_constbytes(void) { - return crypto_core_salsa20_CONSTBYTES; -} diff --git a/src/libsodium/crypto_core/salsa20/ref/core_salsa20_ref.c b/src/libsodium/crypto_core/salsa20/ref/core_salsa20_ref.c deleted file mode 100644 index 378b6dac..00000000 --- a/src/libsodium/crypto_core/salsa20/ref/core_salsa20_ref.c +++ /dev/null @@ -1,122 +0,0 @@ -/* -version 20080912 -D. J. Bernstein -Public domain. -*/ - -#include -#include - -#include "crypto_core_salsa20.h" -#include "private/common.h" - -#define ROUNDS 20 -#define U32C(v) (v##U) - -int -crypto_core_salsa20(unsigned char *out, - const unsigned char *in, - const unsigned char *k, - const unsigned char *c) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, - x9, x10, x11, x12, x13, x14, x15; - uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, - j9, j10, j11, j12, j13, j14, j15; - int i; - - if (c == NULL) { - j0 = x0 = U32C(0x61707865); - j5 = x5 = U32C(0x3320646e); - j10 = x10 = U32C(0x79622d32); - j15 = x15 = U32C(0x6b206574); - } else { - j0 = x0 = LOAD32_LE(c + 0); - j5 = x5 = LOAD32_LE(c + 4); - j10 = x10 = LOAD32_LE(c + 8); - j15 = x15 = LOAD32_LE(c + 12); - } - j1 = x1 = LOAD32_LE(k + 0); - j2 = x2 = LOAD32_LE(k + 4); - j3 = x3 = LOAD32_LE(k + 8); - j4 = x4 = LOAD32_LE(k + 12); - j6 = x6 = LOAD32_LE(in + 0); - j7 = x7 = LOAD32_LE(in + 4); - j8 = x8 = LOAD32_LE(in + 8); - j9 = x9 = LOAD32_LE(in + 12); - j11 = x11 = LOAD32_LE(k + 16); - j12 = x12 = LOAD32_LE(k + 20); - j13 = x13 = LOAD32_LE(k + 24); - j14 = x14 = LOAD32_LE(k + 28); - - for (i = ROUNDS; i > 0; i -= 2) { - x4 ^= ROTL32(x0 + x12, 7); - x8 ^= ROTL32(x4 + x0, 9); - x12 ^= ROTL32(x8 + x4, 13); - x0 ^= ROTL32(x12 + x8, 18); - x9 ^= ROTL32(x5 + x1, 7); - x13 ^= ROTL32(x9 + x5, 9); - x1 ^= ROTL32(x13 + x9, 13); - x5 ^= ROTL32(x1 + x13, 18); - x14 ^= ROTL32(x10 + x6, 7); - x2 ^= ROTL32(x14 + x10, 9); - x6 ^= ROTL32(x2 + x14, 13); - x10 ^= ROTL32(x6 + x2, 18); - x3 ^= ROTL32(x15 + x11, 7); - x7 ^= ROTL32(x3 + x15, 9); - x11 ^= ROTL32(x7 + x3, 13); - x15 ^= ROTL32(x11 + x7, 18); - x1 ^= ROTL32(x0 + x3, 7); - x2 ^= ROTL32(x1 + x0, 9); - x3 ^= ROTL32(x2 + x1, 13); - x0 ^= ROTL32(x3 + x2, 18); - x6 ^= ROTL32(x5 + x4, 7); - x7 ^= ROTL32(x6 + x5, 9); - x4 ^= ROTL32(x7 + x6, 13); - x5 ^= ROTL32(x4 + x7, 18); - x11 ^= ROTL32(x10 + x9, 7); - x8 ^= ROTL32(x11 + x10, 9); - x9 ^= ROTL32(x8 + x11, 13); - x10 ^= ROTL32(x9 + x8, 18); - x12 ^= ROTL32(x15 + x14, 7); - x13 ^= ROTL32(x12 + x15, 9); - x14 ^= ROTL32(x13 + x12, 13); - x15 ^= ROTL32(x14 + x13, 18); - } - - x0 += j0; - x1 += j1; - x2 += j2; - x3 += j3; - x4 += j4; - x5 += j5; - x6 += j6; - x7 += j7; - x8 += j8; - x9 += j9; - x10 += j10; - x11 += j11; - x12 += j12; - x13 += j13; - x14 += j14; - x15 += j15; - - STORE32_LE(out + 0, x0); - STORE32_LE(out + 4, x1); - STORE32_LE(out + 8, x2); - STORE32_LE(out + 12, x3); - STORE32_LE(out + 16, x4); - STORE32_LE(out + 20, x5); - STORE32_LE(out + 24, x6); - STORE32_LE(out + 28, x7); - STORE32_LE(out + 32, x8); - STORE32_LE(out + 36, x9); - STORE32_LE(out + 40, x10); - STORE32_LE(out + 44, x11); - STORE32_LE(out + 48, x12); - STORE32_LE(out + 52, x13); - STORE32_LE(out + 56, x14); - STORE32_LE(out + 60, x15); - - return 0; -} diff --git a/src/libsodium/crypto_core/salsa2012/core_salsa2012.c b/src/libsodium/crypto_core/salsa2012/core_salsa2012.c deleted file mode 100644 index e49a81e7..00000000 --- a/src/libsodium/crypto_core/salsa2012/core_salsa2012.c +++ /dev/null @@ -1,21 +0,0 @@ -#include "crypto_core_salsa2012.h" - -size_t -crypto_core_salsa2012_outputbytes(void) { - return crypto_core_salsa2012_OUTPUTBYTES; -} - -size_t -crypto_core_salsa2012_inputbytes(void) { - return crypto_core_salsa2012_INPUTBYTES; -} - -size_t -crypto_core_salsa2012_keybytes(void) { - return crypto_core_salsa2012_KEYBYTES; -} - -size_t -crypto_core_salsa2012_constbytes(void) { - return crypto_core_salsa2012_CONSTBYTES; -} diff --git a/src/libsodium/crypto_core/salsa2012/ref/core_salsa2012_ref.c b/src/libsodium/crypto_core/salsa2012/ref/core_salsa2012_ref.c deleted file mode 100644 index 9c66a066..00000000 --- a/src/libsodium/crypto_core/salsa2012/ref/core_salsa2012_ref.c +++ /dev/null @@ -1,122 +0,0 @@ -/* -version 20080913 -D. J. Bernstein -Public domain. -*/ - -#include -#include - -#include "crypto_core_salsa2012.h" -#include "private/common.h" - -#define ROUNDS 12 -#define U32C(v) (v##U) - -int -crypto_core_salsa2012(unsigned char *out, - const unsigned char *in, - const unsigned char *k, - const unsigned char *c) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, - x9, x10, x11, x12, x13, x14, x15; - uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, - j9, j10, j11, j12, j13, j14, j15; - int i; - - if (c == NULL) { - j0 = x0 = U32C(0x61707865); - j5 = x5 = U32C(0x3320646e); - j10 = x10 = U32C(0x79622d32); - j15 = x15 = U32C(0x6b206574); - } else { - j0 = x0 = LOAD32_LE(c + 0); - j5 = x5 = LOAD32_LE(c + 4); - j10 = x10 = LOAD32_LE(c + 8); - j15 = x15 = LOAD32_LE(c + 12); - } - j1 = x1 = LOAD32_LE(k + 0); - j2 = x2 = LOAD32_LE(k + 4); - j3 = x3 = LOAD32_LE(k + 8); - j4 = x4 = LOAD32_LE(k + 12); - j6 = x6 = LOAD32_LE(in + 0); - j7 = x7 = LOAD32_LE(in + 4); - j8 = x8 = LOAD32_LE(in + 8); - j9 = x9 = LOAD32_LE(in + 12); - j11 = x11 = LOAD32_LE(k + 16); - j12 = x12 = LOAD32_LE(k + 20); - j13 = x13 = LOAD32_LE(k + 24); - j14 = x14 = LOAD32_LE(k + 28); - - for (i = ROUNDS; i > 0; i -= 2) { - x4 ^= ROTL32(x0 + x12, 7); - x8 ^= ROTL32(x4 + x0, 9); - x12 ^= ROTL32(x8 + x4, 13); - x0 ^= ROTL32(x12 + x8, 18); - x9 ^= ROTL32(x5 + x1, 7); - x13 ^= ROTL32(x9 + x5, 9); - x1 ^= ROTL32(x13 + x9, 13); - x5 ^= ROTL32(x1 + x13, 18); - x14 ^= ROTL32(x10 + x6, 7); - x2 ^= ROTL32(x14 + x10, 9); - x6 ^= ROTL32(x2 + x14, 13); - x10 ^= ROTL32(x6 + x2, 18); - x3 ^= ROTL32(x15 + x11, 7); - x7 ^= ROTL32(x3 + x15, 9); - x11 ^= ROTL32(x7 + x3, 13); - x15 ^= ROTL32(x11 + x7, 18); - x1 ^= ROTL32(x0 + x3, 7); - x2 ^= ROTL32(x1 + x0, 9); - x3 ^= ROTL32(x2 + x1, 13); - x0 ^= ROTL32(x3 + x2, 18); - x6 ^= ROTL32(x5 + x4, 7); - x7 ^= ROTL32(x6 + x5, 9); - x4 ^= ROTL32(x7 + x6, 13); - x5 ^= ROTL32(x4 + x7, 18); - x11 ^= ROTL32(x10 + x9, 7); - x8 ^= ROTL32(x11 + x10, 9); - x9 ^= ROTL32(x8 + x11, 13); - x10 ^= ROTL32(x9 + x8, 18); - x12 ^= ROTL32(x15 + x14, 7); - x13 ^= ROTL32(x12 + x15, 9); - x14 ^= ROTL32(x13 + x12, 13); - x15 ^= ROTL32(x14 + x13, 18); - } - - x0 += j0; - x1 += j1; - x2 += j2; - x3 += j3; - x4 += j4; - x5 += j5; - x6 += j6; - x7 += j7; - x8 += j8; - x9 += j9; - x10 += j10; - x11 += j11; - x12 += j12; - x13 += j13; - x14 += j14; - x15 += j15; - - STORE32_LE(out + 0, x0); - STORE32_LE(out + 4, x1); - STORE32_LE(out + 8, x2); - STORE32_LE(out + 12, x3); - STORE32_LE(out + 16, x4); - STORE32_LE(out + 20, x5); - STORE32_LE(out + 24, x6); - STORE32_LE(out + 28, x7); - STORE32_LE(out + 32, x8); - STORE32_LE(out + 36, x9); - STORE32_LE(out + 40, x10); - STORE32_LE(out + 44, x11); - STORE32_LE(out + 48, x12); - STORE32_LE(out + 52, x13); - STORE32_LE(out + 56, x14); - STORE32_LE(out + 60, x15); - - return 0; -} diff --git a/src/libsodium/crypto_core/salsa208/core_salsa208.c b/src/libsodium/crypto_core/salsa208/core_salsa208.c deleted file mode 100644 index 72c336c4..00000000 --- a/src/libsodium/crypto_core/salsa208/core_salsa208.c +++ /dev/null @@ -1,21 +0,0 @@ -#include "crypto_core_salsa208.h" - -size_t -crypto_core_salsa208_outputbytes(void) { - return crypto_core_salsa208_OUTPUTBYTES; -} - -size_t -crypto_core_salsa208_inputbytes(void) { - return crypto_core_salsa208_INPUTBYTES; -} - -size_t -crypto_core_salsa208_keybytes(void) { - return crypto_core_salsa208_KEYBYTES; -} - -size_t -crypto_core_salsa208_constbytes(void) { - return crypto_core_salsa208_CONSTBYTES; -} diff --git a/src/libsodium/crypto_core/salsa208/ref/core_salsa208_ref.c b/src/libsodium/crypto_core/salsa208/ref/core_salsa208_ref.c deleted file mode 100644 index e2c2ee34..00000000 --- a/src/libsodium/crypto_core/salsa208/ref/core_salsa208_ref.c +++ /dev/null @@ -1,122 +0,0 @@ -/* -version 20080913 -D. J. Bernstein -Public domain. -*/ - -#include -#include - -#include "crypto_core_salsa208.h" -#include "private/common.h" - -#define ROUNDS 8 -#define U32C(v) (v##U) - -int -crypto_core_salsa208(unsigned char *out, - const unsigned char *in, - const unsigned char *k, - const unsigned char *c) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, - x9, x10, x11, x12, x13, x14, x15; - uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, - j9, j10, j11, j12, j13, j14, j15; - int i; - - if (c == NULL) { - j0 = x0 = U32C(0x61707865); - j5 = x5 = U32C(0x3320646e); - j10 = x10 = U32C(0x79622d32); - j15 = x15 = U32C(0x6b206574); - } else { - j0 = x0 = LOAD32_LE(c + 0); - j5 = x5 = LOAD32_LE(c + 4); - j10 = x10 = LOAD32_LE(c + 8); - j15 = x15 = LOAD32_LE(c + 12); - } - j1 = x1 = LOAD32_LE(k + 0); - j2 = x2 = LOAD32_LE(k + 4); - j3 = x3 = LOAD32_LE(k + 8); - j4 = x4 = LOAD32_LE(k + 12); - j6 = x6 = LOAD32_LE(in + 0); - j7 = x7 = LOAD32_LE(in + 4); - j8 = x8 = LOAD32_LE(in + 8); - j9 = x9 = LOAD32_LE(in + 12); - j11 = x11 = LOAD32_LE(k + 16); - j12 = x12 = LOAD32_LE(k + 20); - j13 = x13 = LOAD32_LE(k + 24); - j14 = x14 = LOAD32_LE(k + 28); - - for (i = ROUNDS; i > 0; i -= 2) { - x4 ^= ROTL32(x0 + x12, 7); - x8 ^= ROTL32(x4 + x0, 9); - x12 ^= ROTL32(x8 + x4, 13); - x0 ^= ROTL32(x12 + x8, 18); - x9 ^= ROTL32(x5 + x1, 7); - x13 ^= ROTL32(x9 + x5, 9); - x1 ^= ROTL32(x13 + x9, 13); - x5 ^= ROTL32(x1 + x13, 18); - x14 ^= ROTL32(x10 + x6, 7); - x2 ^= ROTL32(x14 + x10, 9); - x6 ^= ROTL32(x2 + x14, 13); - x10 ^= ROTL32(x6 + x2, 18); - x3 ^= ROTL32(x15 + x11, 7); - x7 ^= ROTL32(x3 + x15, 9); - x11 ^= ROTL32(x7 + x3, 13); - x15 ^= ROTL32(x11 + x7, 18); - x1 ^= ROTL32(x0 + x3, 7); - x2 ^= ROTL32(x1 + x0, 9); - x3 ^= ROTL32(x2 + x1, 13); - x0 ^= ROTL32(x3 + x2, 18); - x6 ^= ROTL32(x5 + x4, 7); - x7 ^= ROTL32(x6 + x5, 9); - x4 ^= ROTL32(x7 + x6, 13); - x5 ^= ROTL32(x4 + x7, 18); - x11 ^= ROTL32(x10 + x9, 7); - x8 ^= ROTL32(x11 + x10, 9); - x9 ^= ROTL32(x8 + x11, 13); - x10 ^= ROTL32(x9 + x8, 18); - x12 ^= ROTL32(x15 + x14, 7); - x13 ^= ROTL32(x12 + x15, 9); - x14 ^= ROTL32(x13 + x12, 13); - x15 ^= ROTL32(x14 + x13, 18); - } - - x0 += j0; - x1 += j1; - x2 += j2; - x3 += j3; - x4 += j4; - x5 += j5; - x6 += j6; - x7 += j7; - x8 += j8; - x9 += j9; - x10 += j10; - x11 += j11; - x12 += j12; - x13 += j13; - x14 += j14; - x15 += j15; - - STORE32_LE(out + 0, x0); - STORE32_LE(out + 4, x1); - STORE32_LE(out + 8, x2); - STORE32_LE(out + 12, x3); - STORE32_LE(out + 16, x4); - STORE32_LE(out + 20, x5); - STORE32_LE(out + 24, x6); - STORE32_LE(out + 28, x7); - STORE32_LE(out + 32, x8); - STORE32_LE(out + 36, x9); - STORE32_LE(out + 40, x10); - STORE32_LE(out + 44, x11); - STORE32_LE(out + 48, x12); - STORE32_LE(out + 52, x13); - STORE32_LE(out + 56, x14); - STORE32_LE(out + 60, x15); - - return 0; -} diff --git a/src/libsodium/crypto_stream/chacha20/ref/stream_chacha20_ref.c b/src/libsodium/crypto_stream/chacha20/ref/stream_chacha20_ref.c index b8cc6baa..afb384f5 100644 --- a/src/libsodium/crypto_stream/chacha20/ref/stream_chacha20_ref.c +++ b/src/libsodium/crypto_stream/chacha20/ref/stream_chacha20_ref.c @@ -77,8 +77,8 @@ chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) } static void -chacha_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, - unsigned long long bytes) +chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) { uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; @@ -235,7 +235,7 @@ stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, chacha_keysetup(&ctx, k); chacha_ivsetup(&ctx, n, NULL); memset(c, 0, clen); - chacha_encrypt_bytes(&ctx, c, c, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); sodium_memzero(&ctx, sizeof ctx); return 0; @@ -254,7 +254,7 @@ stream_ietf_ref(unsigned char *c, unsigned long long clen, chacha_keysetup(&ctx, k); chacha_ietf_ivsetup(&ctx, n, NULL); memset(c, 0, clen); - chacha_encrypt_bytes(&ctx, c, c, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); sodium_memzero(&ctx, sizeof ctx); return 0; @@ -279,7 +279,7 @@ stream_ref_xor_ic(unsigned char *c, const unsigned char *m, STORE32_LE(&ic_bytes[4], ic_high); chacha_keysetup(&ctx, k); chacha_ivsetup(&ctx, n, ic_bytes); - chacha_encrypt_bytes(&ctx, m, c, mlen); + chacha20_encrypt_bytes(&ctx, m, c, mlen); sodium_memzero(&ctx, sizeof ctx); return 0; @@ -299,7 +299,7 @@ stream_ietf_ref_xor_ic(unsigned char *c, const unsigned char *m, STORE32_LE(ic_bytes, ic); chacha_keysetup(&ctx, k); chacha_ietf_ivsetup(&ctx, n, ic_bytes); - chacha_encrypt_bytes(&ctx, m, c, mlen); + chacha20_encrypt_bytes(&ctx, m, c, mlen); sodium_memzero(&ctx, sizeof ctx); return 0; diff --git a/src/libsodium/crypto_stream/chacha20/vec/stream_chacha20_vec.c b/src/libsodium/crypto_stream/chacha20/vec/stream_chacha20_vec.c index f09944ef..734c9b0c 100644 --- a/src/libsodium/crypto_stream/chacha20/vec/stream_chacha20_vec.c +++ b/src/libsodium/crypto_stream/chacha20/vec/stream_chacha20_vec.c @@ -129,8 +129,8 @@ chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) } static void -chacha_encrypt_bytes(chacha_ctx *ctx, const uint8_t *in, uint8_t *out, - unsigned long long inlen) +chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *in, uint8_t *out, + unsigned long long inlen) { CRYPTO_ALIGN(16) unsigned chacha_const[] = { 0x61707865, 0x3320646E, 0x79622D32, @@ -258,7 +258,7 @@ stream_vec(unsigned char *c, unsigned long long clen, const unsigned char *n, chacha_keysetup(&ctx, k); chacha_ivsetup(&ctx, n, 0ULL); memset(c, 0, clen); - chacha_encrypt_bytes(&ctx, c, c, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); sodium_memzero(&ctx, sizeof ctx); return 0; @@ -277,7 +277,7 @@ stream_ietf_vec(unsigned char *c, unsigned long long clen, chacha_keysetup(&ctx, k); chacha_ietf_ivsetup(&ctx, n, 0ULL); memset(c, 0, clen); - chacha_encrypt_bytes(&ctx, c, c, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); sodium_memzero(&ctx, sizeof ctx); return 0; @@ -295,7 +295,7 @@ stream_vec_xor_ic(unsigned char *c, const unsigned char *m, } chacha_keysetup(&ctx, k); chacha_ivsetup(&ctx, n, ic); - chacha_encrypt_bytes(&ctx, m, c, mlen); + chacha20_encrypt_bytes(&ctx, m, c, mlen); sodium_memzero(&ctx, sizeof ctx); return 0; @@ -313,7 +313,7 @@ stream_ietf_vec_xor_ic(unsigned char *c, const unsigned char *m, } chacha_keysetup(&ctx, k); chacha_ietf_ivsetup(&ctx, n, ic); - chacha_encrypt_bytes(&ctx, m, c, mlen); + chacha20_encrypt_bytes(&ctx, m, c, mlen); sodium_memzero(&ctx, sizeof ctx); return 0; diff --git a/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S b/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S deleted file mode 100644 index 1c7850a5..00000000 --- a/src/libsodium/crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S +++ /dev/null @@ -1,952 +0,0 @@ -#ifdef HAVE_AMD64_ASM - -.text -.p2align 5 - -.globl crypto_stream_salsa20 -.globl _crypto_stream_salsa20 -#ifdef __ELF__ -.type crypto_stream_salsa20, @function -.type _crypto_stream_salsa20, @function -#endif -crypto_stream_salsa20: -_crypto_stream_salsa20: -mov %rsp,%r11 -and $31,%r11 -add $512,%r11 -sub %r11,%rsp -movq %r11,416(%rsp) -movq %r12,424(%rsp) -movq %r13,432(%rsp) -movq %r14,440(%rsp) -movq %r15,448(%rsp) -movq %rbx,456(%rsp) -movq %rbp,464(%rsp) -mov %rsi,%r9 -mov %rdi,%rdi -mov %rdi,%rsi -mov %rdx,%rdx -mov %rcx,%r10 -cmp $0,%r9 -jbe ._done -mov $0,%rax -mov %r9,%rcx -rep stosb -sub %r9,%rdi -movq $0,472(%rsp) -jmp ._start - -.text -.p2align 5 - -.globl crypto_stream_salsa20_xor_ic -.globl _crypto_stream_salsa20_xor_ic -#ifdef __ELF__ -.type crypto_stream_salsa20_xor_ic, @function -.type _crypto_stream_salsa20_xor_ic, @function -#endif -crypto_stream_salsa20_xor_ic: -_crypto_stream_salsa20_xor_ic: - -mov %rsp,%r11 -and $31,%r11 -add $512,%r11 -sub %r11,%rsp -movq %r11,416(%rsp) -movq %r12,424(%rsp) -movq %r13,432(%rsp) -movq %r14,440(%rsp) -movq %r15,448(%rsp) -movq %rbx,456(%rsp) -movq %rbp,464(%rsp) -mov %rdi,%rdi -mov %rsi,%rsi -mov %r9,%r10 -movq %r8,472(%rsp) -mov %rdx,%r9 -mov %rcx,%rdx -cmp $0,%r9 -jbe ._done - -._start: -movl 20(%r10),%ecx -movl 0(%r10),%r8d -movl 0(%rdx),%eax -movl 16(%r10),%r11d -movl %ecx,64(%rsp) -movl %r8d,4+64(%rsp) -movl %eax,8+64(%rsp) -movl %r11d,12+64(%rsp) -movl 24(%r10),%r8d -movl 4(%r10),%eax -movl 4(%rdx),%edx -movq 472(%rsp),%rcx -movl %ecx,80(%rsp) -movl %r8d,4+80(%rsp) -movl %eax,8+80(%rsp) -movl %edx,12+80(%rsp) -movl 12(%r10),%edx -shr $32,%rcx -movl 28(%r10),%r8d -movl 8(%r10),%eax -movl %edx,96(%rsp) -movl %ecx,4+96(%rsp) -movl %r8d,8+96(%rsp) -movl %eax,12+96(%rsp) -mov $1634760805,%rdx -mov $857760878,%rcx -mov $2036477234,%r8 -mov $1797285236,%rax -movl %edx,112(%rsp) -movl %ecx,4+112(%rsp) -movl %r8d,8+112(%rsp) -movl %eax,12+112(%rsp) -cmp $256,%r9 -jb ._bytesbetween1and255 -movdqa 112(%rsp),%xmm0 -pshufd $0x55,%xmm0,%xmm1 -pshufd $0xaa,%xmm0,%xmm2 -pshufd $0xff,%xmm0,%xmm3 -pshufd $0x00,%xmm0,%xmm0 -movdqa %xmm1,128(%rsp) -movdqa %xmm2,144(%rsp) -movdqa %xmm3,160(%rsp) -movdqa %xmm0,176(%rsp) -movdqa 64(%rsp),%xmm0 -pshufd $0xaa,%xmm0,%xmm1 -pshufd $0xff,%xmm0,%xmm2 -pshufd $0x00,%xmm0,%xmm3 -pshufd $0x55,%xmm0,%xmm0 -movdqa %xmm1,192(%rsp) -movdqa %xmm2,208(%rsp) -movdqa %xmm3,224(%rsp) -movdqa %xmm0,240(%rsp) -movdqa 80(%rsp),%xmm0 -pshufd $0xff,%xmm0,%xmm1 -pshufd $0x55,%xmm0,%xmm2 -pshufd $0xaa,%xmm0,%xmm0 -movdqa %xmm1,256(%rsp) -movdqa %xmm2,272(%rsp) -movdqa %xmm0,288(%rsp) -movdqa 96(%rsp),%xmm0 -pshufd $0x00,%xmm0,%xmm1 -pshufd $0xaa,%xmm0,%xmm2 -pshufd $0xff,%xmm0,%xmm0 -movdqa %xmm1,304(%rsp) -movdqa %xmm2,320(%rsp) -movdqa %xmm0,336(%rsp) - -.p2align 4 -._bytesatleast256: -movq 472(%rsp),%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,352(%rsp) -movl %ecx,368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,4+352(%rsp) -movl %ecx,4+368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,8+352(%rsp) -movl %ecx,8+368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,12+352(%rsp) -movl %ecx,12+368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,80(%rsp) -movl %ecx,4+96(%rsp) -movq %rdx,472(%rsp) -movq %r9,480(%rsp) -mov $20,%rdx -movdqa 128(%rsp),%xmm0 -movdqa 144(%rsp),%xmm1 -movdqa 160(%rsp),%xmm2 -movdqa 320(%rsp),%xmm3 -movdqa 336(%rsp),%xmm4 -movdqa 192(%rsp),%xmm5 -movdqa 208(%rsp),%xmm6 -movdqa 240(%rsp),%xmm7 -movdqa 256(%rsp),%xmm8 -movdqa 272(%rsp),%xmm9 -movdqa 288(%rsp),%xmm10 -movdqa 368(%rsp),%xmm11 -movdqa 176(%rsp),%xmm12 -movdqa 224(%rsp),%xmm13 -movdqa 304(%rsp),%xmm14 -movdqa 352(%rsp),%xmm15 - -.p2align 4 -._mainloop1: -movdqa %xmm1,384(%rsp) -movdqa %xmm2,400(%rsp) -movdqa %xmm13,%xmm1 -paddd %xmm12,%xmm1 -movdqa %xmm1,%xmm2 -pslld $7,%xmm1 -pxor %xmm1,%xmm14 -psrld $25,%xmm2 -pxor %xmm2,%xmm14 -movdqa %xmm7,%xmm1 -paddd %xmm0,%xmm1 -movdqa %xmm1,%xmm2 -pslld $7,%xmm1 -pxor %xmm1,%xmm11 -psrld $25,%xmm2 -pxor %xmm2,%xmm11 -movdqa %xmm12,%xmm1 -paddd %xmm14,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm15 -psrld $23,%xmm2 -pxor %xmm2,%xmm15 -movdqa %xmm0,%xmm1 -paddd %xmm11,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm9 -psrld $23,%xmm2 -pxor %xmm2,%xmm9 -movdqa %xmm14,%xmm1 -paddd %xmm15,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm13 -psrld $19,%xmm2 -pxor %xmm2,%xmm13 -movdqa %xmm11,%xmm1 -paddd %xmm9,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm7 -psrld $19,%xmm2 -pxor %xmm2,%xmm7 -movdqa %xmm15,%xmm1 -paddd %xmm13,%xmm1 -movdqa %xmm1,%xmm2 -pslld $18,%xmm1 -pxor %xmm1,%xmm12 -psrld $14,%xmm2 -pxor %xmm2,%xmm12 -movdqa 384(%rsp),%xmm1 -movdqa %xmm12,384(%rsp) -movdqa %xmm9,%xmm2 -paddd %xmm7,%xmm2 -movdqa %xmm2,%xmm12 -pslld $18,%xmm2 -pxor %xmm2,%xmm0 -psrld $14,%xmm12 -pxor %xmm12,%xmm0 -movdqa %xmm5,%xmm2 -paddd %xmm1,%xmm2 -movdqa %xmm2,%xmm12 -pslld $7,%xmm2 -pxor %xmm2,%xmm3 -psrld $25,%xmm12 -pxor %xmm12,%xmm3 -movdqa 400(%rsp),%xmm2 -movdqa %xmm0,400(%rsp) -movdqa %xmm6,%xmm0 -paddd %xmm2,%xmm0 -movdqa %xmm0,%xmm12 -pslld $7,%xmm0 -pxor %xmm0,%xmm4 -psrld $25,%xmm12 -pxor %xmm12,%xmm4 -movdqa %xmm1,%xmm0 -paddd %xmm3,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm10 -psrld $23,%xmm12 -pxor %xmm12,%xmm10 -movdqa %xmm2,%xmm0 -paddd %xmm4,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm8 -psrld $23,%xmm12 -pxor %xmm12,%xmm8 -movdqa %xmm3,%xmm0 -paddd %xmm10,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm5 -psrld $19,%xmm12 -pxor %xmm12,%xmm5 -movdqa %xmm4,%xmm0 -paddd %xmm8,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm6 -psrld $19,%xmm12 -pxor %xmm12,%xmm6 -movdqa %xmm10,%xmm0 -paddd %xmm5,%xmm0 -movdqa %xmm0,%xmm12 -pslld $18,%xmm0 -pxor %xmm0,%xmm1 -psrld $14,%xmm12 -pxor %xmm12,%xmm1 -movdqa 384(%rsp),%xmm0 -movdqa %xmm1,384(%rsp) -movdqa %xmm4,%xmm1 -paddd %xmm0,%xmm1 -movdqa %xmm1,%xmm12 -pslld $7,%xmm1 -pxor %xmm1,%xmm7 -psrld $25,%xmm12 -pxor %xmm12,%xmm7 -movdqa %xmm8,%xmm1 -paddd %xmm6,%xmm1 -movdqa %xmm1,%xmm12 -pslld $18,%xmm1 -pxor %xmm1,%xmm2 -psrld $14,%xmm12 -pxor %xmm12,%xmm2 -movdqa 400(%rsp),%xmm12 -movdqa %xmm2,400(%rsp) -movdqa %xmm14,%xmm1 -paddd %xmm12,%xmm1 -movdqa %xmm1,%xmm2 -pslld $7,%xmm1 -pxor %xmm1,%xmm5 -psrld $25,%xmm2 -pxor %xmm2,%xmm5 -movdqa %xmm0,%xmm1 -paddd %xmm7,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm10 -psrld $23,%xmm2 -pxor %xmm2,%xmm10 -movdqa %xmm12,%xmm1 -paddd %xmm5,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm8 -psrld $23,%xmm2 -pxor %xmm2,%xmm8 -movdqa %xmm7,%xmm1 -paddd %xmm10,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm4 -psrld $19,%xmm2 -pxor %xmm2,%xmm4 -movdqa %xmm5,%xmm1 -paddd %xmm8,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm14 -psrld $19,%xmm2 -pxor %xmm2,%xmm14 -movdqa %xmm10,%xmm1 -paddd %xmm4,%xmm1 -movdqa %xmm1,%xmm2 -pslld $18,%xmm1 -pxor %xmm1,%xmm0 -psrld $14,%xmm2 -pxor %xmm2,%xmm0 -movdqa 384(%rsp),%xmm1 -movdqa %xmm0,384(%rsp) -movdqa %xmm8,%xmm0 -paddd %xmm14,%xmm0 -movdqa %xmm0,%xmm2 -pslld $18,%xmm0 -pxor %xmm0,%xmm12 -psrld $14,%xmm2 -pxor %xmm2,%xmm12 -movdqa %xmm11,%xmm0 -paddd %xmm1,%xmm0 -movdqa %xmm0,%xmm2 -pslld $7,%xmm0 -pxor %xmm0,%xmm6 -psrld $25,%xmm2 -pxor %xmm2,%xmm6 -movdqa 400(%rsp),%xmm2 -movdqa %xmm12,400(%rsp) -movdqa %xmm3,%xmm0 -paddd %xmm2,%xmm0 -movdqa %xmm0,%xmm12 -pslld $7,%xmm0 -pxor %xmm0,%xmm13 -psrld $25,%xmm12 -pxor %xmm12,%xmm13 -movdqa %xmm1,%xmm0 -paddd %xmm6,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm15 -psrld $23,%xmm12 -pxor %xmm12,%xmm15 -movdqa %xmm2,%xmm0 -paddd %xmm13,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm9 -psrld $23,%xmm12 -pxor %xmm12,%xmm9 -movdqa %xmm6,%xmm0 -paddd %xmm15,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm11 -psrld $19,%xmm12 -pxor %xmm12,%xmm11 -movdqa %xmm13,%xmm0 -paddd %xmm9,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm3 -psrld $19,%xmm12 -pxor %xmm12,%xmm3 -movdqa %xmm15,%xmm0 -paddd %xmm11,%xmm0 -movdqa %xmm0,%xmm12 -pslld $18,%xmm0 -pxor %xmm0,%xmm1 -psrld $14,%xmm12 -pxor %xmm12,%xmm1 -movdqa %xmm9,%xmm0 -paddd %xmm3,%xmm0 -movdqa %xmm0,%xmm12 -pslld $18,%xmm0 -pxor %xmm0,%xmm2 -psrld $14,%xmm12 -pxor %xmm12,%xmm2 -movdqa 384(%rsp),%xmm12 -movdqa 400(%rsp),%xmm0 -sub $2,%rdx -ja ._mainloop1 - -paddd 176(%rsp),%xmm12 -paddd 240(%rsp),%xmm7 -paddd 288(%rsp),%xmm10 -paddd 336(%rsp),%xmm4 -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -pshufd $0x39,%xmm12,%xmm12 -pshufd $0x39,%xmm7,%xmm7 -pshufd $0x39,%xmm10,%xmm10 -pshufd $0x39,%xmm4,%xmm4 -xorl 0(%rsi),%edx -xorl 4(%rsi),%ecx -xorl 8(%rsi),%r8d -xorl 12(%rsi),%r9d -movl %edx,0(%rdi) -movl %ecx,4(%rdi) -movl %r8d,8(%rdi) -movl %r9d,12(%rdi) -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -pshufd $0x39,%xmm12,%xmm12 -pshufd $0x39,%xmm7,%xmm7 -pshufd $0x39,%xmm10,%xmm10 -pshufd $0x39,%xmm4,%xmm4 -xorl 64(%rsi),%edx -xorl 68(%rsi),%ecx -xorl 72(%rsi),%r8d -xorl 76(%rsi),%r9d -movl %edx,64(%rdi) -movl %ecx,68(%rdi) -movl %r8d,72(%rdi) -movl %r9d,76(%rdi) -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -pshufd $0x39,%xmm12,%xmm12 -pshufd $0x39,%xmm7,%xmm7 -pshufd $0x39,%xmm10,%xmm10 -pshufd $0x39,%xmm4,%xmm4 -xorl 128(%rsi),%edx -xorl 132(%rsi),%ecx -xorl 136(%rsi),%r8d -xorl 140(%rsi),%r9d -movl %edx,128(%rdi) -movl %ecx,132(%rdi) -movl %r8d,136(%rdi) -movl %r9d,140(%rdi) -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -xorl 192(%rsi),%edx -xorl 196(%rsi),%ecx -xorl 200(%rsi),%r8d -xorl 204(%rsi),%r9d -movl %edx,192(%rdi) -movl %ecx,196(%rdi) -movl %r8d,200(%rdi) -movl %r9d,204(%rdi) -paddd 304(%rsp),%xmm14 -paddd 128(%rsp),%xmm0 -paddd 192(%rsp),%xmm5 -paddd 256(%rsp),%xmm8 -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -pshufd $0x39,%xmm14,%xmm14 -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm5,%xmm5 -pshufd $0x39,%xmm8,%xmm8 -xorl 16(%rsi),%edx -xorl 20(%rsi),%ecx -xorl 24(%rsi),%r8d -xorl 28(%rsi),%r9d -movl %edx,16(%rdi) -movl %ecx,20(%rdi) -movl %r8d,24(%rdi) -movl %r9d,28(%rdi) -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -pshufd $0x39,%xmm14,%xmm14 -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm5,%xmm5 -pshufd $0x39,%xmm8,%xmm8 -xorl 80(%rsi),%edx -xorl 84(%rsi),%ecx -xorl 88(%rsi),%r8d -xorl 92(%rsi),%r9d -movl %edx,80(%rdi) -movl %ecx,84(%rdi) -movl %r8d,88(%rdi) -movl %r9d,92(%rdi) -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -pshufd $0x39,%xmm14,%xmm14 -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm5,%xmm5 -pshufd $0x39,%xmm8,%xmm8 -xorl 144(%rsi),%edx -xorl 148(%rsi),%ecx -xorl 152(%rsi),%r8d -xorl 156(%rsi),%r9d -movl %edx,144(%rdi) -movl %ecx,148(%rdi) -movl %r8d,152(%rdi) -movl %r9d,156(%rdi) -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -xorl 208(%rsi),%edx -xorl 212(%rsi),%ecx -xorl 216(%rsi),%r8d -xorl 220(%rsi),%r9d -movl %edx,208(%rdi) -movl %ecx,212(%rdi) -movl %r8d,216(%rdi) -movl %r9d,220(%rdi) -paddd 352(%rsp),%xmm15 -paddd 368(%rsp),%xmm11 -paddd 144(%rsp),%xmm1 -paddd 208(%rsp),%xmm6 -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -pshufd $0x39,%xmm15,%xmm15 -pshufd $0x39,%xmm11,%xmm11 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm6,%xmm6 -xorl 32(%rsi),%edx -xorl 36(%rsi),%ecx -xorl 40(%rsi),%r8d -xorl 44(%rsi),%r9d -movl %edx,32(%rdi) -movl %ecx,36(%rdi) -movl %r8d,40(%rdi) -movl %r9d,44(%rdi) -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -pshufd $0x39,%xmm15,%xmm15 -pshufd $0x39,%xmm11,%xmm11 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm6,%xmm6 -xorl 96(%rsi),%edx -xorl 100(%rsi),%ecx -xorl 104(%rsi),%r8d -xorl 108(%rsi),%r9d -movl %edx,96(%rdi) -movl %ecx,100(%rdi) -movl %r8d,104(%rdi) -movl %r9d,108(%rdi) -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -pshufd $0x39,%xmm15,%xmm15 -pshufd $0x39,%xmm11,%xmm11 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm6,%xmm6 -xorl 160(%rsi),%edx -xorl 164(%rsi),%ecx -xorl 168(%rsi),%r8d -xorl 172(%rsi),%r9d -movl %edx,160(%rdi) -movl %ecx,164(%rdi) -movl %r8d,168(%rdi) -movl %r9d,172(%rdi) -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -xorl 224(%rsi),%edx -xorl 228(%rsi),%ecx -xorl 232(%rsi),%r8d -xorl 236(%rsi),%r9d -movl %edx,224(%rdi) -movl %ecx,228(%rdi) -movl %r8d,232(%rdi) -movl %r9d,236(%rdi) -paddd 224(%rsp),%xmm13 -paddd 272(%rsp),%xmm9 -paddd 320(%rsp),%xmm3 -paddd 160(%rsp),%xmm2 -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -pshufd $0x39,%xmm13,%xmm13 -pshufd $0x39,%xmm9,%xmm9 -pshufd $0x39,%xmm3,%xmm3 -pshufd $0x39,%xmm2,%xmm2 -xorl 48(%rsi),%edx -xorl 52(%rsi),%ecx -xorl 56(%rsi),%r8d -xorl 60(%rsi),%r9d -movl %edx,48(%rdi) -movl %ecx,52(%rdi) -movl %r8d,56(%rdi) -movl %r9d,60(%rdi) -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -pshufd $0x39,%xmm13,%xmm13 -pshufd $0x39,%xmm9,%xmm9 -pshufd $0x39,%xmm3,%xmm3 -pshufd $0x39,%xmm2,%xmm2 -xorl 112(%rsi),%edx -xorl 116(%rsi),%ecx -xorl 120(%rsi),%r8d -xorl 124(%rsi),%r9d -movl %edx,112(%rdi) -movl %ecx,116(%rdi) -movl %r8d,120(%rdi) -movl %r9d,124(%rdi) -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -pshufd $0x39,%xmm13,%xmm13 -pshufd $0x39,%xmm9,%xmm9 -pshufd $0x39,%xmm3,%xmm3 -pshufd $0x39,%xmm2,%xmm2 -xorl 176(%rsi),%edx -xorl 180(%rsi),%ecx -xorl 184(%rsi),%r8d -xorl 188(%rsi),%r9d -movl %edx,176(%rdi) -movl %ecx,180(%rdi) -movl %r8d,184(%rdi) -movl %r9d,188(%rdi) -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -xorl 240(%rsi),%edx -xorl 244(%rsi),%ecx -xorl 248(%rsi),%r8d -xorl 252(%rsi),%r9d -movl %edx,240(%rdi) -movl %ecx,244(%rdi) -movl %r8d,248(%rdi) -movl %r9d,252(%rdi) -movq 480(%rsp),%r9 -sub $256,%r9 -add $256,%rsi -add $256,%rdi -cmp $256,%r9 -jae ._bytesatleast256 - -cmp $0,%r9 -jbe ._done - -._bytesbetween1and255: -cmp $64,%r9 -jae ._nocopy - -mov %rdi,%rdx -leaq 0(%rsp),%rdi -mov %r9,%rcx -rep movsb -leaq 0(%rsp),%rdi -leaq 0(%rsp),%rsi - -._nocopy: -movq %r9,480(%rsp) -movdqa 112(%rsp),%xmm0 -movdqa 64(%rsp),%xmm1 -movdqa 80(%rsp),%xmm2 -movdqa 96(%rsp),%xmm3 -movdqa %xmm1,%xmm4 -mov $20,%rcx - -.p2align 4 -._mainloop2: -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm3 -pxor %xmm6,%xmm3 -paddd %xmm3,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm3,%xmm3 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm1 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pxor %xmm6,%xmm0 -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm1 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm1,%xmm1 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm3 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm3 -paddd %xmm3,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm3,%xmm3 -pxor %xmm6,%xmm0 -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm3 -pxor %xmm6,%xmm3 -paddd %xmm3,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm3,%xmm3 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm1 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pxor %xmm6,%xmm0 -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm1 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm1,%xmm1 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm3 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm3 -sub $4,%rcx -paddd %xmm3,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -pxor %xmm7,%xmm7 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm3,%xmm3 -pxor %xmm6,%xmm0 -ja ._mainloop2 - -paddd 112(%rsp),%xmm0 -paddd 64(%rsp),%xmm1 -paddd 80(%rsp),%xmm2 -paddd 96(%rsp),%xmm3 -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm2,%xmm2 -pshufd $0x39,%xmm3,%xmm3 -xorl 0(%rsi),%ecx -xorl 48(%rsi),%r8d -xorl 32(%rsi),%r9d -xorl 16(%rsi),%eax -movl %ecx,0(%rdi) -movl %r8d,48(%rdi) -movl %r9d,32(%rdi) -movl %eax,16(%rdi) -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm2,%xmm2 -pshufd $0x39,%xmm3,%xmm3 -xorl 20(%rsi),%ecx -xorl 4(%rsi),%r8d -xorl 52(%rsi),%r9d -xorl 36(%rsi),%eax -movl %ecx,20(%rdi) -movl %r8d,4(%rdi) -movl %r9d,52(%rdi) -movl %eax,36(%rdi) -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm2,%xmm2 -pshufd $0x39,%xmm3,%xmm3 -xorl 40(%rsi),%ecx -xorl 24(%rsi),%r8d -xorl 8(%rsi),%r9d -xorl 56(%rsi),%eax -movl %ecx,40(%rdi) -movl %r8d,24(%rdi) -movl %r9d,8(%rdi) -movl %eax,56(%rdi) -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -xorl 60(%rsi),%ecx -xorl 44(%rsi),%r8d -xorl 28(%rsi),%r9d -xorl 12(%rsi),%eax -movl %ecx,60(%rdi) -movl %r8d,44(%rdi) -movl %r9d,28(%rdi) -movl %eax,12(%rdi) -movq 480(%rsp),%r9 -movq 472(%rsp),%rcx -add $1,%rcx -mov %rcx,%r8 -shr $32,%r8 -movl %ecx,80(%rsp) -movl %r8d,4+96(%rsp) -movq %rcx,472(%rsp) -cmp $64,%r9 -ja ._bytesatleast65 -jae ._bytesatleast64 - -mov %rdi,%rsi -mov %rdx,%rdi -mov %r9,%rcx -rep movsb - -._bytesatleast64: -._done: -movq 416(%rsp),%r11 -movq 424(%rsp),%r12 -movq 432(%rsp),%r13 -movq 440(%rsp),%r14 -movq 448(%rsp),%r15 -movq 456(%rsp),%rbx -movq 464(%rsp),%rbp -add %r11,%rsp -xor %rax,%rax -mov %rsi,%rdx -ret - -._bytesatleast65: -sub $64,%r9 -add $64,%rdi -add $64,%rsi -jmp ._bytesbetween1and255 - -#endif - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.c b/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.c index 8d1000c1..008d8daf 100644 --- a/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.c +++ b/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.c @@ -4,15 +4,17 @@ D. J. Bernstein Public domain. */ +#include + #include "crypto_core_salsa20.h" #include "crypto_stream_salsa20.h" #include "utils.h" #ifndef HAVE_AMD64_ASM -int -crypto_stream_salsa20(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) +static int +stream_ref(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) { unsigned char in[16]; unsigned char block[64]; @@ -34,18 +36,15 @@ crypto_stream_salsa20(unsigned char *c, unsigned long long clen, } while (clen >= 64) { crypto_core_salsa20(c, in, kcopy, NULL); - u = 1; for (i = 8; i < 16; ++i) { u += (unsigned int)in[i]; in[i] = u; u >>= 8; } - clen -= 64; c += 64; } - if (clen) { crypto_core_salsa20(block, in, kcopy, NULL); for (i = 0; i < (unsigned int)clen; ++i) { @@ -58,4 +57,61 @@ crypto_stream_salsa20(unsigned char *c, unsigned long long clen, return 0; } +static int +stream_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + uint64_t ic, const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!mlen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = (unsigned char)(ic & 0xff); + ic >>= 8; + } + while (mlen >= 64) { + crypto_core_salsa20(block, in, kcopy, NULL); + for (i = 0; i < 64; ++i) { + c[i] = m[i] ^ block[i]; + } + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + mlen -= 64; + c += 64; + m += 64; + } + if (mlen) { + crypto_core_salsa20(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)mlen; ++i) { + c[i] = m[i] ^ block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} + +struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_ref_implementation = { + SODIUM_C99(.stream =) stream_ref, + SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, + }; + #endif diff --git a/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.h b/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.h new file mode 100644 index 00000000..180f0b66 --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/ref/stream_salsa20_ref.h @@ -0,0 +1,16 @@ + +#include + +#include "../stream_salsa20.h" +#include "crypto_stream_salsa20.h" + +extern struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_ref_implementation; + +int crypto_stream_salsa20_ref(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k); + +int crypto_stream_salsa20_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k); diff --git a/src/libsodium/crypto_stream/salsa20/ref/xor_salsa20_ref.c b/src/libsodium/crypto_stream/salsa20/ref/xor_salsa20_ref.c deleted file mode 100644 index 03e08d62..00000000 --- a/src/libsodium/crypto_stream/salsa20/ref/xor_salsa20_ref.c +++ /dev/null @@ -1,69 +0,0 @@ -/* -version 20140420 -D. J. Bernstein -Public domain. -*/ - -#include - -#include "crypto_core_salsa20.h" -#include "crypto_stream_salsa20.h" -#include "utils.h" - -#ifndef HAVE_AMD64_ASM - -int -crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - uint64_t ic, const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!mlen) { - return 0; - } - for (i = 0; i < 32; ++i) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; ++i) { - in[i] = n[i]; - } - for (i = 8; i < 16; ++i) { - in[i] = (unsigned char)(ic & 0xff); - ic >>= 8; - } - - while (mlen >= 64) { - crypto_core_salsa20(block, in, kcopy, NULL); - for (i = 0; i < 64; ++i) { - c[i] = m[i] ^ block[i]; - } - u = 1; - for (i = 8; i < 16; ++i) { - u += (unsigned int)in[i]; - in[i] = u; - u >>= 8; - } - - mlen -= 64; - c += 64; - m += 64; - } - - if (mlen) { - crypto_core_salsa20(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int)mlen; ++i) { - c[i] = m[i] ^ block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} - -#endif diff --git a/src/libsodium/crypto_stream/salsa20/stream_salsa20.c b/src/libsodium/crypto_stream/salsa20/stream_salsa20.c index ae951a0a..97b3a3c2 100644 --- a/src/libsodium/crypto_stream/salsa20/stream_salsa20.c +++ b/src/libsodium/crypto_stream/salsa20/stream_salsa20.c @@ -1,5 +1,19 @@ #include "crypto_stream_salsa20.h" +#include "stream_salsa20.h" #include "randombytes.h" +#include "runtime.h" +#include "ref/stream_salsa20_ref.h" +#ifdef HAVE_EMMINTRIN_H +# include "xmm6int/stream_salsa20_xmm6int.h" +#endif + +#if defined(HAVE_EMMINTRIN_H) && defined(__x86_64__) +static const crypto_stream_salsa20_implementation *implementation = + &crypto_stream_salsa20_xmm6int_implementation; +#else +static const crypto_stream_salsa20_implementation *implementation = + &crypto_stream_salsa20_ref_implementation; +#endif size_t crypto_stream_salsa20_keybytes(void) @@ -13,12 +27,28 @@ crypto_stream_salsa20_noncebytes(void) return crypto_stream_salsa20_NONCEBYTES; } +int +crypto_stream_salsa20(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + return implementation->stream(c, clen, n, k); +} + +int +crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + return implementation->stream_xor_ic(c, m, mlen, n, ic, k); +} + int crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m, unsigned long long mlen, const unsigned char *n, const unsigned char *k) { - return crypto_stream_salsa20_xor_ic(c, m, mlen, n, 0U, k); + return implementation->stream_xor_ic(c, m, mlen, n, 0U, k); } void @@ -26,3 +56,20 @@ crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]) { randombytes_buf(k, crypto_stream_salsa20_KEYBYTES); } + +int +_crypto_stream_salsa20_pick_best_implementation(void) +{ +#if defined(HAVE_EMMINTRIN_H) && defined(__x86_64__) + implementation = &crypto_stream_salsa20_xmm6int_implementation; +#else + implementation = &crypto_stream_salsa20_ref_implementation; +#endif + +#ifdef HAVE_EMMINTRIN_H + if (sodium_runtime_has_sse2()) { + implementation = &crypto_stream_salsa20_xmm6int_implementation; + } +#endif + return 0; +} diff --git a/src/libsodium/crypto_stream/salsa20/stream_salsa20.h b/src/libsodium/crypto_stream/salsa20/stream_salsa20.h new file mode 100644 index 00000000..1949d381 --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/stream_salsa20.h @@ -0,0 +1,16 @@ + +#ifndef stream_salsa20_H +#define stream_salsa20_H + +#include + +typedef struct crypto_stream_salsa20_implementation { + int (*stream)(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k); + int (*stream_xor_ic)(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k); +} crypto_stream_salsa20_implementation; + +#endif diff --git a/src/libsodium/crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.c b/src/libsodium/crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.c new file mode 100644 index 00000000..c3ae9d35 --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.c @@ -0,0 +1,185 @@ + +#include +#include +#include + +#if defined(HAVE_EMMINTRIN_H) || \ + (defined(_MSC_VER) && \ + (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) +# ifdef __GNUC__ +# pragma GCC target("sse2") +# endif +# include +#endif + +#include "crypto_stream_salsa20.h" +#include "private/common.h" +#include "utils.h" + +#include "../stream_salsa20.h" +#include "stream_salsa20_xmm6int.h" + + +#define ROUNDS 20 + +typedef struct salsa_ctx { + uint32_t input[16]; +} salsa_ctx; + +static const int TR[16] = { + 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3 +}; + +static void +salsa20_wordtobyte_tr(uint8_t output[64], const uint32_t input[16]) +{ + uint32_t x[16]; + int i; + + for (i = 0; i < 16; i++) { + x[TR[i]] = input[TR[i]]; + } + for (i = 20; i > 0; i -= 2) { + x[TR[4]] ^= ROTL32(x[TR[0]] + x[TR[12]], 7); + x[TR[8]] ^= ROTL32(x[TR[4]] + x[TR[0]], 9); + x[TR[12]] ^= ROTL32(x[TR[8]] + x[TR[4]], 13); + x[TR[0]] ^= ROTL32(x[TR[12]] + x[TR[8]], 18); + x[TR[9]] ^= ROTL32(x[TR[5]] + x[TR[1]], 7); + x[TR[13]] ^= ROTL32(x[TR[9]] + x[TR[5]], 9); + x[TR[1]] ^= ROTL32(x[TR[13]] + x[TR[9]], 13); + x[TR[5]] ^= ROTL32(x[TR[1]] + x[TR[13]], 18); + x[TR[14]] ^= ROTL32(x[TR[10]] + x[TR[6]], 7); + x[TR[2]] ^= ROTL32(x[TR[14]] + x[TR[10]], 9); + x[TR[6]] ^= ROTL32(x[TR[2]] + x[TR[14]], 13); + x[TR[10]] ^= ROTL32(x[TR[6]] + x[TR[2]], 18); + x[TR[3]] ^= ROTL32(x[TR[15]] + x[TR[11]], 7); + x[TR[7]] ^= ROTL32(x[TR[3]] + x[TR[15]], 9); + x[TR[11]] ^= ROTL32(x[TR[7]] + x[TR[3]], 13); + x[TR[15]] ^= ROTL32(x[TR[11]] + x[TR[7]], 18); + x[TR[1]] ^= ROTL32(x[TR[0]] + x[TR[3]], 7); + x[TR[2]] ^= ROTL32(x[TR[1]] + x[TR[0]], 9); + x[TR[3]] ^= ROTL32(x[TR[2]] + x[TR[1]], 13); + x[TR[0]] ^= ROTL32(x[TR[3]] + x[TR[2]], 18); + x[TR[6]] ^= ROTL32(x[TR[5]] + x[TR[4]], 7); + x[TR[7]] ^= ROTL32(x[TR[6]] + x[TR[5]], 9); + x[TR[4]] ^= ROTL32(x[TR[7]] + x[TR[6]], 13); + x[TR[5]] ^= ROTL32(x[TR[4]] + x[TR[7]], 18); + x[TR[11]] ^= ROTL32(x[TR[10]] + x[TR[9]], 7); + x[TR[8]] ^= ROTL32(x[TR[11]] + x[TR[10]], 9); + x[TR[9]] ^= ROTL32(x[TR[8]] + x[TR[11]], 13); + x[TR[10]] ^= ROTL32(x[TR[9]] + x[TR[8]], 18); + x[TR[12]] ^= ROTL32(x[TR[15]] + x[TR[14]], 7); + x[TR[13]] ^= ROTL32(x[TR[12]] + x[TR[15]], 9); + x[TR[14]] ^= ROTL32(x[TR[13]] + x[TR[12]], 13); + x[TR[15]] ^= ROTL32(x[TR[14]] + x[TR[13]], 18); + } + for (i = 0; i < 16; i++) { + x[TR[i]] += input[TR[i]]; + } + for (i = 0; i < 16; i++) { + STORE32_LE(output + 4 * i, x[TR[i]]); + } +} + +static void +salsa_keysetup(salsa_ctx *ctx, const uint8_t *k) +{ + ctx->input[TR[1]] = LOAD32_LE(k + 0); + ctx->input[TR[2]] = LOAD32_LE(k + 4); + ctx->input[TR[3]] = LOAD32_LE(k + 8); + ctx->input[TR[4]] = LOAD32_LE(k + 12); + ctx->input[TR[11]] = LOAD32_LE(k + 16); + ctx->input[TR[12]] = LOAD32_LE(k + 20); + ctx->input[TR[13]] = LOAD32_LE(k + 24); + ctx->input[TR[14]] = LOAD32_LE(k + 28); + ctx->input[TR[0]] = 0x61707865; + ctx->input[TR[5]] = 0x3320646e; + ctx->input[TR[10]] = 0x79622d32; + ctx->input[TR[15]] = 0x6b206574; +} + +static void +salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[TR[6]] = LOAD32_LE(iv + 0); + ctx->input[TR[7]] = LOAD32_LE(iv + 4); + ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4); +} + +static void +salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint8_t partialblock[64]; + uint32_t * const x = &ctx->input[0]; + int i; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } + if (bytes > 64ULL * (1ULL << 32) - 64ULL) { + abort(); + } + +#include "u4.h" +#include "u1.h" + + if (!bytes) { + return; + } + salsa20_wordtobyte_tr(partialblock, x); + for (i = 0; i < bytes; i++) { + c[i] = m[i] ^ partialblock[i]; + } +} + +static int +stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + struct salsa_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8); + salsa_keysetup(&ctx, k); + salsa_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + salsa20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + struct salsa_ctx ctx; + uint8_t ic_bytes[8]; + uint32_t ic_high; + uint32_t ic_low; + + if (!mlen) { + return 0; + } + ic_high = (uint32_t) (ic >> 32); + ic_low = (uint32_t) (ic); + STORE32_LE(&ic_bytes[0], ic_low); + STORE32_LE(&ic_bytes[4], ic_high); + salsa_keysetup(&ctx, k); + salsa_ivsetup(&ctx, n, ic_bytes); + salsa20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6int_implementation = { + SODIUM_C99(.stream =) stream_ref, + SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic + }; diff --git a/src/libsodium/crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.h b/src/libsodium/crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.h new file mode 100644 index 00000000..5beb2db4 --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.h @@ -0,0 +1,16 @@ + +#include + +#include "../stream_salsa20.h" +#include "crypto_stream_salsa20.h" + +extern struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6int_implementation; + +int crypto_stream_salsa20_xmm6int(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k); + +int crypto_stream_salsa20_xmm6int_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k); diff --git a/src/libsodium/crypto_stream/salsa20/xmm6int/u1.h b/src/libsodium/crypto_stream/salsa20/xmm6int/u1.h new file mode 100644 index 00000000..aba7e913 --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/xmm6int/u1.h @@ -0,0 +1,220 @@ +while (bytes >= 64) { + __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0)); + __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4)); + __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8)); + __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12)); + __m128i a0; + __m128i a1; + __m128i a2; + __m128i a3; + __m128i a4; + __m128i a5; + __m128i a6; + __m128i a7; + __m128i b0; + __m128i b1; + __m128i b2; + __m128i b3; + __m128i b4; + __m128i b5; + __m128i b6; + __m128i b7; + + uint32_t in8; + uint32_t in9; + + a0 = diag1; + for (i = 0; i < 20; i += 4) { + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + } + + diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0))); + diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4))); + diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8))); + diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12))); + +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + do { \ + uint32_t in##A = _mm_cvtsi128_si32(diag0); \ + uint32_t in##B = _mm_cvtsi128_si32(diag1); \ + uint32_t in##C = _mm_cvtsi128_si32(diag2); \ + uint32_t in##D = _mm_cvtsi128_si32(diag3); \ + diag0 = _mm_shuffle_epi32(diag0, 0x39); \ + diag1 = _mm_shuffle_epi32(diag1, 0x39); \ + diag2 = _mm_shuffle_epi32(diag2, 0x39); \ + diag3 = _mm_shuffle_epi32(diag3, 0x39); \ + in##A ^= *(uint32_t *) (m + (A * 4)); \ + in##B ^= *(uint32_t *) (m + (B * 4)); \ + in##C ^= *(uint32_t *) (m + (C * 4)); \ + in##D ^= *(uint32_t *) (m + (D * 4)); \ + *(uint32_t *) (c + (A * 4)) = in##A; \ + *(uint32_t *) (c + (B * 4)) = in##B; \ + *(uint32_t *) (c + (C * 4)) = in##C; \ + *(uint32_t *) (c + (D * 4)) = in##D; \ + } while (0) + +#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) + + ONEQUAD(0, 12, 8, 4); + ONEQUAD(5, 1, 13, 9); + ONEQUAD(10, 6, 2, 14); + ONEQUAD(15, 11, 7, 3); + +#undef ONEQUAD +#undef ONEQUAD_SHUFFLE + + in8 = x[8]; + in9 = x[13]; + in8++; + if (in8 == 0) { + in9++; + } + x[8] = in8; + x[13] = in9; + + c += 64; + m += 64; + bytes -= 64; +} diff --git a/src/libsodium/crypto_stream/salsa20/xmm6int/u4.h b/src/libsodium/crypto_stream/salsa20/xmm6int/u4.h new file mode 100644 index 00000000..78b5be92 --- /dev/null +++ b/src/libsodium/crypto_stream/salsa20/xmm6int/u4.h @@ -0,0 +1,532 @@ +if (bytes >= 256) { + __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, + y15; + __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, + z15; + __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8, + orig9, orig10, orig11, orig12, orig13, orig14, orig15; + + uint32_t in8; + uint32_t in9; + + /* element broadcast immediate for _mm_shuffle_epi32 are in order: + 0x00, 0x55, 0xaa, 0xff */ + z0 = _mm_loadu_si128((__m128i *) (x + 0)); + z5 = _mm_shuffle_epi32(z0, 0x55); + z10 = _mm_shuffle_epi32(z0, 0xaa); + z15 = _mm_shuffle_epi32(z0, 0xff); + z0 = _mm_shuffle_epi32(z0, 0x00); + z1 = _mm_loadu_si128((__m128i *) (x + 4)); + z6 = _mm_shuffle_epi32(z1, 0xaa); + z11 = _mm_shuffle_epi32(z1, 0xff); + z12 = _mm_shuffle_epi32(z1, 0x00); + z1 = _mm_shuffle_epi32(z1, 0x55); + z2 = _mm_loadu_si128((__m128i *) (x + 8)); + z7 = _mm_shuffle_epi32(z2, 0xff); + z13 = _mm_shuffle_epi32(z2, 0x55); + z2 = _mm_shuffle_epi32(z2, 0xaa); + /* no z8 -> first half of the nonce, will fill later */ + z3 = _mm_loadu_si128((__m128i *) (x + 12)); + z4 = _mm_shuffle_epi32(z3, 0x00); + z14 = _mm_shuffle_epi32(z3, 0xaa); + z3 = _mm_shuffle_epi32(z3, 0xff); + /* no z9 -> second half of the nonce, will fill later */ + orig0 = z0; + orig1 = z1; + orig2 = z2; + orig3 = z3; + orig4 = z4; + orig5 = z5; + orig6 = z6; + orig7 = z7; + orig10 = z10; + orig11 = z11; + orig12 = z12; + orig13 = z13; + orig14 = z14; + orig15 = z15; + + while (bytes >= 256) { + /* vector implementation for z8 and z9 */ + /* not sure if it helps for only 4 blocks */ + const __m128i addv8 = _mm_set_epi64x(1, 0); + const __m128i addv9 = _mm_set_epi64x(3, 2); + __m128i t8, t9; + uint64_t in89; + + in8 = x[8]; + in9 = x[13]; // see arrays above for the address translation + in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); + t8 = _mm_set1_epi64x(in89); + t9 = _mm_set1_epi64x(in89); + + z8 = _mm_add_epi64(addv8, t8); + z9 = _mm_add_epi64(addv9, t9); + + t8 = _mm_unpacklo_epi32(z8, z9); + t9 = _mm_unpackhi_epi32(z8, z9); + + z8 = _mm_unpacklo_epi32(t8, t9); + z9 = _mm_unpackhi_epi32(t8, t9); + + orig8 = z8; + orig9 = z9; + + in89 += 4; + + x[8] = in89 & 0xFFFFFFFF; + x[13] = (in89 >> 32) & 0xFFFFFFFF; + + z5 = orig5; + z10 = orig10; + z15 = orig15; + z14 = orig14; + z3 = orig3; + z6 = orig6; + z11 = orig11; + z1 = orig1; + + z7 = orig7; + z13 = orig13; + z2 = orig2; + z9 = orig9; + z0 = orig0; + z12 = orig12; + z4 = orig4; + z8 = orig8; + + for (i = 0; i < 20; i += 2) { + /* the inner loop is a direct translation (regexp search/replace) + * from the amd64-xmm6 ASM */ + __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, + r14, r15; + + y4 = z12; + y4 = _mm_add_epi32(y4, z0); + r4 = y4; + y4 = _mm_slli_epi32(y4, 7); + z4 = _mm_xor_si128(z4, y4); + r4 = _mm_srli_epi32(r4, 25); + z4 = _mm_xor_si128(z4, r4); + + y9 = z1; + y9 = _mm_add_epi32(y9, z5); + r9 = y9; + y9 = _mm_slli_epi32(y9, 7); + z9 = _mm_xor_si128(z9, y9); + r9 = _mm_srli_epi32(r9, 25); + z9 = _mm_xor_si128(z9, r9); + + y8 = z0; + y8 = _mm_add_epi32(y8, z4); + r8 = y8; + y8 = _mm_slli_epi32(y8, 9); + z8 = _mm_xor_si128(z8, y8); + r8 = _mm_srli_epi32(r8, 23); + z8 = _mm_xor_si128(z8, r8); + + y13 = z5; + y13 = _mm_add_epi32(y13, z9); + r13 = y13; + y13 = _mm_slli_epi32(y13, 9); + z13 = _mm_xor_si128(z13, y13); + r13 = _mm_srli_epi32(r13, 23); + z13 = _mm_xor_si128(z13, r13); + + y12 = z4; + y12 = _mm_add_epi32(y12, z8); + r12 = y12; + y12 = _mm_slli_epi32(y12, 13); + z12 = _mm_xor_si128(z12, y12); + r12 = _mm_srli_epi32(r12, 19); + z12 = _mm_xor_si128(z12, r12); + + y1 = z9; + y1 = _mm_add_epi32(y1, z13); + r1 = y1; + y1 = _mm_slli_epi32(y1, 13); + z1 = _mm_xor_si128(z1, y1); + r1 = _mm_srli_epi32(r1, 19); + z1 = _mm_xor_si128(z1, r1); + + y0 = z8; + y0 = _mm_add_epi32(y0, z12); + r0 = y0; + y0 = _mm_slli_epi32(y0, 18); + z0 = _mm_xor_si128(z0, y0); + r0 = _mm_srli_epi32(r0, 14); + z0 = _mm_xor_si128(z0, r0); + + y5 = z13; + y5 = _mm_add_epi32(y5, z1); + r5 = y5; + y5 = _mm_slli_epi32(y5, 18); + z5 = _mm_xor_si128(z5, y5); + r5 = _mm_srli_epi32(r5, 14); + z5 = _mm_xor_si128(z5, r5); + + y14 = z6; + y14 = _mm_add_epi32(y14, z10); + r14 = y14; + y14 = _mm_slli_epi32(y14, 7); + z14 = _mm_xor_si128(z14, y14); + r14 = _mm_srli_epi32(r14, 25); + z14 = _mm_xor_si128(z14, r14); + + y3 = z11; + y3 = _mm_add_epi32(y3, z15); + r3 = y3; + y3 = _mm_slli_epi32(y3, 7); + z3 = _mm_xor_si128(z3, y3); + r3 = _mm_srli_epi32(r3, 25); + z3 = _mm_xor_si128(z3, r3); + + y2 = z10; + y2 = _mm_add_epi32(y2, z14); + r2 = y2; + y2 = _mm_slli_epi32(y2, 9); + z2 = _mm_xor_si128(z2, y2); + r2 = _mm_srli_epi32(r2, 23); + z2 = _mm_xor_si128(z2, r2); + + y7 = z15; + y7 = _mm_add_epi32(y7, z3); + r7 = y7; + y7 = _mm_slli_epi32(y7, 9); + z7 = _mm_xor_si128(z7, y7); + r7 = _mm_srli_epi32(r7, 23); + z7 = _mm_xor_si128(z7, r7); + + y6 = z14; + y6 = _mm_add_epi32(y6, z2); + r6 = y6; + y6 = _mm_slli_epi32(y6, 13); + z6 = _mm_xor_si128(z6, y6); + r6 = _mm_srli_epi32(r6, 19); + z6 = _mm_xor_si128(z6, r6); + + y11 = z3; + y11 = _mm_add_epi32(y11, z7); + r11 = y11; + y11 = _mm_slli_epi32(y11, 13); + z11 = _mm_xor_si128(z11, y11); + r11 = _mm_srli_epi32(r11, 19); + z11 = _mm_xor_si128(z11, r11); + + y10 = z2; + y10 = _mm_add_epi32(y10, z6); + r10 = y10; + y10 = _mm_slli_epi32(y10, 18); + z10 = _mm_xor_si128(z10, y10); + r10 = _mm_srli_epi32(r10, 14); + z10 = _mm_xor_si128(z10, r10); + + y1 = z3; + y1 = _mm_add_epi32(y1, z0); + r1 = y1; + y1 = _mm_slli_epi32(y1, 7); + z1 = _mm_xor_si128(z1, y1); + r1 = _mm_srli_epi32(r1, 25); + z1 = _mm_xor_si128(z1, r1); + + y15 = z7; + y15 = _mm_add_epi32(y15, z11); + r15 = y15; + y15 = _mm_slli_epi32(y15, 18); + z15 = _mm_xor_si128(z15, y15); + r15 = _mm_srli_epi32(r15, 14); + z15 = _mm_xor_si128(z15, r15); + + y6 = z4; + y6 = _mm_add_epi32(y6, z5); + r6 = y6; + y6 = _mm_slli_epi32(y6, 7); + z6 = _mm_xor_si128(z6, y6); + r6 = _mm_srli_epi32(r6, 25); + z6 = _mm_xor_si128(z6, r6); + + y2 = z0; + y2 = _mm_add_epi32(y2, z1); + r2 = y2; + y2 = _mm_slli_epi32(y2, 9); + z2 = _mm_xor_si128(z2, y2); + r2 = _mm_srli_epi32(r2, 23); + z2 = _mm_xor_si128(z2, r2); + + y7 = z5; + y7 = _mm_add_epi32(y7, z6); + r7 = y7; + y7 = _mm_slli_epi32(y7, 9); + z7 = _mm_xor_si128(z7, y7); + r7 = _mm_srli_epi32(r7, 23); + z7 = _mm_xor_si128(z7, r7); + + y3 = z1; + y3 = _mm_add_epi32(y3, z2); + r3 = y3; + y3 = _mm_slli_epi32(y3, 13); + z3 = _mm_xor_si128(z3, y3); + r3 = _mm_srli_epi32(r3, 19); + z3 = _mm_xor_si128(z3, r3); + + y4 = z6; + y4 = _mm_add_epi32(y4, z7); + r4 = y4; + y4 = _mm_slli_epi32(y4, 13); + z4 = _mm_xor_si128(z4, y4); + r4 = _mm_srli_epi32(r4, 19); + z4 = _mm_xor_si128(z4, r4); + + y0 = z2; + y0 = _mm_add_epi32(y0, z3); + r0 = y0; + y0 = _mm_slli_epi32(y0, 18); + z0 = _mm_xor_si128(z0, y0); + r0 = _mm_srli_epi32(r0, 14); + z0 = _mm_xor_si128(z0, r0); + + y5 = z7; + y5 = _mm_add_epi32(y5, z4); + r5 = y5; + y5 = _mm_slli_epi32(y5, 18); + z5 = _mm_xor_si128(z5, y5); + r5 = _mm_srli_epi32(r5, 14); + z5 = _mm_xor_si128(z5, r5); + + y11 = z9; + y11 = _mm_add_epi32(y11, z10); + r11 = y11; + y11 = _mm_slli_epi32(y11, 7); + z11 = _mm_xor_si128(z11, y11); + r11 = _mm_srli_epi32(r11, 25); + z11 = _mm_xor_si128(z11, r11); + + y12 = z14; + y12 = _mm_add_epi32(y12, z15); + r12 = y12; + y12 = _mm_slli_epi32(y12, 7); + z12 = _mm_xor_si128(z12, y12); + r12 = _mm_srli_epi32(r12, 25); + z12 = _mm_xor_si128(z12, r12); + + y8 = z10; + y8 = _mm_add_epi32(y8, z11); + r8 = y8; + y8 = _mm_slli_epi32(y8, 9); + z8 = _mm_xor_si128(z8, y8); + r8 = _mm_srli_epi32(r8, 23); + z8 = _mm_xor_si128(z8, r8); + + y13 = z15; + y13 = _mm_add_epi32(y13, z12); + r13 = y13; + y13 = _mm_slli_epi32(y13, 9); + z13 = _mm_xor_si128(z13, y13); + r13 = _mm_srli_epi32(r13, 23); + z13 = _mm_xor_si128(z13, r13); + + y9 = z11; + y9 = _mm_add_epi32(y9, z8); + r9 = y9; + y9 = _mm_slli_epi32(y9, 13); + z9 = _mm_xor_si128(z9, y9); + r9 = _mm_srli_epi32(r9, 19); + z9 = _mm_xor_si128(z9, r9); + + y14 = z12; + y14 = _mm_add_epi32(y14, z13); + r14 = y14; + y14 = _mm_slli_epi32(y14, 13); + z14 = _mm_xor_si128(z14, y14); + r14 = _mm_srli_epi32(r14, 19); + z14 = _mm_xor_si128(z14, r14); + + y10 = z8; + y10 = _mm_add_epi32(y10, z9); + r10 = y10; + y10 = _mm_slli_epi32(y10, 18); + z10 = _mm_xor_si128(z10, y10); + r10 = _mm_srli_epi32(r10, 14); + z10 = _mm_xor_si128(z10, r10); + + y15 = z13; + y15 = _mm_add_epi32(y15, z14); + r15 = y15; + y15 = _mm_slli_epi32(y15, 18); + z15 = _mm_xor_si128(z15, y15); + r15 = _mm_srli_epi32(r15, 14); + z15 = _mm_xor_si128(z15, r15); + } + +/* store data ; this macro replicates the original amd64-xmm6 code */ +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + in##A ^= *(uint32_t *) (m + 0); \ + in##B ^= *(uint32_t *) (m + 4); \ + in##C ^= *(uint32_t *) (m + 8); \ + in##D ^= *(uint32_t *) (m + 12); \ + *(uint32_t *) (c + 0) = in##A; \ + *(uint32_t *) (c + 4) = in##B; \ + *(uint32_t *) (c + 8) = in##C; \ + *(uint32_t *) (c + 12) = in##D; \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + in##A ^= *(uint32_t *) (m + 64); \ + in##B ^= *(uint32_t *) (m + 68); \ + in##C ^= *(uint32_t *) (m + 72); \ + in##D ^= *(uint32_t *) (m + 76); \ + *(uint32_t *) (c + 64) = in##A; \ + *(uint32_t *) (c + 68) = in##B; \ + *(uint32_t *) (c + 72) = in##C; \ + *(uint32_t *) (c + 76) = in##D; \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + in##A ^= *(uint32_t *) (m + 128); \ + in##B ^= *(uint32_t *) (m + 132); \ + in##C ^= *(uint32_t *) (m + 136); \ + in##D ^= *(uint32_t *) (m + 140); \ + *(uint32_t *) (c + 128) = in##A; \ + *(uint32_t *) (c + 132) = in##B; \ + *(uint32_t *) (c + 136) = in##C; \ + *(uint32_t *) (c + 140) = in##D; \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + in##A ^= *(uint32_t *) (m + 192); \ + in##B ^= *(uint32_t *) (m + 196); \ + in##C ^= *(uint32_t *) (m + 200); \ + in##D ^= *(uint32_t *) (m + 204); \ + *(uint32_t *) (c + 192) = in##A; \ + *(uint32_t *) (c + 196) = in##B; \ + *(uint32_t *) (c + 200) = in##C; \ + *(uint32_t *) (c + 204) = in##D + +/* store data ; this macro replaces shuffle+mov by a direct extract; not much + * difference */ +#define ONEQUAD_EXTRACT(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + in##A ^= *(uint32_t *) (m + 0); \ + in##B ^= *(uint32_t *) (m + 4); \ + in##C ^= *(uint32_t *) (m + 8); \ + in##D ^= *(uint32_t *) (m + 12); \ + *(uint32_t *) (c + 0) = in##A; \ + *(uint32_t *) (c + 4) = in##B; \ + *(uint32_t *) (c + 8) = in##C; \ + *(uint32_t *) (c + 12) = in##D; \ + in##A = _mm_extract_epi32(z##A, 1); \ + in##B = _mm_extract_epi32(z##B, 1); \ + in##C = _mm_extract_epi32(z##C, 1); \ + in##D = _mm_extract_epi32(z##D, 1); \ + in##A ^= *(uint32_t *) (m + 64); \ + in##B ^= *(uint32_t *) (m + 68); \ + in##C ^= *(uint32_t *) (m + 72); \ + in##D ^= *(uint32_t *) (m + 76); \ + *(uint32_t *) (c + 64) = in##A; \ + *(uint32_t *) (c + 68) = in##B; \ + *(uint32_t *) (c + 72) = in##C; \ + *(uint32_t *) (c + 76) = in##D; \ + in##A = _mm_extract_epi32(z##A, 2); \ + in##B = _mm_extract_epi32(z##B, 2); \ + in##C = _mm_extract_epi32(z##C, 2); \ + in##D = _mm_extract_epi32(z##D, 2); \ + in##A ^= *(uint32_t *) (m + 128); \ + in##B ^= *(uint32_t *) (m + 132); \ + in##C ^= *(uint32_t *) (m + 136); \ + in##D ^= *(uint32_t *) (m + 140); \ + *(uint32_t *) (c + 128) = in##A; \ + *(uint32_t *) (c + 132) = in##B; \ + *(uint32_t *) (c + 136) = in##C; \ + *(uint32_t *) (c + 140) = in##D; \ + in##A = _mm_extract_epi32(z##A, 3); \ + in##B = _mm_extract_epi32(z##B, 3); \ + in##C = _mm_extract_epi32(z##C, 3); \ + in##D = _mm_extract_epi32(z##D, 3); \ + in##A ^= *(uint32_t *) (m + 192); \ + in##B ^= *(uint32_t *) (m + 196); \ + in##C ^= *(uint32_t *) (m + 200); \ + in##D ^= *(uint32_t *) (m + 204); \ + *(uint32_t *) (c + 192) = in##A; \ + *(uint32_t *) (c + 196) = in##B; \ + *(uint32_t *) (c + 200) = in##C; \ + *(uint32_t *) (c + 204) = in##D + +/* store data ; this macro first transpose data in-registers, and then store + * them in memory. much faster with icc. */ +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + y##A = _mm_unpacklo_epi32(z##A, z##B); \ + y##B = _mm_unpacklo_epi32(z##C, z##D); \ + y##C = _mm_unpackhi_epi32(z##A, z##B); \ + y##D = _mm_unpackhi_epi32(z##C, z##D); \ + z##A = _mm_unpacklo_epi64(y##A, y##B); \ + z##B = _mm_unpackhi_epi64(y##A, y##B); \ + z##C = _mm_unpacklo_epi64(y##C, y##D); \ + z##D = _mm_unpackhi_epi64(y##C, y##D); \ + y##A = _mm_xor_si128(z##A, _mm_loadu_si128((__m128i *) (m + 0))); \ + _mm_storeu_si128((__m128i *) (c + 0), y##A); \ + y##B = _mm_xor_si128(z##B, _mm_loadu_si128((__m128i *) (m + 64))); \ + _mm_storeu_si128((__m128i *) (c + 64), y##B); \ + y##C = _mm_xor_si128(z##C, _mm_loadu_si128((__m128i *) (m + 128))); \ + _mm_storeu_si128((__m128i *) (c + 128), y##C); \ + y##D = _mm_xor_si128(z##D, _mm_loadu_si128((__m128i *) (m + 192))); \ + _mm_storeu_si128((__m128i *) (c + 192), y##D) + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + + ONEQUAD(0, 1, 2, 3); + m += 16; + c += 16; + ONEQUAD(4, 5, 6, 7); + m += 16; + c += 16; + ONEQUAD(8, 9, 10, 11); + m += 16; + c += 16; + ONEQUAD(12, 13, 14, 15); + m -= 48; + c -= 48; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE +#undef ONEQUAD_EXTRACT +#undef ONEQUAD_SHUFFLE + + bytes -= 256; + c += 256; + m += 256; + } +} diff --git a/src/libsodium/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c b/src/libsodium/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c index 64ee11bb..bfdfeedb 100644 --- a/src/libsodium/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c +++ b/src/libsodium/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c @@ -4,6 +4,8 @@ D. J. Bernstein Public domain. */ +#include + #include "crypto_core_salsa2012.h" #include "crypto_stream_salsa2012.h" #include "utils.h" @@ -32,18 +34,15 @@ crypto_stream_salsa2012(unsigned char *c, unsigned long long clen, } while (clen >= 64) { crypto_core_salsa2012(c, in, kcopy, NULL); - u = 1; for (i = 8; i < 16; ++i) { u += (unsigned int)in[i]; in[i] = u; u >>= 8; } - clen -= 64; c += 64; } - if (clen) { crypto_core_salsa2012(block, in, kcopy, NULL); for (i = 0; i < (unsigned int)clen; ++i) { @@ -55,3 +54,53 @@ crypto_stream_salsa2012(unsigned char *c, unsigned long long clen, return 0; } + +int +crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!mlen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = 0; + } + while (mlen >= 64) { + crypto_core_salsa2012(block, in, kcopy, NULL); + for (i = 0; i < 64; ++i) { + c[i] = m[i] ^ block[i]; + } + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + mlen -= 64; + c += 64; + m += 64; + } + if (mlen) { + crypto_core_salsa2012(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)mlen; ++i) { + c[i] = m[i] ^ block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} diff --git a/src/libsodium/crypto_stream/salsa2012/ref/xor_salsa2012.c b/src/libsodium/crypto_stream/salsa2012/ref/xor_salsa2012.c index ef620324..bfdfeedb 100644 --- a/src/libsodium/crypto_stream/salsa2012/ref/xor_salsa2012.c +++ b/src/libsodium/crypto_stream/salsa2012/ref/xor_salsa2012.c @@ -4,10 +4,57 @@ D. J. Bernstein Public domain. */ +#include + #include "crypto_core_salsa2012.h" #include "crypto_stream_salsa2012.h" #include "utils.h" +int +crypto_stream_salsa2012(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!clen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = 0; + } + while (clen >= 64) { + crypto_core_salsa2012(c, in, kcopy, NULL); + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + clen -= 64; + c += 64; + } + if (clen) { + crypto_core_salsa2012(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)clen; ++i) { + c[i] = block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} + int crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m, unsigned long long mlen, const unsigned char *n, @@ -46,7 +93,6 @@ crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m, c += 64; m += 64; } - if (mlen) { crypto_core_salsa2012(block, in, kcopy, NULL); for (i = 0; i < (unsigned int)mlen; ++i) { diff --git a/src/libsodium/crypto_stream/salsa208/ref/stream_salsa208_ref.c b/src/libsodium/crypto_stream/salsa208/ref/stream_salsa208_ref.c index b0e9d857..7ec0c4e7 100644 --- a/src/libsodium/crypto_stream/salsa208/ref/stream_salsa208_ref.c +++ b/src/libsodium/crypto_stream/salsa208/ref/stream_salsa208_ref.c @@ -4,6 +4,8 @@ D. J. Bernstein Public domain. */ +#include + #include "crypto_core_salsa208.h" #include "crypto_stream_salsa208.h" #include "utils.h" @@ -32,7 +34,6 @@ crypto_stream_salsa208(unsigned char *c, unsigned long long clen, } while (clen >= 64) { crypto_core_salsa208(c, in, kcopy, NULL); - u = 1; for (i = 8; i < 16; ++i) { u += (unsigned int)in[i]; @@ -42,7 +43,6 @@ crypto_stream_salsa208(unsigned char *c, unsigned long long clen, clen -= 64; c += 64; } - if (clen) { crypto_core_salsa208(block, in, kcopy, NULL); for (i = 0; i < (unsigned int)clen; ++i) { @@ -54,3 +54,53 @@ crypto_stream_salsa208(unsigned char *c, unsigned long long clen, return 0; } + +int +crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!mlen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = 0; + } + while (mlen >= 64) { + crypto_core_salsa208(block, in, kcopy, NULL); + for (i = 0; i < 64; ++i) { + c[i] = m[i] ^ block[i]; + } + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + mlen -= 64; + c += 64; + m += 64; + } + if (mlen) { + crypto_core_salsa208(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)mlen; ++i) { + c[i] = m[i] ^ block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} diff --git a/src/libsodium/crypto_stream/salsa208/ref/xor_salsa208.c b/src/libsodium/crypto_stream/salsa208/ref/xor_salsa208.c deleted file mode 100644 index fe84bc4e..00000000 --- a/src/libsodium/crypto_stream/salsa208/ref/xor_salsa208.c +++ /dev/null @@ -1,60 +0,0 @@ -/* -version 20140420 -D. J. Bernstein -Public domain. -*/ - -#include "crypto_core_salsa208.h" -#include "crypto_stream_salsa208.h" -#include "utils.h" - -int -crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!mlen) { - return 0; - } - for (i = 0; i < 32; ++i) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; ++i) { - in[i] = n[i]; - } - for (i = 8; i < 16; ++i) { - in[i] = 0; - } - while (mlen >= 64) { - crypto_core_salsa208(block, in, kcopy, NULL); - for (i = 0; i < 64; ++i) { - c[i] = m[i] ^ block[i]; - } - u = 1; - for (i = 8; i < 16; ++i) { - u += (unsigned int)in[i]; - in[i] = u; - u >>= 8; - } - mlen -= 64; - c += 64; - m += 64; - } - - if (mlen) { - crypto_core_salsa208(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int)mlen; ++i) { - c[i] = m[i] ^ block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} diff --git a/src/libsodium/include/sodium/crypto_stream_salsa20.h b/src/libsodium/include/sodium/crypto_stream_salsa20.h index 961e5c1c..741140eb 100644 --- a/src/libsodium/include/sodium/crypto_stream_salsa20.h +++ b/src/libsodium/include/sodium/crypto_stream_salsa20.h @@ -46,6 +46,10 @@ int crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m, SODIUM_EXPORT void crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]); +/* ------------------------------------------------------------------------- */ + +int _crypto_stream_salsa20_pick_best_implementation(void); + #ifdef __cplusplus } #endif diff --git a/src/libsodium/sodium/core.c b/src/libsodium/sodium/core.c index e5064cf5..99fd5b4a 100644 --- a/src/libsodium/sodium/core.c +++ b/src/libsodium/sodium/core.c @@ -55,6 +55,7 @@ sodium_init(void) _crypto_onetimeauth_poly1305_pick_best_implementation(); _crypto_scalarmult_curve25519_pick_best_implementation(); _crypto_stream_chacha20_pick_best_implementation(); + _crypto_stream_salsa20_pick_best_implementation(); initialized = 1; if (sodium_crit_leave() != 0) { return -1;