diff --git a/AUTHORS b/AUTHORS index b1179c52..0c46c75b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -104,7 +104,8 @@ crypto_stream/aes128ctr Peter Schwabe crypto_stream/chacha20/ref Daniel J. Bernstein -crypto_stream/chacha20/vec Ted Krovetz +crypto_stream/chacha20/dolbeau Romain Dolbeau + Daniel J. Bernstein crypto_stream/salsa20/ref Daniel J. Bernstein diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 49cc9699..2ecdb4d3 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -221,8 +221,11 @@ libssse3_la_SOURCES = \ crypto_generichash/blake2b/ref/blake2b-compress-ssse3.h \ crypto_pwhash/argon2/argon2-fill-block-ssse3.c \ crypto_pwhash/argon2/blamka-round-ssse3.h \ - crypto_stream/chacha20/vec/chacha20_vec.h \ - crypto_stream/chacha20/vec/chacha20_vec.c + crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c \ + crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h \ + crypto_stream/chacha20/dolbeau/u0.h \ + crypto_stream/chacha20/dolbeau/u1.h \ + crypto_stream/chacha20/dolbeau/u4.h libsse41_la_LDFLAGS = $(libsodium_la_LDFLAGS) libsse41_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ @@ -239,9 +242,6 @@ libavx2_la_SOURCES = \ crypto_generichash/blake2b/ref/blake2b-compress-avx2.h \ crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c \ crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h \ - crypto_stream/chacha20/dolbeau/u0.h \ - crypto_stream/chacha20/dolbeau/u1.h \ - crypto_stream/chacha20/dolbeau/u4.h \ crypto_stream/chacha20/dolbeau/u8.h \ crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c \ crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h \ diff --git a/src/libsodium/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c b/src/libsodium/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c new file mode 100644 index 00000000..a9e728c5 --- /dev/null +++ b/src/libsodium/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c @@ -0,0 +1,172 @@ + +#include +#include +#include + +#include "crypto_stream_chacha20.h" +#include "private/common.h" +#include "utils.h" + +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# pragma GCC target("ssse3") +# endif + +# include +# include + +# include "../stream_chacha20.h" +# include "chacha20_dolbeau-ssse3.h" + +# define ROUNDS 20 + +typedef struct chacha_ctx { + uint32_t input[16]; +} chacha_ctx; + +static void +chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) +{ + ctx->input[0] = 0x61707865; + ctx->input[1] = 0x3320646e; + ctx->input[2] = 0x79622d32; + ctx->input[3] = 0x6b206574; + ctx->input[4] = LOAD32_LE(k + 0); + ctx->input[5] = LOAD32_LE(k + 4); + ctx->input[6] = LOAD32_LE(k + 8); + ctx->input[7] = LOAD32_LE(k + 12); + ctx->input[8] = LOAD32_LE(k + 16); + ctx->input[9] = LOAD32_LE(k + 20); + ctx->input[10] = LOAD32_LE(k + 24); + ctx->input[11] = LOAD32_LE(k + 28); +} + +static void +chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4); + ctx->input[14] = LOAD32_LE(iv + 0); + ctx->input[15] = LOAD32_LE(iv + 4); +} + +static void +chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter); + ctx->input[13] = LOAD32_LE(iv + 0); + ctx->input[14] = LOAD32_LE(iv + 4); + ctx->input[15] = LOAD32_LE(iv + 8); +} + +static void +chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint32_t * const x = &ctx->input[0]; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } + if (bytes > 64ULL * (1ULL << 32) - 64ULL) { + abort(); + } +# include "u4.h" +# include "u1.h" +# include "u0.h" +} + +static int +stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ref(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[8]; + uint32_t ic_high; + uint32_t ic_low; + + if (!mlen) { + return 0; + } + ic_high = (uint32_t) (ic >> 32); + ic_low = (uint32_t) ic; + STORE32_LE(&ic_bytes[0], ic_low); + STORE32_LE(&ic_bytes[4], ic_high); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + uint32_t ic, const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[4]; + + if (!mlen) { + return 0; + } + STORE32_LE(ic_bytes, ic); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +struct crypto_stream_chacha20_implementation + crypto_stream_chacha20_dolbeau_ssse3_implementation = { + SODIUM_C99(.stream =) stream_ref, + SODIUM_C99(.stream_ietf =) stream_ietf_ref, + SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, + SODIUM_C99(.stream_ietf_xor_ic =) stream_ietf_ref_xor_ic + }; + +#endif diff --git a/src/libsodium/crypto_stream/chacha20/vec/chacha20_vec.h b/src/libsodium/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h similarity index 71% rename from src/libsodium/crypto_stream/chacha20/vec/chacha20_vec.h rename to src/libsodium/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h index 1bbe37c7..d67630f6 100644 --- a/src/libsodium/crypto_stream/chacha20/vec/chacha20_vec.h +++ b/src/libsodium/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h @@ -1,8 +1,8 @@ #include -#include "crypto_stream_chacha20.h" #include "../stream_chacha20.h" +#include "crypto_stream_chacha20.h" extern struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_vec_implementation; + crypto_stream_chacha20_dolbeau_ssse3_implementation; diff --git a/src/libsodium/crypto_stream/chacha20/stream_chacha20.c b/src/libsodium/crypto_stream/chacha20/stream_chacha20.c index b9d30f18..a48d344c 100644 --- a/src/libsodium/crypto_stream/chacha20/stream_chacha20.c +++ b/src/libsodium/crypto_stream/chacha20/stream_chacha20.c @@ -7,8 +7,8 @@ defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) # include "dolbeau/chacha20_dolbeau-avx2.h" #endif -#if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(__GNUC__)) -# include "vec/chacha20_vec.h" +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) +# include "dolbeau/chacha20_dolbeau-ssse3.h" #endif static const crypto_stream_chacha20_implementation *implementation = @@ -105,9 +105,9 @@ _crypto_stream_chacha20_pick_best_implementation(void) return 0; } #endif -#if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(__GNUC__)) +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) if (sodium_runtime_has_ssse3()) { - implementation = &crypto_stream_chacha20_vec_implementation; + implementation = &crypto_stream_chacha20_dolbeau_ssse3_implementation; return 0; } #endif diff --git a/src/libsodium/crypto_stream/chacha20/vec/chacha20_vec.c b/src/libsodium/crypto_stream/chacha20/vec/chacha20_vec.c deleted file mode 100644 index 52b6ce64..00000000 --- a/src/libsodium/crypto_stream/chacha20/vec/chacha20_vec.c +++ /dev/null @@ -1,330 +0,0 @@ - -#include -#include -#include - -#include "crypto_stream_chacha20.h" -#include "export.h" -#include "private/common.h" -#include "utils.h" - -#include "../stream_chacha20.h" -#include "chacha20_vec.h" - -#if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \ - defined(__GNUC__)) - -#pragma GCC target("sse2") -#pragma GCC target("ssse3") - -#define CHACHA_RNDS 20 - -typedef unsigned int vec __attribute__((vector_size(16))); - -#include -#include - -#if __clang__ -#define VBPI 4 -#else -#define VBPI 3 -#endif -#define ONE (vec) _mm_set_epi32(0, 0, 0, 1) -#define LOAD(m) (vec) _mm_loadu_si128((const __m128i *) (const void *) (m)) -#define LOAD_ALIGNED(m) \ - (vec) _mm_load_si128((const __m128i *) (const void *) (m)) -#define STORE(m, r) _mm_storeu_si128((__m128i *) (void *) (m), (__m128i)(r)) -#define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i) x, _MM_SHUFFLE(0, 3, 2, 1)) -#define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i) x, _MM_SHUFFLE(1, 0, 3, 2)) -#define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i) x, _MM_SHUFFLE(2, 1, 0, 3)) -#define ROTW7(x) \ - (vec)(_mm_slli_epi32((__m128i) x, 7) ^ _mm_srli_epi32((__m128i) x, 25)) -#define ROTW12(x) \ - (vec)(_mm_slli_epi32((__m128i) x, 12) ^ _mm_srli_epi32((__m128i) x, 20)) -#define ROTW8(x) \ - (vec)(_mm_slli_epi32((__m128i) x, 8) ^ _mm_srli_epi32((__m128i) x, 24)) -#define ROTW16(x) \ - (vec)(_mm_slli_epi32((__m128i) x, 16) ^ _mm_srli_epi32((__m128i) x, 16)) - -#ifndef REVV_BE -#define REVV_BE(x) (x) -#endif - -#define BPI (VBPI + 0) /* Blocks computed per loop iteration */ - -#define DQROUND_VECTORS(a, b, c, d) \ - a += b; \ - d ^= a; \ - d = ROTW16(d); \ - c += d; \ - b ^= c; \ - b = ROTW12(b); \ - a += b; \ - d ^= a; \ - d = ROTW8(d); \ - c += d; \ - b ^= c; \ - b = ROTW7(b); \ - b = ROTV1(b); \ - c = ROTV2(c); \ - d = ROTV3(d); \ - a += b; \ - d ^= a; \ - d = ROTW16(d); \ - c += d; \ - b ^= c; \ - b = ROTW12(b); \ - a += b; \ - d ^= a; \ - d = ROTW8(d); \ - c += d; \ - b ^= c; \ - b = ROTW7(b); \ - b = ROTV3(b); \ - c = ROTV2(c); \ - d = ROTV1(d); - -#define WRITE_XOR(in, op, d, v0, v1, v2, v3) \ - STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \ - STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \ - STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ - STORE(op + d + 12, LOAD(in + d + 12) ^ REVV_BE(v3)); - -struct chacha_ctx { - vec s1; - vec s2; - vec s3; -}; - -typedef struct chacha_ctx chacha_ctx; - -static void -chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, uint64_t ic) -{ - uint32_t iv_low; - uint32_t iv_high; - - memcpy(&iv_low, iv, 4); - memcpy(&iv_high, iv + 4, 4); - { - const vec s3 = { (uint32_t) ic, (uint32_t)(ic >> 32), iv_low, iv_high }; - ctx->s3 = s3; - } -} - -static void -chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, uint32_t ic) -{ - const vec s3 = { ic, ((const uint32_t *) (const void *) iv)[0], - ((const uint32_t *) (const void *) iv)[1], - ((const uint32_t *) (const void *) iv)[2] }; - ctx->s3 = s3; -} - -static void -chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) -{ - ctx->s1 = LOAD(k); - ctx->s2 = LOAD(k + 16); -} - -static void -chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *in, uint8_t *out, - unsigned long long inlen) -{ - CRYPTO_ALIGN(16) - unsigned chacha_const[] = { 0x61707865, 0x3320646E, 0x79622D32, - 0x6B206574 }; - uint32_t * op = (uint32_t *) (void *) out; - const uint32_t * ip = (const uint32_t *) (const void *) in; - vec s0, s1, s2, s3; - unsigned long long iters; - unsigned long long i; - - if (inlen > 64ULL * (1ULL << 32) - 64ULL) { - abort(); /* LCOV_EXCL_LINE */ - } - s0 = LOAD_ALIGNED(chacha_const); - s1 = ctx->s1; - s2 = ctx->s2; - s3 = ctx->s3; - - for (iters = 0; iters < inlen / (BPI * 64); iters++) { -#if VBPI > 2 - vec v8, v9, v10, v11; -#endif -#if VBPI > 3 - vec v12, v13, v14, v15; -#endif - vec v0, v1, v2, v3, v4, v5, v6, v7; - v4 = v0 = s0; - v5 = v1 = s1; - v6 = v2 = s2; - v3 = s3; - v7 = v3 + ONE; -#if VBPI > 2 - v8 = v4; - v9 = v5; - v10 = v6; - v11 = v7 + ONE; -#endif -#if VBPI > 3 - v12 = v8; - v13 = v9; - v14 = v10; - v15 = v11 + ONE; -#endif - for (i = CHACHA_RNDS / 2; i; i--) { - DQROUND_VECTORS(v0, v1, v2, v3) - DQROUND_VECTORS(v4, v5, v6, v7) -#if VBPI > 2 - DQROUND_VECTORS(v8, v9, v10, v11) -#endif -#if VBPI > 3 - DQROUND_VECTORS(v12, v13, v14, v15) -#endif - } - - WRITE_XOR(ip, op, 0, v0 + s0, v1 + s1, v2 + s2, v3 + s3) - s3 += ONE; - WRITE_XOR(ip, op, 16, v4 + s0, v5 + s1, v6 + s2, v7 + s3) - s3 += ONE; -#if VBPI > 2 - WRITE_XOR(ip, op, 32, v8 + s0, v9 + s1, v10 + s2, v11 + s3) - s3 += ONE; -#endif -#if VBPI > 3 - WRITE_XOR(ip, op, 48, v12 + s0, v13 + s1, v14 + s2, v15 + s3) - s3 += ONE; -#endif - ip += VBPI * 16; - op += VBPI * 16; - } - - for (iters = inlen % (BPI * 64) / 64; iters != 0; iters--) { - vec v0 = s0, v1 = s1, v2 = s2, v3 = s3; - for (i = CHACHA_RNDS / 2; i; i--) { - DQROUND_VECTORS(v0, v1, v2, v3); - } - WRITE_XOR(ip, op, 0, v0 + s0, v1 + s1, v2 + s2, v3 + s3) - s3 += ONE; - ip += 16; - op += 16; - } - - inlen = inlen % 64; - if (inlen) { - CRYPTO_ALIGN(16) vec buf[4]; - vec v0, v1, v2, v3; - v0 = s0; - v1 = s1; - v2 = s2; - v3 = s3; - for (i = CHACHA_RNDS / 2; i; i--) { - DQROUND_VECTORS(v0, v1, v2, v3); - } - if (inlen >= 16) { - STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0)); - if (inlen >= 32) { - STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1)); - if (inlen >= 48) { - STORE(op + 8, LOAD(ip + 8) ^ REVV_BE(v2 + s2)); - buf[3] = REVV_BE(v3 + s3); - } else { - buf[2] = REVV_BE(v2 + s2); - } - } else { - buf[1] = REVV_BE(v1 + s1); - } - } else { - buf[0] = REVV_BE(v0 + s0); - } - for (i = inlen & ~15ULL; i < inlen; i++) { - ((char *) op)[i] = ((const char *) ip)[i] ^ ((char *) buf)[i]; - } - } -} - -static int -stream_vec(unsigned char *c, unsigned long long clen, const unsigned char *n, - const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, 0ULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_vec(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, 0ULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_vec_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!mlen) { - return 0; - } - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, ic); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_vec_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - uint32_t ic, const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!mlen) { - return 0; - } - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, ic); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_vec_implementation = { - SODIUM_C99(.stream =) stream_vec, - SODIUM_C99(.stream_ietf =) stream_ietf_vec, - SODIUM_C99(.stream_xor_ic =) stream_vec_xor_ic, - SODIUM_C99(.stream_ietf_xor_ic =) stream_ietf_vec_xor_ic - }; - -#endif