From e8e5d2fc18da56e7628c0a537697d33307efccaf Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Wed, 7 Oct 2015 23:07:19 +0200 Subject: [PATCH 01/25] Add crypto_aead_aes256gcm_aesni_* Requires a CPU with aesni and pclmulqdq This is a private branch for a reason. It is not going to be merged as-is. --- configure.ac | 7 + src/libsodium/Makefile.am | 1 + .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 444 ++++++++++++++++++ src/libsodium/include/Makefile.am | 1 + src/libsodium/include/sodium.h | 1 + .../sodium/crypto_aead_aes256gcm_aesni.h | 44 ++ 6 files changed, 498 insertions(+) create mode 100644 src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c create mode 100644 src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h diff --git a/configure.ac b/configure.ac index 4d85a4e2..14b9cb73 100644 --- a/configure.ac +++ b/configure.ac @@ -211,6 +211,13 @@ AX_CHECK_COMPILE_FLAG([-Wwrite-strings], [CFLAGS="$CFLAGS -Wwrite-strings"]) AX_CHECK_COMPILE_FLAG([-Wdiv-by-zero], [CFLAGS="$CFLAGS -Wdiv-by-zero"]) AX_CHECK_COMPILE_FLAG([-Wsometimes-uninitialized], [CFLAGS="$CFLAGS -Wsometimes-uninitialized"]) +AX_CHECK_COMPILE_FLAG([$CFLAGS -mmmx], [CFLAGS="$CFLAGS -mmmx"]) +AX_CHECK_COMPILE_FLAG([$CFLAGS -msse], [CFLAGS="$CFLAGS -msse"]) +AX_CHECK_COMPILE_FLAG([$CFLAGS -msse2], [CFLAGS="$CFLAGS -msse2"]) +AX_CHECK_COMPILE_FLAG([$CFLAGS -msse3], [CFLAGS="$CFLAGS -msse3"]) +AX_CHECK_COMPILE_FLAG([$CFLAGS -maes], [CFLAGS="$CFLAGS -maes"]) +AX_CHECK_COMPILE_FLAG([$CFLAGS -mpclmul], [CFLAGS="$CFLAGS -mpclmul"]) + AC_ARG_VAR([CWFLAGS], [define to compilation flags for generating extra warnings]) AX_CHECK_COMPILE_FLAG([$CWFLAGS -Wall], [CWFLAGS="$CWFLAGS -Wall"]) diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 55c0dffd..c7748234 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -2,6 +2,7 @@ lib_LTLIBRARIES = \ libsodium.la libsodium_la_SOURCES = \ + crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c \ crypto_aead/chacha20poly1305/sodium/aead_chacha20poly1305.c \ crypto_auth/crypto_auth.c \ crypto_auth/hmacsha256/auth_hmacsha256_api.c \ diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c new file mode 100644 index 00000000..a5fab086 --- /dev/null +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -0,0 +1,444 @@ + +#include +#include +#include +#include + +#include "crypto_aead_aes256gcm_aesni.h" +#include "utils.h" + +#define AES_BLOCKSIZE 16 +#define AES_MAXROUNDS 14 +#define GMAC_BLOCKSIZE 16 + +#if defined(_MSC_VER) +# define CRYPTO_ALIGN(x) __declspec(align(x)) +#else +# define CRYPTO_ALIGN(x) __attribute__((aligned(x))) +#endif + +typedef CRYPTO_ALIGN(128) struct ghash { + unsigned char initial_state[GMAC_BLOCKSIZE]; + unsigned char state[GMAC_BLOCKSIZE]; + unsigned char subkey[GMAC_BLOCKSIZE]; +} ghash; + +typedef CRYPTO_ALIGN(128) struct context { + __m128i ekey[AES_MAXROUNDS + 1]; + ghash ghash; +} context; + +static inline void +_u64_be_from_ull(unsigned char out[8U], unsigned long long x) +{ + out[7] = (unsigned char) (x & 0xff); x >>= 8; + out[6] = (unsigned char) (x & 0xff); x >>= 8; + out[5] = (unsigned char) (x & 0xff); x >>= 8; + out[4] = (unsigned char) (x & 0xff); x >>= 8; + out[3] = (unsigned char) (x & 0xff); x >>= 8; + out[2] = (unsigned char) (x & 0xff); x >>= 8; + out[1] = (unsigned char) (x & 0xff); x >>= 8; + out[0] = (unsigned char) (x & 0xff); +} + +#define KEY_EXPANSION_A \ + tmp1 = _mm_shuffle_epi32(tmp1, 255); \ + tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ + _mm_castsi128_ps(tmp0), 16)); \ + tmp0 = _mm_xor_si128(tmp0, tmp3); \ + tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ + _mm_castsi128_ps(tmp0), 140)); \ + tmp0 = _mm_xor_si128(_mm_xor_si128(tmp0, tmp3), tmp1); \ + _mm_store_si128((void *) key_ptr, tmp0); \ + key_ptr++ + +#define KEY_EXPANSION_B \ + tmp1 = _mm_shuffle_epi32(tmp1, 170); \ + tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ + _mm_castsi128_ps(tmp2), 16)); \ + tmp2 = _mm_xor_si128(tmp2, tmp3); \ + tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ + _mm_castsi128_ps(tmp2), 140)); \ + tmp2 = _mm_xor_si128(_mm_xor_si128(tmp2, tmp3), tmp1); \ + _mm_store_si128((void *) key_ptr, tmp2); \ + key_ptr++ + +#define KEY_EXPANSION_BLOCK_1(rcon) \ + tmp1 = _mm_aeskeygenassist_si128(tmp0, (rcon)); KEY_EXPANSION_A + +#define KEY_EXPANSION_BLOCK_2(rcon) \ + tmp1 = _mm_aeskeygenassist_si128(tmp2, rcon); KEY_EXPANSION_A; \ + tmp1 = _mm_aeskeygenassist_si128(tmp0, rcon); KEY_EXPANSION_B + +#define KEY_EXPANSION_LAST(rcon) \ + tmp1 = _mm_aeskeygenassist_si128(tmp2, 64); KEY_EXPANSION_A; + +static void +_key_setup(context * const ctx, const unsigned char *key) +{ + __m128i *key_ptr = &ctx->ekey[0]; + __m128i tmp0 = _mm_loadu_si128((const void *) key); + __m128i tmp1, tmp2; + __m128i tmp3 = _mm_setzero_si128(); + + *key_ptr++ = tmp0; + switch (crypto_aead_aes256gcm_KEYBYTES) { + case 16: + KEY_EXPANSION_BLOCK_1(1); + KEY_EXPANSION_BLOCK_1(2); + KEY_EXPANSION_BLOCK_1(4); + KEY_EXPANSION_BLOCK_1(8); + KEY_EXPANSION_BLOCK_1(16); + KEY_EXPANSION_BLOCK_1(32); + KEY_EXPANSION_BLOCK_1(64); + KEY_EXPANSION_BLOCK_1(128); + KEY_EXPANSION_BLOCK_1(27); + KEY_EXPANSION_BLOCK_1(54); + break; + case 32: + tmp2 = _mm_loadu_si128((const void *) (key + 16)); + _mm_store_si128((void *) key_ptr, tmp2); + key_ptr++; + KEY_EXPANSION_BLOCK_2(1); + KEY_EXPANSION_BLOCK_2(2); + KEY_EXPANSION_BLOCK_2(4); + KEY_EXPANSION_BLOCK_2(8); + KEY_EXPANSION_BLOCK_2(16); + KEY_EXPANSION_BLOCK_2(32); + KEY_EXPANSION_LAST(64); + break; + default: + abort(); + } +} + +#define AESNI_INC \ + ctr = _mm_add_epi64(ctr, inc); \ + ctr_low++; \ + if (ctr_low == 0U) { \ + inc = _mm_slli_si128(inc, 8); \ + ctr = _mm_add_epi64(ctr, inc); \ + inc = _mm_srli_si128(inc, 8); \ + } \ + iv = ctr; \ + iv = _mm_shuffle_epi8(iv, mask) + +static const unsigned char +swap_mask[] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + +static void +_aes_ctr(context *ctx, unsigned char *dst, const unsigned char *src, size_t len, + unsigned char *ivc_block) +{ + __m128i *key_ptr; + __m128i ctr; + __m128i in1, in2, in3, in4; + __m128i iv; + __m128i key; + __m128i mask; + __m128i state1, state2, state3, state4; + + iv = ctr = _mm_loadu_si128((void *) ivc_block); + iv = _mm_slli_si128(iv, 8); + mask = _mm_loadu_si128((const void *) swap_mask); + ctr = _mm_shuffle_epi8(ctr, mask); + uint32_t ctr_low = (uint32_t) 1; + __m128i inc = _mm_cvtsi32_si128((int32_t) ctr_low); + ctr_low = (uint32_t) _mm_cvtsi128_si32(ctr); + +#define ENC4(offset) \ + key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ + state1 = _mm_aesenc_si128(state1, key); \ + state2 = _mm_aesenc_si128(state2, key); \ + state3 = _mm_aesenc_si128(state3, key); \ + state4 = _mm_aesenc_si128(state4, key) + +#define ENC4_LAST(offset) \ + key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ + state1 = _mm_aesenclast_si128(state1, key); \ + state2 = _mm_aesenclast_si128(state2, key); \ + state3 = _mm_aesenclast_si128(state3, key); \ + state4 = _mm_aesenclast_si128(state4, key) + + while (len >= 64) { + AESNI_INC; + state1 = iv; + in1 = _mm_loadu_si128((const void *) src); + AESNI_INC; + state2 = iv; + in2 = _mm_loadu_si128((const void *) (src + 16)); + AESNI_INC; + state3 = iv; + in3 = _mm_loadu_si128((const void *) (src + 32)); + AESNI_INC; + state4 = iv; + in4 = _mm_loadu_si128((const void *) (src + 48)); + key = _mm_loadu_si128((void *) &ctx->ekey[0]); + key_ptr = &ctx->ekey[0]; + state1 = _mm_xor_si128(state1, key); + state2 = _mm_xor_si128(state2, key); + state3 = _mm_xor_si128(state3, key); + state4 = _mm_xor_si128(state4, key); + key_ptr += 3; + switch (crypto_aead_aes256gcm_KEYBYTES) { + case 32: + key_ptr += 4; + ENC4(-6); + ENC4(-5); + case 16: + ENC4(-4); + ENC4(-3); + ENC4(-2); + ENC4(-1); + ENC4(0); + ENC4(1); + ENC4(2); + ENC4(3); + ENC4(4); + ENC4(5); + ENC4(6); + ENC4_LAST(7); + break; + default: + abort(); + } + state1 = _mm_xor_si128(state1, in1); + _mm_storeu_si128((void *) dst, state1); + state2 = _mm_xor_si128(state2, in2); + _mm_storeu_si128((void *) (dst + 16), state2); + state3 = _mm_xor_si128(state3, in3); + _mm_storeu_si128((void *) (dst + 32), state3); + state4 = _mm_xor_si128(state4, in4); + _mm_storeu_si128((void *) (dst + 48), state4); + len -= 64; + src += 64; + dst += 64; + } + +#define ENC(offset) \ + key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ + state1 = _mm_aesenc_si128(state1, key) + +#define ENC_LAST(offset) \ + key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ + state1 = _mm_aesenclast_si128(state1, key); + +#define ENC_ONE \ + key_ptr = &ctx->ekey[0]; \ + key = _mm_loadu_si128((void *) &ctx->ekey[0]); \ + state1 = _mm_xor_si128(state1, key); \ + key_ptr += 3; \ + switch (crypto_aead_aes256gcm_KEYBYTES) { \ + case 32: \ + key_ptr += 4; \ + ENC(-6); \ + ENC(-5); \ + ENC(-4); \ + ENC(-3); \ + case 16: \ + ENC(-2); \ + ENC(-1); \ + ENC(0); \ + ENC(1); \ + ENC(2); \ + ENC(3); \ + ENC(4); \ + ENC(5); \ + ENC(6); \ + ENC_LAST(7); \ + break; \ + default: \ + abort(); \ + } + while (len >= 16) { + AESNI_INC; + state1 = iv; + in1 = _mm_loadu_si128((const void *) src); + ENC_ONE; + state1 = _mm_xor_si128(state1, in1); + _mm_storeu_si128((void *) dst, state1); + len -= 16; + src += 16; + dst += 16; + } + if (len > 0) { + unsigned char padded[16]; + memset(padded, 0, sizeof padded); + memcpy(padded, src, len); + src = padded; + AESNI_INC; + state1 = iv; + in1 = _mm_loadu_si128((const void *) src); + ENC_ONE; + state1 = _mm_xor_si128(state1, in1); + _mm_storeu_si128((void *) padded, state1); + memcpy(dst, padded, len); + } + _mm_storel_epi64((void *) ivc_block, iv); +} + +static void +_aes_enc_one(context *ctx, unsigned char *dst, unsigned char *src) +{ + __m128i *key_ptr; + __m128i key; + __m128i state1 = _mm_loadu_si128((const void *) src); + + ENC_ONE; + _mm_storeu_si128((void *) dst, state1); +} + +#define GMAC_UPDATE \ + tmp2 = _mm_loadu_si128((const void *) src); \ + tmp2 = _mm_shuffle_epi8(tmp2, mask); \ + tmp0 = _mm_xor_si128(tmp6, tmp2); \ + \ + tmp4 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x10), \ + _mm_clmulepi64_si128(tmp0, tmp1, 0x01)); \ + tmp5 = _mm_slli_si128(tmp4, 8); \ + tmp3 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x00), tmp5); \ + tmp6 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x11), \ + _mm_srli_si128(tmp4, 8)); \ + tmp7 = _mm_srli_epi32(tmp3, 31); \ + tmp8 = _mm_srli_epi32(tmp6, 31); \ + tmp3 = _mm_slli_epi32(tmp3, 1); \ + tmp6 = _mm_slli_epi32(tmp6, 1); \ + tmp8 = _mm_slli_si128(tmp8, 4); \ + tmp9 = _mm_srli_si128(tmp7, 12); \ + tmp7 = _mm_slli_si128(tmp7, 4); \ + tmp3 = _mm_or_si128(tmp3, tmp7); \ + tmp6 = _mm_or_si128(_mm_or_si128(tmp6, tmp8), tmp9); \ + \ + tmp8 = _mm_slli_epi32(tmp3, 30); \ + tmp9 = _mm_slli_epi32(tmp3, 25); \ + tmp7 = _mm_xor_si128(_mm_xor_si128(_mm_slli_epi32(tmp3, 31), tmp8), tmp9); \ + tmp3 = _mm_xor_si128(tmp3, _mm_slli_si128(tmp7, 12)); \ + \ + tmp4 = _mm_srli_epi32(tmp3, 2); \ + tmp5 = _mm_srli_epi32(tmp3, 7); \ + tmp6 = _mm_xor_si128(tmp6, _mm_xor_si128(tmp3, \ + _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(tmp3, 1), tmp4), tmp5), \ + _mm_srli_si128(tmp7, 4)))) + +static void +_gmac_update(ghash *ghash, const unsigned char *src, size_t len) +{ + __m128i mask = _mm_loadu_si128((const void *) swap_mask); + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + + tmp1 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ghash->subkey), mask); + tmp6 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ghash->initial_state), mask); + while (len >= 16) { + GMAC_UPDATE; + len -= 16; + src += 16; + } + if (len > 0) { + unsigned char padded[16]; + memset(padded, 0, sizeof padded); + memcpy(padded, src, len); + src = padded; + GMAC_UPDATE; + } + tmp6 = _mm_shuffle_epi8(tmp6, mask); + _mm_storeu_si128((void *) ghash->state, tmp6); + _mm_storeu_si128((void *) ghash->initial_state, tmp6); +} + +static void +_gmac_final(context *ctx, unsigned char *tag, unsigned char *ivc_block, unsigned char *hashstate) +{ + __m128i *key_ptr; + __m128i in; + __m128i key; + __m128i state1 = _mm_loadu_si128((const void *) ivc_block); + + ENC_ONE; + in = _mm_loadu_si128((void *) hashstate); + state1 = _mm_xor_si128(state1, in); + _mm_storeu_si128((void *) tag, state1); +} + +int +crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, + unsigned long long *clen_p, + const unsigned char *m, + unsigned long long mlen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const unsigned char *k) +{ + context ctx; + unsigned char *mac; + unsigned char ivc_block[AES_BLOCKSIZE]; + + (void) nsec; + memset(&ctx, 0, sizeof ctx); + memset(ivc_block, 0, sizeof ivc_block); + memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); + ivc_block[AES_BLOCKSIZE - 1U] = 1U; + _key_setup(&ctx, k); + _aes_enc_one(&ctx, ctx.ghash.subkey, ctx.ghash.subkey); + _gmac_update(&ctx.ghash, ad, adlen); + _aes_ctr(&ctx, c, m, mlen, ivc_block); + _gmac_update(&ctx.ghash, c, mlen); + mac = c + mlen; + _u64_be_from_ull(mac, adlen * 8ULL); + _u64_be_from_ull(mac + 8U, mlen * 8ULL); + _gmac_update(&ctx.ghash, mac, GMAC_BLOCKSIZE); + _gmac_final(&ctx, mac, ivc_block, ctx.ghash.state); + sodium_memzero(&ctx, sizeof ctx); + if (clen_p != NULL) { + *clen_p = mlen + crypto_aead_aes256gcm_ABYTES; + } + return 0; +} + +int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + const unsigned char *k) +{ + context ctx; + unsigned char mac[GMAC_BLOCKSIZE]; + unsigned char ivc_block[AES_BLOCKSIZE]; + size_t mlen; + + (void) nsec; + if (mlen_p != NULL) { + *mlen_p = 0; + } + if (clen < crypto_aead_aes256gcm_ABYTES) { + return -1; + } + mlen = clen - crypto_aead_aes256gcm_ABYTES; + memset(&ctx, 0, sizeof ctx); + memset(ivc_block, 0, sizeof ivc_block); + memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); + ivc_block[AES_BLOCKSIZE - 1U] = 1U; + _key_setup(&ctx, k); + _aes_enc_one(&ctx, ctx.ghash.subkey, ctx.ghash.subkey); + _gmac_update(&ctx.ghash, ad, adlen); + _gmac_update(&ctx.ghash, c, mlen); + _u64_be_from_ull(mac, adlen * 8ULL); + _u64_be_from_ull(mac + 8U, mlen * 8ULL); + _gmac_update(&ctx.ghash, mac, GMAC_BLOCKSIZE); + _gmac_final(&ctx, mac, ivc_block, ctx.ghash.state); + if (sodium_memcmp(c + mlen, mac, crypto_aead_aes256gcm_ABYTES) != 0) { + sodium_memzero(&ctx, sizeof ctx); + return -1; + } + _aes_ctr(&ctx, m, c, mlen, ivc_block); + sodium_memzero(&ctx, sizeof ctx); + if (mlen_p != NULL) { + *mlen_p = mlen; + } + return 0; +} diff --git a/src/libsodium/include/Makefile.am b/src/libsodium/include/Makefile.am index 77e01eac..854e35fa 100644 --- a/src/libsodium/include/Makefile.am +++ b/src/libsodium/include/Makefile.am @@ -2,6 +2,7 @@ SODIUM_EXPORT = \ sodium.h \ sodium/core.h \ + sodium/crypto_aead_aes256gcm_aesni.h \ sodium/crypto_aead_chacha20poly1305.h \ sodium/crypto_auth.h \ sodium/crypto_auth_hmacsha256.h \ diff --git a/src/libsodium/include/sodium.h b/src/libsodium/include/sodium.h index 97280f93..b615cad8 100644 --- a/src/libsodium/include/sodium.h +++ b/src/libsodium/include/sodium.h @@ -3,6 +3,7 @@ #define sodium_H #include "sodium/core.h" +#include "sodium/crypto_aead_aes256gcm_aesni.h" #include "sodium/crypto_aead_chacha20poly1305.h" #include "sodium/crypto_auth.h" #include "sodium/crypto_auth_hmacsha256.h" diff --git a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h new file mode 100644 index 00000000..b5a49cb3 --- /dev/null +++ b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h @@ -0,0 +1,44 @@ +#ifndef crypto_aead_aes256gcm_aesni_H +#define crypto_aead_aes256gcm_aesni_H + +#include +#include "export.h" + +#ifdef __cplusplus +# if __GNUC__ +# pragma GCC diagnostic ignored "-Wlong-long" +# endif +extern "C" { +#endif + +#define crypto_aead_aes256gcm_KEYBYTES 32U +#define crypto_aead_aes256gcm_NSECBYTES 0U +#define crypto_aead_aes256gcm_NPUBBYTES 12U +#define crypto_aead_aes256gcm_ABYTES 16U + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, + unsigned long long *clen_p, + const unsigned char *m, + unsigned long long mlen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const unsigned char *k); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + const unsigned char *k); +#ifdef __cplusplus +} +#endif + +#endif From 571bfc99c8ca2ec676604330a8baec5673de39bf Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Wed, 7 Oct 2015 23:26:13 +0200 Subject: [PATCH 02/25] Check for ssse3 presence --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index 14b9cb73..78da398c 100644 --- a/configure.ac +++ b/configure.ac @@ -215,6 +215,7 @@ AX_CHECK_COMPILE_FLAG([$CFLAGS -mmmx], [CFLAGS="$CFLAGS -mmmx"]) AX_CHECK_COMPILE_FLAG([$CFLAGS -msse], [CFLAGS="$CFLAGS -msse"]) AX_CHECK_COMPILE_FLAG([$CFLAGS -msse2], [CFLAGS="$CFLAGS -msse2"]) AX_CHECK_COMPILE_FLAG([$CFLAGS -msse3], [CFLAGS="$CFLAGS -msse3"]) +AX_CHECK_COMPILE_FLAG([$CFLAGS -mssse3], [CFLAGS="$CFLAGS -mssse3"]) AX_CHECK_COMPILE_FLAG([$CFLAGS -maes], [CFLAGS="$CFLAGS -maes"]) AX_CHECK_COMPILE_FLAG([$CFLAGS -mpclmul], [CFLAGS="$CFLAGS -mpclmul"]) From 396e16880d3940ea6564169ed13a52de899875b2 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Fri, 9 Oct 2015 08:50:49 +0200 Subject: [PATCH 03/25] Move CRYPTO_ALIGN to sodium/export.h --- src/libsodium/include/sodium/crypto_generichash_blake2b.h | 6 ------ src/libsodium/include/sodium/export.h | 8 ++++++++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/libsodium/include/sodium/crypto_generichash_blake2b.h b/src/libsodium/include/sodium/crypto_generichash_blake2b.h index cddfdaa6..a88a59ec 100644 --- a/src/libsodium/include/sodium/crypto_generichash_blake2b.h +++ b/src/libsodium/include/sodium/crypto_generichash_blake2b.h @@ -7,12 +7,6 @@ #include "export.h" -#if defined(_MSC_VER) -# define CRYPTO_ALIGN(x) __declspec(align(x)) -#else -# define CRYPTO_ALIGN(x) __attribute__((aligned(x))) -#endif - #ifdef __cplusplus # if __GNUC__ # pragma GCC diagnostic ignored "-Wlong-long" diff --git a/src/libsodium/include/sodium/export.h b/src/libsodium/include/sodium/export.h index 53fcd7b5..aeb6d0e2 100644 --- a/src/libsodium/include/sodium/export.h +++ b/src/libsodium/include/sodium/export.h @@ -29,4 +29,12 @@ # endif #endif +#ifndef CRYPTO_ALIGN +# if defined(_MSC_VER) +# define CRYPTO_ALIGN(x) __declspec(align(x)) +# else +# define CRYPTO_ALIGN(x) __attribute__((aligned(x))) +# endif +#endif + #endif From 96d4494f2fc381f8537e10a6ea6ab9dbc83b10c3 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Fri, 9 Oct 2015 09:25:01 +0200 Subject: [PATCH 04/25] Add crypto_aead_aes256gcm_aesni_{beforenm|*_afternm} --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 204 +++++++++++------- .../sodium/crypto_aead_aes256gcm_aesni.h | 28 +++ 2 files changed, 149 insertions(+), 83 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index a5fab086..e9f38187 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -11,21 +11,11 @@ #define AES_MAXROUNDS 14 #define GMAC_BLOCKSIZE 16 -#if defined(_MSC_VER) -# define CRYPTO_ALIGN(x) __declspec(align(x)) -#else -# define CRYPTO_ALIGN(x) __attribute__((aligned(x))) -#endif - -typedef CRYPTO_ALIGN(128) struct ghash { +typedef CRYPTO_ALIGN(128) struct context { + __m128i ekey[AES_MAXROUNDS + 1]; unsigned char initial_state[GMAC_BLOCKSIZE]; unsigned char state[GMAC_BLOCKSIZE]; unsigned char subkey[GMAC_BLOCKSIZE]; -} ghash; - -typedef CRYPTO_ALIGN(128) struct context { - __m128i ekey[AES_MAXROUNDS + 1]; - ghash ghash; } context; static inline void @@ -321,13 +311,13 @@ _aes_enc_one(context *ctx, unsigned char *dst, unsigned char *src) _mm_srli_si128(tmp7, 4)))) static void -_gmac_update(ghash *ghash, const unsigned char *src, size_t len) +_gmac_update(context *ctx, const unsigned char *src, size_t len) { __m128i mask = _mm_loadu_si128((const void *) swap_mask); __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; - tmp1 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ghash->subkey), mask); - tmp6 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ghash->initial_state), mask); + tmp1 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ctx->subkey), mask); + tmp6 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ctx->initial_state), mask); while (len >= 16) { GMAC_UPDATE; len -= 16; @@ -341,8 +331,8 @@ _gmac_update(ghash *ghash, const unsigned char *src, size_t len) GMAC_UPDATE; } tmp6 = _mm_shuffle_epi8(tmp6, mask); - _mm_storeu_si128((void *) ghash->state, tmp6); - _mm_storeu_si128((void *) ghash->initial_state, tmp6); + _mm_storeu_si128((void *) ctx->state, tmp6); + _mm_storeu_si128((void *) ctx->initial_state, tmp6); } static void @@ -359,6 +349,100 @@ _gmac_final(context *ctx, unsigned char *tag, unsigned char *ivc_block, unsigned _mm_storeu_si128((void *) tag, state1); } +int +crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_, + const unsigned char *k) +{ + context *ctx = (context *) ctx_; + + (void) sizeof(int[(sizeof *ctx_) >= (sizeof *ctx) ? 1 : -1]); + memset(ctx, 0, sizeof *ctx); + _key_setup(ctx, k); + _aes_enc_one(ctx, ctx->subkey, ctx->subkey); + + return 0; +} + +int +crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, + unsigned long long *clen_p, + const unsigned char *m, + unsigned long long mlen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + crypto_aead_aes256gcm_aesni_state *ctx_) +{ + context *ctx = (context *) ctx_; + unsigned char *mac; + unsigned char ivc_block[AES_BLOCKSIZE]; + + (void) nsec; + memset(ivc_block, 0, sizeof ivc_block); + memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); + ivc_block[AES_BLOCKSIZE - 1U] = 1U; + _gmac_update(ctx, ad, adlen); + _aes_ctr(ctx, c, m, mlen, ivc_block); + _gmac_update(ctx, c, mlen); + mac = c + mlen; + _u64_be_from_ull(mac, adlen * 8ULL); + _u64_be_from_ull(mac + 8U, mlen * 8ULL); + _gmac_update(ctx, mac, GMAC_BLOCKSIZE); + _gmac_final(ctx, mac, ivc_block, ctx->state); + sodium_memzero(ctx, sizeof *ctx); + if (clen_p != NULL) { + *clen_p = mlen + crypto_aead_aes256gcm_ABYTES; + } + return 0; +} + +int +crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + crypto_aead_aes256gcm_aesni_state *ctx_) +{ + context *ctx = (context *) ctx_; + unsigned char mac[GMAC_BLOCKSIZE]; + unsigned char ivc_block[AES_BLOCKSIZE]; + size_t mlen; + + (void) nsec; + if (mlen_p != NULL) { + *mlen_p = 0; + } + if (clen < crypto_aead_aes256gcm_ABYTES) { + return -1; + } + mlen = clen - crypto_aead_aes256gcm_ABYTES; + memset(ivc_block, 0, sizeof ivc_block); + memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); + ivc_block[AES_BLOCKSIZE - 1U] = 1U; + + _gmac_update(ctx, ad, adlen); + _gmac_update(ctx, c, mlen); + _u64_be_from_ull(mac, adlen * 8ULL); + _u64_be_from_ull(mac + 8U, mlen * 8ULL); + _gmac_update(ctx, mac, GMAC_BLOCKSIZE); + _gmac_final(ctx, mac, ivc_block, ctx->state); + if (sodium_memcmp(c + mlen, mac, crypto_aead_aes256gcm_ABYTES) != 0) { + sodium_memzero(ctx, sizeof *ctx); + return -1; + } + _aes_ctr(ctx, m, c, mlen, ivc_block); + sodium_memzero(ctx, sizeof *ctx); + if (mlen_p != NULL) { + *mlen_p = mlen; + } + return 0; +} + int crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, unsigned long long *clen_p, @@ -370,75 +454,29 @@ crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, const unsigned char *npub, const unsigned char *k) { - context ctx; - unsigned char *mac; - unsigned char ivc_block[AES_BLOCKSIZE]; + crypto_aead_aes256gcm_aesni_state ctx; - (void) nsec; - memset(&ctx, 0, sizeof ctx); - memset(ivc_block, 0, sizeof ivc_block); - memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); - ivc_block[AES_BLOCKSIZE - 1U] = 1U; - _key_setup(&ctx, k); - _aes_enc_one(&ctx, ctx.ghash.subkey, ctx.ghash.subkey); - _gmac_update(&ctx.ghash, ad, adlen); - _aes_ctr(&ctx, c, m, mlen, ivc_block); - _gmac_update(&ctx.ghash, c, mlen); - mac = c + mlen; - _u64_be_from_ull(mac, adlen * 8ULL); - _u64_be_from_ull(mac + 8U, mlen * 8ULL); - _gmac_update(&ctx.ghash, mac, GMAC_BLOCKSIZE); - _gmac_final(&ctx, mac, ivc_block, ctx.ghash.state); - sodium_memzero(&ctx, sizeof ctx); - if (clen_p != NULL) { - *clen_p = mlen + crypto_aead_aes256gcm_ABYTES; - } - return 0; + crypto_aead_aes256gcm_aesni_beforenm(&ctx, k); + + return crypto_aead_aes256gcm_aesni_encrypt_afternm + (c, clen_p, m, mlen, ad, adlen, nsec, npub, &ctx); } -int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, - unsigned long long *mlen_p, - unsigned char *nsec, - const unsigned char *c, - unsigned long long clen, - const unsigned char *ad, - unsigned long long adlen, - const unsigned char *npub, - const unsigned char *k) +int +crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + const unsigned char *k) { - context ctx; - unsigned char mac[GMAC_BLOCKSIZE]; - unsigned char ivc_block[AES_BLOCKSIZE]; - size_t mlen; + crypto_aead_aes256gcm_aesni_state ctx; - (void) nsec; - if (mlen_p != NULL) { - *mlen_p = 0; - } - if (clen < crypto_aead_aes256gcm_ABYTES) { - return -1; - } - mlen = clen - crypto_aead_aes256gcm_ABYTES; - memset(&ctx, 0, sizeof ctx); - memset(ivc_block, 0, sizeof ivc_block); - memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); - ivc_block[AES_BLOCKSIZE - 1U] = 1U; - _key_setup(&ctx, k); - _aes_enc_one(&ctx, ctx.ghash.subkey, ctx.ghash.subkey); - _gmac_update(&ctx.ghash, ad, adlen); - _gmac_update(&ctx.ghash, c, mlen); - _u64_be_from_ull(mac, adlen * 8ULL); - _u64_be_from_ull(mac + 8U, mlen * 8ULL); - _gmac_update(&ctx.ghash, mac, GMAC_BLOCKSIZE); - _gmac_final(&ctx, mac, ivc_block, ctx.ghash.state); - if (sodium_memcmp(c + mlen, mac, crypto_aead_aes256gcm_ABYTES) != 0) { - sodium_memzero(&ctx, sizeof ctx); - return -1; - } - _aes_ctr(&ctx, m, c, mlen, ivc_block); - sodium_memzero(&ctx, sizeof ctx); - if (mlen_p != NULL) { - *mlen_p = mlen; - } - return 0; + crypto_aead_aes256gcm_aesni_beforenm(&ctx, k); + + return crypto_aead_aes256gcm_aesni_decrypt_afternm + (m, mlen_p, nsec, c, clen, ad, adlen, npub, &ctx); } diff --git a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h index b5a49cb3..fca611d9 100644 --- a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h +++ b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h @@ -16,6 +16,8 @@ extern "C" { #define crypto_aead_aes256gcm_NPUBBYTES 12U #define crypto_aead_aes256gcm_ABYTES 16U +typedef CRYPTO_ALIGN(128) unsigned char crypto_aead_aes256gcm_aesni_state[384]; + SODIUM_EXPORT int crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, unsigned long long *clen_p, @@ -37,6 +39,32 @@ int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, unsigned long long adlen, const unsigned char *npub, const unsigned char *k); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_, + const unsigned char *k); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, + unsigned long long *clen_p, + const unsigned char *m, + unsigned long long mlen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + crypto_aead_aes256gcm_aesni_state *ctx_); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + crypto_aead_aes256gcm_aesni_state *ctx_); #ifdef __cplusplus } #endif From 41c296fcf89de6a495a91186ab7e073eae1f2e94 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Fri, 9 Oct 2015 09:43:03 +0200 Subject: [PATCH 05/25] Make the state const in *_afternm() --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 48 ++++++++++--------- .../sodium/crypto_aead_aes256gcm_aesni.h | 4 +- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index e9f38187..b1d879e9 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -372,25 +372,27 @@ crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, - crypto_aead_aes256gcm_aesni_state *ctx_) + const crypto_aead_aes256gcm_aesni_state *ctx_) { - context *ctx = (context *) ctx_; - unsigned char *mac; - unsigned char ivc_block[AES_BLOCKSIZE]; + context ctx; + unsigned char *mac; + unsigned char ivc_block[AES_BLOCKSIZE]; (void) nsec; + (void) sizeof(int[(sizeof *ctx_) >= (sizeof ctx) ? 1 : -1]); + memcpy(&ctx, ctx_, sizeof ctx); memset(ivc_block, 0, sizeof ivc_block); memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); ivc_block[AES_BLOCKSIZE - 1U] = 1U; - _gmac_update(ctx, ad, adlen); - _aes_ctr(ctx, c, m, mlen, ivc_block); - _gmac_update(ctx, c, mlen); + _gmac_update(&ctx, ad, adlen); + _aes_ctr(&ctx, c, m, mlen, ivc_block); + _gmac_update(&ctx, c, mlen); mac = c + mlen; _u64_be_from_ull(mac, adlen * 8ULL); _u64_be_from_ull(mac + 8U, mlen * 8ULL); - _gmac_update(ctx, mac, GMAC_BLOCKSIZE); - _gmac_final(ctx, mac, ivc_block, ctx->state); - sodium_memzero(ctx, sizeof *ctx); + _gmac_update(&ctx, mac, GMAC_BLOCKSIZE); + _gmac_final(&ctx, mac, ivc_block, ctx.state); + sodium_memzero(&ctx, sizeof ctx); if (clen_p != NULL) { *clen_p = mlen + crypto_aead_aes256gcm_ABYTES; } @@ -406,12 +408,12 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, - crypto_aead_aes256gcm_aesni_state *ctx_) + const crypto_aead_aes256gcm_aesni_state *ctx_) { - context *ctx = (context *) ctx_; - unsigned char mac[GMAC_BLOCKSIZE]; - unsigned char ivc_block[AES_BLOCKSIZE]; - size_t mlen; + context ctx; + unsigned char mac[GMAC_BLOCKSIZE]; + unsigned char ivc_block[AES_BLOCKSIZE]; + size_t mlen; (void) nsec; if (mlen_p != NULL) { @@ -425,18 +427,20 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); ivc_block[AES_BLOCKSIZE - 1U] = 1U; - _gmac_update(ctx, ad, adlen); - _gmac_update(ctx, c, mlen); + (void) sizeof(int[(sizeof *ctx_) >= (sizeof ctx) ? 1 : -1]); + memcpy(&ctx, ctx_, sizeof ctx); + _gmac_update(&ctx, ad, adlen); + _gmac_update(&ctx, c, mlen); _u64_be_from_ull(mac, adlen * 8ULL); _u64_be_from_ull(mac + 8U, mlen * 8ULL); - _gmac_update(ctx, mac, GMAC_BLOCKSIZE); - _gmac_final(ctx, mac, ivc_block, ctx->state); + _gmac_update(&ctx, mac, GMAC_BLOCKSIZE); + _gmac_final(&ctx, mac, ivc_block, ctx.state); if (sodium_memcmp(c + mlen, mac, crypto_aead_aes256gcm_ABYTES) != 0) { - sodium_memzero(ctx, sizeof *ctx); + sodium_memzero(&ctx, sizeof ctx); return -1; } - _aes_ctr(ctx, m, c, mlen, ivc_block); - sodium_memzero(ctx, sizeof *ctx); + _aes_ctr(&ctx, m, c, mlen, ivc_block); + sodium_memzero(&ctx, sizeof ctx); if (mlen_p != NULL) { *mlen_p = mlen; } diff --git a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h index fca611d9..d6ba98ec 100644 --- a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h +++ b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h @@ -53,7 +53,7 @@ int crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, - crypto_aead_aes256gcm_aesni_state *ctx_); + const crypto_aead_aes256gcm_aesni_state *ctx_); SODIUM_EXPORT int crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, @@ -64,7 +64,7 @@ int crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, - crypto_aead_aes256gcm_aesni_state *ctx_); + const crypto_aead_aes256gcm_aesni_state *ctx_); #ifdef __cplusplus } #endif From ef1417bc2f6976dd1976d487f807c2cf7967bea1 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Fri, 9 Oct 2015 09:48:34 +0200 Subject: [PATCH 06/25] Explicit cast --- .../crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index b1d879e9..925a1fda 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -463,7 +463,8 @@ crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, crypto_aead_aes256gcm_aesni_beforenm(&ctx, k); return crypto_aead_aes256gcm_aesni_encrypt_afternm - (c, clen_p, m, mlen, ad, adlen, nsec, npub, &ctx); + (c, clen_p, m, mlen, ad, adlen, nsec, npub, + (const crypto_aead_aes256gcm_aesni_state *) &ctx); } int @@ -482,5 +483,6 @@ crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, crypto_aead_aes256gcm_aesni_beforenm(&ctx, k); return crypto_aead_aes256gcm_aesni_decrypt_afternm - (m, mlen_p, nsec, c, clen, ad, adlen, npub, &ctx); + (m, mlen_p, nsec, c, clen, ad, adlen, npub, + (const crypto_aead_aes256gcm_aesni_state *) &ctx); } From ab2e86748efdab910646ed21171f8f8d8c2b3096 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 10:49:54 +0200 Subject: [PATCH 07/25] Replace the aes256gcm implementation with Romain Dolbeau's implementation which is slightly faster than mine. Reimplement features from the previous implementation: add batch mode and use two passes in the decryption function in order to check the tag before decrypting. --- AUTHORS | 3 + configure.ac | 9 +- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 1058 +++++++++++------ .../sodium/crypto_aead_aes256gcm_aesni.h | 21 +- src/libsodium/include/sodium/export.h | 2 +- 5 files changed, 706 insertions(+), 387 deletions(-) diff --git a/AUTHORS b/AUTHORS index 6208f0ec..0622fafa 100644 --- a/AUTHORS +++ b/AUTHORS @@ -32,6 +32,9 @@ scrypt Colin Percival Implementors ============ +crypto_aead/aes256gcm/aesni Romain Dolbeau + Frank Denis + crypto_aead/chacha20poly1305 Frank Denis crypto_box/curve25519xsalsa20poly1305 Daniel J. Bernstein diff --git a/configure.ac b/configure.ac index 78da398c..c2427d5b 100644 --- a/configure.ac +++ b/configure.ac @@ -211,13 +211,8 @@ AX_CHECK_COMPILE_FLAG([-Wwrite-strings], [CFLAGS="$CFLAGS -Wwrite-strings"]) AX_CHECK_COMPILE_FLAG([-Wdiv-by-zero], [CFLAGS="$CFLAGS -Wdiv-by-zero"]) AX_CHECK_COMPILE_FLAG([-Wsometimes-uninitialized], [CFLAGS="$CFLAGS -Wsometimes-uninitialized"]) -AX_CHECK_COMPILE_FLAG([$CFLAGS -mmmx], [CFLAGS="$CFLAGS -mmmx"]) -AX_CHECK_COMPILE_FLAG([$CFLAGS -msse], [CFLAGS="$CFLAGS -msse"]) -AX_CHECK_COMPILE_FLAG([$CFLAGS -msse2], [CFLAGS="$CFLAGS -msse2"]) -AX_CHECK_COMPILE_FLAG([$CFLAGS -msse3], [CFLAGS="$CFLAGS -msse3"]) -AX_CHECK_COMPILE_FLAG([$CFLAGS -mssse3], [CFLAGS="$CFLAGS -mssse3"]) -AX_CHECK_COMPILE_FLAG([$CFLAGS -maes], [CFLAGS="$CFLAGS -maes"]) -AX_CHECK_COMPILE_FLAG([$CFLAGS -mpclmul], [CFLAGS="$CFLAGS -mpclmul"]) +AC_MSG_CHECKING([Checking if we can compile for westmere]) +AX_CHECK_COMPILE_FLAG([-march=westmere $CFLAGS], [CFLAGS="-march=westmere $CFLAGS"]) AC_ARG_VAR([CWFLAGS], [define to compilation flags for generating extra warnings]) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 925a1fda..4643b0ba 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -1,446 +1,726 @@ +/* + * AES256-GCM, based on original code by Romain Dolbeau + */ + #include #include #include #include #include "crypto_aead_aes256gcm_aesni.h" +#include "export.h" #include "utils.h" -#define AES_BLOCKSIZE 16 -#define AES_MAXROUNDS 14 -#define GMAC_BLOCKSIZE 16 +#if defined(__INTEL_COMPILER) || defined(_bswap64) +#elif defined(_MSC_VER) +# define _bswap64(a) _byteswap_uint64(a) +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +# define _bswap64(a) __builtin_bswap64(a) +#else +static inline uint64_t +_bswap64(const uint64_t x) +{ + return + ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) | + ((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) | + ((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) | + ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL); +} +#endif -typedef CRYPTO_ALIGN(128) struct context { - __m128i ekey[AES_MAXROUNDS + 1]; - unsigned char initial_state[GMAC_BLOCKSIZE]; - unsigned char state[GMAC_BLOCKSIZE]; - unsigned char subkey[GMAC_BLOCKSIZE]; +typedef struct context { + CRYPTO_ALIGN(16) unsigned char H[16]; + __m128i rkeys[16]; } context; static inline void -_u64_be_from_ull(unsigned char out[8U], unsigned long long x) +aesni_key256_expand(const unsigned char *key, __m128 rkeys[16]) { - out[7] = (unsigned char) (x & 0xff); x >>= 8; - out[6] = (unsigned char) (x & 0xff); x >>= 8; - out[5] = (unsigned char) (x & 0xff); x >>= 8; - out[4] = (unsigned char) (x & 0xff); x >>= 8; - out[3] = (unsigned char) (x & 0xff); x >>= 8; - out[2] = (unsigned char) (x & 0xff); x >>= 8; - out[1] = (unsigned char) (x & 0xff); x >>= 8; - out[0] = (unsigned char) (x & 0xff); + __m128 key0 = _mm_loadu_ps((const float *) (key + 0)); + __m128 key1 = _mm_loadu_ps((const float *) (key + 16)); + __m128 temp0, temp1, temp2, temp4; + int idx = 0; + + rkeys[idx++] = key0; + temp0 = key0; + temp2 = key1; + temp4 = _mm_setzero_ps(); + +/* why single precision floating-point rather than integer instructions ? + because _mm_shuffle_ps takes two inputs, while _mm_shuffle_epi32 only + takes one - it doesn't perform the same computation... + _mm_shuffle_ps takes the lower 64 bits of the result from the first + operand, and the higher 64 bits of the result from the second operand + (in both cases, all four input floats are accessible). + I don't like the non-orthogonal naming scheme :-( + + This is all strongly inspired by the openssl assembly code. + */ +#define BLOCK1(IMM) \ + temp1 = (__m128)_mm_aeskeygenassist_si128((__m128i) temp2, IMM);\ + rkeys[idx++] = temp2; \ + temp4 = _mm_shuffle_ps(temp4, temp0, 0x10); \ + temp0 = _mm_xor_ps(temp0, temp4); \ + temp4 = _mm_shuffle_ps(temp4, temp0, 0x8c); \ + temp0 = _mm_xor_ps(temp0, temp4); \ + temp1 = _mm_shuffle_ps(temp1, temp1, 0xff); \ + temp0 = _mm_xor_ps(temp0, temp1) + +#define BLOCK2(IMM) \ + temp1 = (__m128)_mm_aeskeygenassist_si128((__m128i) temp0, IMM);\ + rkeys[idx++] = temp0; \ + temp4 = _mm_shuffle_ps(temp4, temp2, 0x10); \ + temp2 = _mm_xor_ps(temp2, temp4); \ + temp4 = _mm_shuffle_ps(temp4, temp2, 0x8c); \ + temp2 = _mm_xor_ps(temp2, temp4); \ + temp1 = _mm_shuffle_ps(temp1, temp1, 0xaa); \ + temp2 = _mm_xor_ps(temp2, temp1) + + BLOCK1(0x01); + BLOCK2(0x01); + + BLOCK1(0x02); + BLOCK2(0x02); + + BLOCK1(0x04); + BLOCK2(0x04); + + BLOCK1(0x08); + BLOCK2(0x08); + + BLOCK1(0x10); + BLOCK2(0x10); + + BLOCK1(0x20); + BLOCK2(0x20); + + BLOCK1(0x40); + rkeys[idx++] = temp0; } -#define KEY_EXPANSION_A \ - tmp1 = _mm_shuffle_epi32(tmp1, 255); \ - tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ - _mm_castsi128_ps(tmp0), 16)); \ - tmp0 = _mm_xor_si128(tmp0, tmp3); \ - tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ - _mm_castsi128_ps(tmp0), 140)); \ - tmp0 = _mm_xor_si128(_mm_xor_si128(tmp0, tmp3), tmp1); \ - _mm_store_si128((void *) key_ptr, tmp0); \ - key_ptr++ - -#define KEY_EXPANSION_B \ - tmp1 = _mm_shuffle_epi32(tmp1, 170); \ - tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ - _mm_castsi128_ps(tmp2), 16)); \ - tmp2 = _mm_xor_si128(tmp2, tmp3); \ - tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \ - _mm_castsi128_ps(tmp2), 140)); \ - tmp2 = _mm_xor_si128(_mm_xor_si128(tmp2, tmp3), tmp1); \ - _mm_store_si128((void *) key_ptr, tmp2); \ - key_ptr++ - -#define KEY_EXPANSION_BLOCK_1(rcon) \ - tmp1 = _mm_aeskeygenassist_si128(tmp0, (rcon)); KEY_EXPANSION_A - -#define KEY_EXPANSION_BLOCK_2(rcon) \ - tmp1 = _mm_aeskeygenassist_si128(tmp2, rcon); KEY_EXPANSION_A; \ - tmp1 = _mm_aeskeygenassist_si128(tmp0, rcon); KEY_EXPANSION_B - -#define KEY_EXPANSION_LAST(rcon) \ - tmp1 = _mm_aeskeygenassist_si128(tmp2, 64); KEY_EXPANSION_A; - -static void -_key_setup(context * const ctx, const unsigned char *key) +/** single, by-the-book AES encryption with AES-NI */ +static inline void +aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i rkeys[16]) { - __m128i *key_ptr = &ctx->ekey[0]; - __m128i tmp0 = _mm_loadu_si128((const void *) key); - __m128i tmp1, tmp2; - __m128i tmp3 = _mm_setzero_si128(); + __m128i temp = _mm_xor_si128(nv, rkeys[0]); + int i; - *key_ptr++ = tmp0; - switch (crypto_aead_aes256gcm_KEYBYTES) { - case 16: - KEY_EXPANSION_BLOCK_1(1); - KEY_EXPANSION_BLOCK_1(2); - KEY_EXPANSION_BLOCK_1(4); - KEY_EXPANSION_BLOCK_1(8); - KEY_EXPANSION_BLOCK_1(16); - KEY_EXPANSION_BLOCK_1(32); - KEY_EXPANSION_BLOCK_1(64); - KEY_EXPANSION_BLOCK_1(128); - KEY_EXPANSION_BLOCK_1(27); - KEY_EXPANSION_BLOCK_1(54); - break; - case 32: - tmp2 = _mm_loadu_si128((const void *) (key + 16)); - _mm_store_si128((void *) key_ptr, tmp2); - key_ptr++; - KEY_EXPANSION_BLOCK_2(1); - KEY_EXPANSION_BLOCK_2(2); - KEY_EXPANSION_BLOCK_2(4); - KEY_EXPANSION_BLOCK_2(8); - KEY_EXPANSION_BLOCK_2(16); - KEY_EXPANSION_BLOCK_2(32); - KEY_EXPANSION_LAST(64); - break; - default: - abort(); +#pragma unroll(13) + for (i = 1; i < 14; i++) { + temp = _mm_aesenc_si128(temp, rkeys[i]); } + temp = _mm_aesenclast_si128(temp, rkeys[14]); + _mm_store_si128((__m128i *) out, temp); } -#define AESNI_INC \ - ctr = _mm_add_epi64(ctr, inc); \ - ctr_low++; \ - if (ctr_low == 0U) { \ - inc = _mm_slli_si128(inc, 8); \ - ctr = _mm_add_epi64(ctr, inc); \ - inc = _mm_srli_si128(inc, 8); \ - } \ - iv = ctr; \ - iv = _mm_shuffle_epi8(iv, mask) +/** multiple-blocks-at-once AES encryption with AES-NI ; + on Haswell, aesenc as a latency of 7 and a througput of 1 + so the sequence of aesenc should be bubble-free, if you + have at least 8 blocks. Let's build an arbitratry-sized + function */ +/* Step 1 : loading the nonce */ +/* load & increment the n vector (non-vectorized, unused for now) */ +#define NVx(a) \ + __m128i nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \ + n[3]++ -static const unsigned char -swap_mask[] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; +/* Step 2 : define value in round one (xor with subkey #0, aka key) */ +#define TEMPx(a) \ + __m128i temp##a = _mm_xor_si128(nv##a, rkeys[0]) -static void -_aes_ctr(context *ctx, unsigned char *dst, const unsigned char *src, size_t len, - unsigned char *ivc_block) +/* Step 3: one round of AES */ +#define AESENCx(a) \ + temp##a = _mm_aesenc_si128(temp##a, rkeys[i]) + +/* Step 4: last round of AES */ +#define AESENCLASTx(a) \ + temp##a = _mm_aesenclast_si128(temp##a, rkeys[14]) + +/* Step 5: store result */ +#define STOREx(a) \ + _mm_store_si128((__m128i *) (out + (a * 16)), temp##a) + +/* all the MAKE* macros are for automatic explicit unrolling */ +#define MAKE4(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +#define MAKE8(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3); \ + X(4); \ + X(5); \ + X(6); \ + X(7) + +#define COUNTER_INC2(N) (*(uint32_t *) &(N)[12]) = (2U + (((*(uint32_t *) &(N)[12])))) + +/* create a function of unrolling N ; the MAKEN is the unrolling + macro, defined above. The N in MAKEN must match N, obviously. */ +#define FUNC(N, MAKEN) \ + static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i rkeys[16]) \ + { \ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int i; \ + \ + MAKEN(NVx); \ + MAKEN(TEMPx); \ + for (i = 1; i < 14; i++) { \ + MAKEN(AESENCx); \ + } \ + MAKEN(AESENCLASTx); \ + MAKEN(STOREx); \ + } + +FUNC(8, MAKE8) + +/* all GF(2^128) fnctions are by the book, meaning this one: + +*/ + +static inline void +addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b) { - __m128i *key_ptr; - __m128i ctr; - __m128i in1, in2, in3, in4; - __m128i iv; - __m128i key; - __m128i mask; - __m128i state1, state2, state3, state4; + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i ff = _mm_set1_epi32(-1); + __m128i A = _mm_loadu_si128((const __m128i *) a); - iv = ctr = _mm_loadu_si128((void *) ivc_block); - iv = _mm_slli_si128(iv, 8); - mask = _mm_loadu_si128((const void *) swap_mask); - ctr = _mm_shuffle_epi8(ctr, mask); - uint32_t ctr_low = (uint32_t) 1; - __m128i inc = _mm_cvtsi32_si128((int32_t) ctr_low); - ctr_low = (uint32_t) _mm_cvtsi128_si32(ctr); + A = _mm_shuffle_epi8(A, rev); + if (xlen < 16) { /* less than 16 useful bytes - insert zeroes where needed */ + uint64_t mask = -1ull ^ (1ull << (((16 - xlen) % 8) * 8)) - 1ull; + __m128i vm; -#define ENC4(offset) \ - key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ - state1 = _mm_aesenc_si128(state1, key); \ - state2 = _mm_aesenc_si128(state2, key); \ - state3 = _mm_aesenc_si128(state3, key); \ - state4 = _mm_aesenc_si128(state4, key) - -#define ENC4_LAST(offset) \ - key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ - state1 = _mm_aesenclast_si128(state1, key); \ - state2 = _mm_aesenclast_si128(state2, key); \ - state3 = _mm_aesenclast_si128(state3, key); \ - state4 = _mm_aesenclast_si128(state4, key) - - while (len >= 64) { - AESNI_INC; - state1 = iv; - in1 = _mm_loadu_si128((const void *) src); - AESNI_INC; - state2 = iv; - in2 = _mm_loadu_si128((const void *) (src + 16)); - AESNI_INC; - state3 = iv; - in3 = _mm_loadu_si128((const void *) (src + 32)); - AESNI_INC; - state4 = iv; - in4 = _mm_loadu_si128((const void *) (src + 48)); - key = _mm_loadu_si128((void *) &ctx->ekey[0]); - key_ptr = &ctx->ekey[0]; - state1 = _mm_xor_si128(state1, key); - state2 = _mm_xor_si128(state2, key); - state3 = _mm_xor_si128(state3, key); - state4 = _mm_xor_si128(state4, key); - key_ptr += 3; - switch (crypto_aead_aes256gcm_KEYBYTES) { - case 32: - key_ptr += 4; - ENC4(-6); - ENC4(-5); - case 16: - ENC4(-4); - ENC4(-3); - ENC4(-2); - ENC4(-1); - ENC4(0); - ENC4(1); - ENC4(2); - ENC4(3); - ENC4(4); - ENC4(5); - ENC4(6); - ENC4_LAST(7); - break; - default: - abort(); + if (xlen > 8) { + vm = _mm_insert_epi64(ff, mask, 0); + } else { + vm = _mm_insert_epi64(_mm_setzero_si128(), mask, 1); } - state1 = _mm_xor_si128(state1, in1); - _mm_storeu_si128((void *) dst, state1); - state2 = _mm_xor_si128(state2, in2); - _mm_storeu_si128((void *) (dst + 16), state2); - state3 = _mm_xor_si128(state3, in3); - _mm_storeu_si128((void *) (dst + 32), state3); - state4 = _mm_xor_si128(state4, in4); - _mm_storeu_si128((void *) (dst + 48), state4); - len -= 64; - src += 64; - dst += 64; + A = _mm_and_si128(vm, A); } - -#define ENC(offset) \ - key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ - state1 = _mm_aesenc_si128(state1, key) - -#define ENC_LAST(offset) \ - key = _mm_loadu_si128((void *) (key_ptr + (offset))); \ - state1 = _mm_aesenclast_si128(state1, key); - -#define ENC_ONE \ - key_ptr = &ctx->ekey[0]; \ - key = _mm_loadu_si128((void *) &ctx->ekey[0]); \ - state1 = _mm_xor_si128(state1, key); \ - key_ptr += 3; \ - switch (crypto_aead_aes256gcm_KEYBYTES) { \ - case 32: \ - key_ptr += 4; \ - ENC(-6); \ - ENC(-5); \ - ENC(-4); \ - ENC(-3); \ - case 16: \ - ENC(-2); \ - ENC(-1); \ - ENC(0); \ - ENC(1); \ - ENC(2); \ - ENC(3); \ - ENC(4); \ - ENC(5); \ - ENC(6); \ - ENC_LAST(7); \ - break; \ - default: \ - abort(); \ - } - while (len >= 16) { - AESNI_INC; - state1 = iv; - in1 = _mm_loadu_si128((const void *) src); - ENC_ONE; - state1 = _mm_xor_si128(state1, in1); - _mm_storeu_si128((void *) dst, state1); - len -= 16; - src += 16; - dst += 16; - } - if (len > 0) { - unsigned char padded[16]; - memset(padded, 0, sizeof padded); - memcpy(padded, src, len); - src = padded; - AESNI_INC; - state1 = iv; - in1 = _mm_loadu_si128((const void *) src); - ENC_ONE; - state1 = _mm_xor_si128(state1, in1); - _mm_storeu_si128((void *) padded, state1); - memcpy(dst, padded, len); - } - _mm_storel_epi64((void *) ivc_block, iv); + __m128i B = _mm_loadu_si128((const __m128i *) b); + __m128i C = _mm_loadu_si128((const __m128i *) c); + A = _mm_xor_si128(A, C); + __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00); + __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10); + __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01); + __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11); + __m128i tmp10 = _mm_xor_si128(tmp4, tmp5); + __m128i tmp13 = _mm_slli_si128(tmp10, 8); + __m128i tmp11 = _mm_srli_si128(tmp10, 8); + __m128i tmp15 = _mm_xor_si128(tmp3, tmp13); + __m128i tmp17 = _mm_xor_si128(tmp6, tmp11); + __m128i tmp7 = _mm_srli_epi32(tmp15, 31); + __m128i tmp8 = _mm_srli_epi32(tmp17, 31); + __m128i tmp16 = _mm_slli_epi32(tmp15, 1); + __m128i tmp18 = _mm_slli_epi32(tmp17, 1); + __m128i tmp9 = _mm_srli_si128(tmp7, 12); + __m128i tmp22 = _mm_slli_si128(tmp8, 4); + __m128i tmp25 = _mm_slli_si128(tmp7, 4); + __m128i tmp29 = _mm_or_si128(tmp16, tmp25); + __m128i tmp19 = _mm_or_si128(tmp18, tmp22); + __m128i tmp20 = _mm_or_si128(tmp19, tmp9); + __m128i tmp26 = _mm_slli_epi32(tmp29, 31); + __m128i tmp23 = _mm_slli_epi32(tmp29, 30); + __m128i tmp32 = _mm_slli_epi32(tmp29, 25); + __m128i tmp27 = _mm_xor_si128(tmp26, tmp23); + __m128i tmp28 = _mm_xor_si128(tmp27, tmp32); + __m128i tmp24 = _mm_srli_si128(tmp28, 4); + __m128i tmp33 = _mm_slli_si128(tmp28, 12); + __m128i tmp30 = _mm_xor_si128(tmp29, tmp33); + __m128i tmp2 = _mm_srli_epi32(tmp30, 1); + __m128i tmp12 = _mm_srli_epi32(tmp30, 2); + __m128i tmp14 = _mm_srli_epi32(tmp30, 7); + __m128i tmp34 = _mm_xor_si128(tmp2, tmp12); + __m128i tmp35 = _mm_xor_si128(tmp34, tmp14); + __m128i tmp36 = _mm_xor_si128(tmp35, tmp24); + __m128i tmp31 = _mm_xor_si128(tmp30, tmp36); + __m128i tmp21 = _mm_xor_si128(tmp20, tmp31); + _mm_storeu_si128((__m128i *) c, tmp21); } -static void -_aes_enc_one(context *ctx, unsigned char *dst, unsigned char *src) +/* pure multiplication, for pre-computing powers of H */ +static inline __m128i +mulv(__m128i A, __m128i B) { - __m128i *key_ptr; - __m128i key; - __m128i state1 = _mm_loadu_si128((const void *) src); + __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00); + __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10); + __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01); + __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11); + __m128i tmp10 = _mm_xor_si128(tmp4, tmp5); + __m128i tmp13 = _mm_slli_si128(tmp10, 8); + __m128i tmp11 = _mm_srli_si128(tmp10, 8); + __m128i tmp15 = _mm_xor_si128(tmp3, tmp13); + __m128i tmp17 = _mm_xor_si128(tmp6, tmp11); + __m128i tmp7 = _mm_srli_epi32(tmp15, 31); + __m128i tmp8 = _mm_srli_epi32(tmp17, 31); + __m128i tmp16 = _mm_slli_epi32(tmp15, 1); + __m128i tmp18 = _mm_slli_epi32(tmp17, 1); + __m128i tmp9 = _mm_srli_si128(tmp7, 12); + __m128i tmp22 = _mm_slli_si128(tmp8, 4); + __m128i tmp25 = _mm_slli_si128(tmp7, 4); + __m128i tmp29 = _mm_or_si128(tmp16, tmp25); + __m128i tmp19 = _mm_or_si128(tmp18, tmp22); + __m128i tmp20 = _mm_or_si128(tmp19, tmp9); + __m128i tmp26 = _mm_slli_epi32(tmp29, 31); + __m128i tmp23 = _mm_slli_epi32(tmp29, 30); + __m128i tmp32 = _mm_slli_epi32(tmp29, 25); + __m128i tmp27 = _mm_xor_si128(tmp26, tmp23); + __m128i tmp28 = _mm_xor_si128(tmp27, tmp32); + __m128i tmp24 = _mm_srli_si128(tmp28, 4); + __m128i tmp33 = _mm_slli_si128(tmp28, 12); + __m128i tmp30 = _mm_xor_si128(tmp29, tmp33); + __m128i tmp2 = _mm_srli_epi32(tmp30, 1); + __m128i tmp12 = _mm_srli_epi32(tmp30, 2); + __m128i tmp14 = _mm_srli_epi32(tmp30, 7); + __m128i tmp34 = _mm_xor_si128(tmp2, tmp12); + __m128i tmp35 = _mm_xor_si128(tmp34, tmp14); + __m128i tmp36 = _mm_xor_si128(tmp35, tmp24); + __m128i tmp31 = _mm_xor_si128(tmp30, tmp36); + __m128i C = _mm_xor_si128(tmp20, tmp31); - ENC_ONE; - _mm_storeu_si128((void *) dst, state1); + return C; } -#define GMAC_UPDATE \ - tmp2 = _mm_loadu_si128((const void *) src); \ - tmp2 = _mm_shuffle_epi8(tmp2, mask); \ - tmp0 = _mm_xor_si128(tmp6, tmp2); \ - \ - tmp4 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x10), \ - _mm_clmulepi64_si128(tmp0, tmp1, 0x01)); \ - tmp5 = _mm_slli_si128(tmp4, 8); \ - tmp3 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x00), tmp5); \ - tmp6 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x11), \ - _mm_srli_si128(tmp4, 8)); \ - tmp7 = _mm_srli_epi32(tmp3, 31); \ - tmp8 = _mm_srli_epi32(tmp6, 31); \ - tmp3 = _mm_slli_epi32(tmp3, 1); \ - tmp6 = _mm_slli_epi32(tmp6, 1); \ - tmp8 = _mm_slli_si128(tmp8, 4); \ - tmp9 = _mm_srli_si128(tmp7, 12); \ - tmp7 = _mm_slli_si128(tmp7, 4); \ - tmp3 = _mm_or_si128(tmp3, tmp7); \ - tmp6 = _mm_or_si128(_mm_or_si128(tmp6, tmp8), tmp9); \ - \ - tmp8 = _mm_slli_epi32(tmp3, 30); \ - tmp9 = _mm_slli_epi32(tmp3, 25); \ - tmp7 = _mm_xor_si128(_mm_xor_si128(_mm_slli_epi32(tmp3, 31), tmp8), tmp9); \ - tmp3 = _mm_xor_si128(tmp3, _mm_slli_si128(tmp7, 12)); \ - \ - tmp4 = _mm_srli_epi32(tmp3, 2); \ - tmp5 = _mm_srli_epi32(tmp3, 7); \ - tmp6 = _mm_xor_si128(tmp6, _mm_xor_si128(tmp3, \ - _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(tmp3, 1), tmp4), tmp5), \ - _mm_srli_si128(tmp7, 4)))) - -static void -_gmac_update(context *ctx, const unsigned char *src, size_t len) +/* 4 multiply-accumulate at once; again + + for the Aggregated Reduction Method & sample code. +*/ +static inline __m128i +reduce4(__m128i H0, __m128i H1, __m128i H2, __m128i H3, __m128i X0, __m128i X1, + __m128i X2, __m128i X3, __m128i acc) { - __m128i mask = _mm_loadu_si128((const void *) swap_mask); - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; +/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ +#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B + MAKE4(RED_DECL); + __m128i lo, hi; + __m128i tmp8, tmp9; + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - tmp1 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ctx->subkey), mask); - tmp6 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ctx->initial_state), mask); - while (len >= 16) { - GMAC_UPDATE; - len -= 16; - src += 16; - } - if (len > 0) { - unsigned char padded[16]; - memset(padded, 0, sizeof padded); - memcpy(padded, src, len); - src = padded; - GMAC_UPDATE; - } - tmp6 = _mm_shuffle_epi8(tmp6, mask); - _mm_storeu_si128((void *) ctx->state, tmp6); - _mm_storeu_si128((void *) ctx->initial_state, tmp6); +/* byte-revert the inputs & xor the first one into the accumulator */ +#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev) + MAKE4(RED_SHUFFLE); + X3 = _mm_xor_si128(X3, acc); + +/* 4 low H*X (x0*h0) */ +#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00) + MAKE4(RED_MUL_LOW); + lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); + lo = _mm_xor_si128(lo, H2_X2_lo); + lo = _mm_xor_si128(lo, H3_X3_lo); + +/* 4 high H*X (x1*h1) */ +#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11) + MAKE4(RED_MUL_HIGH); + hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); + hi = _mm_xor_si128(hi, H2_X2_hi); + hi = _mm_xor_si128(hi, H3_X3_hi); + +/* 4 middle H*X, using Karatsuba, i.e. + x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 + we already have all x1y1 & x0y0 (accumulated in hi & lo) + (0 is low half and 1 is high half) + */ +/* permute the high and low 64 bits in H1 & X1, + so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), + then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), + and finally multiply + */ +#define RED_MUL_MID(a) \ + tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \ + tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \ + tmp##a = _mm_xor_si128(tmp##a, H##a); \ + tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \ + tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00) + MAKE4(RED_MUL_MID); + +/* substracts x1*h1 and x0*h0 */ + tmp0 = _mm_xor_si128(tmp0, lo); + tmp0 = _mm_xor_si128(tmp0, hi); + tmp0 = _mm_xor_si128(tmp1, tmp0); + tmp0 = _mm_xor_si128(tmp2, tmp0); + tmp0 = _mm_xor_si128(tmp3, tmp0); + + /* reduction */ + tmp0B = _mm_slli_si128(tmp0, 8); + tmp0 = _mm_srli_si128(tmp0, 8); + lo = _mm_xor_si128(tmp0B, lo); + hi = _mm_xor_si128(tmp0, hi); + tmp3 = lo; + tmp2B = hi; + tmp3B = _mm_srli_epi32(tmp3, 31); + tmp8 = _mm_srli_epi32(tmp2B, 31); + tmp3 = _mm_slli_epi32(tmp3, 1); + tmp2B = _mm_slli_epi32(tmp2B, 1); + tmp9 = _mm_srli_si128(tmp3B, 12); + tmp8 = _mm_slli_si128(tmp8, 4); + tmp3B = _mm_slli_si128(tmp3B, 4); + tmp3 = _mm_or_si128(tmp3, tmp3B); + tmp2B = _mm_or_si128(tmp2B, tmp8); + tmp2B = _mm_or_si128(tmp2B, tmp9); + tmp3B = _mm_slli_epi32(tmp3, 31); + tmp8 = _mm_slli_epi32(tmp3, 30); + tmp9 = _mm_slli_epi32(tmp3, 25); + tmp3B = _mm_xor_si128(tmp3B, tmp8); + tmp3B = _mm_xor_si128(tmp3B, tmp9); + tmp8 = _mm_srli_si128(tmp3B, 4); + tmp3B = _mm_slli_si128(tmp3B, 12); + tmp3 = _mm_xor_si128(tmp3, tmp3B); + tmp2 = _mm_srli_epi32(tmp3, 1); + tmp0B = _mm_srli_epi32(tmp3, 2); + tmp1B = _mm_srli_epi32(tmp3, 7); + tmp2 = _mm_xor_si128(tmp2, tmp0B); + tmp2 = _mm_xor_si128(tmp2, tmp1B); + tmp2 = _mm_xor_si128(tmp2, tmp8); + tmp3 = _mm_xor_si128(tmp3, tmp2); + tmp2B = _mm_xor_si128(tmp2B, tmp3); + + return tmp2B; } -static void -_gmac_final(context *ctx, unsigned char *tag, unsigned char *ivc_block, unsigned char *hashstate) -{ - __m128i *key_ptr; - __m128i in; - __m128i key; - __m128i state1 = _mm_loadu_si128((const void *) ivc_block); +#define XORx(a) \ + __m128i in##a = _mm_load_si128((const __m128i *) (in + a * 16)); \ + temp##a = _mm_xor_si128(temp##a, in##a) - ENC_ONE; - in = _mm_loadu_si128((void *) hashstate); - state1 = _mm_xor_si128(state1, in); - _mm_storeu_si128((void *) tag, state1); +#define LOADx(a) \ + __m128i in##a = _mm_load_si128((const __m128i *) (in + a * 16)); + +/* full encrypt & checksum 8 blocks at once */ +static inline void +aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i rkeys[16], + const unsigned char *in, unsigned char *accum, + const __m128i hv, const __m128i h2v, const __m128i h3v, + const __m128i h4v) +{ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m128i accv = _mm_loadu_si128((const __m128i *) accum); + int i; + + MAKE8(NVx); + MAKE8(TEMPx); +#pragma unroll(13) + for (i = 1; i < 14; i++) { + MAKE8(AESENCx); + } + MAKE8(AESENCLASTx); + MAKE8(XORx); + MAKE8(STOREx); + accv = reduce4(hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); + accv = reduce4(hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); + _mm_storeu_si128((__m128i *) accum, accv); +} + +/* checksum 8 blocks at once */ +static inline void +aesni_addmul8full(const unsigned char *in, unsigned char *accum, + const __m128i hv, const __m128i h2v, + const __m128i h3v, const __m128i h4v) +{ + __m128i accv = _mm_loadu_si128((const __m128i *) accum); + MAKE8(LOADx); + accv = reduce4(hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); + accv = reduce4(hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); + _mm_storeu_si128((__m128i *) accum, accv); +} + +/* decrypt 8 blocks at once */ +static inline void +aesni_decrypt8full(unsigned char *out, uint32_t *n, const __m128i rkeys[16], + const unsigned char *in) +{ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + int i; + + MAKE8(NVx); + MAKE8(TEMPx); +#pragma unroll(13) + for (i = 1; i < 14; i++) { + MAKE8(AESENCx); + } + MAKE8(AESENCLASTx); + MAKE8(XORx); + MAKE8(STOREx); } int crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_, const unsigned char *k) { - context *ctx = (context *) ctx_; + context *ctx = (context *) ctx_; + __m128i *rkeys = ctx->rkeys; + __m128i zero = _mm_setzero_si128(); + unsigned char *H = ctx->H; (void) sizeof(int[(sizeof *ctx_) >= (sizeof *ctx) ? 1 : -1]); - memset(ctx, 0, sizeof *ctx); - _key_setup(ctx, k); - _aes_enc_one(ctx, ctx->subkey, ctx->subkey); + aesni_key256_expand(k, (__m128*) rkeys); + aesni_encrypt1(H, zero, rkeys); return 0; } int -crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, - unsigned long long *clen_p, - const unsigned char *m, - unsigned long long mlen, - const unsigned char *ad, - unsigned long long adlen, +crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long *clen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const crypto_aead_aes256gcm_aesni_state *ctx_) { - context ctx; - unsigned char *mac; - unsigned char ivc_block[AES_BLOCKSIZE]; + unsigned char H[16]; + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const context *ctx = (const context *) ctx_; + const __m128i *rkeys = ctx->rkeys; + __m128i Hv, H2v, H3v, H4v, accv; + unsigned long long i, j; + unsigned long long adlen_rnd64 = adlen & ~63ULL; + unsigned long long mlen_rnd128 = mlen & ~127ULL; + CRYPTO_ALIGN(16) unsigned char n2[16]; + CRYPTO_ALIGN(16) unsigned char T[16]; + CRYPTO_ALIGN(16) unsigned char accum[16]; + CRYPTO_ALIGN(16) unsigned char fb[16]; (void) nsec; - (void) sizeof(int[(sizeof *ctx_) >= (sizeof ctx) ? 1 : -1]); - memcpy(&ctx, ctx_, sizeof ctx); - memset(ivc_block, 0, sizeof ivc_block); - memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); - ivc_block[AES_BLOCKSIZE - 1U] = 1U; - _gmac_update(&ctx, ad, adlen); - _aes_ctr(&ctx, c, m, mlen, ivc_block); - _gmac_update(&ctx, c, mlen); - mac = c + mlen; - _u64_be_from_ull(mac, adlen * 8ULL); - _u64_be_from_ull(mac + 8U, mlen * 8ULL); - _gmac_update(&ctx, mac, GMAC_BLOCKSIZE); - _gmac_final(&ctx, mac, ivc_block, ctx.state); - sodium_memzero(&ctx, sizeof ctx); - if (clen_p != NULL) { - *clen_p = mlen + crypto_aead_aes256gcm_ABYTES; + memcpy(H, ctx->H, sizeof H); + if (mlen > 16ULL * (1ULL << 32)) { + abort(); + } + memcpy(&n2[0], npub, 12); + *(uint32_t *) &n2[12] = 0x01000000; + aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); + + (*(uint64_t *) &fb[0]) = _bswap64((uint64_t) (8 * adlen)); + (*(uint64_t *) &fb[8]) = _bswap64((uint64_t) (8 * mlen)); + + /* we store H (and it's power) byte-reverted once and for all */ + Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); + _mm_store_si128((__m128i *) H, Hv); + H2v = mulv(Hv, Hv); + H3v = mulv(H2v, Hv); + H4v = mulv(H3v, Hv); + + accv = _mm_setzero_si128(); + /* unrolled by 4 GCM (by 8 doesn't improve using reduce4) */ + for (i = 0; i < adlen_rnd64; i += 64) { + __m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0)); + __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); + __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); + __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); + accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); + } + _mm_storeu_si128((__m128i *) accum, accv); + + /* GCM remainder loop */ + for (i = adlen_rnd64; i < adlen; i += 16) { + unsigned int blocklen = 16; + + if (i + (unsigned long long) blocklen > adlen) { + blocklen = (unsigned int) (adlen - i); + } + addmul(accum, ad + i, blocklen, H); + } + +/* this only does 8 full blocks, so no fancy bounds checking is necessary*/ +#define LOOPRND128 \ + { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = 0; i < mlen_rnd128; i += lb) { \ + aesni_encrypt8full(c + i, (uint32_t *) n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v); \ + } \ + } + +/* remainder loop, with the slower GCM update to accomodate partial blocks */ +#define LOOPRMD128 \ + { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = mlen_rnd128; i < mlen; i += lb) { \ + CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ + unsigned long long mj = lb; \ + \ + aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \ + if ((i + mj) >= mlen) { \ + mj = mlen - i; \ + } \ + for (j = 0; j < mj; j++) { \ + c[i + j] = m[i + j] ^ outni[j]; \ + } \ + for (j = 0; j < mj; j += 16) { \ + unsigned int bl = 16; \ + \ + if (j + (unsigned long long) bl >= mj) { \ + bl = (unsigned int) (mj - j); \ + } \ + addmul(accum, c + i + j, bl, H); \ + } \ + } \ + } + + n2[15] = 0; + COUNTER_INC2(n2); + LOOPRND128; + LOOPRMD128; + + addmul(accum, fb, 16, H); + + for (i = 0; i < 16; ++i) { + c[i + mlen] = T[i] ^ accum[15 - i]; + } + if (clen != NULL) { + *clen = mlen + 16; } return 0; } int -crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, - unsigned long long *mlen_p, +crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p, unsigned char *nsec, - const unsigned char *c, - unsigned long long clen, - const unsigned char *ad, - unsigned long long adlen, + const unsigned char *c, unsigned long long clen, + const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const crypto_aead_aes256gcm_aesni_state *ctx_) { - context ctx; - unsigned char mac[GMAC_BLOCKSIZE]; - unsigned char ivc_block[AES_BLOCKSIZE]; - size_t mlen; + unsigned char H[16]; + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const context *ctx = (const context *) ctx_; + const __m128i *rkeys = ctx->rkeys; + __m128i Hv, H2v, H3v, H4v, accv; + unsigned long long i, j; + unsigned long long adlen_rnd64 = adlen & ~63ULL; + unsigned long long mlen; + unsigned long long mlen_rnd128; + CRYPTO_ALIGN(16) unsigned char n2[16]; + CRYPTO_ALIGN(16) unsigned char T[16]; + CRYPTO_ALIGN(16) unsigned char accum[16]; + CRYPTO_ALIGN(16) unsigned char fb[16]; (void) nsec; + memcpy(H, ctx->H, sizeof H); + if (clen > 16ULL * (1ULL << 32) - 16ULL) { + abort(); + } + mlen = clen - 16; if (mlen_p != NULL) { - *mlen_p = 0; + *mlen_p = 0U; } - if (clen < crypto_aead_aes256gcm_ABYTES) { - return -1; - } - mlen = clen - crypto_aead_aes256gcm_ABYTES; - memset(ivc_block, 0, sizeof ivc_block); - memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES); - ivc_block[AES_BLOCKSIZE - 1U] = 1U; + memcpy(&n2[0], npub, 12); + *(uint32_t *) &n2[12] = 0x01000000; + aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); - (void) sizeof(int[(sizeof *ctx_) >= (sizeof ctx) ? 1 : -1]); - memcpy(&ctx, ctx_, sizeof ctx); - _gmac_update(&ctx, ad, adlen); - _gmac_update(&ctx, c, mlen); - _u64_be_from_ull(mac, adlen * 8ULL); - _u64_be_from_ull(mac + 8U, mlen * 8ULL); - _gmac_update(&ctx, mac, GMAC_BLOCKSIZE); - _gmac_final(&ctx, mac, ivc_block, ctx.state); - if (sodium_memcmp(c + mlen, mac, crypto_aead_aes256gcm_ABYTES) != 0) { - sodium_memzero(&ctx, sizeof ctx); - return -1; + (*(uint64_t *) &fb[0]) = _bswap64((uint64_t)(8 * adlen)); + (*(uint64_t *) &fb[8]) = _bswap64((uint64_t)(8 * mlen)); + + Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); + _mm_store_si128((__m128i *) H, Hv); + H2v = mulv(Hv, Hv); + H3v = mulv(H2v, Hv); + H4v = mulv(H3v, Hv); + + accv = _mm_setzero_si128(); + for (i = 0; i < adlen_rnd64; i += 64) { + __m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0)); + __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); + __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); + __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); + accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); } - _aes_ctr(&ctx, m, c, mlen, ivc_block); - sodium_memzero(&ctx, sizeof ctx); + _mm_storeu_si128((__m128i *) accum, accv); + + for (i = adlen_rnd64; i < adlen; i += 16) { + unsigned int blocklen = 16; + if (i + (unsigned long long) blocklen > adlen) { + blocklen = (unsigned int) (adlen - i); + } + addmul(accum, ad + i, blocklen, H); + } + + mlen_rnd128 = mlen & ~127ULL; + +#define LOOPACCUMDRND128 \ + { \ + const int iter = 8; \ + const int lb = iter * 16; \ + for (i = 0; i < mlen_rnd128; i += lb) { \ + aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v); \ + } \ + } + +#define LOOPDRND128 \ + { \ + const int iter = 8; \ + const int lb = iter * 16; \ + for (i = 0; i < mlen_rnd128; i += lb) { \ + aesni_decrypt8full(m + i, (uint32_t *) n2, rkeys, c + i); \ + } \ + } + +#define LOOPACCUMDRMD128 \ + { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = mlen_rnd128; i < mlen; i += lb) { \ + unsigned long long mj = lb; \ + \ + if ((i + mj) >= mlen) { \ + mj = mlen - i; \ + } \ + for (j = 0; j < mj; j += 16) { \ + unsigned int bl = 16; \ + \ + if (j + (unsigned long long) bl >= mj) { \ + bl = (unsigned int) (mj - j); \ + } \ + addmul(accum, c + i + j, bl, H); \ + } \ + } \ + } + +#define LOOPDRMD128 \ + { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = mlen_rnd128; i < mlen; i += lb) { \ + CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ + unsigned long long mj = lb; \ + \ + if ((i + mj) >= mlen) { \ + mj = mlen - i; \ + } \ + aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \ + for (j = 0; j < mj; j++) { \ + m[i + j] = c[i + j] ^ outni[j]; \ + } \ + } \ + } + n2[15] = 0; + + COUNTER_INC2(n2); + LOOPACCUMDRND128; + LOOPACCUMDRMD128; + addmul(accum, fb, 16, H); + { + unsigned char d = 0; + + for (i = 0; i < 16; i++) { + d |= (c[i + mlen] ^ (T[i] ^ accum[15 - i])); + } + if (d != 0) { + return -1; + } + } + *(uint32_t *) &n2[12] = 0; + COUNTER_INC2(n2); + LOOPDRND128; + LOOPDRMD128; + if (mlen_p != NULL) { *mlen_p = mlen; } @@ -463,8 +743,7 @@ crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, crypto_aead_aes256gcm_aesni_beforenm(&ctx, k); return crypto_aead_aes256gcm_aesni_encrypt_afternm - (c, clen_p, m, mlen, ad, adlen, nsec, npub, - (const crypto_aead_aes256gcm_aesni_state *) &ctx); + (c, clen_p, m, mlen, ad, adlen, nsec, npub, &ctx); } int @@ -480,9 +759,38 @@ crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, { crypto_aead_aes256gcm_aesni_state ctx; - crypto_aead_aes256gcm_aesni_beforenm(&ctx, k); + crypto_aead_aes256gcm_aesni_beforenm((crypto_aead_aes256gcm_aesni_state *) + &ctx, k); return crypto_aead_aes256gcm_aesni_decrypt_afternm - (m, mlen_p, nsec, c, clen, ad, adlen, npub, - (const crypto_aead_aes256gcm_aesni_state *) &ctx); + (m, mlen_p, nsec, c, clen, ad, adlen, npub, &ctx); +} + +size_t +crypto_aead_aes256gcm_aesni_keybytes(void) +{ + return crypto_aead_aes256gcm_KEYBYTES; +} + +size_t +crypto_aead_aes256gcm_aesni_nsecbytes(void) +{ + return crypto_aead_aes256gcm_NSECBYTES; +} + +size_t crypto_aead_aes256gcm_aesni_npubbytes(void) +{ + return crypto_aead_aes256gcm_NPUBBYTES; +} + +size_t crypto_aead_aes256gcm_aesni_abytes(void) +{ + return crypto_aead_aes256gcm_ABYTES; +} + +size_t crypto_aead_aes256gcm_aesni_statebytes(void) +{ + (void) sizeof(int[(sizeof(crypto_aead_aes256gcm_aesni_state) >= + sizeof(context)) ? 1 : -1]); + return sizeof(crypto_aead_aes256gcm_aesni_state); } diff --git a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h index d6ba98ec..53c02abd 100644 --- a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h +++ b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h @@ -12,11 +12,24 @@ extern "C" { #endif #define crypto_aead_aes256gcm_KEYBYTES 32U -#define crypto_aead_aes256gcm_NSECBYTES 0U -#define crypto_aead_aes256gcm_NPUBBYTES 12U -#define crypto_aead_aes256gcm_ABYTES 16U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_keybytes(void); -typedef CRYPTO_ALIGN(128) unsigned char crypto_aead_aes256gcm_aesni_state[384]; +#define crypto_aead_aes256gcm_NSECBYTES 0U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_nsecbytes(void); + +#define crypto_aead_aes256gcm_NPUBBYTES 12U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_npubbytes(void); + +#define crypto_aead_aes256gcm_ABYTES 16U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_abytes(void); + +typedef CRYPTO_ALIGN(16) unsigned char crypto_aead_aes256gcm_aesni_state[272]; +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_statebytes(void); SODIUM_EXPORT int crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, diff --git a/src/libsodium/include/sodium/export.h b/src/libsodium/include/sodium/export.h index aeb6d0e2..89f10fe9 100644 --- a/src/libsodium/include/sodium/export.h +++ b/src/libsodium/include/sodium/export.h @@ -30,7 +30,7 @@ #endif #ifndef CRYPTO_ALIGN -# if defined(_MSC_VER) +# if defined(__INTEL_COMPILER) || defined(_MSC_VER) # define CRYPTO_ALIGN(x) __declspec(align(x)) # else # define CRYPTO_ALIGN(x) __attribute__((aligned(x))) From e83e9b2d8ed5bd115e5129d3d9e75f6587c2115a Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 17:57:47 +0200 Subject: [PATCH 08/25] Check for AESNI & PCLMUL presence/usability --- configure.ac | 20 +++++++++++--- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 17 ++++++++++++ src/libsodium/include/sodium/runtime.h | 6 +++++ src/libsodium/sodium/runtime.c | 26 +++++++++++++++++-- 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/configure.ac b/configure.ac index c2427d5b..d94f7bdb 100644 --- a/configure.ac +++ b/configure.ac @@ -211,9 +211,6 @@ AX_CHECK_COMPILE_FLAG([-Wwrite-strings], [CFLAGS="$CFLAGS -Wwrite-strings"]) AX_CHECK_COMPILE_FLAG([-Wdiv-by-zero], [CFLAGS="$CFLAGS -Wdiv-by-zero"]) AX_CHECK_COMPILE_FLAG([-Wsometimes-uninitialized], [CFLAGS="$CFLAGS -Wsometimes-uninitialized"]) -AC_MSG_CHECKING([Checking if we can compile for westmere]) -AX_CHECK_COMPILE_FLAG([-march=westmere $CFLAGS], [CFLAGS="-march=westmere $CFLAGS"]) - AC_ARG_VAR([CWFLAGS], [define to compilation flags for generating extra warnings]) AX_CHECK_COMPILE_FLAG([$CWFLAGS -Wall], [CWFLAGS="$CWFLAGS -Wall"]) @@ -301,6 +298,23 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ [AC_MSG_RESULT(yes) AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available])], [AC_MSG_RESULT(no)]) + + AC_MSG_CHECKING(for AESNI instructions set and PCLMULQDQ) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#pragma GCC target("aes") +#pragma GCC target("pclmul") +#ifndef __AES__ +# define __AES__ +#endif +#ifndef __PCLMUL__ +# define __PCLMUL__ +#endif +#include +]], [[ __m128i x = _mm_aesimc_si128(_mm_setzero_si128()); + __m128i y = _mm_clmulepi64_si128(_mm_setzero_si128(), _mm_setzero_si128(), 0);]])], + [AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_WMMINTRIN_H], [1], [aesni is available])], + [AC_MSG_RESULT(no)]) ]) AC_CHECK_HEADERS([sys/mman.h]) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 4643b0ba..e17c78c0 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -3,6 +3,23 @@ * AES256-GCM, based on original code by Romain Dolbeau */ +#pragma GCC target("sse") +#pragma GCC target("sse2") +#pragma GCC target("ssse3") +#pragma GCC target("sse4.1") +#pragma GCC target("aes") +#pragma GCC target("pclmul") + +#ifndef __SSE4_1__ +# define __SSE4_1__ +#endif +#ifndef __AES__ +# define __AES__ +#endif +#ifndef __PCLMUL__ +# define __PCLMUL__ +#endif + #include #include #include diff --git a/src/libsodium/include/sodium/runtime.h b/src/libsodium/include/sodium/runtime.h index 50226ae1..3bdc4dcd 100644 --- a/src/libsodium/include/sodium/runtime.h +++ b/src/libsodium/include/sodium/runtime.h @@ -20,6 +20,12 @@ int sodium_runtime_has_sse2(void); SODIUM_EXPORT int sodium_runtime_has_sse3(void); +SODIUM_EXPORT +int sodium_runtime_has_pclmul(void); + +SODIUM_EXPORT +int sodium_runtime_has_aesni(void); + #ifdef __cplusplus } #endif diff --git a/src/libsodium/sodium/runtime.c b/src/libsodium/sodium/runtime.c index 93b07932..2cf915c8 100644 --- a/src/libsodium/sodium/runtime.c +++ b/src/libsodium/sodium/runtime.c @@ -10,12 +10,16 @@ typedef struct CPUFeatures_ { int has_neon; int has_sse2; int has_sse3; + int has_pclmul; + int has_aesni; } CPUFeatures; static CPUFeatures _cpu_features; -#define CPUID_SSE2 0x04000000 -#define CPUIDECX_SSE3 0x00000001 +#define CPUID_SSE2 0x04000000 +#define CPUIDECX_SSE3 0x00000001 +#define CPUIDECX_PCLMUL 0x00000002 +#define CPUIDECX_AESNI 0x02000000 static int _sodium_runtime_arm_cpu_features(CPUFeatures * const cpu_features) @@ -104,6 +108,14 @@ _sodium_runtime_intel_cpu_features(CPUFeatures * const cpu_features) cpu_features->has_sse3 = ((cpu_info[2] & CPUIDECX_SSE3) != 0x0); #endif +#ifndef HAVE_WMMINTRIN_H + cpu_features->has_pclmul = 0; + cpu_features->has_aesni = 0; +#else + cpu_features->has_pclmul = ((cpu_info[2] & CPUIDECX_PCLMUL) != 0x0); + cpu_features->has_aesni = ((cpu_info[2] & CPUIDECX_AESNI) != 0x0); +#endif + return 0; } @@ -133,3 +145,13 @@ int sodium_runtime_has_sse3(void) { return _cpu_features.has_sse3; } + +int +sodium_runtime_has_pclmul(void) { + return _cpu_features.has_pclmul; +} + +int +sodium_runtime_has_aesni(void) { + return _cpu_features.has_aesni; +} From 6ca06314fcee3ff25d42500581e6f010974473f2 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 18:21:33 +0200 Subject: [PATCH 09/25] Do not try to compile aesni code if this is not going to compile --- configure.ac | 5 ++++- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 22 ++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/configure.ac b/configure.ac index d94f7bdb..548b1a08 100644 --- a/configure.ac +++ b/configure.ac @@ -313,7 +313,10 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ ]], [[ __m128i x = _mm_aesimc_si128(_mm_setzero_si128()); __m128i y = _mm_clmulepi64_si128(_mm_setzero_si128(), _mm_setzero_si128(), 0);]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_WMMINTRIN_H], [1], [aesni is available])], + AC_DEFINE([HAVE_WMMINTRIN_H], [1], [aesni is available]) + AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS="$CFLAGS -maes"]) + AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS="$CFLAGS -mpclmul"]) + ], [AC_MSG_RESULT(no)]) ]) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index e17c78c0..bc1780f1 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -3,6 +3,16 @@ * AES256-GCM, based on original code by Romain Dolbeau */ +#include +#include +#include + +#include "crypto_aead_aes256gcm_aesni.h" +#include "export.h" +#include "utils.h" + +#ifdef HAVE_WMMINTRIN_H + #pragma GCC target("sse") #pragma GCC target("sse2") #pragma GCC target("ssse3") @@ -19,15 +29,7 @@ #ifndef __PCLMUL__ # define __PCLMUL__ #endif - #include -#include -#include -#include - -#include "crypto_aead_aes256gcm_aesni.h" -#include "export.h" -#include "utils.h" #if defined(__INTEL_COMPILER) || defined(_bswap64) #elif defined(_MSC_VER) @@ -807,7 +809,7 @@ size_t crypto_aead_aes256gcm_aesni_abytes(void) size_t crypto_aead_aes256gcm_aesni_statebytes(void) { - (void) sizeof(int[(sizeof(crypto_aead_aes256gcm_aesni_state) >= - sizeof(context)) ? 1 : -1]); return sizeof(crypto_aead_aes256gcm_aesni_state); } + +#endif From d4ff80e7a09d7c6ac272ed217aea96b56cece58d Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 18:32:10 +0200 Subject: [PATCH 10/25] Define __SSSE3__ if required --- configure.ac | 12 ++++++++++++ .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 3 +++ 2 files changed, 15 insertions(+) diff --git a/configure.ac b/configure.ac index 548b1a08..a307c135 100644 --- a/configure.ac +++ b/configure.ac @@ -265,6 +265,9 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_MSG_CHECKING(for MMX instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("mmx") +#ifndef __MMX__ +# define __MMX__ +#endif #include ]], [[ __m64 x = _mm_setzero_si64(); ]])], [AC_MSG_RESULT(yes) @@ -274,6 +277,9 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_MSG_CHECKING(for SSE2 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("sse2") +#ifndef __SSE2__ +# define __SSE2__ +#endif #include ]], [[ __m128d x = _mm_setzero_pd(); ]])], [AC_MSG_RESULT(yes) @@ -283,6 +289,9 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_MSG_CHECKING(for SSE3 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("sse3") +#ifndef __SSE3__ +# define __SSE3__ +#endif #include ]], [[ __m128 x = _mm_addsub_ps(_mm_cvtpd_ps(_mm_setzero_pd()), _mm_cvtpd_ps(_mm_setzero_pd())); ]])], @@ -293,6 +302,9 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_MSG_CHECKING(for SSSE3 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("ssse3") +#ifndef __SSSE3__ +# define __SSSE3__ +#endif #include ]], [[ __m64 x = _mm_abs_pi32(_m_from_int(0)); ]])], [AC_MSG_RESULT(yes) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index bc1780f1..37beab47 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -20,6 +20,9 @@ #pragma GCC target("aes") #pragma GCC target("pclmul") +#ifndef __SSSE3__ +# define __SSSE3__ +#endif #ifndef __SSE4_1__ # define __SSE4_1__ #endif From f267352eecba63a75b7958639a737a0ba9f0e43e Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 19:23:39 +0200 Subject: [PATCH 11/25] Use SIMD-specific compiler flags only for files needing them --- configure.ac | 37 ++++++++++++++++--- src/libsodium/Makefile.am | 19 +++++++++- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 15 -------- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/configure.ac b/configure.ac index a307c135..3c838b92 100644 --- a/configure.ac +++ b/configure.ac @@ -271,7 +271,8 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ #include ]], [[ __m64 x = _mm_setzero_si64(); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_MMINTRIN_H], [1], [mmx is available])], + AC_DEFINE([HAVE_MMINTRIN_H], [1], [mmx is available]) + AX_CHECK_COMPILE_FLAG([-mmmx], [CFLAGS_MMX="-mmmx"])], [AC_MSG_RESULT(no)]) AC_MSG_CHECKING(for SSE2 instructions set) @@ -283,7 +284,8 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ #include ]], [[ __m128d x = _mm_setzero_pd(); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_EMMINTRIN_H], [1], [sse2 is available])], + AC_DEFINE([HAVE_EMMINTRIN_H], [1], [sse2 is available]) + AX_CHECK_COMPILE_FLAG([-msse2], [CFLAGS_SSE2="-msse2"])], [AC_MSG_RESULT(no)]) AC_MSG_CHECKING(for SSE3 instructions set) @@ -296,7 +298,8 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ ]], [[ __m128 x = _mm_addsub_ps(_mm_cvtpd_ps(_mm_setzero_pd()), _mm_cvtpd_ps(_mm_setzero_pd())); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_PMMINTRIN_H], [1], [sse3 is available])], + AC_DEFINE([HAVE_PMMINTRIN_H], [1], [sse3 is available]) + AX_CHECK_COMPILE_FLAG([-msse3], [CFLAGS_SSE3="-msse3"])], [AC_MSG_RESULT(no)]) AC_MSG_CHECKING(for SSSE3 instructions set) @@ -308,7 +311,21 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ #include ]], [[ __m64 x = _mm_abs_pi32(_m_from_int(0)); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available])], + AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available]) + AX_CHECK_COMPILE_FLAG([-mssse3], [CFLAGS_SSSE3="-mssse3"])], + [AC_MSG_RESULT(no)]) + + AC_MSG_CHECKING(for SSE4.1 instructions set) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#pragma GCC target("sse4.1") +#ifndef __SSE4_1__ +# define __SSE4_1__ +#endif +#include +]], [[ __m128i x = _mm_minpos_epu16(_mm_setzero_si128()); ]])], + [AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_SMMINTRIN_H], [1], [sse4.1 is available]) + AX_CHECK_COMPILE_FLAG([-msse4.1], [CFLAGS_SSE4_1="-msse4.1"])], [AC_MSG_RESULT(no)]) AC_MSG_CHECKING(for AESNI instructions set and PCLMULQDQ) @@ -326,12 +343,20 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ __m128i y = _mm_clmulepi64_si128(_mm_setzero_si128(), _mm_setzero_si128(), 0);]])], [AC_MSG_RESULT(yes) AC_DEFINE([HAVE_WMMINTRIN_H], [1], [aesni is available]) - AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS="$CFLAGS -maes"]) - AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS="$CFLAGS -mpclmul"]) + AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS_AESNI="-maes"]) + AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS_PCLMUL="-mpclmul"]) ], [AC_MSG_RESULT(no)]) ]) +AC_SUBST(CFLAGS_MMX) +AC_SUBST(CFLAGS_SSE2) +AC_SUBST(CFLAGS_SSE3) +AC_SUBST(CFLAGS_SSSE3) +AC_SUBST(CFLAGS_SSE4_1) +AC_SUBST(CFLAGS_AESNI) +AC_SUBST(CFLAGS_PCLMUL) + AC_CHECK_HEADERS([sys/mman.h]) dnl Checks for typedefs, structures, and compiler characteristics. diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index c7748234..201c0bd3 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -2,7 +2,6 @@ lib_LTLIBRARIES = \ libsodium.la libsodium_la_SOURCES = \ - crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c \ crypto_aead/chacha20poly1305/sodium/aead_chacha20poly1305.c \ crypto_auth/crypto_auth.c \ crypto_auth/hmacsha256/auth_hmacsha256_api.c \ @@ -63,7 +62,6 @@ libsodium_la_SOURCES = \ crypto_pwhash/scryptsalsa208sha256/pwhash_scryptsalsa208sha256.c \ crypto_pwhash/scryptsalsa208sha256/sysendian.h \ crypto_pwhash/scryptsalsa208sha256/nosse/pwhash_scryptsalsa208sha256_nosse.c \ - crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c \ crypto_scalarmult/crypto_scalarmult.c \ crypto_scalarmult/curve25519/scalarmult_curve25519_api.c \ crypto_secretbox/crypto_secretbox.c \ @@ -269,3 +267,20 @@ endif SUBDIRS = \ include + +libsodium_la_LIBADD = libaesni.la libsse2.la +noinst_LTLIBRARIES = libaesni.la libsse2.la + +libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS) +libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ + @CFLAGS_SSE4_1@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ + +libaesni_la_SOURCES = \ + crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c + +libsse2_la_LDFLAGS = $(libsodium_la_LDFLAGS) +libsse2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ + @CFLAGS_SSE2@ + +libsse2_la_SOURCES = \ + crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 37beab47..5cb17fbc 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -13,25 +13,10 @@ #ifdef HAVE_WMMINTRIN_H -#pragma GCC target("sse") -#pragma GCC target("sse2") -#pragma GCC target("ssse3") #pragma GCC target("sse4.1") #pragma GCC target("aes") #pragma GCC target("pclmul") -#ifndef __SSSE3__ -# define __SSSE3__ -#endif -#ifndef __SSE4_1__ -# define __SSE4_1__ -#endif -#ifndef __AES__ -# define __AES__ -#endif -#ifndef __PCLMUL__ -# define __PCLMUL__ -#endif #include #if defined(__INTEL_COMPILER) || defined(_bswap64) From c3195da04dded97ce6447a7c9ffc66c07ecc95e9 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 19:40:29 +0200 Subject: [PATCH 12/25] ssse3 target is required in addition to sse4.1 --- src/libsodium/Makefile.am | 2 +- .../crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 201c0bd3..be775006 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -273,7 +273,7 @@ noinst_LTLIBRARIES = libaesni.la libsse2.la libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS) libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ - @CFLAGS_SSE4_1@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ + @CFLAGS_SSSE3@ @CFLAGS_SSE4_1@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ libaesni_la_SOURCES = \ crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 5cb17fbc..05b49459 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -13,6 +13,7 @@ #ifdef HAVE_WMMINTRIN_H +#pragma GCC target("ssse3") #pragma GCC target("sse4.1") #pragma GCC target("aes") #pragma GCC target("pclmul") From 84d92fc1bf99ec1549c4db5b11093215979b8086 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 20:10:26 +0200 Subject: [PATCH 13/25] Try to enable specific cflags before testing each intructions set --- configure.ac | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/configure.ac b/configure.ac index 3c838b92..91708583 100644 --- a/configure.ac +++ b/configure.ac @@ -181,7 +181,7 @@ LIBTOOL_EXTRA_FLAGS="$LIBTOOL_EXTRA_FLAGS -version-info $SODIUM_LIBRARY_VERSION" AC_ARG_ENABLE(soname-versions, [AC_HELP_STRING([--enable-soname-versions], [enable soname versions (must be disabled for Android) (default: enabled)])], [ - AS_IF([test "x$enableval" = "xno"], [ + AS_IF([test "x$enableval" = "xno"], [ LIBTOOL_EXTRA_FLAGS="$LIBTOOL_OLD_FLAGS -avoid-version" ]) ] @@ -263,19 +263,21 @@ dnl Checks for headers AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_MSG_CHECKING(for MMX instructions set) + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-mmmx], [CFLAGS="$CFLAGS -mmmx"]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("mmx") -#ifndef __MMX__ -# define __MMX__ -#endif #include ]], [[ __m64 x = _mm_setzero_si64(); ]])], [AC_MSG_RESULT(yes) AC_DEFINE([HAVE_MMINTRIN_H], [1], [mmx is available]) AX_CHECK_COMPILE_FLAG([-mmmx], [CFLAGS_MMX="-mmmx"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" AC_MSG_CHECKING(for SSE2 instructions set) + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-msse2], [CFLAGS="$CFLAGS -msse2"]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("sse2") #ifndef __SSE2__ @@ -287,13 +289,13 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_DEFINE([HAVE_EMMINTRIN_H], [1], [sse2 is available]) AX_CHECK_COMPILE_FLAG([-msse2], [CFLAGS_SSE2="-msse2"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-msse3], [CFLAGS="$CFLAGS -msse3"]) AC_MSG_CHECKING(for SSE3 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("sse3") -#ifndef __SSE3__ -# define __SSE3__ -#endif #include ]], [[ __m128 x = _mm_addsub_ps(_mm_cvtpd_ps(_mm_setzero_pd()), _mm_cvtpd_ps(_mm_setzero_pd())); ]])], @@ -301,43 +303,41 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_DEFINE([HAVE_PMMINTRIN_H], [1], [sse3 is available]) AX_CHECK_COMPILE_FLAG([-msse3], [CFLAGS_SSE3="-msse3"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-mssse3], [CFLAGS="$CFLAGS -mssse3"]) AC_MSG_CHECKING(for SSSE3 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("ssse3") -#ifndef __SSSE3__ -# define __SSSE3__ -#endif #include ]], [[ __m64 x = _mm_abs_pi32(_m_from_int(0)); ]])], [AC_MSG_RESULT(yes) AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available]) AX_CHECK_COMPILE_FLAG([-mssse3], [CFLAGS_SSSE3="-mssse3"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-msse4.1], [CFLAGS="$CFLAGS -msse4.1"]) AC_MSG_CHECKING(for SSE4.1 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("sse4.1") -#ifndef __SSE4_1__ -# define __SSE4_1__ -#endif #include ]], [[ __m128i x = _mm_minpos_epu16(_mm_setzero_si128()); ]])], [AC_MSG_RESULT(yes) AC_DEFINE([HAVE_SMMINTRIN_H], [1], [sse4.1 is available]) AX_CHECK_COMPILE_FLAG([-msse4.1], [CFLAGS_SSE4_1="-msse4.1"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS="$CFLAGS -maes"]) + AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS="$CFLAGS -mpclmul"]) AC_MSG_CHECKING(for AESNI instructions set and PCLMULQDQ) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("aes") #pragma GCC target("pclmul") -#ifndef __AES__ -# define __AES__ -#endif -#ifndef __PCLMUL__ -# define __PCLMUL__ -#endif #include ]], [[ __m128i x = _mm_aesimc_si128(_mm_setzero_si128()); __m128i y = _mm_clmulepi64_si128(_mm_setzero_si128(), _mm_setzero_si128(), 0);]])], @@ -347,6 +347,8 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS_PCLMUL="-mpclmul"]) ], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + ]) AC_SUBST(CFLAGS_MMX) From fad86b2fe957ad39391767237e35aa8df6bc0da1 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 20:15:35 +0200 Subject: [PATCH 14/25] Let's hope that requiring ssse3 is not required any more --- src/libsodium/Makefile.am | 2 +- .../crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index be775006..201c0bd3 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -273,7 +273,7 @@ noinst_LTLIBRARIES = libaesni.la libsse2.la libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS) libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ - @CFLAGS_SSSE3@ @CFLAGS_SSE4_1@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ + @CFLAGS_SSE4_1@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ libaesni_la_SOURCES = \ crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 05b49459..5cb17fbc 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -13,7 +13,6 @@ #ifdef HAVE_WMMINTRIN_H -#pragma GCC target("ssse3") #pragma GCC target("sse4.1") #pragma GCC target("aes") #pragma GCC target("pclmul") From 78002f8ca7b0db1897d5394f1e164ee4c93121e1 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 20:57:46 +0200 Subject: [PATCH 15/25] Proper casts for aeskeygenassist() --- .../crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 5cb17fbc..441fa972 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -65,7 +65,7 @@ aesni_key256_expand(const unsigned char *key, __m128 rkeys[16]) This is all strongly inspired by the openssl assembly code. */ #define BLOCK1(IMM) \ - temp1 = (__m128)_mm_aeskeygenassist_si128((__m128i) temp2, IMM);\ + temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp2), IMM));\ rkeys[idx++] = temp2; \ temp4 = _mm_shuffle_ps(temp4, temp0, 0x10); \ temp0 = _mm_xor_ps(temp0, temp4); \ @@ -75,7 +75,7 @@ aesni_key256_expand(const unsigned char *key, __m128 rkeys[16]) temp0 = _mm_xor_ps(temp0, temp1) #define BLOCK2(IMM) \ - temp1 = (__m128)_mm_aeskeygenassist_si128((__m128i) temp0, IMM);\ + temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp0), IMM));\ rkeys[idx++] = temp0; \ temp4 = _mm_shuffle_ps(temp4, temp2, 0x10); \ temp2 = _mm_xor_ps(temp2, temp4); \ @@ -463,7 +463,7 @@ crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_, unsigned char *H = ctx->H; (void) sizeof(int[(sizeof *ctx_) >= (sizeof *ctx) ? 1 : -1]); - aesni_key256_expand(k, (__m128*) rkeys); + aesni_key256_expand(k, (__m128 *) rkeys); aesni_encrypt1(H, zero, rkeys); return 0; From 9055a140f31e9e2f18ed88a0d318356766389061 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 21:07:07 +0200 Subject: [PATCH 16/25] Declare __m128 arrays used as parameters as pointers Required for MSVC --- .../crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 441fa972..f5e5e5ea 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -42,7 +42,7 @@ typedef struct context { } context; static inline void -aesni_key256_expand(const unsigned char *key, __m128 rkeys[16]) +aesni_key256_expand(const unsigned char *key, __m128 *rkeys) { __m128 key0 = _mm_loadu_ps((const float *) (key + 0)); __m128 key1 = _mm_loadu_ps((const float *) (key + 16)); @@ -108,7 +108,7 @@ aesni_key256_expand(const unsigned char *key, __m128 rkeys[16]) /** single, by-the-book AES encryption with AES-NI */ static inline void -aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i rkeys[16]) +aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) { __m128i temp = _mm_xor_si128(nv, rkeys[0]); int i; @@ -170,7 +170,7 @@ aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i rkeys[16]) /* create a function of unrolling N ; the MAKEN is the unrolling macro, defined above. The N in MAKEN must match N, obviously. */ #define FUNC(N, MAKEN) \ - static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i rkeys[16]) \ + static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \ { \ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ int i; \ @@ -398,7 +398,7 @@ reduce4(__m128i H0, __m128i H1, __m128i H2, __m128i H3, __m128i X0, __m128i X1, /* full encrypt & checksum 8 blocks at once */ static inline void -aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i rkeys[16], +aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, const unsigned char *in, unsigned char *accum, const __m128i hv, const __m128i h2v, const __m128i h3v, const __m128i h4v) @@ -436,7 +436,7 @@ aesni_addmul8full(const unsigned char *in, unsigned char *accum, /* decrypt 8 blocks at once */ static inline void -aesni_decrypt8full(unsigned char *out, uint32_t *n, const __m128i rkeys[16], +aesni_decrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, const unsigned char *in) { const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); From 30729b0add30b611df58ab0eebc56ee73364311b Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 21:57:04 +0200 Subject: [PATCH 17/25] Don't declare new variables after a line of code --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index f5e5e5ea..c042351e 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -128,13 +128,19 @@ aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) function */ /* Step 1 : loading the nonce */ /* load & increment the n vector (non-vectorized, unused for now) */ -#define NVx(a) \ - __m128i nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \ +#define NVDECLx(a) \ + __m128i nv##a + +#define NVx(a) \ + nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \ n[3]++ /* Step 2 : define value in round one (xor with subkey #0, aka key) */ +#define TEMPDECLx(a) \ + __m128i temp##a + #define TEMPx(a) \ - __m128i temp##a = _mm_xor_si128(nv##a, rkeys[0]) + temp##a = _mm_xor_si128(nv##a, rkeys[0]) /* Step 3: one round of AES */ #define AESENCx(a) \ @@ -174,6 +180,8 @@ aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) { \ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ int i; \ + MAKEN(NVDECLx); \ + MAKEN(TEMPDECLx); \ \ MAKEN(NVx); \ MAKEN(TEMPx); \ @@ -407,6 +415,8 @@ aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, __m128i accv = _mm_loadu_si128((const __m128i *) accum); int i; + MAKE8(NVDECLx); + MAKE8(TEMPDECLx); MAKE8(NVx); MAKE8(TEMPx); #pragma unroll(13) @@ -442,6 +452,8 @@ aesni_decrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int i; + MAKE8(NVDECLx); + MAKE8(TEMPDECLx); MAKE8(NVx); MAKE8(TEMPx); #pragma unroll(13) From d1d833a240ac3a5ed36129a61d4353ae07bc07ce Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 23:04:40 +0200 Subject: [PATCH 18/25] Enable aes256gcm on Visual Studio --- .../crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index c042351e..09c05da1 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -11,7 +11,8 @@ #include "export.h" #include "utils.h" -#ifdef HAVE_WMMINTRIN_H +#if defined(HAVE_WMMINTRIN_H) || \ + (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) #pragma GCC target("sse4.1") #pragma GCC target("aes") From 7a67bb9484f1acb1b4489c3add4e3962bb9bbbdf Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 10 Oct 2015 23:33:34 +0200 Subject: [PATCH 19/25] Turn reduce4 into a macro That's too much registers for a function call in 32-bit mode. And in MSVC, this is even the case if the function is marked inline. --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 186 ++++++++++-------- 1 file changed, 99 insertions(+), 87 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 09c05da1..6c0084e0 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -305,97 +305,106 @@ mulv(__m128i A, __m128i B) /* 4 multiply-accumulate at once; again for the Aggregated Reduction Method & sample code. -*/ -static inline __m128i -reduce4(__m128i H0, __m128i H1, __m128i H2, __m128i H3, __m128i X0, __m128i X1, - __m128i X2, __m128i X3, __m128i acc) -{ -/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ + Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */ + #define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B - MAKE4(RED_DECL); - __m128i lo, hi; - __m128i tmp8, tmp9; - const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - -/* byte-revert the inputs & xor the first one into the accumulator */ #define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev) - MAKE4(RED_SHUFFLE); - X3 = _mm_xor_si128(X3, acc); - -/* 4 low H*X (x0*h0) */ #define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00) - MAKE4(RED_MUL_LOW); - lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); - lo = _mm_xor_si128(lo, H2_X2_lo); - lo = _mm_xor_si128(lo, H3_X3_lo); - -/* 4 high H*X (x1*h1) */ #define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11) - MAKE4(RED_MUL_HIGH); - hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); - hi = _mm_xor_si128(hi, H2_X2_hi); - hi = _mm_xor_si128(hi, H3_X3_hi); - -/* 4 middle H*X, using Karatsuba, i.e. - x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 - we already have all x1y1 & x0y0 (accumulated in hi & lo) - (0 is low half and 1 is high half) - */ -/* permute the high and low 64 bits in H1 & X1, - so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), - then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), - and finally multiply - */ #define RED_MUL_MID(a) \ tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \ tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \ tmp##a = _mm_xor_si128(tmp##a, H##a); \ tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \ tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00) - MAKE4(RED_MUL_MID); -/* substracts x1*h1 and x0*h0 */ - tmp0 = _mm_xor_si128(tmp0, lo); - tmp0 = _mm_xor_si128(tmp0, hi); - tmp0 = _mm_xor_si128(tmp1, tmp0); - tmp0 = _mm_xor_si128(tmp2, tmp0); - tmp0 = _mm_xor_si128(tmp3, tmp0); - - /* reduction */ - tmp0B = _mm_slli_si128(tmp0, 8); - tmp0 = _mm_srli_si128(tmp0, 8); - lo = _mm_xor_si128(tmp0B, lo); - hi = _mm_xor_si128(tmp0, hi); - tmp3 = lo; - tmp2B = hi; - tmp3B = _mm_srli_epi32(tmp3, 31); - tmp8 = _mm_srli_epi32(tmp2B, 31); - tmp3 = _mm_slli_epi32(tmp3, 1); - tmp2B = _mm_slli_epi32(tmp2B, 1); - tmp9 = _mm_srli_si128(tmp3B, 12); - tmp8 = _mm_slli_si128(tmp8, 4); - tmp3B = _mm_slli_si128(tmp3B, 4); - tmp3 = _mm_or_si128(tmp3, tmp3B); - tmp2B = _mm_or_si128(tmp2B, tmp8); - tmp2B = _mm_or_si128(tmp2B, tmp9); - tmp3B = _mm_slli_epi32(tmp3, 31); - tmp8 = _mm_slli_epi32(tmp3, 30); - tmp9 = _mm_slli_epi32(tmp3, 25); - tmp3B = _mm_xor_si128(tmp3B, tmp8); - tmp3B = _mm_xor_si128(tmp3B, tmp9); - tmp8 = _mm_srli_si128(tmp3B, 4); - tmp3B = _mm_slli_si128(tmp3B, 12); - tmp3 = _mm_xor_si128(tmp3, tmp3B); - tmp2 = _mm_srli_epi32(tmp3, 1); - tmp0B = _mm_srli_epi32(tmp3, 2); - tmp1B = _mm_srli_epi32(tmp3, 7); - tmp2 = _mm_xor_si128(tmp2, tmp0B); - tmp2 = _mm_xor_si128(tmp2, tmp1B); - tmp2 = _mm_xor_si128(tmp2, tmp8); - tmp3 = _mm_xor_si128(tmp3, tmp2); - tmp2B = _mm_xor_si128(tmp2B, tmp3); - - return tmp2B; +#define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, acc) \ +{ \ + MAKE4(RED_DECL); \ + __m128i lo, hi; \ + __m128i tmp8, tmp9; \ + __m128i H0 = H0_; \ + __m128i H1 = H1_; \ + __m128i H2 = H2_; \ + __m128i H3 = H3_; \ + __m128i X0 = X0_; \ + __m128i X1 = X1_; \ + __m128i X2 = X2_; \ + __m128i X3 = X3_; \ +\ +/* byte-revert the inputs & xor the first one into the accumulator */ \ +\ + MAKE4(RED_SHUFFLE); \ + X3 = _mm_xor_si128(X3, acc); \ +\ +/* 4 low H*X (x0*h0) */ \ +\ + MAKE4(RED_MUL_LOW); \ + lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \ + lo = _mm_xor_si128(lo, H2_X2_lo); \ + lo = _mm_xor_si128(lo, H3_X3_lo); \ +\ +/* 4 high H*X (x1*h1) */ \ +\ + MAKE4(RED_MUL_HIGH); \ + hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \ + hi = _mm_xor_si128(hi, H2_X2_hi); \ + hi = _mm_xor_si128(hi, H3_X3_hi); \ +\ +/* 4 middle H*X, using Karatsuba, i.e. \ + x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \ + we already have all x1y1 & x0y0 (accumulated in hi & lo) \ + (0 is low half and 1 is high half) \ + */ \ +/* permute the high and low 64 bits in H1 & X1, \ + so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \ + then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \ + and finally multiply \ + */ \ + MAKE4(RED_MUL_MID); \ +\ +/* substracts x1*h1 and x0*h0 */ \ + tmp0 = _mm_xor_si128(tmp0, lo); \ + tmp0 = _mm_xor_si128(tmp0, hi); \ + tmp0 = _mm_xor_si128(tmp1, tmp0); \ + tmp0 = _mm_xor_si128(tmp2, tmp0); \ + tmp0 = _mm_xor_si128(tmp3, tmp0);\ +\ + /* reduction */ \ + tmp0B = _mm_slli_si128(tmp0, 8); \ + tmp0 = _mm_srli_si128(tmp0, 8); \ + lo = _mm_xor_si128(tmp0B, lo); \ + hi = _mm_xor_si128(tmp0, hi); \ + tmp3 = lo; \ + tmp2B = hi; \ + tmp3B = _mm_srli_epi32(tmp3, 31); \ + tmp8 = _mm_srli_epi32(tmp2B, 31); \ + tmp3 = _mm_slli_epi32(tmp3, 1); \ + tmp2B = _mm_slli_epi32(tmp2B, 1); \ + tmp9 = _mm_srli_si128(tmp3B, 12); \ + tmp8 = _mm_slli_si128(tmp8, 4); \ + tmp3B = _mm_slli_si128(tmp3B, 4); \ + tmp3 = _mm_or_si128(tmp3, tmp3B); \ + tmp2B = _mm_or_si128(tmp2B, tmp8); \ + tmp2B = _mm_or_si128(tmp2B, tmp9); \ + tmp3B = _mm_slli_epi32(tmp3, 31); \ + tmp8 = _mm_slli_epi32(tmp3, 30); \ + tmp9 = _mm_slli_epi32(tmp3, 25); \ + tmp3B = _mm_xor_si128(tmp3B, tmp8); \ + tmp3B = _mm_xor_si128(tmp3B, tmp9); \ + tmp8 = _mm_srli_si128(tmp3B, 4); \ + tmp3B = _mm_slli_si128(tmp3B, 12); \ + tmp3 = _mm_xor_si128(tmp3, tmp3B); \ + tmp2 = _mm_srli_epi32(tmp3, 1); \ + tmp0B = _mm_srli_epi32(tmp3, 2); \ + tmp1B = _mm_srli_epi32(tmp3, 7); \ + tmp2 = _mm_xor_si128(tmp2, tmp0B); \ + tmp2 = _mm_xor_si128(tmp2, tmp1B); \ + tmp2 = _mm_xor_si128(tmp2, tmp8); \ + tmp3 = _mm_xor_si128(tmp3, tmp2); \ + tmp2B = _mm_xor_si128(tmp2B, tmp3); \ +\ + accv = tmp2B; \ } #define XORx(a) \ @@ -413,6 +422,7 @@ aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, const __m128i h4v) { const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i accv = _mm_loadu_si128((const __m128i *) accum); int i; @@ -427,8 +437,8 @@ aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, MAKE8(AESENCLASTx); MAKE8(XORx); MAKE8(STOREx); - accv = reduce4(hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); - accv = reduce4(hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); + REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); + REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); _mm_storeu_si128((__m128i *) accum, accv); } @@ -438,10 +448,12 @@ aesni_addmul8full(const unsigned char *in, unsigned char *accum, const __m128i hv, const __m128i h2v, const __m128i h3v, const __m128i h4v) { + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i accv = _mm_loadu_si128((const __m128i *) accum); + MAKE8(LOADx); - accv = reduce4(hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); - accv = reduce4(hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); + REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); + REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); _mm_storeu_si128((__m128i *) accum, accv); } @@ -523,13 +535,13 @@ crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long H4v = mulv(H3v, Hv); accv = _mm_setzero_si128(); - /* unrolled by 4 GCM (by 8 doesn't improve using reduce4) */ + /* unrolled by 4 GCM (by 8 doesn't improve using REDUCE4) */ for (i = 0; i < adlen_rnd64; i += 64) { __m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0)); __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); - accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); + REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); } _mm_storeu_si128((__m128i *) accum, accv); @@ -648,7 +660,7 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); - accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); + REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); } _mm_storeu_si128((__m128i *) accum, accv); From 69aac7d0af4be46040cc273662c9ac888a35cdb9 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sun, 11 Oct 2015 00:12:16 +0200 Subject: [PATCH 20/25] Add do { ... } while(0) when relevant --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 6c0084e0..37e3a229 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -319,7 +319,7 @@ mulv(__m128i A, __m128i B) tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00) #define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, acc) \ -{ \ +do { \ MAKE4(RED_DECL); \ __m128i lo, hi; \ __m128i tmp8, tmp9; \ @@ -405,7 +405,7 @@ mulv(__m128i A, __m128i B) tmp2B = _mm_xor_si128(tmp2B, tmp3); \ \ accv = tmp2B; \ -} +} while(0) #define XORx(a) \ __m128i in##a = _mm_load_si128((const __m128i *) (in + a * 16)); \ @@ -557,18 +557,18 @@ crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long /* this only does 8 full blocks, so no fancy bounds checking is necessary*/ #define LOOPRND128 \ - { \ + do { \ const int iter = 8; \ const int lb = iter * 16; \ \ for (i = 0; i < mlen_rnd128; i += lb) { \ aesni_encrypt8full(c + i, (uint32_t *) n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v); \ } \ - } + } while(0) /* remainder loop, with the slower GCM update to accomodate partial blocks */ #define LOOPRMD128 \ - { \ + do { \ const int iter = 8; \ const int lb = iter * 16; \ \ @@ -592,7 +592,7 @@ crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long addmul(accum, c + i + j, bl, H); \ } \ } \ - } + } while(0) n2[15] = 0; COUNTER_INC2(n2); @@ -675,25 +675,25 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long mlen_rnd128 = mlen & ~127ULL; #define LOOPACCUMDRND128 \ - { \ + do { \ const int iter = 8; \ const int lb = iter * 16; \ for (i = 0; i < mlen_rnd128; i += lb) { \ aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v); \ } \ - } + } while(0) #define LOOPDRND128 \ - { \ + do { \ const int iter = 8; \ const int lb = iter * 16; \ for (i = 0; i < mlen_rnd128; i += lb) { \ aesni_decrypt8full(m + i, (uint32_t *) n2, rkeys, c + i); \ } \ - } + } while(0) #define LOOPACCUMDRMD128 \ - { \ + do { \ const int iter = 8; \ const int lb = iter * 16; \ \ @@ -712,10 +712,10 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long addmul(accum, c + i + j, bl, H); \ } \ } \ - } + } while(0) #define LOOPDRMD128 \ - { \ + do { \ const int iter = 8; \ const int lb = iter * 16; \ \ @@ -731,7 +731,7 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long m[i + j] = c[i + j] ^ outni[j]; \ } \ } \ - } + } while(0) n2[15] = 0; COUNTER_INC2(n2); From 0b20d292dfa28f7ecff28ddc3f6175440e5cfc58 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sun, 11 Oct 2015 00:43:44 +0200 Subject: [PATCH 21/25] Convert more functions to macros --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 120 +++++++++--------- 1 file changed, 62 insertions(+), 58 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 37e3a229..b80f9725 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -415,68 +415,72 @@ do { \ __m128i in##a = _mm_load_si128((const __m128i *) (in + a * 16)); /* full encrypt & checksum 8 blocks at once */ -static inline void -aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, - const unsigned char *in, unsigned char *accum, - const __m128i hv, const __m128i h2v, const __m128i h3v, - const __m128i h4v) -{ - const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - __m128i accv = _mm_loadu_si128((const __m128i *) accum); - int i; - - MAKE8(NVDECLx); - MAKE8(TEMPDECLx); - MAKE8(NVx); - MAKE8(TEMPx); -#pragma unroll(13) - for (i = 1; i < 14; i++) { - MAKE8(AESENCx); - } - MAKE8(AESENCLASTx); - MAKE8(XORx); - MAKE8(STOREx); - REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); - REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); - _mm_storeu_si128((__m128i *) accum, accv); -} +#define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_) \ +do { \ + unsigned char *out = out_; \ + uint32_t *n = n_; \ + const unsigned char *in = in_; \ + const __m128i hv = hv_; \ + const __m128i h2v = h2v_; \ + const __m128i h3v = h3v_; \ + const __m128i h4v = h4v_; \ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i accv = _mm_loadu_si128((const __m128i *) accum); \ + int i; \ +\ + MAKE8(NVDECLx); \ + MAKE8(TEMPDECLx); \ + MAKE8(NVx); \ + MAKE8(TEMPx); \ + for (i = 1; i < 14; i++) { \ + MAKE8(AESENCx); \ + } \ + MAKE8(AESENCLASTx); \ + MAKE8(XORx); \ + MAKE8(STOREx); \ + REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); \ + REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); \ + _mm_storeu_si128((__m128i *) accum, accv); \ +} while(0) /* checksum 8 blocks at once */ -static inline void -aesni_addmul8full(const unsigned char *in, unsigned char *accum, - const __m128i hv, const __m128i h2v, - const __m128i h3v, const __m128i h4v) -{ - const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - __m128i accv = _mm_loadu_si128((const __m128i *) accum); - - MAKE8(LOADx); - REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); - REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); - _mm_storeu_si128((__m128i *) accum, accv); -} +#define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_) \ +do { \ + const unsigned char *in = in_; \ + const __m128i hv = hv_; \ + const __m128i h2v = h2v_ ; \ + const __m128i h3v = h3v_ ; \ + const __m128i h4v = h4v_ ; \ + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i accv = _mm_loadu_si128((const __m128i *) accum); \ +\ + MAKE8(LOADx); \ + REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); \ + REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); \ + _mm_storeu_si128((__m128i *) accum, accv); \ +} while(0) /* decrypt 8 blocks at once */ -static inline void -aesni_decrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys, - const unsigned char *in) -{ - const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int i; - - MAKE8(NVDECLx); - MAKE8(TEMPDECLx); - MAKE8(NVx); - MAKE8(TEMPx); -#pragma unroll(13) - for (i = 1; i < 14; i++) { - MAKE8(AESENCx); - } - MAKE8(AESENCLASTx); - MAKE8(XORx); - MAKE8(STOREx); -} +#define aesni_decrypt8full(out_, n_, rkeys, in_) \ +do { \ + unsigned char *out = out_; \ + uint32_t *n = n_; \ + const unsigned char *in = in_; \ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int i; \ +\ + MAKE8(NVDECLx); \ + MAKE8(TEMPDECLx); \ + MAKE8(NVx); \ + MAKE8(TEMPx); \ + for (i = 1; i < 14; i++) { \ + MAKE8(AESENCx); \ + } \ + MAKE8(AESENCLASTx); \ + MAKE8(XORx); \ + MAKE8(STOREx); \ +} while(0) int crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_, From 970058bb38c57e05c79fe1b07bc5f0923326ed7d Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sun, 11 Oct 2015 01:00:33 +0200 Subject: [PATCH 22/25] Don't read past the AD buffer, even through an SIMD register --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index b80f9725..eabcd046 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -203,21 +203,17 @@ static inline void addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b) { const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const __m128i ff = _mm_set1_epi32(-1); - __m128i A = _mm_loadu_si128((const __m128i *) a); + __m128i A; - A = _mm_shuffle_epi8(A, rev); - if (xlen < 16) { /* less than 16 useful bytes - insert zeroes where needed */ - uint64_t mask = -1ull ^ (1ull << (((16 - xlen) % 8) * 8)) - 1ull; - __m128i vm; - - if (xlen > 8) { - vm = _mm_insert_epi64(ff, mask, 0); - } else { - vm = _mm_insert_epi64(_mm_setzero_si128(), mask, 1); - } - A = _mm_and_si128(vm, A); + if (xlen >= 16) { + A = _mm_loadu_si128((const __m128i *) a); + } else { + unsigned char padded[16]; + memset(padded, 0, 16); + memcpy(padded, a, xlen); + A = _mm_loadu_si128((const __m128i *) padded); } + A = _mm_shuffle_epi8(A, rev); __m128i B = _mm_loadu_si128((const __m128i *) b); __m128i C = _mm_loadu_si128((const __m128i *) c); A = _mm_xor_si128(A, C); From 82b2f5a4c408e34514c1ff9730fd8f29c43cc560 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sun, 11 Oct 2015 01:17:00 +0200 Subject: [PATCH 23/25] aes256gcm doesn't use SSE4.1 instructions any more --- src/libsodium/Makefile.am | 2 +- .../crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 201c0bd3..5f94c407 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -273,7 +273,7 @@ noinst_LTLIBRARIES = libaesni.la libsse2.la libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS) libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ - @CFLAGS_SSE4_1@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ + @CFLAGS_SSSE3@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ libaesni_la_SOURCES = \ crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index eabcd046..734b7c82 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -14,7 +14,7 @@ #if defined(HAVE_WMMINTRIN_H) || \ (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) -#pragma GCC target("sse4.1") +#pragma GCC target("ssse3") #pragma GCC target("aes") #pragma GCC target("pclmul") From 66d55c1939d91c12453dce4714e181d9ed9c33cc Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sun, 11 Oct 2015 02:07:20 +0200 Subject: [PATCH 24/25] aesgcm: don't expect input & output buffers to be aligned --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index 734b7c82..a1635ee5 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -119,7 +119,7 @@ aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) temp = _mm_aesenc_si128(temp, rkeys[i]); } temp = _mm_aesenclast_si128(temp, rkeys[14]); - _mm_store_si128((__m128i *) out, temp); + _mm_storeu_si128((__m128i *) out, temp); } /** multiple-blocks-at-once AES encryption with AES-NI ; @@ -153,7 +153,7 @@ aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) /* Step 5: store result */ #define STOREx(a) \ - _mm_store_si128((__m128i *) (out + (a * 16)), temp##a) + _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a) /* all the MAKE* macros are for automatic explicit unrolling */ #define MAKE4(X) \ @@ -403,12 +403,12 @@ do { \ accv = tmp2B; \ } while(0) -#define XORx(a) \ - __m128i in##a = _mm_load_si128((const __m128i *) (in + a * 16)); \ +#define XORx(a) \ + __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16)); \ temp##a = _mm_xor_si128(temp##a, in##a) -#define LOADx(a) \ - __m128i in##a = _mm_load_si128((const __m128i *) (in + a * 16)); +#define LOADx(a) \ + __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16)); /* full encrypt & checksum 8 blocks at once */ #define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_) \ From 82e9c729f1e3bbebe26b63788db1fdd3aad8af7e Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sun, 11 Oct 2015 02:19:49 +0200 Subject: [PATCH 25/25] aes256gcm: we can expect the accumulator and the padding buffer to be aligned --- .../aes256gcm/aesni/aead_aes256gcm_aesni.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c index a1635ee5..64ec8576 100644 --- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -208,10 +208,10 @@ addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsign if (xlen >= 16) { A = _mm_loadu_si128((const __m128i *) a); } else { - unsigned char padded[16]; + CRYPTO_ALIGN(16) unsigned char padded[16]; memset(padded, 0, 16); memcpy(padded, a, xlen); - A = _mm_loadu_si128((const __m128i *) padded); + A = _mm_load_si128((const __m128i *) padded); } A = _mm_shuffle_epi8(A, rev); __m128i B = _mm_loadu_si128((const __m128i *) b); @@ -422,7 +422,7 @@ do { \ const __m128i h4v = h4v_; \ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ - __m128i accv = _mm_loadu_si128((const __m128i *) accum); \ + __m128i accv = _mm_load_si128((const __m128i *) accum); \ int i; \ \ MAKE8(NVDECLx); \ @@ -437,7 +437,7 @@ do { \ MAKE8(STOREx); \ REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); \ REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); \ - _mm_storeu_si128((__m128i *) accum, accv); \ + _mm_store_si128((__m128i *) accum, accv); \ } while(0) /* checksum 8 blocks at once */ @@ -449,12 +449,12 @@ do { \ const __m128i h3v = h3v_ ; \ const __m128i h4v = h4v_ ; \ const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ - __m128i accv = _mm_loadu_si128((const __m128i *) accum); \ + __m128i accv = _mm_load_si128((const __m128i *) accum); \ \ MAKE8(LOADx); \ REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); \ REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); \ - _mm_storeu_si128((__m128i *) accum, accv); \ + _mm_store_si128((__m128i *) accum, accv); \ } while(0) /* decrypt 8 blocks at once */ @@ -543,7 +543,7 @@ crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); } - _mm_storeu_si128((__m128i *) accum, accv); + _mm_store_si128((__m128i *) accum, accv); /* GCM remainder loop */ for (i = adlen_rnd64; i < adlen; i += 16) { @@ -662,7 +662,7 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); } - _mm_storeu_si128((__m128i *) accum, accv); + _mm_store_si128((__m128i *) accum, accv); for (i = adlen_rnd64; i < adlen; i += 16) { unsigned int blocklen = 16;