diff --git a/AUTHORS b/AUTHORS index 6208f0ec..0622fafa 100644 --- a/AUTHORS +++ b/AUTHORS @@ -32,6 +32,9 @@ scrypt Colin Percival Implementors ============ +crypto_aead/aes256gcm/aesni Romain Dolbeau + Frank Denis + crypto_aead/chacha20poly1305 Frank Denis crypto_box/curve25519xsalsa20poly1305 Daniel J. Bernstein diff --git a/configure.ac b/configure.ac index 4d85a4e2..91708583 100644 --- a/configure.ac +++ b/configure.ac @@ -181,7 +181,7 @@ LIBTOOL_EXTRA_FLAGS="$LIBTOOL_EXTRA_FLAGS -version-info $SODIUM_LIBRARY_VERSION" AC_ARG_ENABLE(soname-versions, [AC_HELP_STRING([--enable-soname-versions], [enable soname versions (must be disabled for Android) (default: enabled)])], [ - AS_IF([test "x$enableval" = "xno"], [ + AS_IF([test "x$enableval" = "xno"], [ LIBTOOL_EXTRA_FLAGS="$LIBTOOL_OLD_FLAGS -avoid-version" ]) ] @@ -263,23 +263,36 @@ dnl Checks for headers AS_IF([test "x$EMSCRIPTEN" = "x"],[ AC_MSG_CHECKING(for MMX instructions set) + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-mmmx], [CFLAGS="$CFLAGS -mmmx"]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("mmx") #include ]], [[ __m64 x = _mm_setzero_si64(); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_MMINTRIN_H], [1], [mmx is available])], + AC_DEFINE([HAVE_MMINTRIN_H], [1], [mmx is available]) + AX_CHECK_COMPILE_FLAG([-mmmx], [CFLAGS_MMX="-mmmx"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" AC_MSG_CHECKING(for SSE2 instructions set) + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-msse2], [CFLAGS="$CFLAGS -msse2"]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("sse2") +#ifndef __SSE2__ +# define __SSE2__ +#endif #include ]], [[ __m128d x = _mm_setzero_pd(); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_EMMINTRIN_H], [1], [sse2 is available])], + AC_DEFINE([HAVE_EMMINTRIN_H], [1], [sse2 is available]) + AX_CHECK_COMPILE_FLAG([-msse2], [CFLAGS_SSE2="-msse2"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-msse3], [CFLAGS="$CFLAGS -msse3"]) AC_MSG_CHECKING(for SSE3 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("sse3") @@ -287,19 +300,65 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[ ]], [[ __m128 x = _mm_addsub_ps(_mm_cvtpd_ps(_mm_setzero_pd()), _mm_cvtpd_ps(_mm_setzero_pd())); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_PMMINTRIN_H], [1], [sse3 is available])], + AC_DEFINE([HAVE_PMMINTRIN_H], [1], [sse3 is available]) + AX_CHECK_COMPILE_FLAG([-msse3], [CFLAGS_SSE3="-msse3"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-mssse3], [CFLAGS="$CFLAGS -mssse3"]) AC_MSG_CHECKING(for SSSE3 instructions set) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC target("ssse3") #include ]], [[ __m64 x = _mm_abs_pi32(_m_from_int(0)); ]])], [AC_MSG_RESULT(yes) - AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available])], + AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available]) + AX_CHECK_COMPILE_FLAG([-mssse3], [CFLAGS_SSSE3="-mssse3"])], [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-msse4.1], [CFLAGS="$CFLAGS -msse4.1"]) + AC_MSG_CHECKING(for SSE4.1 instructions set) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#pragma GCC target("sse4.1") +#include +]], [[ __m128i x = _mm_minpos_epu16(_mm_setzero_si128()); ]])], + [AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_SMMINTRIN_H], [1], [sse4.1 is available]) + AX_CHECK_COMPILE_FLAG([-msse4.1], [CFLAGS_SSE4_1="-msse4.1"])], + [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + + oldcflags="$CFLAGS" + AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS="$CFLAGS -maes"]) + AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS="$CFLAGS -mpclmul"]) + AC_MSG_CHECKING(for AESNI instructions set and PCLMULQDQ) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#pragma GCC target("aes") +#pragma GCC target("pclmul") +#include +]], [[ __m128i x = _mm_aesimc_si128(_mm_setzero_si128()); + __m128i y = _mm_clmulepi64_si128(_mm_setzero_si128(), _mm_setzero_si128(), 0);]])], + [AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_WMMINTRIN_H], [1], [aesni is available]) + AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS_AESNI="-maes"]) + AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS_PCLMUL="-mpclmul"]) + ], + [AC_MSG_RESULT(no)]) + CFLAGS="$oldcflags" + ]) +AC_SUBST(CFLAGS_MMX) +AC_SUBST(CFLAGS_SSE2) +AC_SUBST(CFLAGS_SSE3) +AC_SUBST(CFLAGS_SSSE3) +AC_SUBST(CFLAGS_SSE4_1) +AC_SUBST(CFLAGS_AESNI) +AC_SUBST(CFLAGS_PCLMUL) + AC_CHECK_HEADERS([sys/mman.h]) dnl Checks for typedefs, structures, and compiler characteristics. diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 55c0dffd..5f94c407 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -62,7 +62,6 @@ libsodium_la_SOURCES = \ crypto_pwhash/scryptsalsa208sha256/pwhash_scryptsalsa208sha256.c \ crypto_pwhash/scryptsalsa208sha256/sysendian.h \ crypto_pwhash/scryptsalsa208sha256/nosse/pwhash_scryptsalsa208sha256_nosse.c \ - crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c \ crypto_scalarmult/crypto_scalarmult.c \ crypto_scalarmult/curve25519/scalarmult_curve25519_api.c \ crypto_secretbox/crypto_secretbox.c \ @@ -268,3 +267,20 @@ endif SUBDIRS = \ include + +libsodium_la_LIBADD = libaesni.la libsse2.la +noinst_LTLIBRARIES = libaesni.la libsse2.la + +libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS) +libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ + @CFLAGS_SSSE3@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@ + +libaesni_la_SOURCES = \ + crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c + +libsse2_la_LDFLAGS = $(libsodium_la_LDFLAGS) +libsse2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ + @CFLAGS_SSE2@ + +libsse2_la_SOURCES = \ + crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c new file mode 100644 index 00000000..64ec8576 --- /dev/null +++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c @@ -0,0 +1,828 @@ + +/* + * AES256-GCM, based on original code by Romain Dolbeau + */ + +#include +#include +#include + +#include "crypto_aead_aes256gcm_aesni.h" +#include "export.h" +#include "utils.h" + +#if defined(HAVE_WMMINTRIN_H) || \ + (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) + +#pragma GCC target("ssse3") +#pragma GCC target("aes") +#pragma GCC target("pclmul") + +#include + +#if defined(__INTEL_COMPILER) || defined(_bswap64) +#elif defined(_MSC_VER) +# define _bswap64(a) _byteswap_uint64(a) +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +# define _bswap64(a) __builtin_bswap64(a) +#else +static inline uint64_t +_bswap64(const uint64_t x) +{ + return + ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) | + ((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) | + ((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) | + ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL); +} +#endif + +typedef struct context { + CRYPTO_ALIGN(16) unsigned char H[16]; + __m128i rkeys[16]; +} context; + +static inline void +aesni_key256_expand(const unsigned char *key, __m128 *rkeys) +{ + __m128 key0 = _mm_loadu_ps((const float *) (key + 0)); + __m128 key1 = _mm_loadu_ps((const float *) (key + 16)); + __m128 temp0, temp1, temp2, temp4; + int idx = 0; + + rkeys[idx++] = key0; + temp0 = key0; + temp2 = key1; + temp4 = _mm_setzero_ps(); + +/* why single precision floating-point rather than integer instructions ? + because _mm_shuffle_ps takes two inputs, while _mm_shuffle_epi32 only + takes one - it doesn't perform the same computation... + _mm_shuffle_ps takes the lower 64 bits of the result from the first + operand, and the higher 64 bits of the result from the second operand + (in both cases, all four input floats are accessible). + I don't like the non-orthogonal naming scheme :-( + + This is all strongly inspired by the openssl assembly code. + */ +#define BLOCK1(IMM) \ + temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp2), IMM));\ + rkeys[idx++] = temp2; \ + temp4 = _mm_shuffle_ps(temp4, temp0, 0x10); \ + temp0 = _mm_xor_ps(temp0, temp4); \ + temp4 = _mm_shuffle_ps(temp4, temp0, 0x8c); \ + temp0 = _mm_xor_ps(temp0, temp4); \ + temp1 = _mm_shuffle_ps(temp1, temp1, 0xff); \ + temp0 = _mm_xor_ps(temp0, temp1) + +#define BLOCK2(IMM) \ + temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp0), IMM));\ + rkeys[idx++] = temp0; \ + temp4 = _mm_shuffle_ps(temp4, temp2, 0x10); \ + temp2 = _mm_xor_ps(temp2, temp4); \ + temp4 = _mm_shuffle_ps(temp4, temp2, 0x8c); \ + temp2 = _mm_xor_ps(temp2, temp4); \ + temp1 = _mm_shuffle_ps(temp1, temp1, 0xaa); \ + temp2 = _mm_xor_ps(temp2, temp1) + + BLOCK1(0x01); + BLOCK2(0x01); + + BLOCK1(0x02); + BLOCK2(0x02); + + BLOCK1(0x04); + BLOCK2(0x04); + + BLOCK1(0x08); + BLOCK2(0x08); + + BLOCK1(0x10); + BLOCK2(0x10); + + BLOCK1(0x20); + BLOCK2(0x20); + + BLOCK1(0x40); + rkeys[idx++] = temp0; +} + +/** single, by-the-book AES encryption with AES-NI */ +static inline void +aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) +{ + __m128i temp = _mm_xor_si128(nv, rkeys[0]); + int i; + +#pragma unroll(13) + for (i = 1; i < 14; i++) { + temp = _mm_aesenc_si128(temp, rkeys[i]); + } + temp = _mm_aesenclast_si128(temp, rkeys[14]); + _mm_storeu_si128((__m128i *) out, temp); +} + +/** multiple-blocks-at-once AES encryption with AES-NI ; + on Haswell, aesenc as a latency of 7 and a througput of 1 + so the sequence of aesenc should be bubble-free, if you + have at least 8 blocks. Let's build an arbitratry-sized + function */ +/* Step 1 : loading the nonce */ +/* load & increment the n vector (non-vectorized, unused for now) */ +#define NVDECLx(a) \ + __m128i nv##a + +#define NVx(a) \ + nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \ + n[3]++ + +/* Step 2 : define value in round one (xor with subkey #0, aka key) */ +#define TEMPDECLx(a) \ + __m128i temp##a + +#define TEMPx(a) \ + temp##a = _mm_xor_si128(nv##a, rkeys[0]) + +/* Step 3: one round of AES */ +#define AESENCx(a) \ + temp##a = _mm_aesenc_si128(temp##a, rkeys[i]) + +/* Step 4: last round of AES */ +#define AESENCLASTx(a) \ + temp##a = _mm_aesenclast_si128(temp##a, rkeys[14]) + +/* Step 5: store result */ +#define STOREx(a) \ + _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a) + +/* all the MAKE* macros are for automatic explicit unrolling */ +#define MAKE4(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +#define MAKE8(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3); \ + X(4); \ + X(5); \ + X(6); \ + X(7) + +#define COUNTER_INC2(N) (*(uint32_t *) &(N)[12]) = (2U + (((*(uint32_t *) &(N)[12])))) + +/* create a function of unrolling N ; the MAKEN is the unrolling + macro, defined above. The N in MAKEN must match N, obviously. */ +#define FUNC(N, MAKEN) \ + static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \ + { \ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int i; \ + MAKEN(NVDECLx); \ + MAKEN(TEMPDECLx); \ + \ + MAKEN(NVx); \ + MAKEN(TEMPx); \ + for (i = 1; i < 14; i++) { \ + MAKEN(AESENCx); \ + } \ + MAKEN(AESENCLASTx); \ + MAKEN(STOREx); \ + } + +FUNC(8, MAKE8) + +/* all GF(2^128) fnctions are by the book, meaning this one: + +*/ + +static inline void +addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b) +{ + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + __m128i A; + + if (xlen >= 16) { + A = _mm_loadu_si128((const __m128i *) a); + } else { + CRYPTO_ALIGN(16) unsigned char padded[16]; + memset(padded, 0, 16); + memcpy(padded, a, xlen); + A = _mm_load_si128((const __m128i *) padded); + } + A = _mm_shuffle_epi8(A, rev); + __m128i B = _mm_loadu_si128((const __m128i *) b); + __m128i C = _mm_loadu_si128((const __m128i *) c); + A = _mm_xor_si128(A, C); + __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00); + __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10); + __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01); + __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11); + __m128i tmp10 = _mm_xor_si128(tmp4, tmp5); + __m128i tmp13 = _mm_slli_si128(tmp10, 8); + __m128i tmp11 = _mm_srli_si128(tmp10, 8); + __m128i tmp15 = _mm_xor_si128(tmp3, tmp13); + __m128i tmp17 = _mm_xor_si128(tmp6, tmp11); + __m128i tmp7 = _mm_srli_epi32(tmp15, 31); + __m128i tmp8 = _mm_srli_epi32(tmp17, 31); + __m128i tmp16 = _mm_slli_epi32(tmp15, 1); + __m128i tmp18 = _mm_slli_epi32(tmp17, 1); + __m128i tmp9 = _mm_srli_si128(tmp7, 12); + __m128i tmp22 = _mm_slli_si128(tmp8, 4); + __m128i tmp25 = _mm_slli_si128(tmp7, 4); + __m128i tmp29 = _mm_or_si128(tmp16, tmp25); + __m128i tmp19 = _mm_or_si128(tmp18, tmp22); + __m128i tmp20 = _mm_or_si128(tmp19, tmp9); + __m128i tmp26 = _mm_slli_epi32(tmp29, 31); + __m128i tmp23 = _mm_slli_epi32(tmp29, 30); + __m128i tmp32 = _mm_slli_epi32(tmp29, 25); + __m128i tmp27 = _mm_xor_si128(tmp26, tmp23); + __m128i tmp28 = _mm_xor_si128(tmp27, tmp32); + __m128i tmp24 = _mm_srli_si128(tmp28, 4); + __m128i tmp33 = _mm_slli_si128(tmp28, 12); + __m128i tmp30 = _mm_xor_si128(tmp29, tmp33); + __m128i tmp2 = _mm_srli_epi32(tmp30, 1); + __m128i tmp12 = _mm_srli_epi32(tmp30, 2); + __m128i tmp14 = _mm_srli_epi32(tmp30, 7); + __m128i tmp34 = _mm_xor_si128(tmp2, tmp12); + __m128i tmp35 = _mm_xor_si128(tmp34, tmp14); + __m128i tmp36 = _mm_xor_si128(tmp35, tmp24); + __m128i tmp31 = _mm_xor_si128(tmp30, tmp36); + __m128i tmp21 = _mm_xor_si128(tmp20, tmp31); + _mm_storeu_si128((__m128i *) c, tmp21); +} + +/* pure multiplication, for pre-computing powers of H */ +static inline __m128i +mulv(__m128i A, __m128i B) +{ + __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00); + __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10); + __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01); + __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11); + __m128i tmp10 = _mm_xor_si128(tmp4, tmp5); + __m128i tmp13 = _mm_slli_si128(tmp10, 8); + __m128i tmp11 = _mm_srli_si128(tmp10, 8); + __m128i tmp15 = _mm_xor_si128(tmp3, tmp13); + __m128i tmp17 = _mm_xor_si128(tmp6, tmp11); + __m128i tmp7 = _mm_srli_epi32(tmp15, 31); + __m128i tmp8 = _mm_srli_epi32(tmp17, 31); + __m128i tmp16 = _mm_slli_epi32(tmp15, 1); + __m128i tmp18 = _mm_slli_epi32(tmp17, 1); + __m128i tmp9 = _mm_srli_si128(tmp7, 12); + __m128i tmp22 = _mm_slli_si128(tmp8, 4); + __m128i tmp25 = _mm_slli_si128(tmp7, 4); + __m128i tmp29 = _mm_or_si128(tmp16, tmp25); + __m128i tmp19 = _mm_or_si128(tmp18, tmp22); + __m128i tmp20 = _mm_or_si128(tmp19, tmp9); + __m128i tmp26 = _mm_slli_epi32(tmp29, 31); + __m128i tmp23 = _mm_slli_epi32(tmp29, 30); + __m128i tmp32 = _mm_slli_epi32(tmp29, 25); + __m128i tmp27 = _mm_xor_si128(tmp26, tmp23); + __m128i tmp28 = _mm_xor_si128(tmp27, tmp32); + __m128i tmp24 = _mm_srli_si128(tmp28, 4); + __m128i tmp33 = _mm_slli_si128(tmp28, 12); + __m128i tmp30 = _mm_xor_si128(tmp29, tmp33); + __m128i tmp2 = _mm_srli_epi32(tmp30, 1); + __m128i tmp12 = _mm_srli_epi32(tmp30, 2); + __m128i tmp14 = _mm_srli_epi32(tmp30, 7); + __m128i tmp34 = _mm_xor_si128(tmp2, tmp12); + __m128i tmp35 = _mm_xor_si128(tmp34, tmp14); + __m128i tmp36 = _mm_xor_si128(tmp35, tmp24); + __m128i tmp31 = _mm_xor_si128(tmp30, tmp36); + __m128i C = _mm_xor_si128(tmp20, tmp31); + + return C; +} + +/* 4 multiply-accumulate at once; again + + for the Aggregated Reduction Method & sample code. + Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */ + +#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B +#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev) +#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00) +#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11) +#define RED_MUL_MID(a) \ + tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \ + tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \ + tmp##a = _mm_xor_si128(tmp##a, H##a); \ + tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \ + tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00) + +#define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, acc) \ +do { \ + MAKE4(RED_DECL); \ + __m128i lo, hi; \ + __m128i tmp8, tmp9; \ + __m128i H0 = H0_; \ + __m128i H1 = H1_; \ + __m128i H2 = H2_; \ + __m128i H3 = H3_; \ + __m128i X0 = X0_; \ + __m128i X1 = X1_; \ + __m128i X2 = X2_; \ + __m128i X3 = X3_; \ +\ +/* byte-revert the inputs & xor the first one into the accumulator */ \ +\ + MAKE4(RED_SHUFFLE); \ + X3 = _mm_xor_si128(X3, acc); \ +\ +/* 4 low H*X (x0*h0) */ \ +\ + MAKE4(RED_MUL_LOW); \ + lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \ + lo = _mm_xor_si128(lo, H2_X2_lo); \ + lo = _mm_xor_si128(lo, H3_X3_lo); \ +\ +/* 4 high H*X (x1*h1) */ \ +\ + MAKE4(RED_MUL_HIGH); \ + hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \ + hi = _mm_xor_si128(hi, H2_X2_hi); \ + hi = _mm_xor_si128(hi, H3_X3_hi); \ +\ +/* 4 middle H*X, using Karatsuba, i.e. \ + x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \ + we already have all x1y1 & x0y0 (accumulated in hi & lo) \ + (0 is low half and 1 is high half) \ + */ \ +/* permute the high and low 64 bits in H1 & X1, \ + so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \ + then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \ + and finally multiply \ + */ \ + MAKE4(RED_MUL_MID); \ +\ +/* substracts x1*h1 and x0*h0 */ \ + tmp0 = _mm_xor_si128(tmp0, lo); \ + tmp0 = _mm_xor_si128(tmp0, hi); \ + tmp0 = _mm_xor_si128(tmp1, tmp0); \ + tmp0 = _mm_xor_si128(tmp2, tmp0); \ + tmp0 = _mm_xor_si128(tmp3, tmp0);\ +\ + /* reduction */ \ + tmp0B = _mm_slli_si128(tmp0, 8); \ + tmp0 = _mm_srli_si128(tmp0, 8); \ + lo = _mm_xor_si128(tmp0B, lo); \ + hi = _mm_xor_si128(tmp0, hi); \ + tmp3 = lo; \ + tmp2B = hi; \ + tmp3B = _mm_srli_epi32(tmp3, 31); \ + tmp8 = _mm_srli_epi32(tmp2B, 31); \ + tmp3 = _mm_slli_epi32(tmp3, 1); \ + tmp2B = _mm_slli_epi32(tmp2B, 1); \ + tmp9 = _mm_srli_si128(tmp3B, 12); \ + tmp8 = _mm_slli_si128(tmp8, 4); \ + tmp3B = _mm_slli_si128(tmp3B, 4); \ + tmp3 = _mm_or_si128(tmp3, tmp3B); \ + tmp2B = _mm_or_si128(tmp2B, tmp8); \ + tmp2B = _mm_or_si128(tmp2B, tmp9); \ + tmp3B = _mm_slli_epi32(tmp3, 31); \ + tmp8 = _mm_slli_epi32(tmp3, 30); \ + tmp9 = _mm_slli_epi32(tmp3, 25); \ + tmp3B = _mm_xor_si128(tmp3B, tmp8); \ + tmp3B = _mm_xor_si128(tmp3B, tmp9); \ + tmp8 = _mm_srli_si128(tmp3B, 4); \ + tmp3B = _mm_slli_si128(tmp3B, 12); \ + tmp3 = _mm_xor_si128(tmp3, tmp3B); \ + tmp2 = _mm_srli_epi32(tmp3, 1); \ + tmp0B = _mm_srli_epi32(tmp3, 2); \ + tmp1B = _mm_srli_epi32(tmp3, 7); \ + tmp2 = _mm_xor_si128(tmp2, tmp0B); \ + tmp2 = _mm_xor_si128(tmp2, tmp1B); \ + tmp2 = _mm_xor_si128(tmp2, tmp8); \ + tmp3 = _mm_xor_si128(tmp3, tmp2); \ + tmp2B = _mm_xor_si128(tmp2B, tmp3); \ +\ + accv = tmp2B; \ +} while(0) + +#define XORx(a) \ + __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16)); \ + temp##a = _mm_xor_si128(temp##a, in##a) + +#define LOADx(a) \ + __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16)); + +/* full encrypt & checksum 8 blocks at once */ +#define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_) \ +do { \ + unsigned char *out = out_; \ + uint32_t *n = n_; \ + const unsigned char *in = in_; \ + const __m128i hv = hv_; \ + const __m128i h2v = h2v_; \ + const __m128i h3v = h3v_; \ + const __m128i h4v = h4v_; \ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i accv = _mm_load_si128((const __m128i *) accum); \ + int i; \ +\ + MAKE8(NVDECLx); \ + MAKE8(TEMPDECLx); \ + MAKE8(NVx); \ + MAKE8(TEMPx); \ + for (i = 1; i < 14; i++) { \ + MAKE8(AESENCx); \ + } \ + MAKE8(AESENCLASTx); \ + MAKE8(XORx); \ + MAKE8(STOREx); \ + REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); \ + REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); \ + _mm_store_si128((__m128i *) accum, accv); \ +} while(0) + +/* checksum 8 blocks at once */ +#define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_) \ +do { \ + const unsigned char *in = in_; \ + const __m128i hv = hv_; \ + const __m128i h2v = h2v_ ; \ + const __m128i h3v = h3v_ ; \ + const __m128i h4v = h4v_ ; \ + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i accv = _mm_load_si128((const __m128i *) accum); \ +\ + MAKE8(LOADx); \ + REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); \ + REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); \ + _mm_store_si128((__m128i *) accum, accv); \ +} while(0) + +/* decrypt 8 blocks at once */ +#define aesni_decrypt8full(out_, n_, rkeys, in_) \ +do { \ + unsigned char *out = out_; \ + uint32_t *n = n_; \ + const unsigned char *in = in_; \ + const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int i; \ +\ + MAKE8(NVDECLx); \ + MAKE8(TEMPDECLx); \ + MAKE8(NVx); \ + MAKE8(TEMPx); \ + for (i = 1; i < 14; i++) { \ + MAKE8(AESENCx); \ + } \ + MAKE8(AESENCLASTx); \ + MAKE8(XORx); \ + MAKE8(STOREx); \ +} while(0) + +int +crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_, + const unsigned char *k) +{ + context *ctx = (context *) ctx_; + __m128i *rkeys = ctx->rkeys; + __m128i zero = _mm_setzero_si128(); + unsigned char *H = ctx->H; + + (void) sizeof(int[(sizeof *ctx_) >= (sizeof *ctx) ? 1 : -1]); + aesni_key256_expand(k, (__m128 *) rkeys); + aesni_encrypt1(H, zero, rkeys); + + return 0; +} + +int +crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long *clen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *ad, unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const crypto_aead_aes256gcm_aesni_state *ctx_) +{ + unsigned char H[16]; + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const context *ctx = (const context *) ctx_; + const __m128i *rkeys = ctx->rkeys; + __m128i Hv, H2v, H3v, H4v, accv; + unsigned long long i, j; + unsigned long long adlen_rnd64 = adlen & ~63ULL; + unsigned long long mlen_rnd128 = mlen & ~127ULL; + CRYPTO_ALIGN(16) unsigned char n2[16]; + CRYPTO_ALIGN(16) unsigned char T[16]; + CRYPTO_ALIGN(16) unsigned char accum[16]; + CRYPTO_ALIGN(16) unsigned char fb[16]; + + (void) nsec; + memcpy(H, ctx->H, sizeof H); + if (mlen > 16ULL * (1ULL << 32)) { + abort(); + } + memcpy(&n2[0], npub, 12); + *(uint32_t *) &n2[12] = 0x01000000; + aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); + + (*(uint64_t *) &fb[0]) = _bswap64((uint64_t) (8 * adlen)); + (*(uint64_t *) &fb[8]) = _bswap64((uint64_t) (8 * mlen)); + + /* we store H (and it's power) byte-reverted once and for all */ + Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); + _mm_store_si128((__m128i *) H, Hv); + H2v = mulv(Hv, Hv); + H3v = mulv(H2v, Hv); + H4v = mulv(H3v, Hv); + + accv = _mm_setzero_si128(); + /* unrolled by 4 GCM (by 8 doesn't improve using REDUCE4) */ + for (i = 0; i < adlen_rnd64; i += 64) { + __m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0)); + __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); + __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); + __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); + REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); + } + _mm_store_si128((__m128i *) accum, accv); + + /* GCM remainder loop */ + for (i = adlen_rnd64; i < adlen; i += 16) { + unsigned int blocklen = 16; + + if (i + (unsigned long long) blocklen > adlen) { + blocklen = (unsigned int) (adlen - i); + } + addmul(accum, ad + i, blocklen, H); + } + +/* this only does 8 full blocks, so no fancy bounds checking is necessary*/ +#define LOOPRND128 \ + do { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = 0; i < mlen_rnd128; i += lb) { \ + aesni_encrypt8full(c + i, (uint32_t *) n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v); \ + } \ + } while(0) + +/* remainder loop, with the slower GCM update to accomodate partial blocks */ +#define LOOPRMD128 \ + do { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = mlen_rnd128; i < mlen; i += lb) { \ + CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ + unsigned long long mj = lb; \ + \ + aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \ + if ((i + mj) >= mlen) { \ + mj = mlen - i; \ + } \ + for (j = 0; j < mj; j++) { \ + c[i + j] = m[i + j] ^ outni[j]; \ + } \ + for (j = 0; j < mj; j += 16) { \ + unsigned int bl = 16; \ + \ + if (j + (unsigned long long) bl >= mj) { \ + bl = (unsigned int) (mj - j); \ + } \ + addmul(accum, c + i + j, bl, H); \ + } \ + } \ + } while(0) + + n2[15] = 0; + COUNTER_INC2(n2); + LOOPRND128; + LOOPRMD128; + + addmul(accum, fb, 16, H); + + for (i = 0; i < 16; ++i) { + c[i + mlen] = T[i] ^ accum[15 - i]; + } + if (clen != NULL) { + *clen = mlen + 16; + } + return 0; +} + +int +crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, unsigned long long clen, + const unsigned char *ad, unsigned long long adlen, + const unsigned char *npub, + const crypto_aead_aes256gcm_aesni_state *ctx_) +{ + unsigned char H[16]; + const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const context *ctx = (const context *) ctx_; + const __m128i *rkeys = ctx->rkeys; + __m128i Hv, H2v, H3v, H4v, accv; + unsigned long long i, j; + unsigned long long adlen_rnd64 = adlen & ~63ULL; + unsigned long long mlen; + unsigned long long mlen_rnd128; + CRYPTO_ALIGN(16) unsigned char n2[16]; + CRYPTO_ALIGN(16) unsigned char T[16]; + CRYPTO_ALIGN(16) unsigned char accum[16]; + CRYPTO_ALIGN(16) unsigned char fb[16]; + + (void) nsec; + memcpy(H, ctx->H, sizeof H); + if (clen > 16ULL * (1ULL << 32) - 16ULL) { + abort(); + } + mlen = clen - 16; + if (mlen_p != NULL) { + *mlen_p = 0U; + } + memcpy(&n2[0], npub, 12); + *(uint32_t *) &n2[12] = 0x01000000; + aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); + + (*(uint64_t *) &fb[0]) = _bswap64((uint64_t)(8 * adlen)); + (*(uint64_t *) &fb[8]) = _bswap64((uint64_t)(8 * mlen)); + + Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); + _mm_store_si128((__m128i *) H, Hv); + H2v = mulv(Hv, Hv); + H3v = mulv(H2v, Hv); + H4v = mulv(H3v, Hv); + + accv = _mm_setzero_si128(); + for (i = 0; i < adlen_rnd64; i += 64) { + __m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0)); + __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); + __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); + __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); + REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); + } + _mm_store_si128((__m128i *) accum, accv); + + for (i = adlen_rnd64; i < adlen; i += 16) { + unsigned int blocklen = 16; + if (i + (unsigned long long) blocklen > adlen) { + blocklen = (unsigned int) (adlen - i); + } + addmul(accum, ad + i, blocklen, H); + } + + mlen_rnd128 = mlen & ~127ULL; + +#define LOOPACCUMDRND128 \ + do { \ + const int iter = 8; \ + const int lb = iter * 16; \ + for (i = 0; i < mlen_rnd128; i += lb) { \ + aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v); \ + } \ + } while(0) + +#define LOOPDRND128 \ + do { \ + const int iter = 8; \ + const int lb = iter * 16; \ + for (i = 0; i < mlen_rnd128; i += lb) { \ + aesni_decrypt8full(m + i, (uint32_t *) n2, rkeys, c + i); \ + } \ + } while(0) + +#define LOOPACCUMDRMD128 \ + do { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = mlen_rnd128; i < mlen; i += lb) { \ + unsigned long long mj = lb; \ + \ + if ((i + mj) >= mlen) { \ + mj = mlen - i; \ + } \ + for (j = 0; j < mj; j += 16) { \ + unsigned int bl = 16; \ + \ + if (j + (unsigned long long) bl >= mj) { \ + bl = (unsigned int) (mj - j); \ + } \ + addmul(accum, c + i + j, bl, H); \ + } \ + } \ + } while(0) + +#define LOOPDRMD128 \ + do { \ + const int iter = 8; \ + const int lb = iter * 16; \ + \ + for (i = mlen_rnd128; i < mlen; i += lb) { \ + CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ + unsigned long long mj = lb; \ + \ + if ((i + mj) >= mlen) { \ + mj = mlen - i; \ + } \ + aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \ + for (j = 0; j < mj; j++) { \ + m[i + j] = c[i + j] ^ outni[j]; \ + } \ + } \ + } while(0) + n2[15] = 0; + + COUNTER_INC2(n2); + LOOPACCUMDRND128; + LOOPACCUMDRMD128; + addmul(accum, fb, 16, H); + { + unsigned char d = 0; + + for (i = 0; i < 16; i++) { + d |= (c[i + mlen] ^ (T[i] ^ accum[15 - i])); + } + if (d != 0) { + return -1; + } + } + *(uint32_t *) &n2[12] = 0; + COUNTER_INC2(n2); + LOOPDRND128; + LOOPDRMD128; + + if (mlen_p != NULL) { + *mlen_p = mlen; + } + return 0; +} + +int +crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, + unsigned long long *clen_p, + const unsigned char *m, + unsigned long long mlen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const unsigned char *k) +{ + crypto_aead_aes256gcm_aesni_state ctx; + + crypto_aead_aes256gcm_aesni_beforenm(&ctx, k); + + return crypto_aead_aes256gcm_aesni_encrypt_afternm + (c, clen_p, m, mlen, ad, adlen, nsec, npub, &ctx); +} + +int +crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + const unsigned char *k) +{ + crypto_aead_aes256gcm_aesni_state ctx; + + crypto_aead_aes256gcm_aesni_beforenm((crypto_aead_aes256gcm_aesni_state *) + &ctx, k); + + return crypto_aead_aes256gcm_aesni_decrypt_afternm + (m, mlen_p, nsec, c, clen, ad, adlen, npub, &ctx); +} + +size_t +crypto_aead_aes256gcm_aesni_keybytes(void) +{ + return crypto_aead_aes256gcm_KEYBYTES; +} + +size_t +crypto_aead_aes256gcm_aesni_nsecbytes(void) +{ + return crypto_aead_aes256gcm_NSECBYTES; +} + +size_t crypto_aead_aes256gcm_aesni_npubbytes(void) +{ + return crypto_aead_aes256gcm_NPUBBYTES; +} + +size_t crypto_aead_aes256gcm_aesni_abytes(void) +{ + return crypto_aead_aes256gcm_ABYTES; +} + +size_t crypto_aead_aes256gcm_aesni_statebytes(void) +{ + return sizeof(crypto_aead_aes256gcm_aesni_state); +} + +#endif diff --git a/src/libsodium/include/Makefile.am b/src/libsodium/include/Makefile.am index 77e01eac..854e35fa 100644 --- a/src/libsodium/include/Makefile.am +++ b/src/libsodium/include/Makefile.am @@ -2,6 +2,7 @@ SODIUM_EXPORT = \ sodium.h \ sodium/core.h \ + sodium/crypto_aead_aes256gcm_aesni.h \ sodium/crypto_aead_chacha20poly1305.h \ sodium/crypto_auth.h \ sodium/crypto_auth_hmacsha256.h \ diff --git a/src/libsodium/include/sodium.h b/src/libsodium/include/sodium.h index 97280f93..b615cad8 100644 --- a/src/libsodium/include/sodium.h +++ b/src/libsodium/include/sodium.h @@ -3,6 +3,7 @@ #define sodium_H #include "sodium/core.h" +#include "sodium/crypto_aead_aes256gcm_aesni.h" #include "sodium/crypto_aead_chacha20poly1305.h" #include "sodium/crypto_auth.h" #include "sodium/crypto_auth_hmacsha256.h" diff --git a/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h new file mode 100644 index 00000000..53c02abd --- /dev/null +++ b/src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h @@ -0,0 +1,85 @@ +#ifndef crypto_aead_aes256gcm_aesni_H +#define crypto_aead_aes256gcm_aesni_H + +#include +#include "export.h" + +#ifdef __cplusplus +# if __GNUC__ +# pragma GCC diagnostic ignored "-Wlong-long" +# endif +extern "C" { +#endif + +#define crypto_aead_aes256gcm_KEYBYTES 32U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_keybytes(void); + +#define crypto_aead_aes256gcm_NSECBYTES 0U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_nsecbytes(void); + +#define crypto_aead_aes256gcm_NPUBBYTES 12U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_npubbytes(void); + +#define crypto_aead_aes256gcm_ABYTES 16U +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_abytes(void); + +typedef CRYPTO_ALIGN(16) unsigned char crypto_aead_aes256gcm_aesni_state[272]; +SODIUM_EXPORT +size_t crypto_aead_aes256gcm_aesni_statebytes(void); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c, + unsigned long long *clen_p, + const unsigned char *m, + unsigned long long mlen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const unsigned char *k); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + const unsigned char *k); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_, + const unsigned char *k); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, + unsigned long long *clen_p, + const unsigned char *m, + unsigned long long mlen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const crypto_aead_aes256gcm_aesni_state *ctx_); + +SODIUM_EXPORT +int crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, + unsigned long long *mlen_p, + unsigned char *nsec, + const unsigned char *c, + unsigned long long clen, + const unsigned char *ad, + unsigned long long adlen, + const unsigned char *npub, + const crypto_aead_aes256gcm_aesni_state *ctx_); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libsodium/include/sodium/crypto_generichash_blake2b.h b/src/libsodium/include/sodium/crypto_generichash_blake2b.h index cddfdaa6..a88a59ec 100644 --- a/src/libsodium/include/sodium/crypto_generichash_blake2b.h +++ b/src/libsodium/include/sodium/crypto_generichash_blake2b.h @@ -7,12 +7,6 @@ #include "export.h" -#if defined(_MSC_VER) -# define CRYPTO_ALIGN(x) __declspec(align(x)) -#else -# define CRYPTO_ALIGN(x) __attribute__((aligned(x))) -#endif - #ifdef __cplusplus # if __GNUC__ # pragma GCC diagnostic ignored "-Wlong-long" diff --git a/src/libsodium/include/sodium/export.h b/src/libsodium/include/sodium/export.h index 53fcd7b5..89f10fe9 100644 --- a/src/libsodium/include/sodium/export.h +++ b/src/libsodium/include/sodium/export.h @@ -29,4 +29,12 @@ # endif #endif +#ifndef CRYPTO_ALIGN +# if defined(__INTEL_COMPILER) || defined(_MSC_VER) +# define CRYPTO_ALIGN(x) __declspec(align(x)) +# else +# define CRYPTO_ALIGN(x) __attribute__((aligned(x))) +# endif +#endif + #endif diff --git a/src/libsodium/include/sodium/runtime.h b/src/libsodium/include/sodium/runtime.h index 50226ae1..3bdc4dcd 100644 --- a/src/libsodium/include/sodium/runtime.h +++ b/src/libsodium/include/sodium/runtime.h @@ -20,6 +20,12 @@ int sodium_runtime_has_sse2(void); SODIUM_EXPORT int sodium_runtime_has_sse3(void); +SODIUM_EXPORT +int sodium_runtime_has_pclmul(void); + +SODIUM_EXPORT +int sodium_runtime_has_aesni(void); + #ifdef __cplusplus } #endif diff --git a/src/libsodium/sodium/runtime.c b/src/libsodium/sodium/runtime.c index 93b07932..2cf915c8 100644 --- a/src/libsodium/sodium/runtime.c +++ b/src/libsodium/sodium/runtime.c @@ -10,12 +10,16 @@ typedef struct CPUFeatures_ { int has_neon; int has_sse2; int has_sse3; + int has_pclmul; + int has_aesni; } CPUFeatures; static CPUFeatures _cpu_features; -#define CPUID_SSE2 0x04000000 -#define CPUIDECX_SSE3 0x00000001 +#define CPUID_SSE2 0x04000000 +#define CPUIDECX_SSE3 0x00000001 +#define CPUIDECX_PCLMUL 0x00000002 +#define CPUIDECX_AESNI 0x02000000 static int _sodium_runtime_arm_cpu_features(CPUFeatures * const cpu_features) @@ -104,6 +108,14 @@ _sodium_runtime_intel_cpu_features(CPUFeatures * const cpu_features) cpu_features->has_sse3 = ((cpu_info[2] & CPUIDECX_SSE3) != 0x0); #endif +#ifndef HAVE_WMMINTRIN_H + cpu_features->has_pclmul = 0; + cpu_features->has_aesni = 0; +#else + cpu_features->has_pclmul = ((cpu_info[2] & CPUIDECX_PCLMUL) != 0x0); + cpu_features->has_aesni = ((cpu_info[2] & CPUIDECX_AESNI) != 0x0); +#endif + return 0; } @@ -133,3 +145,13 @@ int sodium_runtime_has_sse3(void) { return _cpu_features.has_sse3; } + +int +sodium_runtime_has_pclmul(void) { + return _cpu_features.has_pclmul; +} + +int +sodium_runtime_has_aesni(void) { + return _cpu_features.has_aesni; +}