Merge branch 'aes256gcm'
* aes256gcm: (25 commits) aes256gcm: we can expect the accumulator and the padding buffer to be aligned aesgcm: don't expect input & output buffers to be aligned aes256gcm doesn't use SSE4.1 instructions any more Don't read past the AD buffer, even through an SIMD register Convert more functions to macros Add do { ... } while(0) when relevant Turn reduce4 into a macro That's too much registers for a function call in 32-bit mode. And in MSVC, this is even the case if the function is marked inline. Enable aes256gcm on Visual Studio Don't declare new variables after a line of code Declare __m128 arrays used as parameters as pointers Required for MSVC Proper casts for aeskeygenassist() Let's hope that requiring ssse3 is not required any more Try to enable specific cflags before testing each intructions set ssse3 target is required in addition to sse4.1 Use SIMD-specific compiler flags only for files needing them Define __SSSE3__ if required Do not try to compile aesni code if this is not going to compile Check for AESNI & PCLMUL presence/usability Replace the aes256gcm implementation with Romain Dolbeau's implementation which is slightly faster than mine. Reimplement features from the previous implementation: add batch mode and use two passes in the decryption function in order to check the tag before decrypting. Explicit cast ...
This commit is contained in:
commit
1dddd63a19
3
AUTHORS
3
AUTHORS
@ -32,6 +32,9 @@ scrypt Colin Percival
|
||||
Implementors
|
||||
============
|
||||
|
||||
crypto_aead/aes256gcm/aesni Romain Dolbeau
|
||||
Frank Denis
|
||||
|
||||
crypto_aead/chacha20poly1305 Frank Denis
|
||||
|
||||
crypto_box/curve25519xsalsa20poly1305 Daniel J. Bernstein
|
||||
|
67
configure.ac
67
configure.ac
@ -263,23 +263,36 @@ dnl Checks for headers
|
||||
AS_IF([test "x$EMSCRIPTEN" = "x"],[
|
||||
|
||||
AC_MSG_CHECKING(for MMX instructions set)
|
||||
oldcflags="$CFLAGS"
|
||||
AX_CHECK_COMPILE_FLAG([-mmmx], [CFLAGS="$CFLAGS -mmmx"])
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
#pragma GCC target("mmx")
|
||||
#include <mmintrin.h>
|
||||
]], [[ __m64 x = _mm_setzero_si64(); ]])],
|
||||
[AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_MMINTRIN_H], [1], [mmx is available])],
|
||||
AC_DEFINE([HAVE_MMINTRIN_H], [1], [mmx is available])
|
||||
AX_CHECK_COMPILE_FLAG([-mmmx], [CFLAGS_MMX="-mmmx"])],
|
||||
[AC_MSG_RESULT(no)])
|
||||
CFLAGS="$oldcflags"
|
||||
|
||||
AC_MSG_CHECKING(for SSE2 instructions set)
|
||||
oldcflags="$CFLAGS"
|
||||
AX_CHECK_COMPILE_FLAG([-msse2], [CFLAGS="$CFLAGS -msse2"])
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
#pragma GCC target("sse2")
|
||||
#ifndef __SSE2__
|
||||
# define __SSE2__
|
||||
#endif
|
||||
#include <emmintrin.h>
|
||||
]], [[ __m128d x = _mm_setzero_pd(); ]])],
|
||||
[AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_EMMINTRIN_H], [1], [sse2 is available])],
|
||||
AC_DEFINE([HAVE_EMMINTRIN_H], [1], [sse2 is available])
|
||||
AX_CHECK_COMPILE_FLAG([-msse2], [CFLAGS_SSE2="-msse2"])],
|
||||
[AC_MSG_RESULT(no)])
|
||||
CFLAGS="$oldcflags"
|
||||
|
||||
oldcflags="$CFLAGS"
|
||||
AX_CHECK_COMPILE_FLAG([-msse3], [CFLAGS="$CFLAGS -msse3"])
|
||||
AC_MSG_CHECKING(for SSE3 instructions set)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
#pragma GCC target("sse3")
|
||||
@ -287,19 +300,65 @@ AS_IF([test "x$EMSCRIPTEN" = "x"],[
|
||||
]], [[ __m128 x = _mm_addsub_ps(_mm_cvtpd_ps(_mm_setzero_pd()),
|
||||
_mm_cvtpd_ps(_mm_setzero_pd())); ]])],
|
||||
[AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_PMMINTRIN_H], [1], [sse3 is available])],
|
||||
AC_DEFINE([HAVE_PMMINTRIN_H], [1], [sse3 is available])
|
||||
AX_CHECK_COMPILE_FLAG([-msse3], [CFLAGS_SSE3="-msse3"])],
|
||||
[AC_MSG_RESULT(no)])
|
||||
CFLAGS="$oldcflags"
|
||||
|
||||
oldcflags="$CFLAGS"
|
||||
AX_CHECK_COMPILE_FLAG([-mssse3], [CFLAGS="$CFLAGS -mssse3"])
|
||||
AC_MSG_CHECKING(for SSSE3 instructions set)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
#pragma GCC target("ssse3")
|
||||
#include <tmmintrin.h>
|
||||
]], [[ __m64 x = _mm_abs_pi32(_m_from_int(0)); ]])],
|
||||
[AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available])],
|
||||
AC_DEFINE([HAVE_TMMINTRIN_H], [1], [ssse3 is available])
|
||||
AX_CHECK_COMPILE_FLAG([-mssse3], [CFLAGS_SSSE3="-mssse3"])],
|
||||
[AC_MSG_RESULT(no)])
|
||||
CFLAGS="$oldcflags"
|
||||
|
||||
oldcflags="$CFLAGS"
|
||||
AX_CHECK_COMPILE_FLAG([-msse4.1], [CFLAGS="$CFLAGS -msse4.1"])
|
||||
AC_MSG_CHECKING(for SSE4.1 instructions set)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
#pragma GCC target("sse4.1")
|
||||
#include <smmintrin.h>
|
||||
]], [[ __m128i x = _mm_minpos_epu16(_mm_setzero_si128()); ]])],
|
||||
[AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_SMMINTRIN_H], [1], [sse4.1 is available])
|
||||
AX_CHECK_COMPILE_FLAG([-msse4.1], [CFLAGS_SSE4_1="-msse4.1"])],
|
||||
[AC_MSG_RESULT(no)])
|
||||
CFLAGS="$oldcflags"
|
||||
|
||||
oldcflags="$CFLAGS"
|
||||
AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS="$CFLAGS -maes"])
|
||||
AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS="$CFLAGS -mpclmul"])
|
||||
AC_MSG_CHECKING(for AESNI instructions set and PCLMULQDQ)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
#pragma GCC target("aes")
|
||||
#pragma GCC target("pclmul")
|
||||
#include <wmmintrin.h>
|
||||
]], [[ __m128i x = _mm_aesimc_si128(_mm_setzero_si128());
|
||||
__m128i y = _mm_clmulepi64_si128(_mm_setzero_si128(), _mm_setzero_si128(), 0);]])],
|
||||
[AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_WMMINTRIN_H], [1], [aesni is available])
|
||||
AX_CHECK_COMPILE_FLAG([-maes], [CFLAGS_AESNI="-maes"])
|
||||
AX_CHECK_COMPILE_FLAG([-mpclmul], [CFLAGS_PCLMUL="-mpclmul"])
|
||||
],
|
||||
[AC_MSG_RESULT(no)])
|
||||
CFLAGS="$oldcflags"
|
||||
|
||||
])
|
||||
|
||||
AC_SUBST(CFLAGS_MMX)
|
||||
AC_SUBST(CFLAGS_SSE2)
|
||||
AC_SUBST(CFLAGS_SSE3)
|
||||
AC_SUBST(CFLAGS_SSSE3)
|
||||
AC_SUBST(CFLAGS_SSE4_1)
|
||||
AC_SUBST(CFLAGS_AESNI)
|
||||
AC_SUBST(CFLAGS_PCLMUL)
|
||||
|
||||
AC_CHECK_HEADERS([sys/mman.h])
|
||||
|
||||
dnl Checks for typedefs, structures, and compiler characteristics.
|
||||
|
@ -62,7 +62,6 @@ libsodium_la_SOURCES = \
|
||||
crypto_pwhash/scryptsalsa208sha256/pwhash_scryptsalsa208sha256.c \
|
||||
crypto_pwhash/scryptsalsa208sha256/sysendian.h \
|
||||
crypto_pwhash/scryptsalsa208sha256/nosse/pwhash_scryptsalsa208sha256_nosse.c \
|
||||
crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c \
|
||||
crypto_scalarmult/crypto_scalarmult.c \
|
||||
crypto_scalarmult/curve25519/scalarmult_curve25519_api.c \
|
||||
crypto_secretbox/crypto_secretbox.c \
|
||||
@ -268,3 +267,20 @@ endif
|
||||
|
||||
SUBDIRS = \
|
||||
include
|
||||
|
||||
libsodium_la_LIBADD = libaesni.la libsse2.la
|
||||
noinst_LTLIBRARIES = libaesni.la libsse2.la
|
||||
|
||||
libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS)
|
||||
libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
|
||||
@CFLAGS_SSSE3@ @CFLAGS_AESNI@ @CFLAGS_PCLMUL@
|
||||
|
||||
libaesni_la_SOURCES = \
|
||||
crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
|
||||
|
||||
libsse2_la_LDFLAGS = $(libsodium_la_LDFLAGS)
|
||||
libsse2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
|
||||
@CFLAGS_SSE2@
|
||||
|
||||
libsse2_la_SOURCES = \
|
||||
crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c
|
||||
|
828
src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
Normal file
828
src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
Normal file
@ -0,0 +1,828 @@
|
||||
|
||||
/*
|
||||
* AES256-GCM, based on original code by Romain Dolbeau
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "crypto_aead_aes256gcm_aesni.h"
|
||||
#include "export.h"
|
||||
#include "utils.h"
|
||||
|
||||
#if defined(HAVE_WMMINTRIN_H) || \
|
||||
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
|
||||
|
||||
#pragma GCC target("ssse3")
|
||||
#pragma GCC target("aes")
|
||||
#pragma GCC target("pclmul")
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#if defined(__INTEL_COMPILER) || defined(_bswap64)
|
||||
#elif defined(_MSC_VER)
|
||||
# define _bswap64(a) _byteswap_uint64(a)
|
||||
#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
|
||||
# define _bswap64(a) __builtin_bswap64(a)
|
||||
#else
|
||||
static inline uint64_t
|
||||
_bswap64(const uint64_t x)
|
||||
{
|
||||
return
|
||||
((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) |
|
||||
((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) |
|
||||
((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) |
|
||||
((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL);
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef struct context {
|
||||
CRYPTO_ALIGN(16) unsigned char H[16];
|
||||
__m128i rkeys[16];
|
||||
} context;
|
||||
|
||||
static inline void
|
||||
aesni_key256_expand(const unsigned char *key, __m128 *rkeys)
|
||||
{
|
||||
__m128 key0 = _mm_loadu_ps((const float *) (key + 0));
|
||||
__m128 key1 = _mm_loadu_ps((const float *) (key + 16));
|
||||
__m128 temp0, temp1, temp2, temp4;
|
||||
int idx = 0;
|
||||
|
||||
rkeys[idx++] = key0;
|
||||
temp0 = key0;
|
||||
temp2 = key1;
|
||||
temp4 = _mm_setzero_ps();
|
||||
|
||||
/* why single precision floating-point rather than integer instructions ?
|
||||
because _mm_shuffle_ps takes two inputs, while _mm_shuffle_epi32 only
|
||||
takes one - it doesn't perform the same computation...
|
||||
_mm_shuffle_ps takes the lower 64 bits of the result from the first
|
||||
operand, and the higher 64 bits of the result from the second operand
|
||||
(in both cases, all four input floats are accessible).
|
||||
I don't like the non-orthogonal naming scheme :-(
|
||||
|
||||
This is all strongly inspired by the openssl assembly code.
|
||||
*/
|
||||
#define BLOCK1(IMM) \
|
||||
temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp2), IMM));\
|
||||
rkeys[idx++] = temp2; \
|
||||
temp4 = _mm_shuffle_ps(temp4, temp0, 0x10); \
|
||||
temp0 = _mm_xor_ps(temp0, temp4); \
|
||||
temp4 = _mm_shuffle_ps(temp4, temp0, 0x8c); \
|
||||
temp0 = _mm_xor_ps(temp0, temp4); \
|
||||
temp1 = _mm_shuffle_ps(temp1, temp1, 0xff); \
|
||||
temp0 = _mm_xor_ps(temp0, temp1)
|
||||
|
||||
#define BLOCK2(IMM) \
|
||||
temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp0), IMM));\
|
||||
rkeys[idx++] = temp0; \
|
||||
temp4 = _mm_shuffle_ps(temp4, temp2, 0x10); \
|
||||
temp2 = _mm_xor_ps(temp2, temp4); \
|
||||
temp4 = _mm_shuffle_ps(temp4, temp2, 0x8c); \
|
||||
temp2 = _mm_xor_ps(temp2, temp4); \
|
||||
temp1 = _mm_shuffle_ps(temp1, temp1, 0xaa); \
|
||||
temp2 = _mm_xor_ps(temp2, temp1)
|
||||
|
||||
BLOCK1(0x01);
|
||||
BLOCK2(0x01);
|
||||
|
||||
BLOCK1(0x02);
|
||||
BLOCK2(0x02);
|
||||
|
||||
BLOCK1(0x04);
|
||||
BLOCK2(0x04);
|
||||
|
||||
BLOCK1(0x08);
|
||||
BLOCK2(0x08);
|
||||
|
||||
BLOCK1(0x10);
|
||||
BLOCK2(0x10);
|
||||
|
||||
BLOCK1(0x20);
|
||||
BLOCK2(0x20);
|
||||
|
||||
BLOCK1(0x40);
|
||||
rkeys[idx++] = temp0;
|
||||
}
|
||||
|
||||
/** single, by-the-book AES encryption with AES-NI */
|
||||
static inline void
|
||||
aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys)
|
||||
{
|
||||
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
|
||||
int i;
|
||||
|
||||
#pragma unroll(13)
|
||||
for (i = 1; i < 14; i++) {
|
||||
temp = _mm_aesenc_si128(temp, rkeys[i]);
|
||||
}
|
||||
temp = _mm_aesenclast_si128(temp, rkeys[14]);
|
||||
_mm_storeu_si128((__m128i *) out, temp);
|
||||
}
|
||||
|
||||
/** multiple-blocks-at-once AES encryption with AES-NI ;
|
||||
on Haswell, aesenc as a latency of 7 and a througput of 1
|
||||
so the sequence of aesenc should be bubble-free, if you
|
||||
have at least 8 blocks. Let's build an arbitratry-sized
|
||||
function */
|
||||
/* Step 1 : loading the nonce */
|
||||
/* load & increment the n vector (non-vectorized, unused for now) */
|
||||
#define NVDECLx(a) \
|
||||
__m128i nv##a
|
||||
|
||||
#define NVx(a) \
|
||||
nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \
|
||||
n[3]++
|
||||
|
||||
/* Step 2 : define value in round one (xor with subkey #0, aka key) */
|
||||
#define TEMPDECLx(a) \
|
||||
__m128i temp##a
|
||||
|
||||
#define TEMPx(a) \
|
||||
temp##a = _mm_xor_si128(nv##a, rkeys[0])
|
||||
|
||||
/* Step 3: one round of AES */
|
||||
#define AESENCx(a) \
|
||||
temp##a = _mm_aesenc_si128(temp##a, rkeys[i])
|
||||
|
||||
/* Step 4: last round of AES */
|
||||
#define AESENCLASTx(a) \
|
||||
temp##a = _mm_aesenclast_si128(temp##a, rkeys[14])
|
||||
|
||||
/* Step 5: store result */
|
||||
#define STOREx(a) \
|
||||
_mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a)
|
||||
|
||||
/* all the MAKE* macros are for automatic explicit unrolling */
|
||||
#define MAKE4(X) \
|
||||
X(0); \
|
||||
X(1); \
|
||||
X(2); \
|
||||
X(3)
|
||||
|
||||
#define MAKE8(X) \
|
||||
X(0); \
|
||||
X(1); \
|
||||
X(2); \
|
||||
X(3); \
|
||||
X(4); \
|
||||
X(5); \
|
||||
X(6); \
|
||||
X(7)
|
||||
|
||||
#define COUNTER_INC2(N) (*(uint32_t *) &(N)[12]) = (2U + (((*(uint32_t *) &(N)[12]))))
|
||||
|
||||
/* create a function of unrolling N ; the MAKEN is the unrolling
|
||||
macro, defined above. The N in MAKEN must match N, obviously. */
|
||||
#define FUNC(N, MAKEN) \
|
||||
static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \
|
||||
{ \
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
||||
int i; \
|
||||
MAKEN(NVDECLx); \
|
||||
MAKEN(TEMPDECLx); \
|
||||
\
|
||||
MAKEN(NVx); \
|
||||
MAKEN(TEMPx); \
|
||||
for (i = 1; i < 14; i++) { \
|
||||
MAKEN(AESENCx); \
|
||||
} \
|
||||
MAKEN(AESENCLASTx); \
|
||||
MAKEN(STOREx); \
|
||||
}
|
||||
|
||||
FUNC(8, MAKE8)
|
||||
|
||||
/* all GF(2^128) fnctions are by the book, meaning this one:
|
||||
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
||||
*/
|
||||
|
||||
static inline void
|
||||
addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
|
||||
{
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
__m128i A;
|
||||
|
||||
if (xlen >= 16) {
|
||||
A = _mm_loadu_si128((const __m128i *) a);
|
||||
} else {
|
||||
CRYPTO_ALIGN(16) unsigned char padded[16];
|
||||
memset(padded, 0, 16);
|
||||
memcpy(padded, a, xlen);
|
||||
A = _mm_load_si128((const __m128i *) padded);
|
||||
}
|
||||
A = _mm_shuffle_epi8(A, rev);
|
||||
__m128i B = _mm_loadu_si128((const __m128i *) b);
|
||||
__m128i C = _mm_loadu_si128((const __m128i *) c);
|
||||
A = _mm_xor_si128(A, C);
|
||||
__m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
|
||||
__m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
|
||||
__m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
|
||||
__m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
|
||||
__m128i tmp10 = _mm_xor_si128(tmp4, tmp5);
|
||||
__m128i tmp13 = _mm_slli_si128(tmp10, 8);
|
||||
__m128i tmp11 = _mm_srli_si128(tmp10, 8);
|
||||
__m128i tmp15 = _mm_xor_si128(tmp3, tmp13);
|
||||
__m128i tmp17 = _mm_xor_si128(tmp6, tmp11);
|
||||
__m128i tmp7 = _mm_srli_epi32(tmp15, 31);
|
||||
__m128i tmp8 = _mm_srli_epi32(tmp17, 31);
|
||||
__m128i tmp16 = _mm_slli_epi32(tmp15, 1);
|
||||
__m128i tmp18 = _mm_slli_epi32(tmp17, 1);
|
||||
__m128i tmp9 = _mm_srli_si128(tmp7, 12);
|
||||
__m128i tmp22 = _mm_slli_si128(tmp8, 4);
|
||||
__m128i tmp25 = _mm_slli_si128(tmp7, 4);
|
||||
__m128i tmp29 = _mm_or_si128(tmp16, tmp25);
|
||||
__m128i tmp19 = _mm_or_si128(tmp18, tmp22);
|
||||
__m128i tmp20 = _mm_or_si128(tmp19, tmp9);
|
||||
__m128i tmp26 = _mm_slli_epi32(tmp29, 31);
|
||||
__m128i tmp23 = _mm_slli_epi32(tmp29, 30);
|
||||
__m128i tmp32 = _mm_slli_epi32(tmp29, 25);
|
||||
__m128i tmp27 = _mm_xor_si128(tmp26, tmp23);
|
||||
__m128i tmp28 = _mm_xor_si128(tmp27, tmp32);
|
||||
__m128i tmp24 = _mm_srli_si128(tmp28, 4);
|
||||
__m128i tmp33 = _mm_slli_si128(tmp28, 12);
|
||||
__m128i tmp30 = _mm_xor_si128(tmp29, tmp33);
|
||||
__m128i tmp2 = _mm_srli_epi32(tmp30, 1);
|
||||
__m128i tmp12 = _mm_srli_epi32(tmp30, 2);
|
||||
__m128i tmp14 = _mm_srli_epi32(tmp30, 7);
|
||||
__m128i tmp34 = _mm_xor_si128(tmp2, tmp12);
|
||||
__m128i tmp35 = _mm_xor_si128(tmp34, tmp14);
|
||||
__m128i tmp36 = _mm_xor_si128(tmp35, tmp24);
|
||||
__m128i tmp31 = _mm_xor_si128(tmp30, tmp36);
|
||||
__m128i tmp21 = _mm_xor_si128(tmp20, tmp31);
|
||||
_mm_storeu_si128((__m128i *) c, tmp21);
|
||||
}
|
||||
|
||||
/* pure multiplication, for pre-computing powers of H */
|
||||
static inline __m128i
|
||||
mulv(__m128i A, __m128i B)
|
||||
{
|
||||
__m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
|
||||
__m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
|
||||
__m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
|
||||
__m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
|
||||
__m128i tmp10 = _mm_xor_si128(tmp4, tmp5);
|
||||
__m128i tmp13 = _mm_slli_si128(tmp10, 8);
|
||||
__m128i tmp11 = _mm_srli_si128(tmp10, 8);
|
||||
__m128i tmp15 = _mm_xor_si128(tmp3, tmp13);
|
||||
__m128i tmp17 = _mm_xor_si128(tmp6, tmp11);
|
||||
__m128i tmp7 = _mm_srli_epi32(tmp15, 31);
|
||||
__m128i tmp8 = _mm_srli_epi32(tmp17, 31);
|
||||
__m128i tmp16 = _mm_slli_epi32(tmp15, 1);
|
||||
__m128i tmp18 = _mm_slli_epi32(tmp17, 1);
|
||||
__m128i tmp9 = _mm_srli_si128(tmp7, 12);
|
||||
__m128i tmp22 = _mm_slli_si128(tmp8, 4);
|
||||
__m128i tmp25 = _mm_slli_si128(tmp7, 4);
|
||||
__m128i tmp29 = _mm_or_si128(tmp16, tmp25);
|
||||
__m128i tmp19 = _mm_or_si128(tmp18, tmp22);
|
||||
__m128i tmp20 = _mm_or_si128(tmp19, tmp9);
|
||||
__m128i tmp26 = _mm_slli_epi32(tmp29, 31);
|
||||
__m128i tmp23 = _mm_slli_epi32(tmp29, 30);
|
||||
__m128i tmp32 = _mm_slli_epi32(tmp29, 25);
|
||||
__m128i tmp27 = _mm_xor_si128(tmp26, tmp23);
|
||||
__m128i tmp28 = _mm_xor_si128(tmp27, tmp32);
|
||||
__m128i tmp24 = _mm_srli_si128(tmp28, 4);
|
||||
__m128i tmp33 = _mm_slli_si128(tmp28, 12);
|
||||
__m128i tmp30 = _mm_xor_si128(tmp29, tmp33);
|
||||
__m128i tmp2 = _mm_srli_epi32(tmp30, 1);
|
||||
__m128i tmp12 = _mm_srli_epi32(tmp30, 2);
|
||||
__m128i tmp14 = _mm_srli_epi32(tmp30, 7);
|
||||
__m128i tmp34 = _mm_xor_si128(tmp2, tmp12);
|
||||
__m128i tmp35 = _mm_xor_si128(tmp34, tmp14);
|
||||
__m128i tmp36 = _mm_xor_si128(tmp35, tmp24);
|
||||
__m128i tmp31 = _mm_xor_si128(tmp30, tmp36);
|
||||
__m128i C = _mm_xor_si128(tmp20, tmp31);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
/* 4 multiply-accumulate at once; again
|
||||
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
||||
for the Aggregated Reduction Method & sample code.
|
||||
Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
|
||||
|
||||
#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
|
||||
#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
|
||||
#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
|
||||
#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
|
||||
#define RED_MUL_MID(a) \
|
||||
tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \
|
||||
tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \
|
||||
tmp##a = _mm_xor_si128(tmp##a, H##a); \
|
||||
tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
|
||||
tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
|
||||
|
||||
#define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, acc) \
|
||||
do { \
|
||||
MAKE4(RED_DECL); \
|
||||
__m128i lo, hi; \
|
||||
__m128i tmp8, tmp9; \
|
||||
__m128i H0 = H0_; \
|
||||
__m128i H1 = H1_; \
|
||||
__m128i H2 = H2_; \
|
||||
__m128i H3 = H3_; \
|
||||
__m128i X0 = X0_; \
|
||||
__m128i X1 = X1_; \
|
||||
__m128i X2 = X2_; \
|
||||
__m128i X3 = X3_; \
|
||||
\
|
||||
/* byte-revert the inputs & xor the first one into the accumulator */ \
|
||||
\
|
||||
MAKE4(RED_SHUFFLE); \
|
||||
X3 = _mm_xor_si128(X3, acc); \
|
||||
\
|
||||
/* 4 low H*X (x0*h0) */ \
|
||||
\
|
||||
MAKE4(RED_MUL_LOW); \
|
||||
lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
|
||||
lo = _mm_xor_si128(lo, H2_X2_lo); \
|
||||
lo = _mm_xor_si128(lo, H3_X3_lo); \
|
||||
\
|
||||
/* 4 high H*X (x1*h1) */ \
|
||||
\
|
||||
MAKE4(RED_MUL_HIGH); \
|
||||
hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
|
||||
hi = _mm_xor_si128(hi, H2_X2_hi); \
|
||||
hi = _mm_xor_si128(hi, H3_X3_hi); \
|
||||
\
|
||||
/* 4 middle H*X, using Karatsuba, i.e. \
|
||||
x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
|
||||
we already have all x1y1 & x0y0 (accumulated in hi & lo) \
|
||||
(0 is low half and 1 is high half) \
|
||||
*/ \
|
||||
/* permute the high and low 64 bits in H1 & X1, \
|
||||
so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
|
||||
then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
|
||||
and finally multiply \
|
||||
*/ \
|
||||
MAKE4(RED_MUL_MID); \
|
||||
\
|
||||
/* substracts x1*h1 and x0*h0 */ \
|
||||
tmp0 = _mm_xor_si128(tmp0, lo); \
|
||||
tmp0 = _mm_xor_si128(tmp0, hi); \
|
||||
tmp0 = _mm_xor_si128(tmp1, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp2, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp3, tmp0);\
|
||||
\
|
||||
/* reduction */ \
|
||||
tmp0B = _mm_slli_si128(tmp0, 8); \
|
||||
tmp0 = _mm_srli_si128(tmp0, 8); \
|
||||
lo = _mm_xor_si128(tmp0B, lo); \
|
||||
hi = _mm_xor_si128(tmp0, hi); \
|
||||
tmp3 = lo; \
|
||||
tmp2B = hi; \
|
||||
tmp3B = _mm_srli_epi32(tmp3, 31); \
|
||||
tmp8 = _mm_srli_epi32(tmp2B, 31); \
|
||||
tmp3 = _mm_slli_epi32(tmp3, 1); \
|
||||
tmp2B = _mm_slli_epi32(tmp2B, 1); \
|
||||
tmp9 = _mm_srli_si128(tmp3B, 12); \
|
||||
tmp8 = _mm_slli_si128(tmp8, 4); \
|
||||
tmp3B = _mm_slli_si128(tmp3B, 4); \
|
||||
tmp3 = _mm_or_si128(tmp3, tmp3B); \
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp8); \
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp9); \
|
||||
tmp3B = _mm_slli_epi32(tmp3, 31); \
|
||||
tmp8 = _mm_slli_epi32(tmp3, 30); \
|
||||
tmp9 = _mm_slli_epi32(tmp3, 25); \
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp8); \
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp9); \
|
||||
tmp8 = _mm_srli_si128(tmp3B, 4); \
|
||||
tmp3B = _mm_slli_si128(tmp3B, 12); \
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp3B); \
|
||||
tmp2 = _mm_srli_epi32(tmp3, 1); \
|
||||
tmp0B = _mm_srli_epi32(tmp3, 2); \
|
||||
tmp1B = _mm_srli_epi32(tmp3, 7); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp0B); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp1B); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp8); \
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp2); \
|
||||
tmp2B = _mm_xor_si128(tmp2B, tmp3); \
|
||||
\
|
||||
accv = tmp2B; \
|
||||
} while(0)
|
||||
|
||||
#define XORx(a) \
|
||||
__m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16)); \
|
||||
temp##a = _mm_xor_si128(temp##a, in##a)
|
||||
|
||||
#define LOADx(a) \
|
||||
__m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16));
|
||||
|
||||
/* full encrypt & checksum 8 blocks at once */
|
||||
#define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_) \
|
||||
do { \
|
||||
unsigned char *out = out_; \
|
||||
uint32_t *n = n_; \
|
||||
const unsigned char *in = in_; \
|
||||
const __m128i hv = hv_; \
|
||||
const __m128i h2v = h2v_; \
|
||||
const __m128i h3v = h3v_; \
|
||||
const __m128i h4v = h4v_; \
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
|
||||
__m128i accv = _mm_load_si128((const __m128i *) accum); \
|
||||
int i; \
|
||||
\
|
||||
MAKE8(NVDECLx); \
|
||||
MAKE8(TEMPDECLx); \
|
||||
MAKE8(NVx); \
|
||||
MAKE8(TEMPx); \
|
||||
for (i = 1; i < 14; i++) { \
|
||||
MAKE8(AESENCx); \
|
||||
} \
|
||||
MAKE8(AESENCLASTx); \
|
||||
MAKE8(XORx); \
|
||||
MAKE8(STOREx); \
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); \
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); \
|
||||
_mm_store_si128((__m128i *) accum, accv); \
|
||||
} while(0)
|
||||
|
||||
/* checksum 8 blocks at once */
|
||||
#define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_) \
|
||||
do { \
|
||||
const unsigned char *in = in_; \
|
||||
const __m128i hv = hv_; \
|
||||
const __m128i h2v = h2v_ ; \
|
||||
const __m128i h3v = h3v_ ; \
|
||||
const __m128i h4v = h4v_ ; \
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
|
||||
__m128i accv = _mm_load_si128((const __m128i *) accum); \
|
||||
\
|
||||
MAKE8(LOADx); \
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); \
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); \
|
||||
_mm_store_si128((__m128i *) accum, accv); \
|
||||
} while(0)
|
||||
|
||||
/* decrypt 8 blocks at once */
|
||||
#define aesni_decrypt8full(out_, n_, rkeys, in_) \
|
||||
do { \
|
||||
unsigned char *out = out_; \
|
||||
uint32_t *n = n_; \
|
||||
const unsigned char *in = in_; \
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
||||
int i; \
|
||||
\
|
||||
MAKE8(NVDECLx); \
|
||||
MAKE8(TEMPDECLx); \
|
||||
MAKE8(NVx); \
|
||||
MAKE8(TEMPx); \
|
||||
for (i = 1; i < 14; i++) { \
|
||||
MAKE8(AESENCx); \
|
||||
} \
|
||||
MAKE8(AESENCLASTx); \
|
||||
MAKE8(XORx); \
|
||||
MAKE8(STOREx); \
|
||||
} while(0)
|
||||
|
||||
int
|
||||
crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_,
|
||||
const unsigned char *k)
|
||||
{
|
||||
context *ctx = (context *) ctx_;
|
||||
__m128i *rkeys = ctx->rkeys;
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
unsigned char *H = ctx->H;
|
||||
|
||||
(void) sizeof(int[(sizeof *ctx_) >= (sizeof *ctx) ? 1 : -1]);
|
||||
aesni_key256_expand(k, (__m128 *) rkeys);
|
||||
aesni_encrypt1(H, zero, rkeys);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long *clen,
|
||||
const unsigned char *m, unsigned long long mlen,
|
||||
const unsigned char *ad, unsigned long long adlen,
|
||||
const unsigned char *nsec,
|
||||
const unsigned char *npub,
|
||||
const crypto_aead_aes256gcm_aesni_state *ctx_)
|
||||
{
|
||||
unsigned char H[16];
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
const context *ctx = (const context *) ctx_;
|
||||
const __m128i *rkeys = ctx->rkeys;
|
||||
__m128i Hv, H2v, H3v, H4v, accv;
|
||||
unsigned long long i, j;
|
||||
unsigned long long adlen_rnd64 = adlen & ~63ULL;
|
||||
unsigned long long mlen_rnd128 = mlen & ~127ULL;
|
||||
CRYPTO_ALIGN(16) unsigned char n2[16];
|
||||
CRYPTO_ALIGN(16) unsigned char T[16];
|
||||
CRYPTO_ALIGN(16) unsigned char accum[16];
|
||||
CRYPTO_ALIGN(16) unsigned char fb[16];
|
||||
|
||||
(void) nsec;
|
||||
memcpy(H, ctx->H, sizeof H);
|
||||
if (mlen > 16ULL * (1ULL << 32)) {
|
||||
abort();
|
||||
}
|
||||
memcpy(&n2[0], npub, 12);
|
||||
*(uint32_t *) &n2[12] = 0x01000000;
|
||||
aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
|
||||
|
||||
(*(uint64_t *) &fb[0]) = _bswap64((uint64_t) (8 * adlen));
|
||||
(*(uint64_t *) &fb[8]) = _bswap64((uint64_t) (8 * mlen));
|
||||
|
||||
/* we store H (and it's power) byte-reverted once and for all */
|
||||
Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
|
||||
_mm_store_si128((__m128i *) H, Hv);
|
||||
H2v = mulv(Hv, Hv);
|
||||
H3v = mulv(H2v, Hv);
|
||||
H4v = mulv(H3v, Hv);
|
||||
|
||||
accv = _mm_setzero_si128();
|
||||
/* unrolled by 4 GCM (by 8 doesn't improve using REDUCE4) */
|
||||
for (i = 0; i < adlen_rnd64; i += 64) {
|
||||
__m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0));
|
||||
__m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
||||
__m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
||||
__m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
||||
REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
|
||||
}
|
||||
_mm_store_si128((__m128i *) accum, accv);
|
||||
|
||||
/* GCM remainder loop */
|
||||
for (i = adlen_rnd64; i < adlen; i += 16) {
|
||||
unsigned int blocklen = 16;
|
||||
|
||||
if (i + (unsigned long long) blocklen > adlen) {
|
||||
blocklen = (unsigned int) (adlen - i);
|
||||
}
|
||||
addmul(accum, ad + i, blocklen, H);
|
||||
}
|
||||
|
||||
/* this only does 8 full blocks, so no fancy bounds checking is necessary*/
|
||||
#define LOOPRND128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = 0; i < mlen_rnd128; i += lb) { \
|
||||
aesni_encrypt8full(c + i, (uint32_t *) n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* remainder loop, with the slower GCM update to accomodate partial blocks */
|
||||
#define LOOPRMD128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
||||
CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
|
||||
unsigned long long mj = lb; \
|
||||
\
|
||||
aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \
|
||||
if ((i + mj) >= mlen) { \
|
||||
mj = mlen - i; \
|
||||
} \
|
||||
for (j = 0; j < mj; j++) { \
|
||||
c[i + j] = m[i + j] ^ outni[j]; \
|
||||
} \
|
||||
for (j = 0; j < mj; j += 16) { \
|
||||
unsigned int bl = 16; \
|
||||
\
|
||||
if (j + (unsigned long long) bl >= mj) { \
|
||||
bl = (unsigned int) (mj - j); \
|
||||
} \
|
||||
addmul(accum, c + i + j, bl, H); \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
n2[15] = 0;
|
||||
COUNTER_INC2(n2);
|
||||
LOOPRND128;
|
||||
LOOPRMD128;
|
||||
|
||||
addmul(accum, fb, 16, H);
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
c[i + mlen] = T[i] ^ accum[15 - i];
|
||||
}
|
||||
if (clen != NULL) {
|
||||
*clen = mlen + 16;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p,
|
||||
unsigned char *nsec,
|
||||
const unsigned char *c, unsigned long long clen,
|
||||
const unsigned char *ad, unsigned long long adlen,
|
||||
const unsigned char *npub,
|
||||
const crypto_aead_aes256gcm_aesni_state *ctx_)
|
||||
{
|
||||
unsigned char H[16];
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
const context *ctx = (const context *) ctx_;
|
||||
const __m128i *rkeys = ctx->rkeys;
|
||||
__m128i Hv, H2v, H3v, H4v, accv;
|
||||
unsigned long long i, j;
|
||||
unsigned long long adlen_rnd64 = adlen & ~63ULL;
|
||||
unsigned long long mlen;
|
||||
unsigned long long mlen_rnd128;
|
||||
CRYPTO_ALIGN(16) unsigned char n2[16];
|
||||
CRYPTO_ALIGN(16) unsigned char T[16];
|
||||
CRYPTO_ALIGN(16) unsigned char accum[16];
|
||||
CRYPTO_ALIGN(16) unsigned char fb[16];
|
||||
|
||||
(void) nsec;
|
||||
memcpy(H, ctx->H, sizeof H);
|
||||
if (clen > 16ULL * (1ULL << 32) - 16ULL) {
|
||||
abort();
|
||||
}
|
||||
mlen = clen - 16;
|
||||
if (mlen_p != NULL) {
|
||||
*mlen_p = 0U;
|
||||
}
|
||||
memcpy(&n2[0], npub, 12);
|
||||
*(uint32_t *) &n2[12] = 0x01000000;
|
||||
aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
|
||||
|
||||
(*(uint64_t *) &fb[0]) = _bswap64((uint64_t)(8 * adlen));
|
||||
(*(uint64_t *) &fb[8]) = _bswap64((uint64_t)(8 * mlen));
|
||||
|
||||
Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
|
||||
_mm_store_si128((__m128i *) H, Hv);
|
||||
H2v = mulv(Hv, Hv);
|
||||
H3v = mulv(H2v, Hv);
|
||||
H4v = mulv(H3v, Hv);
|
||||
|
||||
accv = _mm_setzero_si128();
|
||||
for (i = 0; i < adlen_rnd64; i += 64) {
|
||||
__m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0));
|
||||
__m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
||||
__m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
||||
__m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
||||
REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
|
||||
}
|
||||
_mm_store_si128((__m128i *) accum, accv);
|
||||
|
||||
for (i = adlen_rnd64; i < adlen; i += 16) {
|
||||
unsigned int blocklen = 16;
|
||||
if (i + (unsigned long long) blocklen > adlen) {
|
||||
blocklen = (unsigned int) (adlen - i);
|
||||
}
|
||||
addmul(accum, ad + i, blocklen, H);
|
||||
}
|
||||
|
||||
mlen_rnd128 = mlen & ~127ULL;
|
||||
|
||||
#define LOOPACCUMDRND128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
for (i = 0; i < mlen_rnd128; i += lb) { \
|
||||
aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define LOOPDRND128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
for (i = 0; i < mlen_rnd128; i += lb) { \
|
||||
aesni_decrypt8full(m + i, (uint32_t *) n2, rkeys, c + i); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define LOOPACCUMDRMD128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
||||
unsigned long long mj = lb; \
|
||||
\
|
||||
if ((i + mj) >= mlen) { \
|
||||
mj = mlen - i; \
|
||||
} \
|
||||
for (j = 0; j < mj; j += 16) { \
|
||||
unsigned int bl = 16; \
|
||||
\
|
||||
if (j + (unsigned long long) bl >= mj) { \
|
||||
bl = (unsigned int) (mj - j); \
|
||||
} \
|
||||
addmul(accum, c + i + j, bl, H); \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define LOOPDRMD128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
||||
CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
|
||||
unsigned long long mj = lb; \
|
||||
\
|
||||
if ((i + mj) >= mlen) { \
|
||||
mj = mlen - i; \
|
||||
} \
|
||||
aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \
|
||||
for (j = 0; j < mj; j++) { \
|
||||
m[i + j] = c[i + j] ^ outni[j]; \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
n2[15] = 0;
|
||||
|
||||
COUNTER_INC2(n2);
|
||||
LOOPACCUMDRND128;
|
||||
LOOPACCUMDRMD128;
|
||||
addmul(accum, fb, 16, H);
|
||||
{
|
||||
unsigned char d = 0;
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
d |= (c[i + mlen] ^ (T[i] ^ accum[15 - i]));
|
||||
}
|
||||
if (d != 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
*(uint32_t *) &n2[12] = 0;
|
||||
COUNTER_INC2(n2);
|
||||
LOOPDRND128;
|
||||
LOOPDRMD128;
|
||||
|
||||
if (mlen_p != NULL) {
|
||||
*mlen_p = mlen;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c,
|
||||
unsigned long long *clen_p,
|
||||
const unsigned char *m,
|
||||
unsigned long long mlen,
|
||||
const unsigned char *ad,
|
||||
unsigned long long adlen,
|
||||
const unsigned char *nsec,
|
||||
const unsigned char *npub,
|
||||
const unsigned char *k)
|
||||
{
|
||||
crypto_aead_aes256gcm_aesni_state ctx;
|
||||
|
||||
crypto_aead_aes256gcm_aesni_beforenm(&ctx, k);
|
||||
|
||||
return crypto_aead_aes256gcm_aesni_encrypt_afternm
|
||||
(c, clen_p, m, mlen, ad, adlen, nsec, npub, &ctx);
|
||||
}
|
||||
|
||||
int
|
||||
crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m,
|
||||
unsigned long long *mlen_p,
|
||||
unsigned char *nsec,
|
||||
const unsigned char *c,
|
||||
unsigned long long clen,
|
||||
const unsigned char *ad,
|
||||
unsigned long long adlen,
|
||||
const unsigned char *npub,
|
||||
const unsigned char *k)
|
||||
{
|
||||
crypto_aead_aes256gcm_aesni_state ctx;
|
||||
|
||||
crypto_aead_aes256gcm_aesni_beforenm((crypto_aead_aes256gcm_aesni_state *)
|
||||
&ctx, k);
|
||||
|
||||
return crypto_aead_aes256gcm_aesni_decrypt_afternm
|
||||
(m, mlen_p, nsec, c, clen, ad, adlen, npub, &ctx);
|
||||
}
|
||||
|
||||
size_t
|
||||
crypto_aead_aes256gcm_aesni_keybytes(void)
|
||||
{
|
||||
return crypto_aead_aes256gcm_KEYBYTES;
|
||||
}
|
||||
|
||||
size_t
|
||||
crypto_aead_aes256gcm_aesni_nsecbytes(void)
|
||||
{
|
||||
return crypto_aead_aes256gcm_NSECBYTES;
|
||||
}
|
||||
|
||||
size_t crypto_aead_aes256gcm_aesni_npubbytes(void)
|
||||
{
|
||||
return crypto_aead_aes256gcm_NPUBBYTES;
|
||||
}
|
||||
|
||||
size_t crypto_aead_aes256gcm_aesni_abytes(void)
|
||||
{
|
||||
return crypto_aead_aes256gcm_ABYTES;
|
||||
}
|
||||
|
||||
size_t crypto_aead_aes256gcm_aesni_statebytes(void)
|
||||
{
|
||||
return sizeof(crypto_aead_aes256gcm_aesni_state);
|
||||
}
|
||||
|
||||
#endif
|
@ -2,6 +2,7 @@
|
||||
SODIUM_EXPORT = \
|
||||
sodium.h \
|
||||
sodium/core.h \
|
||||
sodium/crypto_aead_aes256gcm_aesni.h \
|
||||
sodium/crypto_aead_chacha20poly1305.h \
|
||||
sodium/crypto_auth.h \
|
||||
sodium/crypto_auth_hmacsha256.h \
|
||||
|
@ -3,6 +3,7 @@
|
||||
#define sodium_H
|
||||
|
||||
#include "sodium/core.h"
|
||||
#include "sodium/crypto_aead_aes256gcm_aesni.h"
|
||||
#include "sodium/crypto_aead_chacha20poly1305.h"
|
||||
#include "sodium/crypto_auth.h"
|
||||
#include "sodium/crypto_auth_hmacsha256.h"
|
||||
|
85
src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h
Normal file
85
src/libsodium/include/sodium/crypto_aead_aes256gcm_aesni.h
Normal file
@ -0,0 +1,85 @@
|
||||
#ifndef crypto_aead_aes256gcm_aesni_H
|
||||
#define crypto_aead_aes256gcm_aesni_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include "export.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
# if __GNUC__
|
||||
# pragma GCC diagnostic ignored "-Wlong-long"
|
||||
# endif
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define crypto_aead_aes256gcm_KEYBYTES 32U
|
||||
SODIUM_EXPORT
|
||||
size_t crypto_aead_aes256gcm_aesni_keybytes(void);
|
||||
|
||||
#define crypto_aead_aes256gcm_NSECBYTES 0U
|
||||
SODIUM_EXPORT
|
||||
size_t crypto_aead_aes256gcm_aesni_nsecbytes(void);
|
||||
|
||||
#define crypto_aead_aes256gcm_NPUBBYTES 12U
|
||||
SODIUM_EXPORT
|
||||
size_t crypto_aead_aes256gcm_aesni_npubbytes(void);
|
||||
|
||||
#define crypto_aead_aes256gcm_ABYTES 16U
|
||||
SODIUM_EXPORT
|
||||
size_t crypto_aead_aes256gcm_aesni_abytes(void);
|
||||
|
||||
typedef CRYPTO_ALIGN(16) unsigned char crypto_aead_aes256gcm_aesni_state[272];
|
||||
SODIUM_EXPORT
|
||||
size_t crypto_aead_aes256gcm_aesni_statebytes(void);
|
||||
|
||||
SODIUM_EXPORT
|
||||
int crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c,
|
||||
unsigned long long *clen_p,
|
||||
const unsigned char *m,
|
||||
unsigned long long mlen,
|
||||
const unsigned char *ad,
|
||||
unsigned long long adlen,
|
||||
const unsigned char *nsec,
|
||||
const unsigned char *npub,
|
||||
const unsigned char *k);
|
||||
|
||||
SODIUM_EXPORT
|
||||
int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m,
|
||||
unsigned long long *mlen_p,
|
||||
unsigned char *nsec,
|
||||
const unsigned char *c,
|
||||
unsigned long long clen,
|
||||
const unsigned char *ad,
|
||||
unsigned long long adlen,
|
||||
const unsigned char *npub,
|
||||
const unsigned char *k);
|
||||
|
||||
SODIUM_EXPORT
|
||||
int crypto_aead_aes256gcm_aesni_beforenm(crypto_aead_aes256gcm_aesni_state *ctx_,
|
||||
const unsigned char *k);
|
||||
|
||||
SODIUM_EXPORT
|
||||
int crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c,
|
||||
unsigned long long *clen_p,
|
||||
const unsigned char *m,
|
||||
unsigned long long mlen,
|
||||
const unsigned char *ad,
|
||||
unsigned long long adlen,
|
||||
const unsigned char *nsec,
|
||||
const unsigned char *npub,
|
||||
const crypto_aead_aes256gcm_aesni_state *ctx_);
|
||||
|
||||
SODIUM_EXPORT
|
||||
int crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m,
|
||||
unsigned long long *mlen_p,
|
||||
unsigned char *nsec,
|
||||
const unsigned char *c,
|
||||
unsigned long long clen,
|
||||
const unsigned char *ad,
|
||||
unsigned long long adlen,
|
||||
const unsigned char *npub,
|
||||
const crypto_aead_aes256gcm_aesni_state *ctx_);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@ -7,12 +7,6 @@
|
||||
|
||||
#include "export.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# define CRYPTO_ALIGN(x) __declspec(align(x))
|
||||
#else
|
||||
# define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
# if __GNUC__
|
||||
# pragma GCC diagnostic ignored "-Wlong-long"
|
||||
|
@ -29,4 +29,12 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef CRYPTO_ALIGN
|
||||
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
|
||||
# define CRYPTO_ALIGN(x) __declspec(align(x))
|
||||
# else
|
||||
# define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -20,6 +20,12 @@ int sodium_runtime_has_sse2(void);
|
||||
SODIUM_EXPORT
|
||||
int sodium_runtime_has_sse3(void);
|
||||
|
||||
SODIUM_EXPORT
|
||||
int sodium_runtime_has_pclmul(void);
|
||||
|
||||
SODIUM_EXPORT
|
||||
int sodium_runtime_has_aesni(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -10,12 +10,16 @@ typedef struct CPUFeatures_ {
|
||||
int has_neon;
|
||||
int has_sse2;
|
||||
int has_sse3;
|
||||
int has_pclmul;
|
||||
int has_aesni;
|
||||
} CPUFeatures;
|
||||
|
||||
static CPUFeatures _cpu_features;
|
||||
|
||||
#define CPUID_SSE2 0x04000000
|
||||
#define CPUIDECX_SSE3 0x00000001
|
||||
#define CPUIDECX_PCLMUL 0x00000002
|
||||
#define CPUIDECX_AESNI 0x02000000
|
||||
|
||||
static int
|
||||
_sodium_runtime_arm_cpu_features(CPUFeatures * const cpu_features)
|
||||
@ -104,6 +108,14 @@ _sodium_runtime_intel_cpu_features(CPUFeatures * const cpu_features)
|
||||
cpu_features->has_sse3 = ((cpu_info[2] & CPUIDECX_SSE3) != 0x0);
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_WMMINTRIN_H
|
||||
cpu_features->has_pclmul = 0;
|
||||
cpu_features->has_aesni = 0;
|
||||
#else
|
||||
cpu_features->has_pclmul = ((cpu_info[2] & CPUIDECX_PCLMUL) != 0x0);
|
||||
cpu_features->has_aesni = ((cpu_info[2] & CPUIDECX_AESNI) != 0x0);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -133,3 +145,13 @@ int
|
||||
sodium_runtime_has_sse3(void) {
|
||||
return _cpu_features.has_sse3;
|
||||
}
|
||||
|
||||
int
|
||||
sodium_runtime_has_pclmul(void) {
|
||||
return _cpu_features.has_pclmul;
|
||||
}
|
||||
|
||||
int
|
||||
sodium_runtime_has_aesni(void) {
|
||||
return _cpu_features.has_aesni;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user