Add crypto_aead_aes256gcm_aesni_*

Requires a CPU with aesni and pclmulqdq
This is a private branch for a reason. It is not going to be merged as-is.
This commit is contained in:
Frank Denis 2015-10-07 23:07:19 +02:00
parent d8e870cb43
commit e8e5d2fc18
6 changed files with 498 additions and 0 deletions

View File

@ -211,6 +211,13 @@ AX_CHECK_COMPILE_FLAG([-Wwrite-strings], [CFLAGS="$CFLAGS -Wwrite-strings"])
AX_CHECK_COMPILE_FLAG([-Wdiv-by-zero], [CFLAGS="$CFLAGS -Wdiv-by-zero"])
AX_CHECK_COMPILE_FLAG([-Wsometimes-uninitialized], [CFLAGS="$CFLAGS -Wsometimes-uninitialized"])
AX_CHECK_COMPILE_FLAG([$CFLAGS -mmmx], [CFLAGS="$CFLAGS -mmmx"])
AX_CHECK_COMPILE_FLAG([$CFLAGS -msse], [CFLAGS="$CFLAGS -msse"])
AX_CHECK_COMPILE_FLAG([$CFLAGS -msse2], [CFLAGS="$CFLAGS -msse2"])
AX_CHECK_COMPILE_FLAG([$CFLAGS -msse3], [CFLAGS="$CFLAGS -msse3"])
AX_CHECK_COMPILE_FLAG([$CFLAGS -maes], [CFLAGS="$CFLAGS -maes"])
AX_CHECK_COMPILE_FLAG([$CFLAGS -mpclmul], [CFLAGS="$CFLAGS -mpclmul"])
AC_ARG_VAR([CWFLAGS], [define to compilation flags for generating extra warnings])
AX_CHECK_COMPILE_FLAG([$CWFLAGS -Wall], [CWFLAGS="$CWFLAGS -Wall"])

View File

@ -2,6 +2,7 @@ lib_LTLIBRARIES = \
libsodium.la
libsodium_la_SOURCES = \
crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c \
crypto_aead/chacha20poly1305/sodium/aead_chacha20poly1305.c \
crypto_auth/crypto_auth.c \
crypto_auth/hmacsha256/auth_hmacsha256_api.c \

View File

@ -0,0 +1,444 @@
#include <immintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "crypto_aead_aes256gcm_aesni.h"
#include "utils.h"
#define AES_BLOCKSIZE 16
#define AES_MAXROUNDS 14
#define GMAC_BLOCKSIZE 16
#if defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
#else
# define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#endif
typedef CRYPTO_ALIGN(128) struct ghash {
unsigned char initial_state[GMAC_BLOCKSIZE];
unsigned char state[GMAC_BLOCKSIZE];
unsigned char subkey[GMAC_BLOCKSIZE];
} ghash;
typedef CRYPTO_ALIGN(128) struct context {
__m128i ekey[AES_MAXROUNDS + 1];
ghash ghash;
} context;
static inline void
_u64_be_from_ull(unsigned char out[8U], unsigned long long x)
{
out[7] = (unsigned char) (x & 0xff); x >>= 8;
out[6] = (unsigned char) (x & 0xff); x >>= 8;
out[5] = (unsigned char) (x & 0xff); x >>= 8;
out[4] = (unsigned char) (x & 0xff); x >>= 8;
out[3] = (unsigned char) (x & 0xff); x >>= 8;
out[2] = (unsigned char) (x & 0xff); x >>= 8;
out[1] = (unsigned char) (x & 0xff); x >>= 8;
out[0] = (unsigned char) (x & 0xff);
}
#define KEY_EXPANSION_A \
tmp1 = _mm_shuffle_epi32(tmp1, 255); \
tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \
_mm_castsi128_ps(tmp0), 16)); \
tmp0 = _mm_xor_si128(tmp0, tmp3); \
tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \
_mm_castsi128_ps(tmp0), 140)); \
tmp0 = _mm_xor_si128(_mm_xor_si128(tmp0, tmp3), tmp1); \
_mm_store_si128((void *) key_ptr, tmp0); \
key_ptr++
#define KEY_EXPANSION_B \
tmp1 = _mm_shuffle_epi32(tmp1, 170); \
tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \
_mm_castsi128_ps(tmp2), 16)); \
tmp2 = _mm_xor_si128(tmp2, tmp3); \
tmp3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), \
_mm_castsi128_ps(tmp2), 140)); \
tmp2 = _mm_xor_si128(_mm_xor_si128(tmp2, tmp3), tmp1); \
_mm_store_si128((void *) key_ptr, tmp2); \
key_ptr++
#define KEY_EXPANSION_BLOCK_1(rcon) \
tmp1 = _mm_aeskeygenassist_si128(tmp0, (rcon)); KEY_EXPANSION_A
#define KEY_EXPANSION_BLOCK_2(rcon) \
tmp1 = _mm_aeskeygenassist_si128(tmp2, rcon); KEY_EXPANSION_A; \
tmp1 = _mm_aeskeygenassist_si128(tmp0, rcon); KEY_EXPANSION_B
#define KEY_EXPANSION_LAST(rcon) \
tmp1 = _mm_aeskeygenassist_si128(tmp2, 64); KEY_EXPANSION_A;
static void
_key_setup(context * const ctx, const unsigned char *key)
{
__m128i *key_ptr = &ctx->ekey[0];
__m128i tmp0 = _mm_loadu_si128((const void *) key);
__m128i tmp1, tmp2;
__m128i tmp3 = _mm_setzero_si128();
*key_ptr++ = tmp0;
switch (crypto_aead_aes256gcm_KEYBYTES) {
case 16:
KEY_EXPANSION_BLOCK_1(1);
KEY_EXPANSION_BLOCK_1(2);
KEY_EXPANSION_BLOCK_1(4);
KEY_EXPANSION_BLOCK_1(8);
KEY_EXPANSION_BLOCK_1(16);
KEY_EXPANSION_BLOCK_1(32);
KEY_EXPANSION_BLOCK_1(64);
KEY_EXPANSION_BLOCK_1(128);
KEY_EXPANSION_BLOCK_1(27);
KEY_EXPANSION_BLOCK_1(54);
break;
case 32:
tmp2 = _mm_loadu_si128((const void *) (key + 16));
_mm_store_si128((void *) key_ptr, tmp2);
key_ptr++;
KEY_EXPANSION_BLOCK_2(1);
KEY_EXPANSION_BLOCK_2(2);
KEY_EXPANSION_BLOCK_2(4);
KEY_EXPANSION_BLOCK_2(8);
KEY_EXPANSION_BLOCK_2(16);
KEY_EXPANSION_BLOCK_2(32);
KEY_EXPANSION_LAST(64);
break;
default:
abort();
}
}
#define AESNI_INC \
ctr = _mm_add_epi64(ctr, inc); \
ctr_low++; \
if (ctr_low == 0U) { \
inc = _mm_slli_si128(inc, 8); \
ctr = _mm_add_epi64(ctr, inc); \
inc = _mm_srli_si128(inc, 8); \
} \
iv = ctr; \
iv = _mm_shuffle_epi8(iv, mask)
static const unsigned char
swap_mask[] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
static void
_aes_ctr(context *ctx, unsigned char *dst, const unsigned char *src, size_t len,
unsigned char *ivc_block)
{
__m128i *key_ptr;
__m128i ctr;
__m128i in1, in2, in3, in4;
__m128i iv;
__m128i key;
__m128i mask;
__m128i state1, state2, state3, state4;
iv = ctr = _mm_loadu_si128((void *) ivc_block);
iv = _mm_slli_si128(iv, 8);
mask = _mm_loadu_si128((const void *) swap_mask);
ctr = _mm_shuffle_epi8(ctr, mask);
uint32_t ctr_low = (uint32_t) 1;
__m128i inc = _mm_cvtsi32_si128((int32_t) ctr_low);
ctr_low = (uint32_t) _mm_cvtsi128_si32(ctr);
#define ENC4(offset) \
key = _mm_loadu_si128((void *) (key_ptr + (offset))); \
state1 = _mm_aesenc_si128(state1, key); \
state2 = _mm_aesenc_si128(state2, key); \
state3 = _mm_aesenc_si128(state3, key); \
state4 = _mm_aesenc_si128(state4, key)
#define ENC4_LAST(offset) \
key = _mm_loadu_si128((void *) (key_ptr + (offset))); \
state1 = _mm_aesenclast_si128(state1, key); \
state2 = _mm_aesenclast_si128(state2, key); \
state3 = _mm_aesenclast_si128(state3, key); \
state4 = _mm_aesenclast_si128(state4, key)
while (len >= 64) {
AESNI_INC;
state1 = iv;
in1 = _mm_loadu_si128((const void *) src);
AESNI_INC;
state2 = iv;
in2 = _mm_loadu_si128((const void *) (src + 16));
AESNI_INC;
state3 = iv;
in3 = _mm_loadu_si128((const void *) (src + 32));
AESNI_INC;
state4 = iv;
in4 = _mm_loadu_si128((const void *) (src + 48));
key = _mm_loadu_si128((void *) &ctx->ekey[0]);
key_ptr = &ctx->ekey[0];
state1 = _mm_xor_si128(state1, key);
state2 = _mm_xor_si128(state2, key);
state3 = _mm_xor_si128(state3, key);
state4 = _mm_xor_si128(state4, key);
key_ptr += 3;
switch (crypto_aead_aes256gcm_KEYBYTES) {
case 32:
key_ptr += 4;
ENC4(-6);
ENC4(-5);
case 16:
ENC4(-4);
ENC4(-3);
ENC4(-2);
ENC4(-1);
ENC4(0);
ENC4(1);
ENC4(2);
ENC4(3);
ENC4(4);
ENC4(5);
ENC4(6);
ENC4_LAST(7);
break;
default:
abort();
}
state1 = _mm_xor_si128(state1, in1);
_mm_storeu_si128((void *) dst, state1);
state2 = _mm_xor_si128(state2, in2);
_mm_storeu_si128((void *) (dst + 16), state2);
state3 = _mm_xor_si128(state3, in3);
_mm_storeu_si128((void *) (dst + 32), state3);
state4 = _mm_xor_si128(state4, in4);
_mm_storeu_si128((void *) (dst + 48), state4);
len -= 64;
src += 64;
dst += 64;
}
#define ENC(offset) \
key = _mm_loadu_si128((void *) (key_ptr + (offset))); \
state1 = _mm_aesenc_si128(state1, key)
#define ENC_LAST(offset) \
key = _mm_loadu_si128((void *) (key_ptr + (offset))); \
state1 = _mm_aesenclast_si128(state1, key);
#define ENC_ONE \
key_ptr = &ctx->ekey[0]; \
key = _mm_loadu_si128((void *) &ctx->ekey[0]); \
state1 = _mm_xor_si128(state1, key); \
key_ptr += 3; \
switch (crypto_aead_aes256gcm_KEYBYTES) { \
case 32: \
key_ptr += 4; \
ENC(-6); \
ENC(-5); \
ENC(-4); \
ENC(-3); \
case 16: \
ENC(-2); \
ENC(-1); \
ENC(0); \
ENC(1); \
ENC(2); \
ENC(3); \
ENC(4); \
ENC(5); \
ENC(6); \
ENC_LAST(7); \
break; \
default: \
abort(); \
}
while (len >= 16) {
AESNI_INC;
state1 = iv;
in1 = _mm_loadu_si128((const void *) src);
ENC_ONE;
state1 = _mm_xor_si128(state1, in1);
_mm_storeu_si128((void *) dst, state1);
len -= 16;
src += 16;
dst += 16;
}
if (len > 0) {
unsigned char padded[16];
memset(padded, 0, sizeof padded);
memcpy(padded, src, len);
src = padded;
AESNI_INC;
state1 = iv;
in1 = _mm_loadu_si128((const void *) src);
ENC_ONE;
state1 = _mm_xor_si128(state1, in1);
_mm_storeu_si128((void *) padded, state1);
memcpy(dst, padded, len);
}
_mm_storel_epi64((void *) ivc_block, iv);
}
static void
_aes_enc_one(context *ctx, unsigned char *dst, unsigned char *src)
{
__m128i *key_ptr;
__m128i key;
__m128i state1 = _mm_loadu_si128((const void *) src);
ENC_ONE;
_mm_storeu_si128((void *) dst, state1);
}
#define GMAC_UPDATE \
tmp2 = _mm_loadu_si128((const void *) src); \
tmp2 = _mm_shuffle_epi8(tmp2, mask); \
tmp0 = _mm_xor_si128(tmp6, tmp2); \
\
tmp4 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x10), \
_mm_clmulepi64_si128(tmp0, tmp1, 0x01)); \
tmp5 = _mm_slli_si128(tmp4, 8); \
tmp3 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x00), tmp5); \
tmp6 = _mm_xor_si128(_mm_clmulepi64_si128(tmp0, tmp1, 0x11), \
_mm_srli_si128(tmp4, 8)); \
tmp7 = _mm_srli_epi32(tmp3, 31); \
tmp8 = _mm_srli_epi32(tmp6, 31); \
tmp3 = _mm_slli_epi32(tmp3, 1); \
tmp6 = _mm_slli_epi32(tmp6, 1); \
tmp8 = _mm_slli_si128(tmp8, 4); \
tmp9 = _mm_srli_si128(tmp7, 12); \
tmp7 = _mm_slli_si128(tmp7, 4); \
tmp3 = _mm_or_si128(tmp3, tmp7); \
tmp6 = _mm_or_si128(_mm_or_si128(tmp6, tmp8), tmp9); \
\
tmp8 = _mm_slli_epi32(tmp3, 30); \
tmp9 = _mm_slli_epi32(tmp3, 25); \
tmp7 = _mm_xor_si128(_mm_xor_si128(_mm_slli_epi32(tmp3, 31), tmp8), tmp9); \
tmp3 = _mm_xor_si128(tmp3, _mm_slli_si128(tmp7, 12)); \
\
tmp4 = _mm_srli_epi32(tmp3, 2); \
tmp5 = _mm_srli_epi32(tmp3, 7); \
tmp6 = _mm_xor_si128(tmp6, _mm_xor_si128(tmp3, \
_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(tmp3, 1), tmp4), tmp5), \
_mm_srli_si128(tmp7, 4))))
static void
_gmac_update(ghash *ghash, const unsigned char *src, size_t len)
{
__m128i mask = _mm_loadu_si128((const void *) swap_mask);
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
tmp1 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ghash->subkey), mask);
tmp6 = _mm_shuffle_epi8(_mm_loadu_si128((void *) ghash->initial_state), mask);
while (len >= 16) {
GMAC_UPDATE;
len -= 16;
src += 16;
}
if (len > 0) {
unsigned char padded[16];
memset(padded, 0, sizeof padded);
memcpy(padded, src, len);
src = padded;
GMAC_UPDATE;
}
tmp6 = _mm_shuffle_epi8(tmp6, mask);
_mm_storeu_si128((void *) ghash->state, tmp6);
_mm_storeu_si128((void *) ghash->initial_state, tmp6);
}
static void
_gmac_final(context *ctx, unsigned char *tag, unsigned char *ivc_block, unsigned char *hashstate)
{
__m128i *key_ptr;
__m128i in;
__m128i key;
__m128i state1 = _mm_loadu_si128((const void *) ivc_block);
ENC_ONE;
in = _mm_loadu_si128((void *) hashstate);
state1 = _mm_xor_si128(state1, in);
_mm_storeu_si128((void *) tag, state1);
}
int
crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c,
unsigned long long *clen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *ad,
unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k)
{
context ctx;
unsigned char *mac;
unsigned char ivc_block[AES_BLOCKSIZE];
(void) nsec;
memset(&ctx, 0, sizeof ctx);
memset(ivc_block, 0, sizeof ivc_block);
memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES);
ivc_block[AES_BLOCKSIZE - 1U] = 1U;
_key_setup(&ctx, k);
_aes_enc_one(&ctx, ctx.ghash.subkey, ctx.ghash.subkey);
_gmac_update(&ctx.ghash, ad, adlen);
_aes_ctr(&ctx, c, m, mlen, ivc_block);
_gmac_update(&ctx.ghash, c, mlen);
mac = c + mlen;
_u64_be_from_ull(mac, adlen * 8ULL);
_u64_be_from_ull(mac + 8U, mlen * 8ULL);
_gmac_update(&ctx.ghash, mac, GMAC_BLOCKSIZE);
_gmac_final(&ctx, mac, ivc_block, ctx.ghash.state);
sodium_memzero(&ctx, sizeof ctx);
if (clen_p != NULL) {
*clen_p = mlen + crypto_aead_aes256gcm_ABYTES;
}
return 0;
}
int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m,
unsigned long long *mlen_p,
unsigned char *nsec,
const unsigned char *c,
unsigned long long clen,
const unsigned char *ad,
unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k)
{
context ctx;
unsigned char mac[GMAC_BLOCKSIZE];
unsigned char ivc_block[AES_BLOCKSIZE];
size_t mlen;
(void) nsec;
if (mlen_p != NULL) {
*mlen_p = 0;
}
if (clen < crypto_aead_aes256gcm_ABYTES) {
return -1;
}
mlen = clen - crypto_aead_aes256gcm_ABYTES;
memset(&ctx, 0, sizeof ctx);
memset(ivc_block, 0, sizeof ivc_block);
memcpy(ivc_block, npub, crypto_aead_aes256gcm_NPUBBYTES);
ivc_block[AES_BLOCKSIZE - 1U] = 1U;
_key_setup(&ctx, k);
_aes_enc_one(&ctx, ctx.ghash.subkey, ctx.ghash.subkey);
_gmac_update(&ctx.ghash, ad, adlen);
_gmac_update(&ctx.ghash, c, mlen);
_u64_be_from_ull(mac, adlen * 8ULL);
_u64_be_from_ull(mac + 8U, mlen * 8ULL);
_gmac_update(&ctx.ghash, mac, GMAC_BLOCKSIZE);
_gmac_final(&ctx, mac, ivc_block, ctx.ghash.state);
if (sodium_memcmp(c + mlen, mac, crypto_aead_aes256gcm_ABYTES) != 0) {
sodium_memzero(&ctx, sizeof ctx);
return -1;
}
_aes_ctr(&ctx, m, c, mlen, ivc_block);
sodium_memzero(&ctx, sizeof ctx);
if (mlen_p != NULL) {
*mlen_p = mlen;
}
return 0;
}

View File

@ -2,6 +2,7 @@
SODIUM_EXPORT = \
sodium.h \
sodium/core.h \
sodium/crypto_aead_aes256gcm_aesni.h \
sodium/crypto_aead_chacha20poly1305.h \
sodium/crypto_auth.h \
sodium/crypto_auth_hmacsha256.h \

View File

@ -3,6 +3,7 @@
#define sodium_H
#include "sodium/core.h"
#include "sodium/crypto_aead_aes256gcm_aesni.h"
#include "sodium/crypto_aead_chacha20poly1305.h"
#include "sodium/crypto_auth.h"
#include "sodium/crypto_auth_hmacsha256.h"

View File

@ -0,0 +1,44 @@
#ifndef crypto_aead_aes256gcm_aesni_H
#define crypto_aead_aes256gcm_aesni_H
#include <stddef.h>
#include "export.h"
#ifdef __cplusplus
# if __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#endif
#define crypto_aead_aes256gcm_KEYBYTES 32U
#define crypto_aead_aes256gcm_NSECBYTES 0U
#define crypto_aead_aes256gcm_NPUBBYTES 12U
#define crypto_aead_aes256gcm_ABYTES 16U
SODIUM_EXPORT
int crypto_aead_aes256gcm_aesni_encrypt(unsigned char *c,
unsigned long long *clen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *ad,
unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k);
SODIUM_EXPORT
int crypto_aead_aes256gcm_aesni_decrypt(unsigned char *m,
unsigned long long *mlen_p,
unsigned char *nsec,
const unsigned char *c,
unsigned long long clen,
const unsigned char *ad,
unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k);
#ifdef __cplusplus
}
#endif
#endif