Link chacha20_vec

This commit is contained in:
Frank Denis 2015-11-25 15:23:22 +01:00
parent 31c4df3f59
commit fb42d081d7
3 changed files with 381 additions and 1 deletions

View File

@ -285,7 +285,9 @@ libssse3_la_LDFLAGS = $(libsodium_la_LDFLAGS)
libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
@CFLAGS_SSE2@ @CFLAGS_SSSE3@
libssse3_la_SOURCES = \
crypto_generichash/blake2/ref/blake2b-compress-ssse3.c
crypto_generichash/blake2/ref/blake2b-compress-ssse3.c \
crypto_stream/chacha20/vec/stream_chacha20_vec.h \
crypto_stream/chacha20/vec/stream_chacha20_vec.c
libsse41_la_LDFLAGS = $(libsodium_la_LDFLAGS)
libsse41_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \

View File

@ -0,0 +1,351 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "export.h"
#include "utils.h"
#include "crypto_stream_chacha20.h"
#include "stream_chacha20_vec.h"
#include "../stream_chacha20.h"
#if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)) || \
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))
#pragma GCC target("sse2")
#pragma GCC target("ssse3")
#define CHACHA_RNDS 20
typedef unsigned vec __attribute__((vector_size(16)));
#include <emmintrin.h>
#include <tmmintrin.h>
# if __clang__
# define VBPI 4
# else
# define VBPI 3
# endif
# define ONE (vec) _mm_set_epi32(0, 0, 0, 1)
# define LOAD(m) (vec) _mm_loadu_si128((__m128i *)(m))
# define LOAD_ALIGNED(m) (vec) _mm_load_si128((__m128i *)(m))
# define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r))
# define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1))
# define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2))
# define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3))
# define ROTW7(x) \
(vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25))
# define ROTW12(x) \
(vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20))
# define ROTW8(x) \
(vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24))
#define ROTW16(x) \
(vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16))
#ifndef REVV_BE
# define REVV_BE(x) (x)
#endif
#ifndef REVW_BE
# define REVW_BE(x) (x)
#endif
#define BPI (VBPI + 0) /* Blocks computed per loop iteration */
#define DQROUND_VECTORS(a, b, c, d) \
a += b; \
d ^= a; \
d = ROTW16(d); \
c += d; \
b ^= c; \
b = ROTW12(b); \
a += b; \
d ^= a; \
d = ROTW8(d); \
c += d; \
b ^= c; \
b = ROTW7(b); \
b = ROTV1(b); \
c = ROTV2(c); \
d = ROTV3(d); \
a += b; \
d ^= a; \
d = ROTW16(d); \
c += d; \
b ^= c; \
b = ROTW12(b); \
a += b; \
d ^= a; \
d = ROTW8(d); \
c += d; \
b ^= c; \
b = ROTW7(b); \
b = ROTV3(b); \
c = ROTV2(c); \
d = ROTV1(d);
#define QROUND_WORDS(a, b, c, d) \
a = a + b; \
d ^= a; \
d = d << 16 | d >> 16; \
c = c + d; \
b ^= c; \
b = b << 12 | b >> 20; \
a = a + b; \
d ^= a; \
d = d << 8 | d >> 24; \
c = c + d; \
b ^= c; \
b = b << 7 | b >> 25;
#define WRITE_XOR(in, op, d, v0, v1, v2, v3) \
STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \
STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \
STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \
STORE(op + d + 12, LOAD(in + d + 12) ^ REVV_BE(v3));
struct chacha_ctx {
vec s1;
vec s2;
vec s3;
};
typedef struct chacha_ctx chacha_ctx;
static void
chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, uint64_t ic)
{
ctx->s3 = (vec) {
ic,
ic >> 32,
((uint32_t *) iv)[0],
((uint32_t *) iv)[1]
};
}
static void
chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, uint32_t ic)
{
ctx->s3 = (vec) {
ic,
((uint32_t *) iv)[0],
((uint32_t *) iv)[1],
((uint32_t *) iv)[2]
};
}
static void
chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
{
unsigned int *kp;
kp = (unsigned int *) k;
ctx->s1 = LOAD(&((vec *)kp)[0]);
ctx->s2 = LOAD(&((vec *)kp)[1]);
}
static void
chacha_encrypt_bytes(chacha_ctx *ctx, const uint8_t *in, uint8_t *out,
unsigned long long inlen)
{
unsigned long long iters;
unsigned int i;
unsigned int *op = (unsigned *)out;
unsigned int *ip = (unsigned *)in;
unsigned int *kp;
vec s0, s1, s2, s3;
CRYPTO_ALIGN(16) unsigned chacha_const[]
= { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 };
s0 = LOAD_ALIGNED(chacha_const);
s1 = ctx->s1;
s2 = ctx->s2;
s3 = ctx->s3;
for (iters = 0; iters < inlen / (BPI * 64); iters++) {
#if VBPI > 2
vec v8, v9, v10, v11;
#endif
#if VBPI > 3
vec v12, v13, v14, v15;
#endif
vec v0, v1, v2, v3, v4, v5, v6, v7;
v4 = v0 = s0;
v5 = v1 = s1;
v6 = v2 = s2;
v3 = s3;
v7 = v3 + ONE;
#if VBPI > 2
v8 = v4;
v9 = v5;
v10 = v6;
v11 = v7 + ONE;
#endif
#if VBPI > 3
v12 = v8;
v13 = v9;
v14 = v10;
v15 = v11 + ONE;
#endif
for (i = CHACHA_RNDS / 2; i; i--) {
DQROUND_VECTORS(v0, v1, v2, v3)
DQROUND_VECTORS(v4, v5, v6, v7)
#if VBPI > 2
DQROUND_VECTORS(v8, v9, v10, v11)
#endif
#if VBPI > 3
DQROUND_VECTORS(v12, v13, v14, v15)
#endif
}
WRITE_XOR(ip, op, 0, v0 + s0, v1 + s1, v2 + s2, v3 + s3)
s3 += ONE;
WRITE_XOR(ip, op, 16, v4 + s0, v5 + s1, v6 + s2, v7 + s3)
s3 += ONE;
#if VBPI > 2
WRITE_XOR(ip, op, 32, v8 + s0, v9 + s1, v10 + s2, v11 + s3)
s3 += ONE;
#endif
#if VBPI > 3
WRITE_XOR(ip, op, 48, v12 + s0, v13 + s1, v14 + s2, v15 + s3)
s3 += ONE;
#endif
ip += VBPI * 16;
op += VBPI * 16;
}
for (iters = inlen % (BPI * 64) / 64; iters != 0; iters--) {
vec v0 = s0, v1 = s1, v2 = s2, v3 = s3;
for (i = CHACHA_RNDS / 2; i; i--) {
DQROUND_VECTORS(v0, v1, v2, v3);
}
WRITE_XOR(ip, op, 0, v0 + s0, v1 + s1, v2 + s2, v3 + s3)
s3 += ONE;
ip += 16;
op += 16;
}
inlen = inlen % 64;
if (inlen) {
CRYPTO_ALIGN(16) vec buf[4];
vec v0, v1, v2, v3;
v0 = s0;
v1 = s1;
v2 = s2;
v3 = s3;
for (i = CHACHA_RNDS / 2; i; i--) {
DQROUND_VECTORS(v0, v1, v2, v3);
}
if (inlen >= 16) {
STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0));
if (inlen >= 32) {
STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1));
if (inlen >= 48) {
STORE(op + 8, LOAD(ip + 8) ^ REVV_BE(v2 + s2));
buf[3] = REVV_BE(v3 + s3);
} else {
buf[2] = REVV_BE(v2 + s2);
}
} else {
buf[1] = REVV_BE(v1 + s1);
}
} else {
buf[0] = REVV_BE(v0 + s0);
}
for (i = inlen & ~15; i < inlen; i++) {
((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i];
}
}
}
static int
stream_vec(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k)
{
struct chacha_ctx ctx;
if (!clen) {
return 0;
}
(void) sizeof(int[crypto_stream_chacha20_KEYBYTES == 256 / 8 ? 1 : -1]);
chacha_keysetup(&ctx, k);
chacha_ivsetup(&ctx, n, 0ULL);
memset(c, 0, clen);
chacha_encrypt_bytes(&ctx, c, c, clen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
}
static int
stream_ietf_vec(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k)
{
struct chacha_ctx ctx;
if (!clen) {
return 0;
}
if (clen > 64ULL * (1ULL << 32) - 64ULL) {
abort();
}
(void) sizeof(int[crypto_stream_chacha20_KEYBYTES == 256 / 8 ? 1 : -1]);
chacha_keysetup(&ctx, k);
chacha_ietf_ivsetup(&ctx, n, 0ULL);
memset(c, 0, clen);
chacha_encrypt_bytes(&ctx, c, c, clen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
}
static int
stream_vec_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k)
{
struct chacha_ctx ctx;
if (!mlen) {
return 0;
}
chacha_keysetup(&ctx, k);
chacha_ivsetup(&ctx, n, ic);
chacha_encrypt_bytes(&ctx, m, c, mlen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
}
static int
stream_ietf_vec_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k)
{
struct chacha_ctx ctx;
if (!mlen) {
return 0;
}
chacha_keysetup(&ctx, k);
chacha_ietf_ivsetup(&ctx, n, ic);
chacha_encrypt_bytes(&ctx, m, c, mlen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
}
struct crypto_stream_chacha20_implementation
crypto_stream_chacha20_vec_implementation = {
SODIUM_C99(.stream =) stream_vec,
SODIUM_C99(.stream_ietf =) stream_ietf_vec,
SODIUM_C99(.stream_xor_ic =) stream_vec_xor_ic,
SODIUM_C99(.stream_ietf_xor_ic =) stream_ietf_vec_xor_ic
};
#endif

View File

@ -0,0 +1,27 @@
#include <stdint.h>
#include "crypto_stream_chacha20.h"
extern struct crypto_stream_chacha20_implementation
crypto_stream_chacha20_vec_implementation;
int
crypto_stream_chacha20_vec(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int
crypto_stream_chacha20_vec_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
int
crypto_stream_chacha20_ietf_vec(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int
crypto_stream_chacha20_ietf_vec_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);