BLAKE2b AVX2 implementation
By the marvellous Samuel Neves - https://github.com/sneves/blake2-avx2
This commit is contained in:
parent
7611ea6018
commit
0131a72082
@ -394,6 +394,8 @@ AC_SUBST(CFLAGS_SSE2)
|
||||
AC_SUBST(CFLAGS_SSE3)
|
||||
AC_SUBST(CFLAGS_SSSE3)
|
||||
AC_SUBST(CFLAGS_SSE41)
|
||||
AC_SUBST(CFLAGS_AVX)
|
||||
AC_SUBST(CFLAGS_AVX2)
|
||||
AC_SUBST(CFLAGS_AESNI)
|
||||
AC_SUBST(CFLAGS_PCLMUL)
|
||||
|
||||
|
@ -36,6 +36,7 @@ libsodium_la_SOURCES = \
|
||||
crypto_generichash/blake2/ref/blake2b-compress-ref.c \
|
||||
crypto_generichash/blake2/ref/blake2b-load-sse2.h \
|
||||
crypto_generichash/blake2/ref/blake2b-load-sse41.h \
|
||||
crypto_generichash/blake2/ref/blake2b-load-avx2.h \
|
||||
crypto_generichash/blake2/ref/blake2b-ref.c \
|
||||
crypto_generichash/blake2/ref/blake2b-round.h \
|
||||
crypto_generichash/blake2/ref/generichash_blake2b.c \
|
||||
@ -216,8 +217,8 @@ endif
|
||||
SUBDIRS = \
|
||||
include
|
||||
|
||||
libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la
|
||||
noinst_LTLIBRARIES = libaesni.la libsse2.la libssse3.la libsse41.la
|
||||
libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la
|
||||
noinst_LTLIBRARIES = libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la
|
||||
|
||||
libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS)
|
||||
libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
|
||||
@ -238,6 +239,7 @@ libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
|
||||
@CFLAGS_SSE2@ @CFLAGS_SSSE3@
|
||||
libssse3_la_SOURCES = \
|
||||
crypto_generichash/blake2/ref/blake2b-compress-ssse3.c \
|
||||
crypto_generichash/blake2/ref/blake2b-compress-ssse3.h \
|
||||
crypto_pwhash/argon2/argon2-fill-block-ssse3.c \
|
||||
crypto_pwhash/argon2/blamka-round-ssse3.h \
|
||||
crypto_stream/chacha20/vec/stream_chacha20_vec.h \
|
||||
@ -247,4 +249,12 @@ libsse41_la_LDFLAGS = $(libsodium_la_LDFLAGS)
|
||||
libsse41_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
|
||||
@CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@
|
||||
libsse41_la_SOURCES = \
|
||||
crypto_generichash/blake2/ref/blake2b-compress-sse41.c
|
||||
crypto_generichash/blake2/ref/blake2b-compress-sse41.c \
|
||||
crypto_generichash/blake2/ref/blake2b-compress-sse41.h
|
||||
|
||||
libavx2_la_LDFLAGS = $(libsodium_la_LDFLAGS)
|
||||
libavx2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
|
||||
@CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@ @CFLAGS_AVX@ @CFLAGS_AVX2@
|
||||
libavx2_la_SOURCES = \
|
||||
crypto_generichash/blake2/ref/blake2b-compress-avx2.c \
|
||||
crypto_generichash/blake2/ref/blake2b-compress-avx2.h
|
||||
|
@ -183,6 +183,7 @@ CRYPTO_ALIGN( 64 ) typedef struct blake2b_state_
|
||||
int blake2b_compress_ref( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
|
||||
int blake2b_compress_ssse3( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
|
||||
int blake2b_compress_sse41( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
|
||||
int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
|
@ -0,0 +1,45 @@
|
||||
|
||||
#define BLAKE2_USE_SSSE3
|
||||
#define BLAKE2_USE_SSE41
|
||||
#define BLAKE2_USE_AVX2
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
|
||||
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
|
||||
|
||||
#pragma GCC target("sse2")
|
||||
#pragma GCC target("ssse3")
|
||||
#pragma GCC target("sse4.1")
|
||||
#pragma GCC target("avx2")
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "blake2.h"
|
||||
#include "blake2-impl.h"
|
||||
#include "blake2b-compress-avx2.h"
|
||||
|
||||
CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
|
||||
{
|
||||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
|
||||
{
|
||||
__m256i a = LOADU(&S->h[0]);
|
||||
__m256i b = LOADU(&S->h[4]);
|
||||
BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]);
|
||||
STOREU(&S->h[0], a);
|
||||
STOREU(&S->h[4], b);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@ -0,0 +1,123 @@
|
||||
|
||||
#ifndef blake2b_compress_avx2_H
|
||||
#define blake2b_compress_avx2_H
|
||||
|
||||
#define LOAD128(p) _mm_load_si128((__m128i *)(p))
|
||||
#define STORE128(p, r) _mm_store_si128((__m128i *)(p), r)
|
||||
|
||||
#define LOADU128(p) _mm_loadu_si128((__m128i *)(p))
|
||||
#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r)
|
||||
|
||||
#define LOAD(p) _mm256_load_si256((__m256i *)(p))
|
||||
#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r)
|
||||
|
||||
#define LOADU(p) _mm256_loadu_si256((__m256i *)(p))
|
||||
#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r)
|
||||
|
||||
static inline uint64_t LOADU64(const void *p) {
|
||||
uint64_t v;
|
||||
memcpy(&v, p, sizeof v);
|
||||
return v;
|
||||
}
|
||||
|
||||
#define ROTATE16 _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, \
|
||||
2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
|
||||
|
||||
#define ROTATE24 _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, \
|
||||
3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
|
||||
|
||||
#define ADD(a, b) _mm256_add_epi64(a, b)
|
||||
#define SUB(a, b) _mm256_sub_epi64(a, b)
|
||||
|
||||
#define XOR(a, b) _mm256_xor_si256(a, b)
|
||||
#define AND(a, b) _mm256_and_si256(a, b)
|
||||
#define OR(a, b) _mm256_or_si256(a, b)
|
||||
|
||||
#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))
|
||||
#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24)
|
||||
#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16)
|
||||
#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
|
||||
|
||||
#define BLAKE2B_G1_V1(a, b, c, d, m) do { \
|
||||
a = ADD(a, m); \
|
||||
a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \
|
||||
c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_G2_V1(a, b, c, d, m) do { \
|
||||
a = ADD(a, m); \
|
||||
a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \
|
||||
c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_DIAG_V1(a, b, c, d) do { \
|
||||
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2,1,0,3)); \
|
||||
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \
|
||||
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0,3,2,1)); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_UNDIAG_V1(a, b, c, d) do { \
|
||||
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0,3,2,1)); \
|
||||
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \
|
||||
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2,1,0,3)); \
|
||||
} while(0)
|
||||
|
||||
#include "blake2b-load-avx2.h"
|
||||
|
||||
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \
|
||||
__m256i b0; \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_1(b0); \
|
||||
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_2(b0); \
|
||||
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_DIAG_V1(a, b, c, d); \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_3(b0); \
|
||||
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_4(b0); \
|
||||
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_UNDIAG_V1(a, b, c, d); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
|
||||
} while(0)
|
||||
|
||||
#define DECLARE_MESSAGE_WORDS(m) \
|
||||
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
|
||||
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
|
||||
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
|
||||
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
|
||||
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
|
||||
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
|
||||
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
|
||||
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
|
||||
__m256i t0, t1;
|
||||
|
||||
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \
|
||||
DECLARE_MESSAGE_WORDS(m) \
|
||||
const __m256i iv0 = a; \
|
||||
const __m256i iv1 = b; \
|
||||
__m256i c = LOAD(&blake2b_IV[0]); \
|
||||
__m256i d = XOR( \
|
||||
LOAD(&blake2b_IV[4]), \
|
||||
_mm256_set_epi64x(f1, f0, t1, t0) \
|
||||
); \
|
||||
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
|
||||
a = XOR(a, c); \
|
||||
b = XOR(b, d); \
|
||||
a = XOR(a, iv0); \
|
||||
b = XOR(b, iv1); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
@ -18,7 +18,7 @@
|
||||
|
||||
#include "blake2.h"
|
||||
#include "blake2-impl.h"
|
||||
#include "blake2b-round.h"
|
||||
#include "blake2b-compress-sse41.h"
|
||||
|
||||
CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
|
||||
{
|
||||
|
@ -1,31 +1,10 @@
|
||||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef blake2b_round_H
|
||||
#define blake2b_round_H
|
||||
|
||||
#ifndef BLAKE2_USE_SSSE3
|
||||
# error BLAKE2_USE_SSSE3 must be defined in order to use this file
|
||||
#endif
|
||||
#ifndef blake2b_compress_sse41_H
|
||||
#define blake2b_compress_sse41_H
|
||||
|
||||
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void *)(p) )
|
||||
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r)
|
||||
|
||||
#define TOF(reg) _mm_castsi128_ps((reg))
|
||||
#define TOI(reg) _mm_castps_si128((reg))
|
||||
|
||||
|
||||
/* Microarchitecture-specific macros */
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
|
||||
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
|
||||
@ -33,7 +12,6 @@
|
||||
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
|
||||
|
||||
|
||||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
@ -102,11 +80,7 @@
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#if defined(BLAKE2_USE_SSE41)
|
||||
#include "blake2b-load-sse41.h"
|
||||
#else
|
||||
#include "blake2b-load-sse2.h"
|
||||
#endif
|
||||
|
||||
#define ROUND(r) \
|
||||
LOAD_MSG_ ##r ##_1(b0, b1); \
|
@ -1,6 +1,4 @@
|
||||
|
||||
#define BLAKE2_USE_SSSE3
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
@ -18,7 +16,7 @@
|
||||
|
||||
#include "blake2.h"
|
||||
#include "blake2-impl.h"
|
||||
#include "blake2b-round.h"
|
||||
#include "blake2b-compress-ssse3.h"
|
||||
|
||||
CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
|
||||
{
|
||||
|
@ -0,0 +1,97 @@
|
||||
|
||||
#ifndef blake2b_compress_ssse3_H
|
||||
#define blake2b_compress_ssse3_H
|
||||
|
||||
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void *)(p) )
|
||||
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r)
|
||||
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
|
||||
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
|
||||
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
|
||||
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
|
||||
|
||||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -32); \
|
||||
row4h = _mm_roti_epi64(row4h, -32); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -24); \
|
||||
row2h = _mm_roti_epi64(row2h, -24); \
|
||||
|
||||
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -16); \
|
||||
row4h = _mm_roti_epi64(row4h, -16); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -63); \
|
||||
row2h = _mm_roti_epi64(row2h, -63); \
|
||||
|
||||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#include "blake2b-load-sse2.h"
|
||||
|
||||
#define ROUND(r) \
|
||||
LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
|
||||
#endif
|
339
src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h
Normal file
339
src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h
Normal file
@ -0,0 +1,339 @@
|
||||
#ifndef blake2b_load_avx2_H
|
||||
#define blake2b_load_avx2_H
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_1(b0) do { \
|
||||
t0 = _mm256_unpacklo_epi64(m0, m1); \
|
||||
t1 = _mm256_unpacklo_epi64(m2, m3); \
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m0, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m2, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m4, m5);\
|
||||
t1 = _mm256_unpacklo_epi64(m6, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m4, m5);\
|
||||
t1 = _mm256_unpackhi_epi64(m6, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m7, m2);\
|
||||
t1 = _mm256_unpackhi_epi64(m4, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m4);\
|
||||
t1 = _mm256_alignr_epi8(m3, m7, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\
|
||||
t1 = _mm256_unpackhi_epi64(m5, m2);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m6, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m3, m1);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m6, m5, 8);\
|
||||
t1 = _mm256_unpackhi_epi64(m2, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m4, m0);\
|
||||
t1 = _mm256_blend_epi32(m6, m1, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m1, m5, 0x33);\
|
||||
t1 = _mm256_unpackhi_epi64(m3, m4);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m7, m3);\
|
||||
t1 = _mm256_alignr_epi8(m2, m0, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m3, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m6, m5);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m4, m0);\
|
||||
t1 = _mm256_unpacklo_epi64(m6, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m2, m1, 0x33);\
|
||||
t1 = _mm256_blend_epi32(m7, m2, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m3, m5);\
|
||||
t1 = _mm256_unpacklo_epi64(m0, m4);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m4, m2);\
|
||||
t1 = _mm256_unpacklo_epi64(m1, m5);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m3, m0, 0x33);\
|
||||
t1 = _mm256_blend_epi32(m7, m2, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m5, m7, 0x33);\
|
||||
t1 = _mm256_blend_epi32(m1, m3, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m6, m0, 8);\
|
||||
t1 = _mm256_blend_epi32(m6, m4, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m1, m3);\
|
||||
t1 = _mm256_unpacklo_epi64(m0, m4);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m6, m5);\
|
||||
t1 = _mm256_unpackhi_epi64(m5, m1);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m3, m2, 0x33);\
|
||||
t1 = _mm256_unpackhi_epi64(m7, m0);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m6, m2);\
|
||||
t1 = _mm256_blend_epi32(m4, m7, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m0, m6, 0x33);\
|
||||
t1 = _mm256_unpacklo_epi64(m7, m2);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m2, m7);\
|
||||
t1 = _mm256_alignr_epi8(m5, m6, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m0, m3);\
|
||||
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m3, m1);\
|
||||
t1 = _mm256_blend_epi32(m5, m1, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m6, m3);\
|
||||
t1 = _mm256_blend_epi32(m1, m6, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m7, m5, 8);\
|
||||
t1 = _mm256_unpackhi_epi64(m0, m4);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m2, m7);\
|
||||
t1 = _mm256_unpacklo_epi64(m4, m1);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m0, m2);\
|
||||
t1 = _mm256_unpacklo_epi64(m3, m5);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m3, m7);\
|
||||
t1 = _mm256_alignr_epi8(m0, m5, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m7, m4);\
|
||||
t1 = _mm256_alignr_epi8(m4, m1, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_3(b0) \
|
||||
do { \
|
||||
t0 = m6;\
|
||||
t1 = _mm256_alignr_epi8(m5, m0, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m3, m1, 0x33);\
|
||||
t1 = m2;\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m4);\
|
||||
t1 = _mm256_unpackhi_epi64(m3, m0);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m1, m2);\
|
||||
t1 = _mm256_blend_epi32(m2, m3, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m7, m4);\
|
||||
t1 = _mm256_unpackhi_epi64(m1, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m7, m5, 8);\
|
||||
t1 = _mm256_unpacklo_epi64(m6, m0);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m0, m1);\
|
||||
t1 = _mm256_unpacklo_epi64(m2, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m0, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m2, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m4, m5);\
|
||||
t1 = _mm256_unpacklo_epi64(m6, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m4, m5);\
|
||||
t1 = _mm256_unpackhi_epi64(m6, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m7, m2);\
|
||||
t1 = _mm256_unpackhi_epi64(m4, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m4);\
|
||||
t1 = _mm256_alignr_epi8(m3, m7, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\
|
||||
t1 = _mm256_unpackhi_epi64(m5, m2);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m6, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m3, m1);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#endif
|
@ -416,6 +416,13 @@ int blake2b_salt_personal( uint8_t *out, const void *in, const void *key, const
|
||||
int
|
||||
blake2b_pick_best_implementation(void)
|
||||
{
|
||||
#if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
|
||||
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
|
||||
if (sodium_runtime_has_avx2()) {
|
||||
blake2b_compress = blake2b_compress_avx2;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
#if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
|
||||
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
|
||||
if (sodium_runtime_has_sse41()) {
|
||||
|
Loading…
Reference in New Issue
Block a user