BLAKE2b AVX2 implementation

By the marvellous Samuel Neves - https://github.com/sneves/blake2-avx2
2016-03-17 10:45:11 +01:00 · 2016-03-17 10:45:11 +01:00 · 0131a72082
commit 0131a72082
parent 7611ea6018
11 changed files with 631 additions and 35 deletions
--- a/configure.ac
+++ b/configure.ac
@ -394,6 +394,8 @@ AC_SUBST(CFLAGS_SSE2)
 AC_SUBST(CFLAGS_SSE3)
 AC_SUBST(CFLAGS_SSSE3)
 AC_SUBST(CFLAGS_SSE41)
 AC_SUBST(CFLAGS_AVX)
 AC_SUBST(CFLAGS_AVX2)
 AC_SUBST(CFLAGS_AESNI)
 AC_SUBST(CFLAGS_PCLMUL)
--- a/src/libsodium/Makefile.am
+++ b/src/libsodium/Makefile.am
@ -36,6 +36,7 @@ libsodium_la_SOURCES = \
 	crypto_generichash/blake2/ref/blake2b-compress-ref.c \
 	crypto_generichash/blake2/ref/blake2b-load-sse2.h \
 	crypto_generichash/blake2/ref/blake2b-load-sse41.h \
 	crypto_generichash/blake2/ref/blake2b-load-avx2.h \
 	crypto_generichash/blake2/ref/blake2b-ref.c \
 	crypto_generichash/blake2/ref/blake2b-round.h \
 	crypto_generichash/blake2/ref/generichash_blake2b.c \
@ -216,8 +217,8 @@ endif
 SUBDIRS = \
 	include
-libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la
+libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la
-noinst_LTLIBRARIES =  libaesni.la libsse2.la libssse3.la libsse41.la
+noinst_LTLIBRARIES =  libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la
 libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS)
 libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
@ -238,6 +239,7 @@ libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
 	@CFLAGS_SSE2@ @CFLAGS_SSSE3@
 libssse3_la_SOURCES = \
 	crypto_generichash/blake2/ref/blake2b-compress-ssse3.c \
 	crypto_generichash/blake2/ref/blake2b-compress-ssse3.h \
 	crypto_pwhash/argon2/argon2-fill-block-ssse3.c \
 	crypto_pwhash/argon2/blamka-round-ssse3.h \
 	crypto_stream/chacha20/vec/stream_chacha20_vec.h \
@ -247,4 +249,12 @@ libsse41_la_LDFLAGS = $(libsodium_la_LDFLAGS)
 libsse41_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
 	@CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@
 libsse41_la_SOURCES = \
-	crypto_generichash/blake2/ref/blake2b-compress-sse41.c
+	crypto_generichash/blake2/ref/blake2b-compress-sse41.c \
 	crypto_generichash/blake2/ref/blake2b-compress-sse41.h
 libavx2_la_LDFLAGS = $(libsodium_la_LDFLAGS)
 libavx2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
 	@CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@ @CFLAGS_AVX@ @CFLAGS_AVX2@
 libavx2_la_SOURCES = \
 	crypto_generichash/blake2/ref/blake2b-compress-avx2.c \
 	crypto_generichash/blake2/ref/blake2b-compress-avx2.h
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2.h
@ -183,6 +183,7 @@ CRYPTO_ALIGN( 64 ) typedef struct blake2b_state_
  int blake2b_compress_ref( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
  int blake2b_compress_ssse3( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
  int blake2b_compress_sse41( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
  int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
 #if defined(__cplusplus)
 }
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c
@ -0,0 +1,45 @@
 #define BLAKE2_USE_SSSE3
 #define BLAKE2_USE_SSE41
 #define BLAKE2_USE_AVX2
 #include <stdint.h>
 #include <string.h>
 #if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
 #pragma GCC target("sse2")
 #pragma GCC target("ssse3")
 #pragma GCC target("sse4.1")
 #pragma GCC target("avx2")
 #include <emmintrin.h>
 #include <tmmintrin.h>
 #include <smmintrin.h>
 #include <immintrin.h>
 #include "blake2.h"
 #include "blake2-impl.h"
 #include "blake2b-compress-avx2.h"
 CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
 {
    0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
    0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
    0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
    0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
 int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
 {
    __m256i a = LOADU(&S->h[0]);
    __m256i b = LOADU(&S->h[4]);
    BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]);
    STOREU(&S->h[0], a);
    STOREU(&S->h[4], b);
    return 0;
 }
 #endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h
@ -0,0 +1,123 @@
 #ifndef blake2b_compress_avx2_H
 #define blake2b_compress_avx2_H
 #define LOAD128(p)     _mm_load_si128((__m128i *)(p))
 #define STORE128(p, r) _mm_store_si128((__m128i *)(p), r)
 #define LOADU128(p)     _mm_loadu_si128((__m128i *)(p))
 #define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r)
 #define LOAD(p)     _mm256_load_si256((__m256i *)(p))
 #define STORE(p, r) _mm256_store_si256((__m256i *)(p), r)
 #define LOADU(p)     _mm256_loadu_si256((__m256i *)(p))
 #define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r)
 static inline uint64_t LOADU64(const void *p) {
    uint64_t v;
    memcpy(&v, p, sizeof v);
    return v;
 }
 #define ROTATE16 _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, \
                                  2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
 #define ROTATE24 _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, \
                                  3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
 #define ADD(a, b) _mm256_add_epi64(a, b)
 #define SUB(a, b) _mm256_sub_epi64(a, b)
 #define XOR(a, b) _mm256_xor_si256(a, b)
 #define AND(a, b) _mm256_and_si256(a, b)
 #define  OR(a, b) _mm256_or_si256(a, b)
 #define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))
 #define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24)
 #define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16)
 #define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
 #define BLAKE2B_G1_V1(a, b, c, d, m) do {       \
    a = ADD(a, m);                              \
    a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \
    c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \
 } while(0)
 #define BLAKE2B_G2_V1(a, b, c, d, m) do {       \
    a = ADD(a, m);                              \
    a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \
    c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \
 } while(0)
 #define BLAKE2B_DIAG_V1(a, b, c, d) do {                   \
    d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2,1,0,3)); \
    c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \
    b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0,3,2,1)); \
 } while(0)
 #define BLAKE2B_UNDIAG_V1(a, b, c, d) do {                 \
    d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0,3,2,1)); \
    c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \
    b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2,1,0,3)); \
 } while(0)
 #include "blake2b-load-avx2.h"
 #define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \
    __m256i b0;                                 \
    BLAKE2B_LOAD_MSG_ ##r ##_1(b0);             \
    BLAKE2B_G1_V1(a, b, c, d, b0);              \
    BLAKE2B_LOAD_MSG_ ##r ##_2(b0);             \
    BLAKE2B_G2_V1(a, b, c, d, b0);              \
    BLAKE2B_DIAG_V1(a, b, c, d);                \
    BLAKE2B_LOAD_MSG_ ##r ##_3(b0);             \
    BLAKE2B_G1_V1(a, b, c, d, b0);              \
    BLAKE2B_LOAD_MSG_ ##r ##_4(b0);             \
    BLAKE2B_G2_V1(a, b, c, d, b0);              \
    BLAKE2B_UNDIAG_V1(a, b, c, d);              \
 } while(0)
 #define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \
    BLAKE2B_ROUND_V1(a, b, c, d,  0, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  1, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  2, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  3, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  4, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  5, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  6, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  7, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  8, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d,  9, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d, 10, (m));    \
    BLAKE2B_ROUND_V1(a, b, c, d, 11, (m));    \
 } while(0)
 #define DECLARE_MESSAGE_WORDS(m)                                         \
    const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) +   0)); \
    const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) +  16)); \
    const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) +  32)); \
    const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) +  48)); \
    const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) +  64)); \
    const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) +  80)); \
    const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) +  96)); \
    const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
    __m256i t0, t1;
 #define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \
    DECLARE_MESSAGE_WORDS(m)                              \
    const __m256i iv0 = a;                                \
    const __m256i iv1 = b;                                \
    __m256i c = LOAD(&blake2b_IV[0]);                     \
    __m256i d = XOR(                                      \
      LOAD(&blake2b_IV[4]),                               \
      _mm256_set_epi64x(f1, f0, t1, t0)                   \
    );                                                    \
    BLAKE2B_ROUNDS_V1(a, b, c, d, m);                     \
    a = XOR(a, c);                                        \
    b = XOR(b, d);                                        \
    a = XOR(a, iv0);                                      \
    b = XOR(b, iv1);                                      \
 } while(0)
 #endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c
@ -18,7 +18,7 @@
 #include "blake2.h"
 #include "blake2-impl.h"
-#include "blake2b-round.h"
+#include "blake2b-compress-sse41.h"
 CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
 {
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.h
@ -1,31 +1,10 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
-   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+#ifndef blake2b_compress_sse41_H
-
+#define blake2b_compress_sse41_H
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #ifndef blake2b_round_H
 #define blake2b_round_H
 #ifndef BLAKE2_USE_SSSE3
 # error BLAKE2_USE_SSSE3 must be defined in order to use this file
 #endif
 #define LOADU(p)  _mm_loadu_si128( (const __m128i *)(const void *)(p) )
 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r)
 #define TOF(reg) _mm_castsi128_ps((reg))
 #define TOI(reg) _mm_castps_si128((reg))
 /* Microarchitecture-specific macros */
 #define _mm_roti_epi64(x, c) \
    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
@ -33,7 +12,6 @@
    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
@ -102,11 +80,7 @@
  row4l = t1; \
  row4h = t0;
 #if defined(BLAKE2_USE_SSE41)
 #include "blake2b-load-sse41.h"
 #else
 #include "blake2b-load-sse2.h"
 #endif
 #define ROUND(r) \
  LOAD_MSG_ ##r ##_1(b0, b1); \
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c
@ -1,6 +1,4 @@
 #define BLAKE2_USE_SSSE3
 #include <stdint.h>
 #include <string.h>
@ -18,7 +16,7 @@
 #include "blake2.h"
 #include "blake2-impl.h"
-#include "blake2b-round.h"
+#include "blake2b-compress-ssse3.h"
 CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
 {
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h
@ -0,0 +1,97 @@
 #ifndef blake2b_compress_ssse3_H
 #define blake2b_compress_ssse3_H
 #define LOADU(p)  _mm_loadu_si128( (const __m128i *)(const void *)(p) )
 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r)
 #define _mm_roti_epi64(x, c) \
    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, -32); \
  row4h = _mm_roti_epi64(row4h, -32); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, -24); \
  row2h = _mm_roti_epi64(row2h, -24); \
 #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, -16); \
  row4h = _mm_roti_epi64(row4h, -16); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, -63); \
  row2h = _mm_roti_epi64(row2h, -63); \
 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0;    \
  \
  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
  row4l = t1; \
  row4h = t0;
 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0; \
  \
  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
  row4l = t1; \
  row4h = t0;
 #include "blake2b-load-sse2.h"
 #define ROUND(r) \
  LOAD_MSG_ ##r ##_1(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_2(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
  LOAD_MSG_ ##r ##_3(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_4(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
 #endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h
@ -0,0 +1,339 @@
 #ifndef blake2b_load_avx2_H
 #define blake2b_load_avx2_H
 #define BLAKE2B_LOAD_MSG_0_1(b0) do { \
  t0 = _mm256_unpacklo_epi64(m0, m1); \
  t1 = _mm256_unpacklo_epi64(m2, m3); \
  b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
 } while(0)
 #define BLAKE2B_LOAD_MSG_0_2(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m0, m1);\
 t1 = _mm256_unpackhi_epi64(m2, m3);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_0_3(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m4, m5);\
 t1 = _mm256_unpacklo_epi64(m6, m7);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_0_4(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m4, m5);\
 t1 = _mm256_unpackhi_epi64(m6, m7);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_1_1(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m7, m2);\
 t1 = _mm256_unpackhi_epi64(m4, m6);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_1_2(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m5, m4);\
 t1 = _mm256_alignr_epi8(m3, m7, 8);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_1_3(b0) \
 do { \
 t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\
 t1 = _mm256_unpackhi_epi64(m5, m2);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_1_4(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m6, m1);\
 t1 = _mm256_unpackhi_epi64(m3, m1);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_2_1(b0) \
 do { \
 t0 = _mm256_alignr_epi8(m6, m5, 8);\
 t1 = _mm256_unpackhi_epi64(m2, m7);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_2_2(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m4, m0);\
 t1 = _mm256_blend_epi32(m6, m1, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_2_3(b0) \
 do { \
 t0 = _mm256_blend_epi32(m1, m5, 0x33);\
 t1 = _mm256_unpackhi_epi64(m3, m4);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_2_4(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m7, m3);\
 t1 = _mm256_alignr_epi8(m2, m0, 8);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_3_1(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m3, m1);\
 t1 = _mm256_unpackhi_epi64(m6, m5);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_3_2(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m4, m0);\
 t1 = _mm256_unpacklo_epi64(m6, m7);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_3_3(b0) \
 do { \
 t0 = _mm256_blend_epi32(m2, m1, 0x33);\
 t1 = _mm256_blend_epi32(m7, m2, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_3_4(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m3, m5);\
 t1 = _mm256_unpacklo_epi64(m0, m4);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_4_1(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m4, m2);\
 t1 = _mm256_unpacklo_epi64(m1, m5);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_4_2(b0) \
 do { \
 t0 = _mm256_blend_epi32(m3, m0, 0x33);\
 t1 = _mm256_blend_epi32(m7, m2, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_4_3(b0) \
 do { \
 t0 = _mm256_blend_epi32(m5, m7, 0x33);\
 t1 = _mm256_blend_epi32(m1, m3, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_4_4(b0) \
 do { \
 t0 = _mm256_alignr_epi8(m6, m0, 8);\
 t1 = _mm256_blend_epi32(m6, m4, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_5_1(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m1, m3);\
 t1 = _mm256_unpacklo_epi64(m0, m4);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_5_2(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m6, m5);\
 t1 = _mm256_unpackhi_epi64(m5, m1);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_5_3(b0) \
 do { \
 t0 = _mm256_blend_epi32(m3, m2, 0x33);\
 t1 = _mm256_unpackhi_epi64(m7, m0);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_5_4(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m6, m2);\
 t1 = _mm256_blend_epi32(m4, m7, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_6_1(b0) \
 do { \
 t0 = _mm256_blend_epi32(m0, m6, 0x33);\
 t1 = _mm256_unpacklo_epi64(m7, m2);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_6_2(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m2, m7);\
 t1 = _mm256_alignr_epi8(m5, m6, 8);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_6_3(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m0, m3);\
 t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_6_4(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m3, m1);\
 t1 = _mm256_blend_epi32(m5, m1, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_7_1(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m6, m3);\
 t1 = _mm256_blend_epi32(m1, m6, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_7_2(b0) \
 do { \
 t0 = _mm256_alignr_epi8(m7, m5, 8);\
 t1 = _mm256_unpackhi_epi64(m0, m4);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_7_3(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m2, m7);\
 t1 = _mm256_unpacklo_epi64(m4, m1);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_7_4(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m0, m2);\
 t1 = _mm256_unpacklo_epi64(m3, m5);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_8_1(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m3, m7);\
 t1 = _mm256_alignr_epi8(m0, m5, 8);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_8_2(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m7, m4);\
 t1 = _mm256_alignr_epi8(m4, m1, 8);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_8_3(b0) \
 do { \
 t0 = m6;\
 t1 = _mm256_alignr_epi8(m5, m0, 8);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_8_4(b0) \
 do { \
 t0 = _mm256_blend_epi32(m3, m1, 0x33);\
 t1 = m2;\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_9_1(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m5, m4);\
 t1 = _mm256_unpackhi_epi64(m3, m0);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_9_2(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m1, m2);\
 t1 = _mm256_blend_epi32(m2, m3, 0x33);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_9_3(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m7, m4);\
 t1 = _mm256_unpackhi_epi64(m1, m6);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_9_4(b0) \
 do { \
 t0 = _mm256_alignr_epi8(m7, m5, 8);\
 t1 = _mm256_unpacklo_epi64(m6, m0);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_10_1(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m0, m1);\
 t1 = _mm256_unpacklo_epi64(m2, m3);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_10_2(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m0, m1);\
 t1 = _mm256_unpackhi_epi64(m2, m3);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_10_3(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m4, m5);\
 t1 = _mm256_unpacklo_epi64(m6, m7);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_10_4(b0) \
 do { \
 t0 = _mm256_unpackhi_epi64(m4, m5);\
 t1 = _mm256_unpackhi_epi64(m6, m7);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_11_1(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m7, m2);\
 t1 = _mm256_unpackhi_epi64(m4, m6);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_11_2(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m5, m4);\
 t1 = _mm256_alignr_epi8(m3, m7, 8);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_11_3(b0) \
 do { \
 t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\
 t1 = _mm256_unpackhi_epi64(m5, m2);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #define BLAKE2B_LOAD_MSG_11_4(b0) \
 do { \
 t0 = _mm256_unpacklo_epi64(m6, m1);\
 t1 = _mm256_unpackhi_epi64(m3, m1);\
 b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
 } while(0)
 #endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c
@ -416,6 +416,13 @@ int blake2b_salt_personal( uint8_t *out, const void *in, const void *key, const
 int
 blake2b_pick_best_implementation(void)
 {
 #if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
  if (sodium_runtime_has_avx2()) {
    blake2b_compress = blake2b_compress_avx2;
    return 0;
  }
 #endif
 #if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
  if (sodium_runtime_has_sse41()) {