From 0131a720826045e476e6dd6a8e7a1991f1d941aa Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Thu, 17 Mar 2016 10:45:11 +0100 Subject: [PATCH] BLAKE2b AVX2 implementation By the marvellous Samuel Neves - https://github.com/sneves/blake2-avx2 --- configure.ac | 2 + src/libsodium/Makefile.am | 16 +- .../crypto_generichash/blake2/ref/blake2.h | 1 + .../blake2/ref/blake2b-compress-avx2.c | 45 +++ .../blake2/ref/blake2b-compress-avx2.h | 123 +++++++ .../blake2/ref/blake2b-compress-sse41.c | 2 +- ...ake2b-round.h => blake2b-compress-sse41.h} | 30 +- .../blake2/ref/blake2b-compress-ssse3.c | 4 +- .../blake2/ref/blake2b-compress-ssse3.h | 97 +++++ .../blake2/ref/blake2b-load-avx2.h | 339 ++++++++++++++++++ .../blake2/ref/blake2b-ref.c | 7 + 11 files changed, 631 insertions(+), 35 deletions(-) create mode 100644 src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c create mode 100644 src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h rename src/libsodium/crypto_generichash/blake2/ref/{blake2b-round.h => blake2b-compress-sse41.h} (77%) create mode 100644 src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h create mode 100644 src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h diff --git a/configure.ac b/configure.ac index 38eac7af..2328ca4a 100644 --- a/configure.ac +++ b/configure.ac @@ -394,6 +394,8 @@ AC_SUBST(CFLAGS_SSE2) AC_SUBST(CFLAGS_SSE3) AC_SUBST(CFLAGS_SSSE3) AC_SUBST(CFLAGS_SSE41) +AC_SUBST(CFLAGS_AVX) +AC_SUBST(CFLAGS_AVX2) AC_SUBST(CFLAGS_AESNI) AC_SUBST(CFLAGS_PCLMUL) diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index 456d653a..62f5711d 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -36,6 +36,7 @@ libsodium_la_SOURCES = \ crypto_generichash/blake2/ref/blake2b-compress-ref.c \ crypto_generichash/blake2/ref/blake2b-load-sse2.h \ crypto_generichash/blake2/ref/blake2b-load-sse41.h \ + crypto_generichash/blake2/ref/blake2b-load-avx2.h \ crypto_generichash/blake2/ref/blake2b-ref.c \ crypto_generichash/blake2/ref/blake2b-round.h \ crypto_generichash/blake2/ref/generichash_blake2b.c \ @@ -216,8 +217,8 @@ endif SUBDIRS = \ include -libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la -noinst_LTLIBRARIES = libaesni.la libsse2.la libssse3.la libsse41.la +libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la +noinst_LTLIBRARIES = libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS) libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ @@ -238,6 +239,7 @@ libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ @CFLAGS_SSE2@ @CFLAGS_SSSE3@ libssse3_la_SOURCES = \ crypto_generichash/blake2/ref/blake2b-compress-ssse3.c \ + crypto_generichash/blake2/ref/blake2b-compress-ssse3.h \ crypto_pwhash/argon2/argon2-fill-block-ssse3.c \ crypto_pwhash/argon2/blamka-round-ssse3.h \ crypto_stream/chacha20/vec/stream_chacha20_vec.h \ @@ -247,4 +249,12 @@ libsse41_la_LDFLAGS = $(libsodium_la_LDFLAGS) libsse41_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ @CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@ libsse41_la_SOURCES = \ - crypto_generichash/blake2/ref/blake2b-compress-sse41.c + crypto_generichash/blake2/ref/blake2b-compress-sse41.c \ + crypto_generichash/blake2/ref/blake2b-compress-sse41.h + +libavx2_la_LDFLAGS = $(libsodium_la_LDFLAGS) +libavx2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ + @CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@ @CFLAGS_AVX@ @CFLAGS_AVX2@ +libavx2_la_SOURCES = \ + crypto_generichash/blake2/ref/blake2b-compress-avx2.c \ + crypto_generichash/blake2/ref/blake2b-compress-avx2.h diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2.h b/src/libsodium/crypto_generichash/blake2/ref/blake2.h index 175a2cac..28620597 100644 --- a/src/libsodium/crypto_generichash/blake2/ref/blake2.h +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2.h @@ -183,6 +183,7 @@ CRYPTO_ALIGN( 64 ) typedef struct blake2b_state_ int blake2b_compress_ref( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ); int blake2b_compress_ssse3( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ); int blake2b_compress_sse41( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ); + int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ); #if defined(__cplusplus) } diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c new file mode 100644 index 00000000..e1ca9d27 --- /dev/null +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c @@ -0,0 +1,45 @@ + +#define BLAKE2_USE_SSSE3 +#define BLAKE2_USE_SSE41 +#define BLAKE2_USE_AVX2 + +#include +#include + +#if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \ + (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) + +#pragma GCC target("sse2") +#pragma GCC target("ssse3") +#pragma GCC target("sse4.1") +#pragma GCC target("avx2") + +#include +#include +#include +#include + +#include "blake2.h" +#include "blake2-impl.h" +#include "blake2b-compress-avx2.h" + +CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) +{ + __m256i a = LOADU(&S->h[0]); + __m256i b = LOADU(&S->h[4]); + BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]); + STOREU(&S->h[0], a); + STOREU(&S->h[4], b); + + return 0; +} + +#endif diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h new file mode 100644 index 00000000..af24871f --- /dev/null +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h @@ -0,0 +1,123 @@ + +#ifndef blake2b_compress_avx2_H +#define blake2b_compress_avx2_H + +#define LOAD128(p) _mm_load_si128((__m128i *)(p)) +#define STORE128(p, r) _mm_store_si128((__m128i *)(p), r) + +#define LOADU128(p) _mm_loadu_si128((__m128i *)(p)) +#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r) + +#define LOAD(p) _mm256_load_si256((__m256i *)(p)) +#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r) + +#define LOADU(p) _mm256_loadu_si256((__m256i *)(p)) +#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r) + +static inline uint64_t LOADU64(const void *p) { + uint64_t v; + memcpy(&v, p, sizeof v); + return v; +} + +#define ROTATE16 _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, \ + 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9) + +#define ROTATE24 _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, \ + 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10) + +#define ADD(a, b) _mm256_add_epi64(a, b) +#define SUB(a, b) _mm256_sub_epi64(a, b) + +#define XOR(a, b) _mm256_xor_si256(a, b) +#define AND(a, b) _mm256_and_si256(a, b) +#define OR(a, b) _mm256_or_si256(a, b) + +#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) +#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24) +#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16) +#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x))) + +#define BLAKE2B_G1_V1(a, b, c, d, m) do { \ + a = ADD(a, m); \ + a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \ + c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \ +} while(0) + +#define BLAKE2B_G2_V1(a, b, c, d, m) do { \ + a = ADD(a, m); \ + a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \ + c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \ +} while(0) + +#define BLAKE2B_DIAG_V1(a, b, c, d) do { \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2,1,0,3)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \ + b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0,3,2,1)); \ +} while(0) + +#define BLAKE2B_UNDIAG_V1(a, b, c, d) do { \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0,3,2,1)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \ + b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2,1,0,3)); \ +} while(0) + +#include "blake2b-load-avx2.h" + +#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \ + __m256i b0; \ + BLAKE2B_LOAD_MSG_ ##r ##_1(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_ ##r ##_2(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_DIAG_V1(a, b, c, d); \ + BLAKE2B_LOAD_MSG_ ##r ##_3(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_ ##r ##_4(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_UNDIAG_V1(a, b, c, d); \ +} while(0) + +#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \ + BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \ +} while(0) + +#define DECLARE_MESSAGE_WORDS(m) \ + const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \ + const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \ + const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \ + const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \ + const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \ + const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \ + const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \ + const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \ + __m256i t0, t1; + +#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \ + DECLARE_MESSAGE_WORDS(m) \ + const __m256i iv0 = a; \ + const __m256i iv1 = b; \ + __m256i c = LOAD(&blake2b_IV[0]); \ + __m256i d = XOR( \ + LOAD(&blake2b_IV[4]), \ + _mm256_set_epi64x(f1, f0, t1, t0) \ + ); \ + BLAKE2B_ROUNDS_V1(a, b, c, d, m); \ + a = XOR(a, c); \ + b = XOR(b, d); \ + a = XOR(a, iv0); \ + b = XOR(b, iv1); \ +} while(0) + +#endif diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c index b542849c..ea064c26 100644 --- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c @@ -18,7 +18,7 @@ #include "blake2.h" #include "blake2-impl.h" -#include "blake2b-round.h" +#include "blake2b-compress-sse41.h" CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] = { diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-round.h b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.h similarity index 77% rename from src/libsodium/crypto_generichash/blake2/ref/blake2b-round.h rename to src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.h index 0c322b18..8e854661 100644 --- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-round.h +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.h @@ -1,31 +1,10 @@ -/* - BLAKE2 reference source code package - optimized C implementations - Written in 2012 by Samuel Neves - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along with - this software. If not, see . -*/ - -#ifndef blake2b_round_H -#define blake2b_round_H - -#ifndef BLAKE2_USE_SSSE3 -# error BLAKE2_USE_SSSE3 must be defined in order to use this file -#endif +#ifndef blake2b_compress_sse41_H +#define blake2b_compress_sse41_H #define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void *)(p) ) #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r) -#define TOF(reg) _mm_castsi128_ps((reg)) -#define TOI(reg) _mm_castps_si128((reg)) - - -/* Microarchitecture-specific macros */ #define _mm_roti_epi64(x, c) \ (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ @@ -33,7 +12,6 @@ : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) - #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ @@ -102,11 +80,7 @@ row4l = t1; \ row4h = t0; -#if defined(BLAKE2_USE_SSE41) #include "blake2b-load-sse41.h" -#else -#include "blake2b-load-sse2.h" -#endif #define ROUND(r) \ LOAD_MSG_ ##r ##_1(b0, b1); \ diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c index 3058e44c..882351e2 100644 --- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c @@ -1,6 +1,4 @@ -#define BLAKE2_USE_SSSE3 - #include #include @@ -18,7 +16,7 @@ #include "blake2.h" #include "blake2-impl.h" -#include "blake2b-round.h" +#include "blake2b-compress-ssse3.h" CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] = { diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h new file mode 100644 index 00000000..dcfe84e9 --- /dev/null +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h @@ -0,0 +1,97 @@ + +#ifndef blake2b_compress_ssse3_H +#define blake2b_compress_ssse3_H + +#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r) + +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); \ + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; + +#include "blake2b-load-sse2.h" + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + +#endif diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h b/src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h new file mode 100644 index 00000000..61b8f9a9 --- /dev/null +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h @@ -0,0 +1,339 @@ +#ifndef blake2b_load_avx2_H +#define blake2b_load_avx2_H + +#define BLAKE2B_LOAD_MSG_0_1(b0) do { \ + t0 = _mm256_unpacklo_epi64(m0, m1); \ + t1 = _mm256_unpacklo_epi64(m2, m3); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m0, m1);\ +t1 = _mm256_unpackhi_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m5);\ +t1 = _mm256_unpacklo_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m5);\ +t1 = _mm256_unpackhi_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m2);\ +t1 = _mm256_unpackhi_epi64(m4, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_alignr_epi8(m3, m7, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_3(b0) \ +do { \ +t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ +t1 = _mm256_unpackhi_epi64(m5, m2);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m6, m1);\ +t1 = _mm256_unpackhi_epi64(m3, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_1(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m6, m5, 8);\ +t1 = _mm256_unpackhi_epi64(m2, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m0);\ +t1 = _mm256_blend_epi32(m6, m1, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m1, m5, 0x33);\ +t1 = _mm256_unpackhi_epi64(m3, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m3);\ +t1 = _mm256_alignr_epi8(m2, m0, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m3, m1);\ +t1 = _mm256_unpackhi_epi64(m6, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m0);\ +t1 = _mm256_unpacklo_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m2, m1, 0x33);\ +t1 = _mm256_blend_epi32(m7, m2, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m3, m5);\ +t1 = _mm256_unpacklo_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m2);\ +t1 = _mm256_unpacklo_epi64(m1, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_2(b0) \ +do { \ +t0 = _mm256_blend_epi32(m3, m0, 0x33);\ +t1 = _mm256_blend_epi32(m7, m2, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m5, m7, 0x33);\ +t1 = _mm256_blend_epi32(m1, m3, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_4(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m6, m0, 8);\ +t1 = _mm256_blend_epi32(m6, m4, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m1, m3);\ +t1 = _mm256_unpacklo_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m6, m5);\ +t1 = _mm256_unpackhi_epi64(m5, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m3, m2, 0x33);\ +t1 = _mm256_unpackhi_epi64(m7, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m6, m2);\ +t1 = _mm256_blend_epi32(m4, m7, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_1(b0) \ +do { \ +t0 = _mm256_blend_epi32(m0, m6, 0x33);\ +t1 = _mm256_unpacklo_epi64(m7, m2);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m2, m7);\ +t1 = _mm256_alignr_epi8(m5, m6, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m0, m3);\ +t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m3, m1);\ +t1 = _mm256_blend_epi32(m5, m1, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m6, m3);\ +t1 = _mm256_blend_epi32(m1, m6, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_2(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m7, m5, 8);\ +t1 = _mm256_unpackhi_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_3(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m2, m7);\ +t1 = _mm256_unpacklo_epi64(m4, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m0, m2);\ +t1 = _mm256_unpacklo_epi64(m3, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m3, m7);\ +t1 = _mm256_alignr_epi8(m0, m5, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_alignr_epi8(m4, m1, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_3(b0) \ +do { \ +t0 = m6;\ +t1 = _mm256_alignr_epi8(m5, m0, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_4(b0) \ +do { \ +t0 = _mm256_blend_epi32(m3, m1, 0x33);\ +t1 = m2;\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_unpackhi_epi64(m3, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m1, m2);\ +t1 = _mm256_blend_epi32(m2, m3, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_3(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_unpackhi_epi64(m1, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_4(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m7, m5, 8);\ +t1 = _mm256_unpacklo_epi64(m6, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m0, m1);\ +t1 = _mm256_unpacklo_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m0, m1);\ +t1 = _mm256_unpackhi_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m5);\ +t1 = _mm256_unpacklo_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m5);\ +t1 = _mm256_unpackhi_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m2);\ +t1 = _mm256_unpackhi_epi64(m4, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_alignr_epi8(m3, m7, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_3(b0) \ +do { \ +t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ +t1 = _mm256_unpackhi_epi64(m5, m2);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m6, m1);\ +t1 = _mm256_unpackhi_epi64(m3, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#endif diff --git a/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c b/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c index feaea381..8b6c7edf 100644 --- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c +++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c @@ -416,6 +416,13 @@ int blake2b_salt_personal( uint8_t *out, const void *in, const void *key, const int blake2b_pick_best_implementation(void) { +#if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \ + (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) + if (sodium_runtime_has_avx2()) { + blake2b_compress = blake2b_compress_avx2; + return 0; + } +#endif #if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \ (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) if (sodium_runtime_has_sse41()) {