From 7f238f4c8bf4fbe6a7b87b2a5296f4892e7d139c Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sun, 1 Nov 2015 02:09:22 +0100 Subject: [PATCH] Assume that optimized Blake2b versions can use at least SSSE3 --- .../blake2/sse/blake2b-opt.c | 85 +------------------ .../blake2/sse/blake2b-round.h | 38 +-------- 2 files changed, 3 insertions(+), 120 deletions(-) diff --git a/src/libsodium/crypto_generichash/blake2/sse/blake2b-opt.c b/src/libsodium/crypto_generichash/blake2/sse/blake2b-opt.c index b26f7b4c..62c56ab4 100644 --- a/src/libsodium/crypto_generichash/blake2/sse/blake2b-opt.c +++ b/src/libsodium/crypto_generichash/blake2/sse/blake2b-opt.c @@ -18,21 +18,6 @@ #include "blake2.h" #include "blake2-impl.h" -#include "blake2-config.h" - -#ifdef _MSC_VER -#include /* for _mm_set_epi64x */ -#endif -#include -#if defined(HAVE_SSSE3) -#include -#endif -#if defined(HAVE_SSE41) -#include -#endif - -#include "blake2b-round.h" - static const uint64_t blake2b_IV[8] = { 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, @@ -307,75 +292,7 @@ int blake2b_init_key_salt_personal( blake2b_state *S, const uint8_t outlen, cons return 0; } -static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) -{ - __m128i row1l, row1h; - __m128i row2l, row2h; - __m128i row3l, row3h; - __m128i row4l, row4h; - __m128i b0, b1; - __m128i t0, t1; -#if defined(HAVE_SSSE3) - const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); - const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); -#endif -#if defined(HAVE_SSE41) - const __m128i m0 = LOADU( block + 00 ); - const __m128i m1 = LOADU( block + 16 ); - const __m128i m2 = LOADU( block + 32 ); - const __m128i m3 = LOADU( block + 48 ); - const __m128i m4 = LOADU( block + 64 ); - const __m128i m5 = LOADU( block + 80 ); - const __m128i m6 = LOADU( block + 96 ); - const __m128i m7 = LOADU( block + 112 ); -#else - const uint64_t m0 = ( ( uint64_t * )block )[ 0]; - const uint64_t m1 = ( ( uint64_t * )block )[ 1]; - const uint64_t m2 = ( ( uint64_t * )block )[ 2]; - const uint64_t m3 = ( ( uint64_t * )block )[ 3]; - const uint64_t m4 = ( ( uint64_t * )block )[ 4]; - const uint64_t m5 = ( ( uint64_t * )block )[ 5]; - const uint64_t m6 = ( ( uint64_t * )block )[ 6]; - const uint64_t m7 = ( ( uint64_t * )block )[ 7]; - const uint64_t m8 = ( ( uint64_t * )block )[ 8]; - const uint64_t m9 = ( ( uint64_t * )block )[ 9]; - const uint64_t m10 = ( ( uint64_t * )block )[10]; - const uint64_t m11 = ( ( uint64_t * )block )[11]; - const uint64_t m12 = ( ( uint64_t * )block )[12]; - const uint64_t m13 = ( ( uint64_t * )block )[13]; - const uint64_t m14 = ( ( uint64_t * )block )[14]; - const uint64_t m15 = ( ( uint64_t * )block )[15]; -#endif - row1l = LOADU( &S->h[0] ); - row1h = LOADU( &S->h[2] ); - row2l = LOADU( &S->h[4] ); - row2h = LOADU( &S->h[6] ); - row3l = LOADU( &blake2b_IV[0] ); - row3h = LOADU( &blake2b_IV[2] ); - row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); - row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - row1l = _mm_xor_si128( row3l, row1l ); - row1h = _mm_xor_si128( row3h, row1h ); - STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); - STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); - row2l = _mm_xor_si128( row4l, row2l ); - row2h = _mm_xor_si128( row4h, row2h ); - STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); - STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); - return 0; -} +int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ); /* inlen now in bytes */ int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen ) diff --git a/src/libsodium/crypto_generichash/blake2/sse/blake2b-round.h b/src/libsodium/crypto_generichash/blake2/sse/blake2b-round.h index 9e844efa..3b424a54 100644 --- a/src/libsodium/crypto_generichash/blake2/sse/blake2b-round.h +++ b/src/libsodium/crypto_generichash/blake2/sse/blake2b-round.h @@ -10,7 +10,7 @@ You should have received a copy of the CC0 Public Domain Dedication along with this software. If not, see . */ -#pragma once + #ifndef __BLAKE2B_ROUND_H__ #define __BLAKE2B_ROUND_H__ @@ -20,21 +20,14 @@ #define TOF(reg) _mm_castsi128_ps((reg)) #define TOI(reg) _mm_castps_si128((reg)) -#define LIKELY(x) __builtin_expect((x),1) - /* Microarchitecture-specific macros */ -#ifdef HAVE_SSSE3 #define _mm_roti_epi64(x, c) \ (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) -#else -/* ... */ -#endif - #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ @@ -75,7 +68,6 @@ row2l = _mm_roti_epi64(row2l, -63); \ row2h = _mm_roti_epi64(row2h, -63); \ -#if defined(HAVE_SSSE3) #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ t0 = _mm_alignr_epi8(row2h, row2l, 8); \ t1 = _mm_alignr_epi8(row2l, row2h, 8); \ @@ -105,33 +97,8 @@ t1 = _mm_alignr_epi8(row4h, row4l, 8); \ row4l = t1; \ row4h = t0; -#else -#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = row4l;\ - t1 = row2l;\ - row4l = row3l;\ - row3l = row3h;\ - row3h = row4l;\ - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) - -#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = row3l;\ - row3l = row3h;\ - row3h = t0;\ - t0 = row2l;\ - t1 = row4l;\ - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) - -#endif - -#if defined(HAVE_SSE41) +#if defined(BLAKE2_USE_SSE41) #include "blake2b-load-sse41.h" #else #include "blake2b-load-sse2.h" @@ -150,4 +117,3 @@ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); #endif -