Turn reduce4 into a macro

That's too much registers for a function call in 32-bit mode.
And in MSVC, this is even the case if the function is marked inline.
This commit is contained in:
Frank Denis 2015-10-10 23:33:34 +02:00
parent d1d833a240
commit 7a67bb9484

View File

@ -305,97 +305,106 @@ mulv(__m128i A, __m128i B)
/* 4 multiply-accumulate at once; again /* 4 multiply-accumulate at once; again
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf> <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
for the Aggregated Reduction Method & sample code. for the Aggregated Reduction Method & sample code.
*/ Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
static inline __m128i
reduce4(__m128i H0, __m128i H1, __m128i H2, __m128i H3, __m128i X0, __m128i X1,
__m128i X2, __m128i X3, __m128i acc)
{
/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B #define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
MAKE4(RED_DECL);
__m128i lo, hi;
__m128i tmp8, tmp9;
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
/* byte-revert the inputs & xor the first one into the accumulator */
#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev) #define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
MAKE4(RED_SHUFFLE);
X3 = _mm_xor_si128(X3, acc);
/* 4 low H*X (x0*h0) */
#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00) #define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
MAKE4(RED_MUL_LOW);
lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo);
lo = _mm_xor_si128(lo, H2_X2_lo);
lo = _mm_xor_si128(lo, H3_X3_lo);
/* 4 high H*X (x1*h1) */
#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11) #define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
MAKE4(RED_MUL_HIGH);
hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi);
hi = _mm_xor_si128(hi, H2_X2_hi);
hi = _mm_xor_si128(hi, H3_X3_hi);
/* 4 middle H*X, using Karatsuba, i.e.
x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0
we already have all x1y1 & x0y0 (accumulated in hi & lo)
(0 is low half and 1 is high half)
*/
/* permute the high and low 64 bits in H1 & X1,
so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0),
then compute (h0+h1,h1+h0) and (x0+x1,x1+x0),
and finally multiply
*/
#define RED_MUL_MID(a) \ #define RED_MUL_MID(a) \
tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \ tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \
tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \ tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \
tmp##a = _mm_xor_si128(tmp##a, H##a); \ tmp##a = _mm_xor_si128(tmp##a, H##a); \
tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \ tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00) tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
MAKE4(RED_MUL_MID);
/* substracts x1*h1 and x0*h0 */ #define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, acc) \
tmp0 = _mm_xor_si128(tmp0, lo); { \
tmp0 = _mm_xor_si128(tmp0, hi); MAKE4(RED_DECL); \
tmp0 = _mm_xor_si128(tmp1, tmp0); __m128i lo, hi; \
tmp0 = _mm_xor_si128(tmp2, tmp0); __m128i tmp8, tmp9; \
tmp0 = _mm_xor_si128(tmp3, tmp0); __m128i H0 = H0_; \
__m128i H1 = H1_; \
/* reduction */ __m128i H2 = H2_; \
tmp0B = _mm_slli_si128(tmp0, 8); __m128i H3 = H3_; \
tmp0 = _mm_srli_si128(tmp0, 8); __m128i X0 = X0_; \
lo = _mm_xor_si128(tmp0B, lo); __m128i X1 = X1_; \
hi = _mm_xor_si128(tmp0, hi); __m128i X2 = X2_; \
tmp3 = lo; __m128i X3 = X3_; \
tmp2B = hi; \
tmp3B = _mm_srli_epi32(tmp3, 31); /* byte-revert the inputs & xor the first one into the accumulator */ \
tmp8 = _mm_srli_epi32(tmp2B, 31); \
tmp3 = _mm_slli_epi32(tmp3, 1); MAKE4(RED_SHUFFLE); \
tmp2B = _mm_slli_epi32(tmp2B, 1); X3 = _mm_xor_si128(X3, acc); \
tmp9 = _mm_srli_si128(tmp3B, 12); \
tmp8 = _mm_slli_si128(tmp8, 4); /* 4 low H*X (x0*h0) */ \
tmp3B = _mm_slli_si128(tmp3B, 4); \
tmp3 = _mm_or_si128(tmp3, tmp3B); MAKE4(RED_MUL_LOW); \
tmp2B = _mm_or_si128(tmp2B, tmp8); lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
tmp2B = _mm_or_si128(tmp2B, tmp9); lo = _mm_xor_si128(lo, H2_X2_lo); \
tmp3B = _mm_slli_epi32(tmp3, 31); lo = _mm_xor_si128(lo, H3_X3_lo); \
tmp8 = _mm_slli_epi32(tmp3, 30); \
tmp9 = _mm_slli_epi32(tmp3, 25); /* 4 high H*X (x1*h1) */ \
tmp3B = _mm_xor_si128(tmp3B, tmp8); \
tmp3B = _mm_xor_si128(tmp3B, tmp9); MAKE4(RED_MUL_HIGH); \
tmp8 = _mm_srli_si128(tmp3B, 4); hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
tmp3B = _mm_slli_si128(tmp3B, 12); hi = _mm_xor_si128(hi, H2_X2_hi); \
tmp3 = _mm_xor_si128(tmp3, tmp3B); hi = _mm_xor_si128(hi, H3_X3_hi); \
tmp2 = _mm_srli_epi32(tmp3, 1); \
tmp0B = _mm_srli_epi32(tmp3, 2); /* 4 middle H*X, using Karatsuba, i.e. \
tmp1B = _mm_srli_epi32(tmp3, 7); x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
tmp2 = _mm_xor_si128(tmp2, tmp0B); we already have all x1y1 & x0y0 (accumulated in hi & lo) \
tmp2 = _mm_xor_si128(tmp2, tmp1B); (0 is low half and 1 is high half) \
tmp2 = _mm_xor_si128(tmp2, tmp8); */ \
tmp3 = _mm_xor_si128(tmp3, tmp2); /* permute the high and low 64 bits in H1 & X1, \
tmp2B = _mm_xor_si128(tmp2B, tmp3); so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
return tmp2B; and finally multiply \
*/ \
MAKE4(RED_MUL_MID); \
\
/* substracts x1*h1 and x0*h0 */ \
tmp0 = _mm_xor_si128(tmp0, lo); \
tmp0 = _mm_xor_si128(tmp0, hi); \
tmp0 = _mm_xor_si128(tmp1, tmp0); \
tmp0 = _mm_xor_si128(tmp2, tmp0); \
tmp0 = _mm_xor_si128(tmp3, tmp0);\
\
/* reduction */ \
tmp0B = _mm_slli_si128(tmp0, 8); \
tmp0 = _mm_srli_si128(tmp0, 8); \
lo = _mm_xor_si128(tmp0B, lo); \
hi = _mm_xor_si128(tmp0, hi); \
tmp3 = lo; \
tmp2B = hi; \
tmp3B = _mm_srli_epi32(tmp3, 31); \
tmp8 = _mm_srli_epi32(tmp2B, 31); \
tmp3 = _mm_slli_epi32(tmp3, 1); \
tmp2B = _mm_slli_epi32(tmp2B, 1); \
tmp9 = _mm_srli_si128(tmp3B, 12); \
tmp8 = _mm_slli_si128(tmp8, 4); \
tmp3B = _mm_slli_si128(tmp3B, 4); \
tmp3 = _mm_or_si128(tmp3, tmp3B); \
tmp2B = _mm_or_si128(tmp2B, tmp8); \
tmp2B = _mm_or_si128(tmp2B, tmp9); \
tmp3B = _mm_slli_epi32(tmp3, 31); \
tmp8 = _mm_slli_epi32(tmp3, 30); \
tmp9 = _mm_slli_epi32(tmp3, 25); \
tmp3B = _mm_xor_si128(tmp3B, tmp8); \
tmp3B = _mm_xor_si128(tmp3B, tmp9); \
tmp8 = _mm_srli_si128(tmp3B, 4); \
tmp3B = _mm_slli_si128(tmp3B, 12); \
tmp3 = _mm_xor_si128(tmp3, tmp3B); \
tmp2 = _mm_srli_epi32(tmp3, 1); \
tmp0B = _mm_srli_epi32(tmp3, 2); \
tmp1B = _mm_srli_epi32(tmp3, 7); \
tmp2 = _mm_xor_si128(tmp2, tmp0B); \
tmp2 = _mm_xor_si128(tmp2, tmp1B); \
tmp2 = _mm_xor_si128(tmp2, tmp8); \
tmp3 = _mm_xor_si128(tmp3, tmp2); \
tmp2B = _mm_xor_si128(tmp2B, tmp3); \
\
accv = tmp2B; \
} }
#define XORx(a) \ #define XORx(a) \
@ -413,6 +422,7 @@ aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys,
const __m128i h4v) const __m128i h4v)
{ {
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m128i accv = _mm_loadu_si128((const __m128i *) accum); __m128i accv = _mm_loadu_si128((const __m128i *) accum);
int i; int i;
@ -427,8 +437,8 @@ aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys,
MAKE8(AESENCLASTx); MAKE8(AESENCLASTx);
MAKE8(XORx); MAKE8(XORx);
MAKE8(STOREx); MAKE8(STOREx);
accv = reduce4(hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv); REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv);
accv = reduce4(hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv); REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv);
_mm_storeu_si128((__m128i *) accum, accv); _mm_storeu_si128((__m128i *) accum, accv);
} }
@ -438,10 +448,12 @@ aesni_addmul8full(const unsigned char *in, unsigned char *accum,
const __m128i hv, const __m128i h2v, const __m128i hv, const __m128i h2v,
const __m128i h3v, const __m128i h4v) const __m128i h3v, const __m128i h4v)
{ {
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m128i accv = _mm_loadu_si128((const __m128i *) accum); __m128i accv = _mm_loadu_si128((const __m128i *) accum);
MAKE8(LOADx); MAKE8(LOADx);
accv = reduce4(hv, h2v, h3v, h4v, in3, in2, in1, in0, accv); REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv);
accv = reduce4(hv, h2v, h3v, h4v, in7, in6, in5, in4, accv); REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv);
_mm_storeu_si128((__m128i *) accum, accv); _mm_storeu_si128((__m128i *) accum, accv);
} }
@ -523,13 +535,13 @@ crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long
H4v = mulv(H3v, Hv); H4v = mulv(H3v, Hv);
accv = _mm_setzero_si128(); accv = _mm_setzero_si128();
/* unrolled by 4 GCM (by 8 doesn't improve using reduce4) */ /* unrolled by 4 GCM (by 8 doesn't improve using REDUCE4) */
for (i = 0; i < adlen_rnd64; i += 64) { for (i = 0; i < adlen_rnd64; i += 64) {
__m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0)); __m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0));
__m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16));
__m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32));
__m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48));
accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
} }
_mm_storeu_si128((__m128i *) accum, accv); _mm_storeu_si128((__m128i *) accum, accv);
@ -648,7 +660,7 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long
__m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16)); __m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16));
__m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32)); __m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32));
__m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48)); __m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48));
accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv); REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
} }
_mm_storeu_si128((__m128i *) accum, accv); _mm_storeu_si128((__m128i *) accum, accv);