Turn reduce4 into a macro
That's too much registers for a function call in 32-bit mode. And in MSVC, this is even the case if the function is marked inline.
This commit is contained in:
parent
d1d833a240
commit
7a67bb9484
@ -305,97 +305,106 @@ mulv(__m128i A, __m128i B)
|
||||
/* 4 multiply-accumulate at once; again
|
||||
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
||||
for the Aggregated Reduction Method & sample code.
|
||||
*/
|
||||
static inline __m128i
|
||||
reduce4(__m128i H0, __m128i H1, __m128i H2, __m128i H3, __m128i X0, __m128i X1,
|
||||
__m128i X2, __m128i X3, __m128i acc)
|
||||
{
|
||||
/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
|
||||
Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
|
||||
|
||||
#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
|
||||
MAKE4(RED_DECL);
|
||||
__m128i lo, hi;
|
||||
__m128i tmp8, tmp9;
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* byte-revert the inputs & xor the first one into the accumulator */
|
||||
#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
|
||||
MAKE4(RED_SHUFFLE);
|
||||
X3 = _mm_xor_si128(X3, acc);
|
||||
|
||||
/* 4 low H*X (x0*h0) */
|
||||
#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
|
||||
MAKE4(RED_MUL_LOW);
|
||||
lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo);
|
||||
lo = _mm_xor_si128(lo, H2_X2_lo);
|
||||
lo = _mm_xor_si128(lo, H3_X3_lo);
|
||||
|
||||
/* 4 high H*X (x1*h1) */
|
||||
#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
|
||||
MAKE4(RED_MUL_HIGH);
|
||||
hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi);
|
||||
hi = _mm_xor_si128(hi, H2_X2_hi);
|
||||
hi = _mm_xor_si128(hi, H3_X3_hi);
|
||||
|
||||
/* 4 middle H*X, using Karatsuba, i.e.
|
||||
x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0
|
||||
we already have all x1y1 & x0y0 (accumulated in hi & lo)
|
||||
(0 is low half and 1 is high half)
|
||||
*/
|
||||
/* permute the high and low 64 bits in H1 & X1,
|
||||
so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0),
|
||||
then compute (h0+h1,h1+h0) and (x0+x1,x1+x0),
|
||||
and finally multiply
|
||||
*/
|
||||
#define RED_MUL_MID(a) \
|
||||
tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \
|
||||
tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \
|
||||
tmp##a = _mm_xor_si128(tmp##a, H##a); \
|
||||
tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
|
||||
tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
|
||||
MAKE4(RED_MUL_MID);
|
||||
|
||||
/* substracts x1*h1 and x0*h0 */
|
||||
tmp0 = _mm_xor_si128(tmp0, lo);
|
||||
tmp0 = _mm_xor_si128(tmp0, hi);
|
||||
tmp0 = _mm_xor_si128(tmp1, tmp0);
|
||||
tmp0 = _mm_xor_si128(tmp2, tmp0);
|
||||
tmp0 = _mm_xor_si128(tmp3, tmp0);
|
||||
|
||||
/* reduction */
|
||||
tmp0B = _mm_slli_si128(tmp0, 8);
|
||||
tmp0 = _mm_srli_si128(tmp0, 8);
|
||||
lo = _mm_xor_si128(tmp0B, lo);
|
||||
hi = _mm_xor_si128(tmp0, hi);
|
||||
tmp3 = lo;
|
||||
tmp2B = hi;
|
||||
tmp3B = _mm_srli_epi32(tmp3, 31);
|
||||
tmp8 = _mm_srli_epi32(tmp2B, 31);
|
||||
tmp3 = _mm_slli_epi32(tmp3, 1);
|
||||
tmp2B = _mm_slli_epi32(tmp2B, 1);
|
||||
tmp9 = _mm_srli_si128(tmp3B, 12);
|
||||
tmp8 = _mm_slli_si128(tmp8, 4);
|
||||
tmp3B = _mm_slli_si128(tmp3B, 4);
|
||||
tmp3 = _mm_or_si128(tmp3, tmp3B);
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp8);
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp9);
|
||||
tmp3B = _mm_slli_epi32(tmp3, 31);
|
||||
tmp8 = _mm_slli_epi32(tmp3, 30);
|
||||
tmp9 = _mm_slli_epi32(tmp3, 25);
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp8);
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp9);
|
||||
tmp8 = _mm_srli_si128(tmp3B, 4);
|
||||
tmp3B = _mm_slli_si128(tmp3B, 12);
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp3B);
|
||||
tmp2 = _mm_srli_epi32(tmp3, 1);
|
||||
tmp0B = _mm_srli_epi32(tmp3, 2);
|
||||
tmp1B = _mm_srli_epi32(tmp3, 7);
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp0B);
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp1B);
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp8);
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp2);
|
||||
tmp2B = _mm_xor_si128(tmp2B, tmp3);
|
||||
|
||||
return tmp2B;
|
||||
#define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, acc) \
|
||||
{ \
|
||||
MAKE4(RED_DECL); \
|
||||
__m128i lo, hi; \
|
||||
__m128i tmp8, tmp9; \
|
||||
__m128i H0 = H0_; \
|
||||
__m128i H1 = H1_; \
|
||||
__m128i H2 = H2_; \
|
||||
__m128i H3 = H3_; \
|
||||
__m128i X0 = X0_; \
|
||||
__m128i X1 = X1_; \
|
||||
__m128i X2 = X2_; \
|
||||
__m128i X3 = X3_; \
|
||||
\
|
||||
/* byte-revert the inputs & xor the first one into the accumulator */ \
|
||||
\
|
||||
MAKE4(RED_SHUFFLE); \
|
||||
X3 = _mm_xor_si128(X3, acc); \
|
||||
\
|
||||
/* 4 low H*X (x0*h0) */ \
|
||||
\
|
||||
MAKE4(RED_MUL_LOW); \
|
||||
lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
|
||||
lo = _mm_xor_si128(lo, H2_X2_lo); \
|
||||
lo = _mm_xor_si128(lo, H3_X3_lo); \
|
||||
\
|
||||
/* 4 high H*X (x1*h1) */ \
|
||||
\
|
||||
MAKE4(RED_MUL_HIGH); \
|
||||
hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
|
||||
hi = _mm_xor_si128(hi, H2_X2_hi); \
|
||||
hi = _mm_xor_si128(hi, H3_X3_hi); \
|
||||
\
|
||||
/* 4 middle H*X, using Karatsuba, i.e. \
|
||||
x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
|
||||
we already have all x1y1 & x0y0 (accumulated in hi & lo) \
|
||||
(0 is low half and 1 is high half) \
|
||||
*/ \
|
||||
/* permute the high and low 64 bits in H1 & X1, \
|
||||
so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
|
||||
then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
|
||||
and finally multiply \
|
||||
*/ \
|
||||
MAKE4(RED_MUL_MID); \
|
||||
\
|
||||
/* substracts x1*h1 and x0*h0 */ \
|
||||
tmp0 = _mm_xor_si128(tmp0, lo); \
|
||||
tmp0 = _mm_xor_si128(tmp0, hi); \
|
||||
tmp0 = _mm_xor_si128(tmp1, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp2, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp3, tmp0);\
|
||||
\
|
||||
/* reduction */ \
|
||||
tmp0B = _mm_slli_si128(tmp0, 8); \
|
||||
tmp0 = _mm_srli_si128(tmp0, 8); \
|
||||
lo = _mm_xor_si128(tmp0B, lo); \
|
||||
hi = _mm_xor_si128(tmp0, hi); \
|
||||
tmp3 = lo; \
|
||||
tmp2B = hi; \
|
||||
tmp3B = _mm_srli_epi32(tmp3, 31); \
|
||||
tmp8 = _mm_srli_epi32(tmp2B, 31); \
|
||||
tmp3 = _mm_slli_epi32(tmp3, 1); \
|
||||
tmp2B = _mm_slli_epi32(tmp2B, 1); \
|
||||
tmp9 = _mm_srli_si128(tmp3B, 12); \
|
||||
tmp8 = _mm_slli_si128(tmp8, 4); \
|
||||
tmp3B = _mm_slli_si128(tmp3B, 4); \
|
||||
tmp3 = _mm_or_si128(tmp3, tmp3B); \
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp8); \
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp9); \
|
||||
tmp3B = _mm_slli_epi32(tmp3, 31); \
|
||||
tmp8 = _mm_slli_epi32(tmp3, 30); \
|
||||
tmp9 = _mm_slli_epi32(tmp3, 25); \
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp8); \
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp9); \
|
||||
tmp8 = _mm_srli_si128(tmp3B, 4); \
|
||||
tmp3B = _mm_slli_si128(tmp3B, 12); \
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp3B); \
|
||||
tmp2 = _mm_srli_epi32(tmp3, 1); \
|
||||
tmp0B = _mm_srli_epi32(tmp3, 2); \
|
||||
tmp1B = _mm_srli_epi32(tmp3, 7); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp0B); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp1B); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp8); \
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp2); \
|
||||
tmp2B = _mm_xor_si128(tmp2B, tmp3); \
|
||||
\
|
||||
accv = tmp2B; \
|
||||
}
|
||||
|
||||
#define XORx(a) \
|
||||
@ -413,6 +422,7 @@ aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys,
|
||||
const __m128i h4v)
|
||||
{
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
__m128i accv = _mm_loadu_si128((const __m128i *) accum);
|
||||
int i;
|
||||
|
||||
@ -427,8 +437,8 @@ aesni_encrypt8full(unsigned char *out, uint32_t *n, const __m128i *rkeys,
|
||||
MAKE8(AESENCLASTx);
|
||||
MAKE8(XORx);
|
||||
MAKE8(STOREx);
|
||||
accv = reduce4(hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv);
|
||||
accv = reduce4(hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv);
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv);
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv);
|
||||
_mm_storeu_si128((__m128i *) accum, accv);
|
||||
}
|
||||
|
||||
@ -438,10 +448,12 @@ aesni_addmul8full(const unsigned char *in, unsigned char *accum,
|
||||
const __m128i hv, const __m128i h2v,
|
||||
const __m128i h3v, const __m128i h4v)
|
||||
{
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
__m128i accv = _mm_loadu_si128((const __m128i *) accum);
|
||||
|
||||
MAKE8(LOADx);
|
||||
accv = reduce4(hv, h2v, h3v, h4v, in3, in2, in1, in0, accv);
|
||||
accv = reduce4(hv, h2v, h3v, h4v, in7, in6, in5, in4, accv);
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv);
|
||||
REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv);
|
||||
_mm_storeu_si128((__m128i *) accum, accv);
|
||||
}
|
||||
|
||||
@ -523,13 +535,13 @@ crypto_aead_aes256gcm_aesni_encrypt_afternm(unsigned char *c, unsigned long long
|
||||
H4v = mulv(H3v, Hv);
|
||||
|
||||
accv = _mm_setzero_si128();
|
||||
/* unrolled by 4 GCM (by 8 doesn't improve using reduce4) */
|
||||
/* unrolled by 4 GCM (by 8 doesn't improve using REDUCE4) */
|
||||
for (i = 0; i < adlen_rnd64; i += 64) {
|
||||
__m128i X4 = _mm_loadu_si128((const __m128i *) (ad + i + 0));
|
||||
__m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
||||
__m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
||||
__m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
||||
accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
|
||||
REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
|
||||
}
|
||||
_mm_storeu_si128((__m128i *) accum, accv);
|
||||
|
||||
@ -648,7 +660,7 @@ crypto_aead_aes256gcm_aesni_decrypt_afternm(unsigned char *m, unsigned long long
|
||||
__m128i X3 = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
||||
__m128i X2 = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
||||
__m128i X1 = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
||||
accv = reduce4(Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
|
||||
REDUCE4(rev, Hv, H2v, H3v, H4v, X1, X2, X3, X4, accv);
|
||||
}
|
||||
_mm_storeu_si128((__m128i *) accum, accv);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user