Import vanilla poly1305_sse2
This commit is contained in:
parent
121978e2c3
commit
6b7811471b
552
src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c
Normal file
552
src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c
Normal file
@ -0,0 +1,552 @@
|
||||
#include <stdint.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
typedef __m128i xmmi;
|
||||
typedef unsigned int uint128_t __attribute__((mode(TI)));
|
||||
|
||||
enum poly1305_state_flags_t {
|
||||
poly1305_started = 1,
|
||||
poly1305_final_shift8 = 4,
|
||||
poly1305_final_shift16 = 8,
|
||||
poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */
|
||||
poly1305_final_r_1 = 32, /* use [r,1] for the final block */
|
||||
};
|
||||
|
||||
typedef struct poly1305_state_internal_t {
|
||||
union {
|
||||
uint64_t h[3];
|
||||
uint32_t hh[10];
|
||||
}; /* 40 bytes */
|
||||
uint32_t R[5]; /* 20 bytes */
|
||||
uint32_t R2[5]; /* 20 bytes */
|
||||
uint32_t R4[5]; /* 20 bytes */
|
||||
uint64_t pad[2]; /* 16 bytes */
|
||||
uint64_t flags; /* 8 bytes */
|
||||
} poly1305_state_internal; /* 124 bytes total */
|
||||
|
||||
typedef uint8_t poly1305_state[128];
|
||||
|
||||
#if defined(__AVX__)
|
||||
#define FN(name) name##_avx
|
||||
#else
|
||||
#define FN(name) name##_sse2
|
||||
#endif
|
||||
|
||||
size_t
|
||||
FN(poly1305_block_size)(void) {
|
||||
return 32;
|
||||
}
|
||||
|
||||
/* copy 0-31 bytes */
|
||||
inline __attribute__((always_inline)) static void
|
||||
poly1305_block_copy31(uint8_t *dst, const uint8_t *src, size_t bytes) {
|
||||
size_t offset = src - dst;
|
||||
if (bytes & 16) { _mm_store_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset))); dst += 16; }
|
||||
if (bytes & 8) { *(uint64_t *)dst = *(uint64_t *)(dst + offset); dst += 8; }
|
||||
if (bytes & 4) { *(uint32_t *)dst = *(uint32_t *)(dst + offset); dst += 4; }
|
||||
if (bytes & 2) { *(uint16_t *)dst = *(uint16_t *)(dst + offset); dst += 2; }
|
||||
if (bytes & 1) { *( uint8_t *)dst = *( uint8_t *)(dst + offset); }
|
||||
}
|
||||
|
||||
__attribute__((noinline)) void
|
||||
FN(poly1305_init_ext)(poly1305_state_internal *st, const unsigned char key[32], size_t bytes) {
|
||||
uint32_t *R;
|
||||
uint128_t d[3],m0;
|
||||
uint64_t r0,r1,r2;
|
||||
uint32_t rp0,rp1,rp2,rp3,rp4;
|
||||
uint64_t rt0,rt1,rt2,st2,c;
|
||||
uint64_t t0,t1;
|
||||
size_t i;
|
||||
|
||||
if (!bytes) bytes = ~(size_t)0;
|
||||
|
||||
/* H = 0 */
|
||||
_mm_storeu_si128((xmmi *)&st->hh[0], _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)&st->hh[4], _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)&st->hh[8], _mm_setzero_si128());
|
||||
|
||||
/* clamp key */
|
||||
t0 = *(uint64_t *)(key + 0);
|
||||
t1 = *(uint64_t *)(key + 8);
|
||||
r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20;
|
||||
r1 = t0 & 0xfffffc0ffff; t1 >>= 24;
|
||||
r2 = t1 & 0x00ffffffc0f;
|
||||
|
||||
/* r^1 */
|
||||
R = st->R;
|
||||
R[0] = (uint32_t)( r0 ) & 0x3ffffff;
|
||||
R[1] = (uint32_t)(( r0 >> 26) | ( r1 << 18)) & 0x3ffffff;
|
||||
R[2] = (uint32_t)(( r1 >> 8) ) & 0x3ffffff;
|
||||
R[3] = (uint32_t)(( r1 >> 34) | ( r2 << 10)) & 0x3ffffff;
|
||||
R[4] = (uint32_t)(( r2 >> 16) );
|
||||
|
||||
/* save pad */
|
||||
st->pad[0] = *(uint64_t *)(key + 16);
|
||||
st->pad[1] = *(uint64_t *)(key + 24);
|
||||
|
||||
rt0 = r0;
|
||||
rt1 = r1;
|
||||
rt2 = r2;
|
||||
|
||||
/* r^2, r^4 */
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (i == 0) {
|
||||
R = st->R2;
|
||||
if (bytes <= 16)
|
||||
break;
|
||||
} else if (i == 1) {
|
||||
R = st->R4;
|
||||
if (bytes < 96)
|
||||
break;
|
||||
}
|
||||
|
||||
st2 = rt2 * (5 << 2);
|
||||
d[0] = ((uint128_t)rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
|
||||
d[1] = ((uint128_t)rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
|
||||
d[2] = ((uint128_t)rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
|
||||
rt0 = (uint64_t)d[0] & 0xfffffffffff; c = (uint64_t)(d[0] >> 44);
|
||||
d[1] += c ; rt1 = (uint64_t)d[1] & 0xfffffffffff; c = (uint64_t)(d[1] >> 44);
|
||||
d[2] += c ; rt2 = (uint64_t)d[2] & 0x3ffffffffff; c = (uint64_t)(d[2] >> 42);
|
||||
rt0 += c * 5; c = (rt0 >> 44); rt0 = rt0 & 0xfffffffffff;
|
||||
rt1 += c ; c = (rt1 >> 44); rt1 = rt1 & 0xfffffffffff;
|
||||
rt2 += c ; /* even if rt2 overflows, it will still fit in rp4 safely, and is safe to multiply with */
|
||||
|
||||
R[0] = (uint32_t)( rt0 ) & 0x3ffffff;
|
||||
R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
|
||||
R[2] = (uint32_t)((rt1 >> 8) ) & 0x3ffffff;
|
||||
R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
|
||||
R[4] = (uint32_t)((rt2 >> 16) );
|
||||
}
|
||||
|
||||
st->flags = 0;
|
||||
}
|
||||
|
||||
__attribute__((noinline)) void
|
||||
FN(poly1305_blocks)(poly1305_state_internal *st, const uint8_t *m, size_t bytes) {
|
||||
__attribute__((aligned(64))) xmmi HIBIT = _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1,0,1,0));
|
||||
const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1), _MM_SHUFFLE(1,0,1,0));
|
||||
const xmmi FIVE = _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1,0,1,0));
|
||||
|
||||
xmmi H0,H1,H2,H3,H4;
|
||||
xmmi T0,T1,T2,T3,T4,T5,T6,T7,T8;
|
||||
xmmi M0,M1,M2,M3,M4;
|
||||
xmmi M5,M6,M7,M8,M9;
|
||||
xmmi C1,C2;
|
||||
xmmi R20,R21,R22,R23,R24,S21,S22,S23,S24;
|
||||
xmmi R40,R41,R42,R43,R44,S41,S42,S43,S44;
|
||||
|
||||
if (st->flags & poly1305_final_shift8) HIBIT = _mm_srli_si128(HIBIT, 8);
|
||||
if (st->flags & poly1305_final_shift16) HIBIT = _mm_setzero_si128();
|
||||
|
||||
if (!(st->flags & poly1305_started)) {
|
||||
/* H = [Mx,My] */
|
||||
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
|
||||
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
|
||||
H0 = _mm_and_si128(MMASK, T5);
|
||||
H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
|
||||
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
|
||||
H2 = _mm_and_si128(MMASK, T5);
|
||||
H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
|
||||
H4 = _mm_srli_epi64(T6, 40);
|
||||
H4 = _mm_or_si128(H4, HIBIT);
|
||||
m += 32;
|
||||
bytes -= 32;
|
||||
st->flags |= poly1305_started;
|
||||
} else {
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->hh[0]);
|
||||
T1 = _mm_loadu_si128((xmmi *)&st->hh[4]);
|
||||
T2 = _mm_loadu_si128((xmmi *)&st->hh[8]);
|
||||
H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,0,0));
|
||||
H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,2,2));
|
||||
H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1,1,0,0));
|
||||
H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3,3,2,2));
|
||||
H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1,1,0,0));
|
||||
}
|
||||
|
||||
if (st->flags & (poly1305_final_r2_r|poly1305_final_r_1)) {
|
||||
if (st->flags & poly1305_final_r2_r) {
|
||||
/* use [r^2, r] */
|
||||
T2 = _mm_loadu_si128((xmmi *)&st->R[0]);
|
||||
T3 = _mm_cvtsi32_si128(st->R[4]);
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R2[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R2[4]);
|
||||
T4 = _mm_unpacklo_epi32(T0, T2);
|
||||
T5 = _mm_unpackhi_epi32(T0, T2);
|
||||
R24 = _mm_unpacklo_epi64(T1, T3);
|
||||
} else {
|
||||
/* use [r^1, 1] */
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R[4]);
|
||||
T2 = _mm_cvtsi32_si128(1);
|
||||
T4 = _mm_unpacklo_epi32(T0, T2);
|
||||
T5 = _mm_unpackhi_epi32(T0, T2);
|
||||
R24 = T1;
|
||||
}
|
||||
|
||||
R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1,1,0,0));
|
||||
R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3,3,2,2));
|
||||
R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1,1,0,0));
|
||||
R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3,3,2,2));
|
||||
} else {
|
||||
/* use [r^2, r^2] */
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R2[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R2[4]);
|
||||
R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
|
||||
R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
|
||||
R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2,2,2,2));
|
||||
R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,3,3));
|
||||
R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0,0,0,0));
|
||||
}
|
||||
S21 = _mm_mul_epu32(R21, FIVE);
|
||||
S22 = _mm_mul_epu32(R22, FIVE);
|
||||
S23 = _mm_mul_epu32(R23, FIVE);
|
||||
S24 = _mm_mul_epu32(R24, FIVE);
|
||||
|
||||
if (bytes >= 64) {
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R4[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R4[4]);
|
||||
R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
|
||||
R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
|
||||
R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2,2,2,2));
|
||||
R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,3,3));
|
||||
R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0,0,0,0));
|
||||
S41 = _mm_mul_epu32(R41, FIVE);
|
||||
S42 = _mm_mul_epu32(R42, FIVE);
|
||||
S43 = _mm_mul_epu32(R43, FIVE);
|
||||
S44 = _mm_mul_epu32(R44, FIVE);
|
||||
|
||||
while (bytes >= 64) {
|
||||
xmmi v00,v01,v02,v03,v04;
|
||||
xmmi v10,v11,v12,v13,v14;
|
||||
xmmi v20,v21,v22,v23,v24;
|
||||
xmmi v30,v31,v32,v33,v34;
|
||||
xmmi v40,v41,v42,v43,v44;
|
||||
xmmi T14,T15;
|
||||
|
||||
/* H *= [r^4,r^4], preload [Mx,My] */
|
||||
T15 = S42;
|
||||
T0 = H4; T0 = _mm_mul_epu32(T0, S41);
|
||||
v01 = H3; v01 = _mm_mul_epu32(v01, T15);
|
||||
T14 = S43;
|
||||
T1 = H4; T1 = _mm_mul_epu32(T1 , T15);
|
||||
v11 = H3; v11 = _mm_mul_epu32(v11, T14);
|
||||
T2 = H4; T2 = _mm_mul_epu32(T2 , T14); T0 = _mm_add_epi64(T0, v01);
|
||||
T15 = S44;
|
||||
v02 = H2; v02 = _mm_mul_epu32(v02, T14);
|
||||
T3 = H4; T3 = _mm_mul_epu32(T3 , T15); T1 = _mm_add_epi64(T1, v11);
|
||||
v03 = H1; v03 = _mm_mul_epu32(v03, T15);
|
||||
v12 = H2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02);
|
||||
T14 = R40;
|
||||
v21 = H3; v21 = _mm_mul_epu32(v21, T15);
|
||||
v31 = H3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03);
|
||||
T4 = H4; T4 = _mm_mul_epu32(T4 , T14); T1 = _mm_add_epi64(T1, v12);
|
||||
v04 = H0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21);
|
||||
v13 = H1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31);
|
||||
T15 = R41;
|
||||
v22 = H2; v22 = _mm_mul_epu32(v22, T14);
|
||||
v32 = H2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04);
|
||||
v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
|
||||
v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
|
||||
T14 = R42;
|
||||
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
|
||||
v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
|
||||
v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
|
||||
v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
|
||||
T15 = R43;
|
||||
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
|
||||
v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
|
||||
v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
|
||||
M0 = _mm_and_si128(MMASK, T5);
|
||||
v43 = H1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42);
|
||||
M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
|
||||
v44 = H0; v44 = _mm_mul_epu32(v44, R44); T2 = _mm_add_epi64(T2, v24);
|
||||
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
|
||||
T3 = _mm_add_epi64(T3, v34);
|
||||
M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
|
||||
T4 = _mm_add_epi64(T4, v43);
|
||||
M2 = _mm_and_si128(MMASK, T5);
|
||||
T4 = _mm_add_epi64(T4, v44);
|
||||
M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
|
||||
|
||||
/* H += [Mx',My'] */
|
||||
T5 = _mm_loadu_si128((xmmi *)(m + 32));
|
||||
T6 = _mm_loadu_si128((xmmi *)(m + 48));
|
||||
T7 = _mm_unpacklo_epi32(T5, T6);
|
||||
T8 = _mm_unpackhi_epi32(T5, T6);
|
||||
M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
|
||||
M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
|
||||
M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
|
||||
M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
|
||||
M6 = _mm_slli_epi64(M6, 6);
|
||||
M7 = _mm_slli_epi64(M7, 12);
|
||||
M8 = _mm_slli_epi64(M8, 18);
|
||||
T0 = _mm_add_epi64(T0, M5);
|
||||
T1 = _mm_add_epi64(T1, M6);
|
||||
T2 = _mm_add_epi64(T2, M7);
|
||||
T3 = _mm_add_epi64(T3, M8);
|
||||
T4 = _mm_add_epi64(T4, HIBIT);
|
||||
|
||||
/* H += [Mx,My]*[r^2,r^2] */
|
||||
T15 = S22;
|
||||
v00 = M4; v00 = _mm_mul_epu32(v00, S21);
|
||||
v01 = M3; v01 = _mm_mul_epu32(v01, T15);
|
||||
T14 = S23;
|
||||
v10 = M4; v10 = _mm_mul_epu32(v10, T15);
|
||||
v11 = M3; v11 = _mm_mul_epu32(v11, T14); T0 = _mm_add_epi64(T0, v00);
|
||||
v20 = M4; v20 = _mm_mul_epu32(v20, T14); T0 = _mm_add_epi64(T0, v01);
|
||||
T15 = S24;
|
||||
v02 = M2; v02 = _mm_mul_epu32(v02, T14); T1 = _mm_add_epi64(T1, v10);
|
||||
v30 = M4; v30 = _mm_mul_epu32(v30, T15); T1 = _mm_add_epi64(T1, v11);
|
||||
v03 = M1; v03 = _mm_mul_epu32(v03, T15); T2 = _mm_add_epi64(T2, v20);
|
||||
v12 = M2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02);
|
||||
T14 = R20;
|
||||
v21 = M3; v21 = _mm_mul_epu32(v21, T15); T3 = _mm_add_epi64(T3, v30);
|
||||
v31 = M3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03);
|
||||
v40 = M4; v40 = _mm_mul_epu32(v40, T14); T1 = _mm_add_epi64(T1, v12);
|
||||
v04 = M0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21);
|
||||
v13 = M1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31);
|
||||
T15 = R21;
|
||||
v22 = M2; v22 = _mm_mul_epu32(v22, T14); T4 = _mm_add_epi64(T4, v40);
|
||||
v32 = M2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04);
|
||||
v41 = M3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
|
||||
v14 = M0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
|
||||
T14 = R22;
|
||||
v23 = M1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
|
||||
v33 = M1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
|
||||
v42 = M2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
|
||||
T15 = R23;
|
||||
v24 = M0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
|
||||
v34 = M0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
|
||||
v43 = M1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42);
|
||||
v44 = M0; v44 = _mm_mul_epu32(v44, R24); T2 = _mm_add_epi64(T2, v24);
|
||||
T3 = _mm_add_epi64(T3, v34);
|
||||
T4 = _mm_add_epi64(T4, v43);
|
||||
T4 = _mm_add_epi64(T4, v44);
|
||||
|
||||
/* reduce */
|
||||
C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2);
|
||||
C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
|
||||
C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2);
|
||||
C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1);
|
||||
|
||||
/* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
|
||||
H0 = T0;
|
||||
H1 = T1;
|
||||
H2 = T2;
|
||||
H3 = T3;
|
||||
H4 = T4;
|
||||
|
||||
m += 64;
|
||||
bytes -= 64;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (bytes >= 32) {
|
||||
xmmi v01,v02,v03,v04;
|
||||
xmmi v11,v12,v13,v14;
|
||||
xmmi v21,v22,v23,v24;
|
||||
xmmi v31,v32,v33,v34;
|
||||
xmmi v41,v42,v43,v44;
|
||||
xmmi T14,T15;
|
||||
|
||||
/* H *= [r^2,r^2] */
|
||||
T15 = S22;
|
||||
T0 = H4; T0 = _mm_mul_epu32(T0, S21);
|
||||
v01 = H3; v01 = _mm_mul_epu32(v01, T15);
|
||||
T14 = S23;
|
||||
T1 = H4; T1 = _mm_mul_epu32(T1 , T15);
|
||||
v11 = H3; v11 = _mm_mul_epu32(v11, T14);
|
||||
T2 = H4; T2 = _mm_mul_epu32(T2 , T14); T0 = _mm_add_epi64(T0, v01);
|
||||
T15 = S24;
|
||||
v02 = H2; v02 = _mm_mul_epu32(v02, T14);
|
||||
T3 = H4; T3 = _mm_mul_epu32(T3 , T15); T1 = _mm_add_epi64(T1, v11);
|
||||
v03 = H1; v03 = _mm_mul_epu32(v03, T15);
|
||||
v12 = H2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02);
|
||||
T14 = R20;
|
||||
v21 = H3; v21 = _mm_mul_epu32(v21, T15);
|
||||
v31 = H3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03);
|
||||
T4 = H4; T4 = _mm_mul_epu32(T4 , T14); T1 = _mm_add_epi64(T1, v12);
|
||||
v04 = H0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21);
|
||||
v13 = H1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31);
|
||||
T15 = R21;
|
||||
v22 = H2; v22 = _mm_mul_epu32(v22, T14);
|
||||
v32 = H2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04);
|
||||
v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
|
||||
v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
|
||||
T14 = R22;
|
||||
v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
|
||||
v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
|
||||
v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
|
||||
T15 = R23;
|
||||
v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
|
||||
v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
|
||||
v43 = H1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42);
|
||||
v44 = H0; v44 = _mm_mul_epu32(v44, R24); T2 = _mm_add_epi64(T2, v24);
|
||||
T3 = _mm_add_epi64(T3, v34);
|
||||
T4 = _mm_add_epi64(T4, v43);
|
||||
T4 = _mm_add_epi64(T4, v44);
|
||||
|
||||
/* H += [Mx,My] */
|
||||
if (m) {
|
||||
T5 = _mm_loadu_si128((xmmi *)(m + 0));
|
||||
T6 = _mm_loadu_si128((xmmi *)(m + 16));
|
||||
T7 = _mm_unpacklo_epi32(T5, T6);
|
||||
T8 = _mm_unpackhi_epi32(T5, T6);
|
||||
M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
|
||||
M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
|
||||
M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
|
||||
M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
|
||||
M1 = _mm_slli_epi64(M1, 6);
|
||||
M2 = _mm_slli_epi64(M2, 12);
|
||||
M3 = _mm_slli_epi64(M3, 18);
|
||||
T0 = _mm_add_epi64(T0, M0);
|
||||
T1 = _mm_add_epi64(T1, M1);
|
||||
T2 = _mm_add_epi64(T2, M2);
|
||||
T3 = _mm_add_epi64(T3, M3);
|
||||
T4 = _mm_add_epi64(T4, HIBIT);
|
||||
}
|
||||
|
||||
/* reduce */
|
||||
C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2);
|
||||
C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
|
||||
C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2);
|
||||
C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1);
|
||||
|
||||
/* H = (H*[r^2,r^2] + [Mx,My]) */
|
||||
H0 = T0;
|
||||
H1 = T1;
|
||||
H2 = T2;
|
||||
H3 = T3;
|
||||
H4 = T4;
|
||||
}
|
||||
|
||||
if (m) {
|
||||
T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0,0,2,0));
|
||||
T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0,0,2,0));
|
||||
T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0,0,2,0));
|
||||
T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0,0,2,0));
|
||||
T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0,0,2,0));
|
||||
T0 = _mm_unpacklo_epi64(T0, T1);
|
||||
T1 = _mm_unpacklo_epi64(T2, T3);
|
||||
_mm_storeu_si128((xmmi *)&st->hh[0], T0);
|
||||
_mm_storeu_si128((xmmi *)&st->hh[4], T1);
|
||||
_mm_storel_epi64((xmmi *)&st->hh[8], T4);
|
||||
} else {
|
||||
uint32_t t0,t1,t2,t3,t4,b;
|
||||
uint64_t h0,h1,h2,g0,g1,g2,c,nc;
|
||||
|
||||
/* H = H[0]+H[1] */
|
||||
T0 = H0;
|
||||
T1 = H1;
|
||||
T2 = H2;
|
||||
T3 = H3;
|
||||
T4 = H4;
|
||||
|
||||
T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
|
||||
T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
|
||||
T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
|
||||
T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
|
||||
T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
|
||||
|
||||
t0 = _mm_cvtsi128_si32(T0) ; b = (t0 >> 26); t0 &= 0x3ffffff;
|
||||
t1 = _mm_cvtsi128_si32(T1) + b; b = (t1 >> 26); t1 &= 0x3ffffff;
|
||||
t2 = _mm_cvtsi128_si32(T2) + b; b = (t2 >> 26); t2 &= 0x3ffffff;
|
||||
t3 = _mm_cvtsi128_si32(T3) + b; b = (t3 >> 26); t3 &= 0x3ffffff;
|
||||
t4 = _mm_cvtsi128_si32(T4) + b;
|
||||
|
||||
/* everything except t4 is in range, so this is all safe */
|
||||
h0 = (((uint64_t)t0 ) | ((uint64_t)t1 << 26) ) & 0xfffffffffffull;
|
||||
h1 = (((uint64_t)t1 >> 18) | ((uint64_t)t2 << 8) | ((uint64_t)t3 << 34)) & 0xfffffffffffull;
|
||||
h2 = (((uint64_t)t3 >> 10) | ((uint64_t)t4 << 16) );
|
||||
|
||||
c = (h2 >> 42); h2 &= 0x3ffffffffff;
|
||||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
|
||||
h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
|
||||
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
|
||||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
|
||||
h1 += c;
|
||||
|
||||
g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
|
||||
g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
|
||||
g2 = h2 + c - ((uint64_t)1 << 42);
|
||||
|
||||
c = (g2 >> 63) - 1;
|
||||
nc = ~c;
|
||||
h0 = (h0 & nc) | (g0 & c);
|
||||
h1 = (h1 & nc) | (g1 & c);
|
||||
h2 = (h2 & nc) | (g2 & c);
|
||||
|
||||
st->h[0] = h0;
|
||||
st->h[1] = h1;
|
||||
st->h[2] = h2;
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((noinline)) void
|
||||
FN(poly1305_finish_ext)(poly1305_state_internal *st, const uint8_t *m, size_t leftover, unsigned char mac[16]) {
|
||||
uint64_t h0,h1,h2;
|
||||
uint64_t t0,t1,c;
|
||||
|
||||
if (leftover) {
|
||||
__attribute__((aligned(16))) unsigned char final[32] = {0};
|
||||
poly1305_block_copy31(final, m, leftover);
|
||||
if (leftover != 16) final[leftover] = 1;
|
||||
st->flags |= (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
|
||||
FN(poly1305_blocks)(st, final, 32);
|
||||
}
|
||||
|
||||
if (st->flags & poly1305_started) {
|
||||
/* finalize, H *= [r^2,r], or H *= [r,1] */
|
||||
if (!leftover || (leftover > 16))
|
||||
st->flags |= poly1305_final_r2_r;
|
||||
else
|
||||
st->flags |= poly1305_final_r_1;
|
||||
FN(poly1305_blocks)(st, NULL, 32);
|
||||
}
|
||||
|
||||
h0 = st->h[0];
|
||||
h1 = st->h[1];
|
||||
h2 = st->h[2];
|
||||
|
||||
/* pad */
|
||||
h0 = ((h0 ) | (h1 << 44));
|
||||
h1 = ((h1 >> 20) | (h2 << 24));
|
||||
|
||||
__asm__ __volatile__(
|
||||
"addq %2, %0 ;\n"
|
||||
"adcq %3, %1 ;\n"
|
||||
: "+r"(h0), "+r"(h1)
|
||||
: "r"(st->pad[0]), "r"(st->pad[1])
|
||||
: "flags", "cc"
|
||||
);
|
||||
|
||||
_mm_storeu_si128((xmmi *)st + 0, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 1, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 2, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 3, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 4, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 5, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 6, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 7, _mm_setzero_si128());
|
||||
|
||||
*(uint64_t *)(mac + 0) = h0;
|
||||
*(uint64_t *)(mac + 8) = h1;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FN(poly1305_auth)(unsigned char out[16], const unsigned char *m, size_t inlen, const unsigned char key[32]) {
|
||||
__attribute__((aligned(64))) poly1305_state S;
|
||||
poly1305_state_internal *st = (poly1305_state_internal *)S;
|
||||
size_t blocks;
|
||||
FN(poly1305_init_ext)(st, key, inlen);
|
||||
blocks = inlen & ~31;
|
||||
if (blocks) {
|
||||
FN(poly1305_blocks)(st, m, blocks);
|
||||
m += blocks;
|
||||
inlen -= blocks;
|
||||
}
|
||||
FN(poly1305_finish_ext)(st, m, inlen, out);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user