More explicit casts. Unaligned accesses are fine on these architectures.
This commit is contained in:
parent
27048b06aa
commit
cf3064b08f
@ -62,8 +62,8 @@ poly1305_block_copy31(unsigned char *dst, const unsigned char *src, unsigned lon
|
||||
{
|
||||
unsigned long long offset = src - dst;
|
||||
|
||||
if (bytes & 16) { _mm_store_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset))); dst += 16; }
|
||||
if (bytes & 8) { *(uint64_t *)dst = *(uint64_t *)(dst + offset); dst += 8; }
|
||||
if (bytes & 16) { _mm_store_si128((xmmi *)(void *)dst, _mm_loadu_si128((xmmi *)(void *)(dst + offset))); dst += 16; }
|
||||
if (bytes & 8) { *(uint64_t *)(void *)dst = *(uint64_t *)(void *)(dst + offset); dst += 8; }
|
||||
if (bytes & 4) { *(uint32_t *)dst = *(uint32_t *)(dst + offset); dst += 4; }
|
||||
if (bytes & 2) { *(uint16_t *)dst = *(uint16_t *)(dst + offset); dst += 2; }
|
||||
if (bytes & 1) { *( unsigned char *)dst = *( unsigned char *)(dst + offset); }
|
||||
@ -84,13 +84,13 @@ poly1305_init_ext(poly1305_state_internal_t *st,
|
||||
if (!bytes) bytes = ~(unsigned long long)0;
|
||||
|
||||
/* H = 0 */
|
||||
_mm_storeu_si128((xmmi *)&st->hh[0], _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)&st->hh[4], _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)&st->hh[8], _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)&st->hh[0], _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)&st->hh[4], _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)&st->hh[8], _mm_setzero_si128());
|
||||
|
||||
/* clamp key */
|
||||
t0 = *(uint64_t *)(key + 0);
|
||||
t1 = *(uint64_t *)(key + 8);
|
||||
t0 = *(uint64_t *)(void *)(key + 0);
|
||||
t1 = *(uint64_t *)(void *)(key + 8);
|
||||
r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20;
|
||||
r1 = t0 & 0xfffffc0ffff; t1 >>= 24;
|
||||
r2 = t1 & 0x00ffffffc0f;
|
||||
@ -104,8 +104,8 @@ poly1305_init_ext(poly1305_state_internal_t *st,
|
||||
R[4] = (uint32_t)(( r2 >> 16) );
|
||||
|
||||
/* save pad */
|
||||
st->pad[0] = *(uint64_t *)(key + 16);
|
||||
st->pad[1] = *(uint64_t *)(key + 24);
|
||||
st->pad[0] = *(uint64_t *)(void *)(key + 16);
|
||||
st->pad[1] = *(uint64_t *)(void *)(key + 24);
|
||||
|
||||
rt0 = r0;
|
||||
rt1 = r1;
|
||||
@ -166,8 +166,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
|
||||
if (!(st->flags & poly1305_started)) {
|
||||
/* H = [Mx,My] */
|
||||
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
|
||||
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
|
||||
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 0)), _mm_loadl_epi64((xmmi *)(void *)(m + 16)));
|
||||
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 8)), _mm_loadl_epi64((xmmi *)(void *)(m + 24)));
|
||||
H0 = _mm_and_si128(MMASK, T5);
|
||||
H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
|
||||
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
|
||||
@ -179,9 +179,9 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
bytes -= 32;
|
||||
st->flags |= poly1305_started;
|
||||
} else {
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->hh[0]);
|
||||
T1 = _mm_loadu_si128((xmmi *)&st->hh[4]);
|
||||
T2 = _mm_loadu_si128((xmmi *)&st->hh[8]);
|
||||
T0 = _mm_loadu_si128((xmmi *)(void *)&st->hh[0]);
|
||||
T1 = _mm_loadu_si128((xmmi *)(void *)&st->hh[4]);
|
||||
T2 = _mm_loadu_si128((xmmi *)(void *)&st->hh[8]);
|
||||
H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,0,0));
|
||||
H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,2,2));
|
||||
H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1,1,0,0));
|
||||
@ -192,16 +192,16 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
if (st->flags & (poly1305_final_r2_r|poly1305_final_r_1)) {
|
||||
if (st->flags & poly1305_final_r2_r) {
|
||||
/* use [r^2, r] */
|
||||
T2 = _mm_loadu_si128((xmmi *)&st->R[0]);
|
||||
T2 = _mm_loadu_si128((xmmi *)(void *)&st->R[0]);
|
||||
T3 = _mm_cvtsi32_si128(st->R[4]);
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R2[0]);
|
||||
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R2[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R2[4]);
|
||||
T4 = _mm_unpacklo_epi32(T0, T2);
|
||||
T5 = _mm_unpackhi_epi32(T0, T2);
|
||||
R24 = _mm_unpacklo_epi64(T1, T3);
|
||||
} else {
|
||||
/* use [r^1, 1] */
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R[0]);
|
||||
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R[4]);
|
||||
T2 = _mm_cvtsi32_si128(1);
|
||||
T4 = _mm_unpacklo_epi32(T0, T2);
|
||||
@ -215,7 +215,7 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3,3,2,2));
|
||||
} else {
|
||||
/* use [r^2, r^2] */
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R2[0]);
|
||||
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R2[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R2[4]);
|
||||
R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
|
||||
R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
|
||||
@ -229,7 +229,7 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
S24 = _mm_mul_epu32(R24, FIVE);
|
||||
|
||||
if (bytes >= 64) {
|
||||
T0 = _mm_loadu_si128((xmmi *)&st->R4[0]);
|
||||
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R4[0]);
|
||||
T1 = _mm_cvtsi32_si128(st->R4[4]);
|
||||
R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
|
||||
R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
|
||||
@ -274,12 +274,12 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
|
||||
v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
|
||||
T14 = R42;
|
||||
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
|
||||
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 0)), _mm_loadl_epi64((xmmi *)(void *)(m + 16)));
|
||||
v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
|
||||
v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
|
||||
v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
|
||||
T15 = R43;
|
||||
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
|
||||
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 8)), _mm_loadl_epi64((xmmi *)(void *)(m + 24)));
|
||||
v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
|
||||
v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
|
||||
M0 = _mm_and_si128(MMASK, T5);
|
||||
@ -295,8 +295,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
|
||||
|
||||
/* H += [Mx',My'] */
|
||||
T5 = _mm_loadu_si128((xmmi *)(m + 32));
|
||||
T6 = _mm_loadu_si128((xmmi *)(m + 48));
|
||||
T5 = _mm_loadu_si128((xmmi *)(void *)(m + 32));
|
||||
T6 = _mm_loadu_si128((xmmi *)(void *)(m + 48));
|
||||
T7 = _mm_unpacklo_epi32(T5, T6);
|
||||
T8 = _mm_unpackhi_epi32(T5, T6);
|
||||
M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
|
||||
@ -414,8 +414,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
|
||||
/* H += [Mx,My] */
|
||||
if (m) {
|
||||
T5 = _mm_loadu_si128((xmmi *)(m + 0));
|
||||
T6 = _mm_loadu_si128((xmmi *)(m + 16));
|
||||
T5 = _mm_loadu_si128((xmmi *)(void *)(m + 0));
|
||||
T6 = _mm_loadu_si128((xmmi *)(void *)(m + 16));
|
||||
T7 = _mm_unpacklo_epi32(T5, T6);
|
||||
T8 = _mm_unpackhi_epi32(T5, T6);
|
||||
M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
|
||||
@ -454,9 +454,9 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0,0,2,0));
|
||||
T0 = _mm_unpacklo_epi64(T0, T1);
|
||||
T1 = _mm_unpacklo_epi64(T2, T3);
|
||||
_mm_storeu_si128((xmmi *)&st->hh[0], T0);
|
||||
_mm_storeu_si128((xmmi *)&st->hh[4], T1);
|
||||
_mm_storel_epi64((xmmi *)&st->hh[8], T4);
|
||||
_mm_storeu_si128((xmmi *)(void *)&st->hh[0], T0);
|
||||
_mm_storeu_si128((xmmi *)(void *)&st->hh[4], T1);
|
||||
_mm_storel_epi64((xmmi *)(void *)&st->hh[8], T4);
|
||||
} else {
|
||||
uint32_t t0,t1,t2,t3,t4,b;
|
||||
uint64_t h0,h1,h2,g0,g1,g2,c,nc;
|
||||
@ -595,17 +595,17 @@ poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
|
||||
h1 = h[1];
|
||||
}
|
||||
#endif
|
||||
_mm_storeu_si128((xmmi *)st + 0, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 1, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 2, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 3, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 4, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 5, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 6, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)st + 7, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 0, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 1, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 2, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 3, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 4, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 5, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 6, _mm_setzero_si128());
|
||||
_mm_storeu_si128((xmmi *)(void *)st + 7, _mm_setzero_si128());
|
||||
|
||||
*(uint64_t *)(mac + 0) = h0;
|
||||
*(uint64_t *)(mac + 8) = h1;
|
||||
*(uint64_t *)(void *)(mac + 0) = h0;
|
||||
*(uint64_t *)(void *)(mac + 8) = h1;
|
||||
|
||||
sodium_memzero((void *)st, sizeof *st);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user