More explicit casts. Unaligned accesses are fine on these architectures.

This commit is contained in:
Frank Denis 2015-11-14 16:01:34 +01:00
parent 27048b06aa
commit cf3064b08f

View File

@ -62,8 +62,8 @@ poly1305_block_copy31(unsigned char *dst, const unsigned char *src, unsigned lon
{
unsigned long long offset = src - dst;
if (bytes & 16) { _mm_store_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset))); dst += 16; }
if (bytes & 8) { *(uint64_t *)dst = *(uint64_t *)(dst + offset); dst += 8; }
if (bytes & 16) { _mm_store_si128((xmmi *)(void *)dst, _mm_loadu_si128((xmmi *)(void *)(dst + offset))); dst += 16; }
if (bytes & 8) { *(uint64_t *)(void *)dst = *(uint64_t *)(void *)(dst + offset); dst += 8; }
if (bytes & 4) { *(uint32_t *)dst = *(uint32_t *)(dst + offset); dst += 4; }
if (bytes & 2) { *(uint16_t *)dst = *(uint16_t *)(dst + offset); dst += 2; }
if (bytes & 1) { *( unsigned char *)dst = *( unsigned char *)(dst + offset); }
@ -84,13 +84,13 @@ poly1305_init_ext(poly1305_state_internal_t *st,
if (!bytes) bytes = ~(unsigned long long)0;
/* H = 0 */
_mm_storeu_si128((xmmi *)&st->hh[0], _mm_setzero_si128());
_mm_storeu_si128((xmmi *)&st->hh[4], _mm_setzero_si128());
_mm_storeu_si128((xmmi *)&st->hh[8], _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)&st->hh[0], _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)&st->hh[4], _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)&st->hh[8], _mm_setzero_si128());
/* clamp key */
t0 = *(uint64_t *)(key + 0);
t1 = *(uint64_t *)(key + 8);
t0 = *(uint64_t *)(void *)(key + 0);
t1 = *(uint64_t *)(void *)(key + 8);
r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20;
r1 = t0 & 0xfffffc0ffff; t1 >>= 24;
r2 = t1 & 0x00ffffffc0f;
@ -104,8 +104,8 @@ poly1305_init_ext(poly1305_state_internal_t *st,
R[4] = (uint32_t)(( r2 >> 16) );
/* save pad */
st->pad[0] = *(uint64_t *)(key + 16);
st->pad[1] = *(uint64_t *)(key + 24);
st->pad[0] = *(uint64_t *)(void *)(key + 16);
st->pad[1] = *(uint64_t *)(void *)(key + 24);
rt0 = r0;
rt1 = r1;
@ -166,8 +166,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
if (!(st->flags & poly1305_started)) {
/* H = [Mx,My] */
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 0)), _mm_loadl_epi64((xmmi *)(void *)(m + 16)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 8)), _mm_loadl_epi64((xmmi *)(void *)(m + 24)));
H0 = _mm_and_si128(MMASK, T5);
H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
@ -179,9 +179,9 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
bytes -= 32;
st->flags |= poly1305_started;
} else {
T0 = _mm_loadu_si128((xmmi *)&st->hh[0]);
T1 = _mm_loadu_si128((xmmi *)&st->hh[4]);
T2 = _mm_loadu_si128((xmmi *)&st->hh[8]);
T0 = _mm_loadu_si128((xmmi *)(void *)&st->hh[0]);
T1 = _mm_loadu_si128((xmmi *)(void *)&st->hh[4]);
T2 = _mm_loadu_si128((xmmi *)(void *)&st->hh[8]);
H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,0,0));
H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,2,2));
H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1,1,0,0));
@ -192,16 +192,16 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
if (st->flags & (poly1305_final_r2_r|poly1305_final_r_1)) {
if (st->flags & poly1305_final_r2_r) {
/* use [r^2, r] */
T2 = _mm_loadu_si128((xmmi *)&st->R[0]);
T2 = _mm_loadu_si128((xmmi *)(void *)&st->R[0]);
T3 = _mm_cvtsi32_si128(st->R[4]);
T0 = _mm_loadu_si128((xmmi *)&st->R2[0]);
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R2[0]);
T1 = _mm_cvtsi32_si128(st->R2[4]);
T4 = _mm_unpacklo_epi32(T0, T2);
T5 = _mm_unpackhi_epi32(T0, T2);
R24 = _mm_unpacklo_epi64(T1, T3);
} else {
/* use [r^1, 1] */
T0 = _mm_loadu_si128((xmmi *)&st->R[0]);
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R[0]);
T1 = _mm_cvtsi32_si128(st->R[4]);
T2 = _mm_cvtsi32_si128(1);
T4 = _mm_unpacklo_epi32(T0, T2);
@ -215,7 +215,7 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3,3,2,2));
} else {
/* use [r^2, r^2] */
T0 = _mm_loadu_si128((xmmi *)&st->R2[0]);
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R2[0]);
T1 = _mm_cvtsi32_si128(st->R2[4]);
R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
@ -229,7 +229,7 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
S24 = _mm_mul_epu32(R24, FIVE);
if (bytes >= 64) {
T0 = _mm_loadu_si128((xmmi *)&st->R4[0]);
T0 = _mm_loadu_si128((xmmi *)(void *)&st->R4[0]);
T1 = _mm_cvtsi32_si128(st->R4[4]);
R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
@ -274,12 +274,12 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
T14 = R42;
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 0)), _mm_loadl_epi64((xmmi *)(void *)(m + 16)));
v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
T15 = R43;
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 8)), _mm_loadl_epi64((xmmi *)(void *)(m + 24)));
v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
M0 = _mm_and_si128(MMASK, T5);
@ -295,8 +295,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
/* H += [Mx',My'] */
T5 = _mm_loadu_si128((xmmi *)(m + 32));
T6 = _mm_loadu_si128((xmmi *)(m + 48));
T5 = _mm_loadu_si128((xmmi *)(void *)(m + 32));
T6 = _mm_loadu_si128((xmmi *)(void *)(m + 48));
T7 = _mm_unpacklo_epi32(T5, T6);
T8 = _mm_unpackhi_epi32(T5, T6);
M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
@ -414,8 +414,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
/* H += [Mx,My] */
if (m) {
T5 = _mm_loadu_si128((xmmi *)(m + 0));
T6 = _mm_loadu_si128((xmmi *)(m + 16));
T5 = _mm_loadu_si128((xmmi *)(void *)(m + 0));
T6 = _mm_loadu_si128((xmmi *)(void *)(m + 16));
T7 = _mm_unpacklo_epi32(T5, T6);
T8 = _mm_unpackhi_epi32(T5, T6);
M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
@ -454,9 +454,9 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0,0,2,0));
T0 = _mm_unpacklo_epi64(T0, T1);
T1 = _mm_unpacklo_epi64(T2, T3);
_mm_storeu_si128((xmmi *)&st->hh[0], T0);
_mm_storeu_si128((xmmi *)&st->hh[4], T1);
_mm_storel_epi64((xmmi *)&st->hh[8], T4);
_mm_storeu_si128((xmmi *)(void *)&st->hh[0], T0);
_mm_storeu_si128((xmmi *)(void *)&st->hh[4], T1);
_mm_storel_epi64((xmmi *)(void *)&st->hh[8], T4);
} else {
uint32_t t0,t1,t2,t3,t4,b;
uint64_t h0,h1,h2,g0,g1,g2,c,nc;
@ -595,17 +595,17 @@ poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
h1 = h[1];
}
#endif
_mm_storeu_si128((xmmi *)st + 0, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)st + 1, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)st + 2, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)st + 3, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)st + 4, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)st + 5, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)st + 6, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)st + 7, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 0, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 1, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 2, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 3, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 4, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 5, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 6, _mm_setzero_si128());
_mm_storeu_si128((xmmi *)(void *)st + 7, _mm_setzero_si128());
*(uint64_t *)(mac + 0) = h0;
*(uint64_t *)(mac + 8) = h1;
*(uint64_t *)(void *)(mac + 0) = h0;
*(uint64_t *)(void *)(mac + 8) = h1;
sodium_memzero((void *)st, sizeof *st);
}