From cf3064b08f6bf568a7ec53c38161c5225e82f19a Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Sat, 14 Nov 2015 16:01:34 +0100 Subject: [PATCH] More explicit casts. Unaligned accesses are fine on these architectures. --- .../poly1305/sse2/poly1305_sse2.c | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c b/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c index 3a9fe51b..f8c8b794 100644 --- a/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c +++ b/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c @@ -62,8 +62,8 @@ poly1305_block_copy31(unsigned char *dst, const unsigned char *src, unsigned lon { unsigned long long offset = src - dst; - if (bytes & 16) { _mm_store_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset))); dst += 16; } - if (bytes & 8) { *(uint64_t *)dst = *(uint64_t *)(dst + offset); dst += 8; } + if (bytes & 16) { _mm_store_si128((xmmi *)(void *)dst, _mm_loadu_si128((xmmi *)(void *)(dst + offset))); dst += 16; } + if (bytes & 8) { *(uint64_t *)(void *)dst = *(uint64_t *)(void *)(dst + offset); dst += 8; } if (bytes & 4) { *(uint32_t *)dst = *(uint32_t *)(dst + offset); dst += 4; } if (bytes & 2) { *(uint16_t *)dst = *(uint16_t *)(dst + offset); dst += 2; } if (bytes & 1) { *( unsigned char *)dst = *( unsigned char *)(dst + offset); } @@ -84,13 +84,13 @@ poly1305_init_ext(poly1305_state_internal_t *st, if (!bytes) bytes = ~(unsigned long long)0; /* H = 0 */ - _mm_storeu_si128((xmmi *)&st->hh[0], _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)&st->hh[4], _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)&st->hh[8], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)&st->hh[0], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)&st->hh[4], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)&st->hh[8], _mm_setzero_si128()); /* clamp key */ - t0 = *(uint64_t *)(key + 0); - t1 = *(uint64_t *)(key + 8); + t0 = *(uint64_t *)(void *)(key + 0); + t1 = *(uint64_t *)(void *)(key + 8); r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20; r1 = t0 & 0xfffffc0ffff; t1 >>= 24; r2 = t1 & 0x00ffffffc0f; @@ -104,8 +104,8 @@ poly1305_init_ext(poly1305_state_internal_t *st, R[4] = (uint32_t)(( r2 >> 16) ); /* save pad */ - st->pad[0] = *(uint64_t *)(key + 16); - st->pad[1] = *(uint64_t *)(key + 24); + st->pad[0] = *(uint64_t *)(void *)(key + 16); + st->pad[1] = *(uint64_t *)(void *)(key + 24); rt0 = r0; rt1 = r1; @@ -166,8 +166,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, if (!(st->flags & poly1305_started)) { /* H = [Mx,My] */ - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16))); - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24))); + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 0)), _mm_loadl_epi64((xmmi *)(void *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 8)), _mm_loadl_epi64((xmmi *)(void *)(m + 24))); H0 = _mm_and_si128(MMASK, T5); H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); @@ -179,9 +179,9 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, bytes -= 32; st->flags |= poly1305_started; } else { - T0 = _mm_loadu_si128((xmmi *)&st->hh[0]); - T1 = _mm_loadu_si128((xmmi *)&st->hh[4]); - T2 = _mm_loadu_si128((xmmi *)&st->hh[8]); + T0 = _mm_loadu_si128((xmmi *)(void *)&st->hh[0]); + T1 = _mm_loadu_si128((xmmi *)(void *)&st->hh[4]); + T2 = _mm_loadu_si128((xmmi *)(void *)&st->hh[8]); H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,0,0)); H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,2,2)); H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1,1,0,0)); @@ -192,16 +192,16 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, if (st->flags & (poly1305_final_r2_r|poly1305_final_r_1)) { if (st->flags & poly1305_final_r2_r) { /* use [r^2, r] */ - T2 = _mm_loadu_si128((xmmi *)&st->R[0]); + T2 = _mm_loadu_si128((xmmi *)(void *)&st->R[0]); T3 = _mm_cvtsi32_si128(st->R[4]); - T0 = _mm_loadu_si128((xmmi *)&st->R2[0]); + T0 = _mm_loadu_si128((xmmi *)(void *)&st->R2[0]); T1 = _mm_cvtsi32_si128(st->R2[4]); T4 = _mm_unpacklo_epi32(T0, T2); T5 = _mm_unpackhi_epi32(T0, T2); R24 = _mm_unpacklo_epi64(T1, T3); } else { /* use [r^1, 1] */ - T0 = _mm_loadu_si128((xmmi *)&st->R[0]); + T0 = _mm_loadu_si128((xmmi *)(void *)&st->R[0]); T1 = _mm_cvtsi32_si128(st->R[4]); T2 = _mm_cvtsi32_si128(1); T4 = _mm_unpacklo_epi32(T0, T2); @@ -215,7 +215,7 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3,3,2,2)); } else { /* use [r^2, r^2] */ - T0 = _mm_loadu_si128((xmmi *)&st->R2[0]); + T0 = _mm_loadu_si128((xmmi *)(void *)&st->R2[0]); T1 = _mm_cvtsi32_si128(st->R2[4]); R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0)); R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1)); @@ -229,7 +229,7 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, S24 = _mm_mul_epu32(R24, FIVE); if (bytes >= 64) { - T0 = _mm_loadu_si128((xmmi *)&st->R4[0]); + T0 = _mm_loadu_si128((xmmi *)(void *)&st->R4[0]); T1 = _mm_cvtsi32_si128(st->R4[4]); R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0)); R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1)); @@ -274,12 +274,12 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13); v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22); T14 = R42; - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16))); + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 0)), _mm_loadl_epi64((xmmi *)(void *)(m + 16))); v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32); v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41); v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14); T15 = R43; - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(void *)(m + 8)), _mm_loadl_epi64((xmmi *)(void *)(m + 24))); v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23); v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33); M0 = _mm_and_si128(MMASK, T5); @@ -295,8 +295,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); /* H += [Mx',My'] */ - T5 = _mm_loadu_si128((xmmi *)(m + 32)); - T6 = _mm_loadu_si128((xmmi *)(m + 48)); + T5 = _mm_loadu_si128((xmmi *)(void *)(m + 32)); + T6 = _mm_loadu_si128((xmmi *)(void *)(m + 48)); T7 = _mm_unpacklo_epi32(T5, T6); T8 = _mm_unpackhi_epi32(T5, T6); M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); @@ -414,8 +414,8 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, /* H += [Mx,My] */ if (m) { - T5 = _mm_loadu_si128((xmmi *)(m + 0)); - T6 = _mm_loadu_si128((xmmi *)(m + 16)); + T5 = _mm_loadu_si128((xmmi *)(void *)(m + 0)); + T6 = _mm_loadu_si128((xmmi *)(void *)(m + 16)); T7 = _mm_unpacklo_epi32(T5, T6); T8 = _mm_unpackhi_epi32(T5, T6); M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); @@ -454,9 +454,9 @@ poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0,0,2,0)); T0 = _mm_unpacklo_epi64(T0, T1); T1 = _mm_unpacklo_epi64(T2, T3); - _mm_storeu_si128((xmmi *)&st->hh[0], T0); - _mm_storeu_si128((xmmi *)&st->hh[4], T1); - _mm_storel_epi64((xmmi *)&st->hh[8], T4); + _mm_storeu_si128((xmmi *)(void *)&st->hh[0], T0); + _mm_storeu_si128((xmmi *)(void *)&st->hh[4], T1); + _mm_storel_epi64((xmmi *)(void *)&st->hh[8], T4); } else { uint32_t t0,t1,t2,t3,t4,b; uint64_t h0,h1,h2,g0,g1,g2,c,nc; @@ -595,17 +595,17 @@ poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m, h1 = h[1]; } #endif - _mm_storeu_si128((xmmi *)st + 0, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)st + 1, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)st + 2, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)st + 3, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)st + 4, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)st + 5, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)st + 6, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)st + 7, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 0, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 1, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 2, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 3, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 4, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 5, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 6, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *)(void *)st + 7, _mm_setzero_si128()); - *(uint64_t *)(mac + 0) = h0; - *(uint64_t *)(mac + 8) = h1; + *(uint64_t *)(void *)(mac + 0) = h0; + *(uint64_t *)(void *)(mac + 8) = h1; sodium_memzero((void *)st, sizeof *st); }