diff --git a/intel/filter_sse2_intrinsics.c b/intel/filter_sse2_intrinsics.c index ef1fc7d6b..a5f5ecf58 100644 --- a/intel/filter_sse2_intrinsics.c +++ b/intel/filter_sse2_intrinsics.c @@ -29,39 +29,25 @@ */ static __m128i load4(const void* p) { - return _mm_cvtsi32_si128(*(const int*)p); + int tmp; + memcpy(&tmp, p, sizeof(tmp)); + return _mm_cvtsi32_si128(tmp); } static void store4(void* p, __m128i v) { - *(int*)p = _mm_cvtsi128_si32(v); + int tmp = _mm_cvtsi128_si32(v); + memcpy(p, &tmp, sizeof(int)); } static __m128i load3(const void* p) { - /* We'll load 2 bytes, then 1 byte, - * then mask them together, and finally load into SSE. - */ - const png_uint_16* p01 = (png_const_uint_16p)p; - const png_byte* p2 = (const png_byte*)(p01+1); - - png_uint_32 v012 = (png_uint_32)(*p01) - | (png_uint_32)(*p2) << 16; - return load4(&v012); + png_uint_32 tmp = 0; + memcpy(&tmp, p, 3); + return _mm_cvtsi32_si128(tmp); } static void store3(void* p, __m128i v) { - /* We'll pull from SSE as a 32-bit int, then write - * its bottom two bytes, then its third byte. - */ - png_uint_32 v012; - png_uint_16* p01; - png_byte* p2; - - store4(&v012, v); - - p01 = (png_uint_16p)p; - p2 = (png_byte*)(p01+1); - *p01 = (png_uint_16)v012; - *p2 = (png_byte)(v012 >> 16); + int tmp = _mm_cvtsi128_si32(v); + memcpy(p, &tmp, 3); } void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,