This commit is contained in:
Frank Denis 2017-02-26 17:13:55 +01:00
parent 9294e2e699
commit c58cbcbd90

View File

@ -359,126 +359,140 @@ if (bytes >= 256) {
}
/* store data ; this macro replicates the original amd64-xmm6 code */
#define ONEQUAD_SHUFFLE(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
#define ONEQUAD_SHUFFLE(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
\
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
\
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
\
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
\
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
\
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
\
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
\
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
\
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
*(uint32_t *) (c + 204) = in##D
/* store data ; this macro replaces shuffle+mov by a direct extract; not much
* difference */
#define ONEQUAD_EXTRACT(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
in##A = _mm_extract_epi32(z##A, 1); \
in##B = _mm_extract_epi32(z##B, 1); \
in##C = _mm_extract_epi32(z##C, 1); \
in##D = _mm_extract_epi32(z##D, 1); \
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
in##A = _mm_extract_epi32(z##A, 2); \
in##B = _mm_extract_epi32(z##B, 2); \
in##C = _mm_extract_epi32(z##C, 2); \
in##D = _mm_extract_epi32(z##D, 2); \
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
in##A = _mm_extract_epi32(z##A, 3); \
in##B = _mm_extract_epi32(z##B, 3); \
in##C = _mm_extract_epi32(z##C, 3); \
in##D = _mm_extract_epi32(z##D, 3); \
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
#define ONEQUAD_EXTRACT(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
\
in##A = _mm_extract_epi32(z##A, 1); \
in##B = _mm_extract_epi32(z##B, 1); \
in##C = _mm_extract_epi32(z##C, 1); \
in##D = _mm_extract_epi32(z##D, 1); \
\
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
\
in##A = _mm_extract_epi32(z##A, 2); \
in##B = _mm_extract_epi32(z##B, 2); \
in##C = _mm_extract_epi32(z##C, 2); \
in##D = _mm_extract_epi32(z##D, 2); \
\
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
\
in##A = _mm_extract_epi32(z##A, 3); \
in##B = _mm_extract_epi32(z##B, 3); \
in##C = _mm_extract_epi32(z##C, 3); \
in##D = _mm_extract_epi32(z##D, 3); \
\
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
*(uint32_t *) (c + 204) = in##D
/* store data ; this macro first transpose data in-registers, and then store