Indent
This commit is contained in:
parent
9294e2e699
commit
c58cbcbd90
@ -359,126 +359,140 @@ if (bytes >= 256) {
|
||||
}
|
||||
|
||||
/* store data ; this macro replicates the original amd64-xmm6 code */
|
||||
#define ONEQUAD_SHUFFLE(A, B, C, D) \
|
||||
z##A = _mm_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm_add_epi32(z##D, orig##D); \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
in##A ^= *(uint32_t *) (m + 0); \
|
||||
in##B ^= *(uint32_t *) (m + 4); \
|
||||
in##C ^= *(uint32_t *) (m + 8); \
|
||||
in##D ^= *(uint32_t *) (m + 12); \
|
||||
*(uint32_t *) (c + 0) = in##A; \
|
||||
*(uint32_t *) (c + 4) = in##B; \
|
||||
*(uint32_t *) (c + 8) = in##C; \
|
||||
*(uint32_t *) (c + 12) = in##D; \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
in##A ^= *(uint32_t *) (m + 64); \
|
||||
in##B ^= *(uint32_t *) (m + 68); \
|
||||
in##C ^= *(uint32_t *) (m + 72); \
|
||||
in##D ^= *(uint32_t *) (m + 76); \
|
||||
*(uint32_t *) (c + 64) = in##A; \
|
||||
*(uint32_t *) (c + 68) = in##B; \
|
||||
*(uint32_t *) (c + 72) = in##C; \
|
||||
*(uint32_t *) (c + 76) = in##D; \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
in##A ^= *(uint32_t *) (m + 128); \
|
||||
in##B ^= *(uint32_t *) (m + 132); \
|
||||
in##C ^= *(uint32_t *) (m + 136); \
|
||||
in##D ^= *(uint32_t *) (m + 140); \
|
||||
*(uint32_t *) (c + 128) = in##A; \
|
||||
*(uint32_t *) (c + 132) = in##B; \
|
||||
*(uint32_t *) (c + 136) = in##C; \
|
||||
*(uint32_t *) (c + 140) = in##D; \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
in##A ^= *(uint32_t *) (m + 192); \
|
||||
in##B ^= *(uint32_t *) (m + 196); \
|
||||
in##C ^= *(uint32_t *) (m + 200); \
|
||||
in##D ^= *(uint32_t *) (m + 204); \
|
||||
*(uint32_t *) (c + 192) = in##A; \
|
||||
*(uint32_t *) (c + 196) = in##B; \
|
||||
*(uint32_t *) (c + 200) = in##C; \
|
||||
#define ONEQUAD_SHUFFLE(A, B, C, D) \
|
||||
z##A = _mm_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm_add_epi32(z##D, orig##D); \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 0); \
|
||||
in##B ^= *(uint32_t *) (m + 4); \
|
||||
in##C ^= *(uint32_t *) (m + 8); \
|
||||
in##D ^= *(uint32_t *) (m + 12); \
|
||||
\
|
||||
*(uint32_t *) (c + 0) = in##A; \
|
||||
*(uint32_t *) (c + 4) = in##B; \
|
||||
*(uint32_t *) (c + 8) = in##C; \
|
||||
*(uint32_t *) (c + 12) = in##D; \
|
||||
\
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 64); \
|
||||
in##B ^= *(uint32_t *) (m + 68); \
|
||||
in##C ^= *(uint32_t *) (m + 72); \
|
||||
in##D ^= *(uint32_t *) (m + 76); \
|
||||
*(uint32_t *) (c + 64) = in##A; \
|
||||
*(uint32_t *) (c + 68) = in##B; \
|
||||
*(uint32_t *) (c + 72) = in##C; \
|
||||
*(uint32_t *) (c + 76) = in##D; \
|
||||
\
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
z##A = _mm_shuffle_epi32(z##A, 0x39); \
|
||||
z##B = _mm_shuffle_epi32(z##B, 0x39); \
|
||||
z##C = _mm_shuffle_epi32(z##C, 0x39); \
|
||||
z##D = _mm_shuffle_epi32(z##D, 0x39); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 128); \
|
||||
in##B ^= *(uint32_t *) (m + 132); \
|
||||
in##C ^= *(uint32_t *) (m + 136); \
|
||||
in##D ^= *(uint32_t *) (m + 140); \
|
||||
*(uint32_t *) (c + 128) = in##A; \
|
||||
*(uint32_t *) (c + 132) = in##B; \
|
||||
*(uint32_t *) (c + 136) = in##C; \
|
||||
*(uint32_t *) (c + 140) = in##D; \
|
||||
\
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 192); \
|
||||
in##B ^= *(uint32_t *) (m + 196); \
|
||||
in##C ^= *(uint32_t *) (m + 200); \
|
||||
in##D ^= *(uint32_t *) (m + 204); \
|
||||
*(uint32_t *) (c + 192) = in##A; \
|
||||
*(uint32_t *) (c + 196) = in##B; \
|
||||
*(uint32_t *) (c + 200) = in##C; \
|
||||
*(uint32_t *) (c + 204) = in##D
|
||||
|
||||
/* store data ; this macro replaces shuffle+mov by a direct extract; not much
|
||||
* difference */
|
||||
#define ONEQUAD_EXTRACT(A, B, C, D) \
|
||||
z##A = _mm_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm_add_epi32(z##D, orig##D); \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
in##A ^= *(uint32_t *) (m + 0); \
|
||||
in##B ^= *(uint32_t *) (m + 4); \
|
||||
in##C ^= *(uint32_t *) (m + 8); \
|
||||
in##D ^= *(uint32_t *) (m + 12); \
|
||||
*(uint32_t *) (c + 0) = in##A; \
|
||||
*(uint32_t *) (c + 4) = in##B; \
|
||||
*(uint32_t *) (c + 8) = in##C; \
|
||||
*(uint32_t *) (c + 12) = in##D; \
|
||||
in##A = _mm_extract_epi32(z##A, 1); \
|
||||
in##B = _mm_extract_epi32(z##B, 1); \
|
||||
in##C = _mm_extract_epi32(z##C, 1); \
|
||||
in##D = _mm_extract_epi32(z##D, 1); \
|
||||
in##A ^= *(uint32_t *) (m + 64); \
|
||||
in##B ^= *(uint32_t *) (m + 68); \
|
||||
in##C ^= *(uint32_t *) (m + 72); \
|
||||
in##D ^= *(uint32_t *) (m + 76); \
|
||||
*(uint32_t *) (c + 64) = in##A; \
|
||||
*(uint32_t *) (c + 68) = in##B; \
|
||||
*(uint32_t *) (c + 72) = in##C; \
|
||||
*(uint32_t *) (c + 76) = in##D; \
|
||||
in##A = _mm_extract_epi32(z##A, 2); \
|
||||
in##B = _mm_extract_epi32(z##B, 2); \
|
||||
in##C = _mm_extract_epi32(z##C, 2); \
|
||||
in##D = _mm_extract_epi32(z##D, 2); \
|
||||
in##A ^= *(uint32_t *) (m + 128); \
|
||||
in##B ^= *(uint32_t *) (m + 132); \
|
||||
in##C ^= *(uint32_t *) (m + 136); \
|
||||
in##D ^= *(uint32_t *) (m + 140); \
|
||||
*(uint32_t *) (c + 128) = in##A; \
|
||||
*(uint32_t *) (c + 132) = in##B; \
|
||||
*(uint32_t *) (c + 136) = in##C; \
|
||||
*(uint32_t *) (c + 140) = in##D; \
|
||||
in##A = _mm_extract_epi32(z##A, 3); \
|
||||
in##B = _mm_extract_epi32(z##B, 3); \
|
||||
in##C = _mm_extract_epi32(z##C, 3); \
|
||||
in##D = _mm_extract_epi32(z##D, 3); \
|
||||
in##A ^= *(uint32_t *) (m + 192); \
|
||||
in##B ^= *(uint32_t *) (m + 196); \
|
||||
in##C ^= *(uint32_t *) (m + 200); \
|
||||
in##D ^= *(uint32_t *) (m + 204); \
|
||||
*(uint32_t *) (c + 192) = in##A; \
|
||||
*(uint32_t *) (c + 196) = in##B; \
|
||||
*(uint32_t *) (c + 200) = in##C; \
|
||||
#define ONEQUAD_EXTRACT(A, B, C, D) \
|
||||
z##A = _mm_add_epi32(z##A, orig##A); \
|
||||
z##B = _mm_add_epi32(z##B, orig##B); \
|
||||
z##C = _mm_add_epi32(z##C, orig##C); \
|
||||
z##D = _mm_add_epi32(z##D, orig##D); \
|
||||
in##A = _mm_cvtsi128_si32(z##A); \
|
||||
in##B = _mm_cvtsi128_si32(z##B); \
|
||||
in##C = _mm_cvtsi128_si32(z##C); \
|
||||
in##D = _mm_cvtsi128_si32(z##D); \
|
||||
in##A ^= *(uint32_t *) (m + 0); \
|
||||
in##B ^= *(uint32_t *) (m + 4); \
|
||||
in##C ^= *(uint32_t *) (m + 8); \
|
||||
in##D ^= *(uint32_t *) (m + 12); \
|
||||
*(uint32_t *) (c + 0) = in##A; \
|
||||
*(uint32_t *) (c + 4) = in##B; \
|
||||
*(uint32_t *) (c + 8) = in##C; \
|
||||
*(uint32_t *) (c + 12) = in##D; \
|
||||
\
|
||||
in##A = _mm_extract_epi32(z##A, 1); \
|
||||
in##B = _mm_extract_epi32(z##B, 1); \
|
||||
in##C = _mm_extract_epi32(z##C, 1); \
|
||||
in##D = _mm_extract_epi32(z##D, 1); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 64); \
|
||||
in##B ^= *(uint32_t *) (m + 68); \
|
||||
in##C ^= *(uint32_t *) (m + 72); \
|
||||
in##D ^= *(uint32_t *) (m + 76); \
|
||||
*(uint32_t *) (c + 64) = in##A; \
|
||||
*(uint32_t *) (c + 68) = in##B; \
|
||||
*(uint32_t *) (c + 72) = in##C; \
|
||||
*(uint32_t *) (c + 76) = in##D; \
|
||||
\
|
||||
in##A = _mm_extract_epi32(z##A, 2); \
|
||||
in##B = _mm_extract_epi32(z##B, 2); \
|
||||
in##C = _mm_extract_epi32(z##C, 2); \
|
||||
in##D = _mm_extract_epi32(z##D, 2); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 128); \
|
||||
in##B ^= *(uint32_t *) (m + 132); \
|
||||
in##C ^= *(uint32_t *) (m + 136); \
|
||||
in##D ^= *(uint32_t *) (m + 140); \
|
||||
*(uint32_t *) (c + 128) = in##A; \
|
||||
*(uint32_t *) (c + 132) = in##B; \
|
||||
*(uint32_t *) (c + 136) = in##C; \
|
||||
*(uint32_t *) (c + 140) = in##D; \
|
||||
\
|
||||
in##A = _mm_extract_epi32(z##A, 3); \
|
||||
in##B = _mm_extract_epi32(z##B, 3); \
|
||||
in##C = _mm_extract_epi32(z##C, 3); \
|
||||
in##D = _mm_extract_epi32(z##D, 3); \
|
||||
\
|
||||
in##A ^= *(uint32_t *) (m + 192); \
|
||||
in##B ^= *(uint32_t *) (m + 196); \
|
||||
in##C ^= *(uint32_t *) (m + 200); \
|
||||
in##D ^= *(uint32_t *) (m + 204); \
|
||||
*(uint32_t *) (c + 192) = in##A; \
|
||||
*(uint32_t *) (c + 196) = in##B; \
|
||||
*(uint32_t *) (c + 200) = in##C; \
|
||||
*(uint32_t *) (c + 204) = in##D
|
||||
|
||||
/* store data ; this macro first transpose data in-registers, and then store
|
||||
|
Loading…
Reference in New Issue
Block a user