From d0c73fcefc08c771f312baec8f70b50ea8f97e9e Mon Sep 17 00:00:00 2001
From: Frank Denis <github@pureftpd.org>
Date: Thu, 17 Nov 2022 22:59:12 +0100
Subject: [PATCH] Backport the new AES-GCM implementation to -stable

---
 AUTHORS                                       |    3 +-
 .../aes256gcm/aesni/aead_aes256gcm_aesni.c    | 1754 +++++++++--------
 2 files changed, 913 insertions(+), 844 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index f4a4b9d8..597b2c6d 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -45,8 +45,7 @@ siphash                                Jean-Philippe Aumasson
 Implementors
 ============
 
-crypto_aead/aes256gcm/aesni            Romain Dolbeau
-                                       Frank Denis
+crypto_aead/aes256gcm/aesni            Frank Denis
 
 crypto_aead/chacha20poly1305           Frank Denis
 
diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
index fbc66230..8fe2a79f 100644
--- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
+++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
@@ -1,19 +1,15 @@
-
-/*
- * AES256-GCM, based on the "Intel Carry-Less Multiplication Instruction and its Usage for Computing
- * the GCM Mode" paper and reference code, using the aggregated reduction method.
- * Originally adapted by Romain Dolbeau.
- */
-
 #include <errno.h>
+#include <limits.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "core.h"
 #include "crypto_aead_aes256gcm.h"
+#include "crypto_verify_16.h"
 #include "export.h"
 #include "private/common.h"
+#include "private/sse2_64_32.h"
 #include "randombytes.h"
 #include "runtime.h"
 #include "utils.h"
@@ -21,801 +17,934 @@
 #if defined(HAVE_TMMINTRIN_H) && defined(HAVE_WMMINTRIN_H)
 
 #ifdef __GNUC__
-# pragma GCC target("ssse3")
-# pragma GCC target("aes")
-# pragma GCC target("pclmul")
+#pragma GCC target("ssse3")
+#pragma GCC target("aes")
+#pragma GCC target("pclmul")
+#endif
+
+#if !defined(_MSC_VER) || _MSC_VER < 1800
+#define __vectorcall
 #endif
 
 #include <tmmintrin.h>
 #include <wmmintrin.h>
-#include "private/sse2_64_32.h"
 
-#ifndef ENOSYS
-# define ENOSYS ENXIO
-#endif
+#define ABYTES    crypto_aead_aes256gcm_ABYTES
+#define NPUBBYTES crypto_aead_aes256gcm_NPUBBYTES
+#define KEYBYTES  crypto_aead_aes256gcm_KEYBYTES
 
-#if defined(__INTEL_COMPILER) || defined(_bswap64)
-#elif defined(_MSC_VER)
-# define _bswap64(a) _byteswap_uint64(a)
-#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-# define _bswap64(a) __builtin_bswap64(a)
-#else
-static inline uint64_t
-_bswap64(const uint64_t x)
+#define PARALLEL_BLOCKS 7
+#undef USE_KARATSUBA_MULTIPLICATION
+
+typedef __m128i BlockVec;
+
+#define LOAD128(a)                       _mm_loadu_si128((const BlockVec *) (a))
+#define STORE128(a, b)                   _mm_storeu_si128((BlockVec *) (a), (b))
+#define AES_ENCRYPT(block_vec, rkey)     _mm_aesenc_si128((block_vec), (rkey))
+#define AES_ENCRYPTLAST(block_vec, rkey) _mm_aesenclast_si128((block_vec), (rkey))
+#define AES_KEYGEN(block_vec, rc)        _mm_aeskeygenassist_si128((block_vec), (rc))
+#define XOR128(a, b)                     _mm_xor_si128((a), (b))
+#define AND128(a, b)                     _mm_and_si128((a), (b))
+#define OR128(a, b)                      _mm_or_si128((a), (b))
+#define SET64x2(a, b)                    _mm_set_epi64x((uint64_t) (a), (uint64_t) (b))
+#define ZERO128                          _mm_setzero_si128()
+#define ONE128                           SET64x2(0, 1)
+#define ADD64x2(a, b)                    _mm_add_epi64((a), (b))
+#define SUB64x2(a, b)                    _mm_sub_epi64((a), (b))
+#define SHL64x2(a, b)                    _mm_slli_epi64((a), (b))
+#define SHR64x2(a, b)                    _mm_srli_epi64((a), (b))
+#define REV128(x) \
+    _mm_shuffle_epi8((x), _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
+#define SHUFFLE32x4(x, a, b, c, d) _mm_shuffle_epi32((x), _MM_SHUFFLE((d), (c), (b), (a)))
+#define BYTESHL128(a, b)           _mm_slli_si128(a, b)
+#define BYTESHR128(a, b)           _mm_srli_si128(a, b)
+#define SHL128(a, b)               OR128(SHL64x2((a), (b)), SHR64x2(BYTESHL128((a), 8), 64 - (b)))
+#define CLMULLO128(a, b)           _mm_clmulepi64_si128((a), (b), 0x00)
+#define CLMULHI128(a, b)           _mm_clmulepi64_si128((a), (b), 0x11)
+#define CLMULLOHI128(a, b)         _mm_clmulepi64_si128((a), (b), 0x10)
+#define CLMULHILO128(a, b)         _mm_clmulepi64_si128((a), (b), 0x01)
+#define PREFETCH_READ(x)           _mm_prefetch((x), _MM_HINT_T1)
+#define PREFETCH_WRITE(x)          _mm_prefetch((x), _MM_HINT_T1)
+
+#define ROUNDS 14
+
+#define PC_COUNT (2 * PARALLEL_BLOCKS)
+
+typedef struct I256 {
+    BlockVec hi;
+    BlockVec lo;
+    BlockVec mid;
+} I256;
+
+typedef BlockVec Precomp;
+
+typedef struct GHash {
+    BlockVec acc;
+} GHash;
+
+typedef struct State {
+    BlockVec rkeys[ROUNDS + 1];
+    Precomp  hx[PC_COUNT];
+} State;
+
+static void __vectorcall expand256(const unsigned char key[KEYBYTES], BlockVec rkeys[1 + ROUNDS])
 {
-    return
-        ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) |
-        ((x << 24) & 0x0000FF0000000000UL) | ((x <<  8) & 0x000000FF00000000UL) |
-        ((x >>  8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) |
-        ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL);
-}
-#endif
+    BlockVec t1, t2, s;
+    size_t   i = 0;
 
-typedef struct aes256gcm_state {
-    __m128i       rkeys[16];
-    unsigned char H[16];
-} aes256gcm_state;
+#define EXPAND_KEY_1(RC)                        \
+    rkeys[i++] = t2;                            \
+    s          = AES_KEYGEN(t2, RC);            \
+    t1         = XOR128(t1, BYTESHL128(t1, 4)); \
+    t1         = XOR128(t1, BYTESHL128(t1, 8)); \
+    t1         = XOR128(t1, SHUFFLE32x4(s, 3, 3, 3, 3));
 
-static inline void
-aesni_key256_expand(const unsigned char *key, __m128i * const rkeys)
-{
-    __m128i  X0, X1, X2, X3;
-    int      i = 0;
+#define EXPAND_KEY_2(RC)                        \
+    rkeys[i++] = t1;                            \
+    s          = AES_KEYGEN(t1, RC);            \
+    t2         = XOR128(t2, BYTESHL128(t2, 4)); \
+    t2         = XOR128(t2, BYTESHL128(t2, 8)); \
+    t2         = XOR128(t2, SHUFFLE32x4(s, 2, 2, 2, 2));
 
-    X0 = _mm_loadu_si128((const __m128i *) &key[0]);
-    rkeys[i++] = X0;
+    t1 = LOAD128(&key[0]);
+    t2 = LOAD128(&key[16]);
 
-    X2 = _mm_loadu_si128((const __m128i *) &key[16]);
-    rkeys[i++] = X2;
-
-#define EXPAND_KEY_1(S) do { \
-    X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X2, (S)), 0xff); \
-    X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x10)); \
-    X0 = _mm_xor_si128(X0, X3); \
-    X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x8c)); \
-    X0 = _mm_xor_si128(_mm_xor_si128(X0, X3), X1); \
-    rkeys[i++] = X0; \
-} while (0)
-
-#define EXPAND_KEY_2(S) do { \
-    X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X0, (S)), 0xaa); \
-    X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x10)); \
-    X2 = _mm_xor_si128(X2, X3); \
-    X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x8c)); \
-    X2 = _mm_xor_si128(_mm_xor_si128(X2, X3), X1); \
-    rkeys[i++] = X2; \
-} while (0)
-
-    X3 = _mm_setzero_si128();
-    EXPAND_KEY_1(0x01); EXPAND_KEY_2(0x01);
-    EXPAND_KEY_1(0x02); EXPAND_KEY_2(0x02);
-    EXPAND_KEY_1(0x04); EXPAND_KEY_2(0x04);
-    EXPAND_KEY_1(0x08); EXPAND_KEY_2(0x08);
-    EXPAND_KEY_1(0x10); EXPAND_KEY_2(0x10);
-    EXPAND_KEY_1(0x20); EXPAND_KEY_2(0x20);
+    rkeys[i++] = t1;
+    EXPAND_KEY_1(0x01);
+    EXPAND_KEY_2(0x01);
+    EXPAND_KEY_1(0x02);
+    EXPAND_KEY_2(0x02);
+    EXPAND_KEY_1(0x04);
+    EXPAND_KEY_2(0x04);
+    EXPAND_KEY_1(0x08);
+    EXPAND_KEY_2(0x08);
+    EXPAND_KEY_1(0x10);
+    EXPAND_KEY_2(0x10);
+    EXPAND_KEY_1(0x20);
+    EXPAND_KEY_2(0x20);
     EXPAND_KEY_1(0x40);
+    rkeys[i++] = t1;
 }
 
-/** single, by-the-book AES encryption with AES-NI */
+/* Encrypt a single AES block */
+
 static inline void
-aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys)
+encrypt(const State *st, unsigned char dst[16], const unsigned char src[16])
 {
-    __m128i temp = _mm_xor_si128(nv, rkeys[0]);
+    BlockVec t;
 
-    temp = _mm_aesenc_si128(temp, rkeys[1]);
-    temp = _mm_aesenc_si128(temp, rkeys[2]);
-    temp = _mm_aesenc_si128(temp, rkeys[3]);
-    temp = _mm_aesenc_si128(temp, rkeys[4]);
-    temp = _mm_aesenc_si128(temp, rkeys[5]);
-    temp = _mm_aesenc_si128(temp, rkeys[6]);
-    temp = _mm_aesenc_si128(temp, rkeys[7]);
-    temp = _mm_aesenc_si128(temp, rkeys[8]);
-    temp = _mm_aesenc_si128(temp, rkeys[9]);
-    temp = _mm_aesenc_si128(temp, rkeys[10]);
-    temp = _mm_aesenc_si128(temp, rkeys[11]);
-    temp = _mm_aesenc_si128(temp, rkeys[12]);
-    temp = _mm_aesenc_si128(temp, rkeys[13]);
+    size_t i;
 
-    temp = _mm_aesenclast_si128(temp, rkeys[14]);
-    _mm_storeu_si128((__m128i *) out, temp);
-}
-
-/** multiple-blocks-at-once AES encryption with AES-NI ;
-    on Haswell, aesenc has a latency of 7 and a throughput of 1
-    so the sequence of aesenc should be bubble-free if you
-    have at least 8 blocks. Let's build an arbitratry-sized
-    function */
-/* Step 1 : loading the nonce */
-/* load & increment the n vector (non-vectorized, unused for now) */
-#define NVDECLx(a)                                                             \
-    __m128i nv##a
-
-#define NVx(a)                                                                 \
-    nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt);         \
-    n[3]++
-
-/* Step 2 : define value in round one (xor with subkey #0, aka key) */
-#define TEMPDECLx(a) \
-    __m128i temp##a
-
-#define TEMPx(a) \
-    temp##a = _mm_xor_si128(nv##a, rkeys[0])
-
-/* Step 3: one round of AES */
-#define AESENCx(a) \
-    temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr])
-
-/* Step 4: last round of AES */
-#define AESENCLASTx(a) \
-    temp##a = _mm_aesenclast_si128(temp##a, rkeys[14])
-
-/* Step 5: store result */
-#define STOREx(a) \
-    _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a)
-
-/* all the MAKE* macros are for automatic explicit unrolling */
-#define MAKE4(X) \
-    X(0);        \
-    X(1);        \
-    X(2);        \
-    X(3)
-
-#define MAKE8(X) \
-    X(0);        \
-    X(1);        \
-    X(2);        \
-    X(3);        \
-    X(4);        \
-    X(5);        \
-    X(6);        \
-    X(7)
-
-#define COUNTER_INC2(N) (N)[3] += 2
-
-/* create a function of unrolling N ; the MAKEN is the unrolling
-   macro, defined above. The N in MAKEN must match N, obviously. */
-#define FUNC(N, MAKEN)                                                                                \
-    static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys)        \
-    {                                                                                                 \
-        const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);        \
-        int           roundctr;                                                                       \
-        MAKEN(NVDECLx);                                                                               \
-        MAKEN(TEMPDECLx);                                                                             \
-                                                                                                      \
-        MAKEN(NVx);                                                                                   \
-        MAKEN(TEMPx);                                                                                 \
-        for (roundctr = 1; roundctr < 14; roundctr++) {                                               \
-            MAKEN(AESENCx);                                                                           \
-        }                                                                                             \
-        MAKEN(AESENCLASTx);                                                                           \
-        MAKEN(STOREx);                                                                                \
+    t = XOR128(LOAD128(src), st->rkeys[0]);
+    for (i = 1; i < ROUNDS; i++) {
+        t = AES_ENCRYPT(t, st->rkeys[i]);
     }
+    t = AES_ENCRYPTLAST(t, st->rkeys[ROUNDS]);
+    STORE128(dst, t);
+}
 
-FUNC(8, MAKE8)
+/* Encrypt and add a single AES block */
 
-/* all GF(2^128) fnctions are by the book, meaning this one:
-   <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
-*/
-
-static inline void
-addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
+static inline void __vectorcall encrypt_xor_block(const State *st, unsigned char dst[16],
+                                                  const unsigned char src[16],
+                                                  const BlockVec      counter)
 {
-    const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    __m128i       A, B, C;
-    __m128i       tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
-    __m128i       tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp18;
-    __m128i       tmp19, tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
-    __m128i       tmp28, tmp29, tmp30, tmp31, tmp32, tmp33, tmp34, tmp35, tmp36;
+    BlockVec ts;
+    size_t   i;
 
-    if (xlen >= 16) {
-        A = _mm_loadu_si128((const __m128i *) a);
-    } else {
-        CRYPTO_ALIGN(16) unsigned char padded[16];
-        unsigned int i;
+    ts = XOR128(counter, st->rkeys[0]);
+    for (i = 1; i < ROUNDS; i++) {
+        ts = AES_ENCRYPT(ts, st->rkeys[i]);
+    }
+    ts = AES_ENCRYPTLAST(ts, st->rkeys[i]);
+    ts = XOR128(ts, LOAD128(src));
+    STORE128(dst, ts);
+}
 
-        memset(padded, 0, 16);
-        for (i = 0; i < xlen; i++) {
-            padded[i] = a[i];
+/* Encrypt and add PARALLEL_BLOCKS AES blocks */
+
+static inline void __vectorcall encrypt_xor_wide(const State        *st,
+                                                 unsigned char       dst[16 * PARALLEL_BLOCKS],
+                                                 const unsigned char src[16 * PARALLEL_BLOCKS],
+                                                 const BlockVec      counters[PARALLEL_BLOCKS])
+{
+    BlockVec ts[PARALLEL_BLOCKS];
+    size_t   i, j;
+
+    for (j = 0; j < PARALLEL_BLOCKS; j++) {
+        ts[j] = XOR128(counters[j], st->rkeys[0]);
+    }
+    for (i = 1; i < ROUNDS; i++) {
+        for (j = 0; j < PARALLEL_BLOCKS; j++) {
+            ts[j] = AES_ENCRYPT(ts[j], st->rkeys[i]);
         }
-        A = _mm_load_si128((const __m128i *) padded);
     }
-    A = _mm_shuffle_epi8(A, rev);
-    B = _mm_loadu_si128((const __m128i *) b);
-    C = _mm_loadu_si128((const __m128i *) c);
-    A = _mm_xor_si128(A, C);
-    tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
-    tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
-    tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
-    tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
-    tmp10 = _mm_xor_si128(tmp4, tmp5);
-    tmp13 = _mm_slli_si128(tmp10, 8);
-    tmp11 = _mm_srli_si128(tmp10, 8);
-    tmp15 = _mm_xor_si128(tmp3, tmp13);
-    tmp17 = _mm_xor_si128(tmp6, tmp11);
-    tmp7 = _mm_srli_epi32(tmp15, 31);
-    tmp8 = _mm_srli_epi32(tmp17, 31);
-    tmp16 = _mm_slli_epi32(tmp15, 1);
-    tmp18 = _mm_slli_epi32(tmp17, 1);
-    tmp9 = _mm_srli_si128(tmp7, 12);
-    tmp22 = _mm_slli_si128(tmp8, 4);
-    tmp25 = _mm_slli_si128(tmp7, 4);
-    tmp29 = _mm_or_si128(tmp16, tmp25);
-    tmp19 = _mm_or_si128(tmp18, tmp22);
-    tmp20 = _mm_or_si128(tmp19, tmp9);
-    tmp26 = _mm_slli_epi32(tmp29, 31);
-    tmp23 = _mm_slli_epi32(tmp29, 30);
-    tmp32 = _mm_slli_epi32(tmp29, 25);
-    tmp27 = _mm_xor_si128(tmp26, tmp23);
-    tmp28 = _mm_xor_si128(tmp27, tmp32);
-    tmp24 = _mm_srli_si128(tmp28, 4);
-    tmp33 = _mm_slli_si128(tmp28, 12);
-    tmp30 = _mm_xor_si128(tmp29, tmp33);
-    tmp2 = _mm_srli_epi32(tmp30, 1);
-    tmp12 = _mm_srli_epi32(tmp30, 2);
-    tmp14 = _mm_srli_epi32(tmp30, 7);
-    tmp34 = _mm_xor_si128(tmp2, tmp12);
-    tmp35 = _mm_xor_si128(tmp34, tmp14);
-    tmp36 = _mm_xor_si128(tmp35, tmp24);
-    tmp31 = _mm_xor_si128(tmp30, tmp36);
-    tmp21 = _mm_xor_si128(tmp20, tmp31);
-    _mm_storeu_si128((__m128i *) c, tmp21);
+    for (j = 0; j < PARALLEL_BLOCKS; j++) {
+        ts[j] = AES_ENCRYPTLAST(ts[j], st->rkeys[i]);
+        ts[j] = XOR128(ts[j], LOAD128(&src[16 * j]));
+    }
+    for (j = 0; j < PARALLEL_BLOCKS; j++) {
+        STORE128(&dst[16 * j], ts[j]);
+    }
 }
 
-/* pure multiplication, for pre-computing powers of H */
-static inline __m128i
-mulv(__m128i A, __m128i B)
+/* Square a field element */
+
+static inline I256 __vectorcall clsq128(const BlockVec x)
 {
-    __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
-    __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
-    __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
-    __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
-    __m128i tmp10 = _mm_xor_si128(tmp4, tmp5);
-    __m128i tmp13 = _mm_slli_si128(tmp10, 8);
-    __m128i tmp11 = _mm_srli_si128(tmp10, 8);
-    __m128i tmp15 = _mm_xor_si128(tmp3, tmp13);
-    __m128i tmp17 = _mm_xor_si128(tmp6, tmp11);
-    __m128i tmp7 = _mm_srli_epi32(tmp15, 31);
-    __m128i tmp8 = _mm_srli_epi32(tmp17, 31);
-    __m128i tmp16 = _mm_slli_epi32(tmp15, 1);
-    __m128i tmp18 = _mm_slli_epi32(tmp17, 1);
-    __m128i tmp9 = _mm_srli_si128(tmp7, 12);
-    __m128i tmp22 = _mm_slli_si128(tmp8, 4);
-    __m128i tmp25 = _mm_slli_si128(tmp7, 4);
-    __m128i tmp29 = _mm_or_si128(tmp16, tmp25);
-    __m128i tmp19 = _mm_or_si128(tmp18, tmp22);
-    __m128i tmp20 = _mm_or_si128(tmp19, tmp9);
-    __m128i tmp26 = _mm_slli_epi32(tmp29, 31);
-    __m128i tmp23 = _mm_slli_epi32(tmp29, 30);
-    __m128i tmp32 = _mm_slli_epi32(tmp29, 25);
-    __m128i tmp27 = _mm_xor_si128(tmp26, tmp23);
-    __m128i tmp28 = _mm_xor_si128(tmp27, tmp32);
-    __m128i tmp24 = _mm_srli_si128(tmp28, 4);
-    __m128i tmp33 = _mm_slli_si128(tmp28, 12);
-    __m128i tmp30 = _mm_xor_si128(tmp29, tmp33);
-    __m128i tmp2 = _mm_srli_epi32(tmp30, 1);
-    __m128i tmp12 = _mm_srli_epi32(tmp30, 2);
-    __m128i tmp14 = _mm_srli_epi32(tmp30, 7);
-    __m128i tmp34 = _mm_xor_si128(tmp2, tmp12);
-    __m128i tmp35 = _mm_xor_si128(tmp34, tmp14);
-    __m128i tmp36 = _mm_xor_si128(tmp35, tmp24);
-    __m128i tmp31 = _mm_xor_si128(tmp30, tmp36);
-    __m128i C = _mm_xor_si128(tmp20, tmp31);
+    const BlockVec r_lo = CLMULLO128(x, x);
+    const BlockVec r_hi = CLMULHI128(x, x);
 
-    return C;
+    return (I256) {
+        SODIUM_C99(.hi =) r_hi,
+        SODIUM_C99(.lo =) r_lo,
+        SODIUM_C99(.mid =) ZERO128,
+    };
 }
 
-/* 4 multiply-accumulate at once; again
-   <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
-   for the Aggregated Reduction Method & sample code.
-   Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
+/* Multiply two field elements -- Textbook multiplication is faster than Karatsuba on some recent
+ * CPUs */
 
-#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
-#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
-#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
-#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
-#define RED_MUL_MID(a)                          \
-    tmp##a = _mm_shuffle_epi32(H##a, 0x4e);     \
-    tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e);  \
-    tmp##a = _mm_xor_si128(tmp##a, H##a);       \
-    tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
-    tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
+static inline I256 __vectorcall clmul128(const BlockVec x, const BlockVec y)
+{
+#ifdef USE_KARATSUBA_MULTIPLICATION
+    const BlockVec x_hi  = BYTESHR128(x, 8);
+    const BlockVec y_hi  = BYTESHR128(y, 8);
+    const BlockVec r_lo  = CLMULLO128(x, y);
+    const BlockVec r_hi  = CLMULHI128(x, y);
+    const BlockVec r_mid = XOR128(CLMULLO128(XOR128(x, x_hi), XOR128(y, y_hi)), XOR128(r_lo, r_hi));
 
-#define MULREDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \
-do { \
-    MAKE4(RED_DECL); \
-    __m128i lo, hi; \
-    __m128i tmp8, tmp9; \
-    __m128i H0 = H0_; \
-    __m128i H1 = H1_; \
-    __m128i H2 = H2_; \
-    __m128i H3 = H3_; \
-    __m128i X0 = X0_; \
-    __m128i X1 = X1_; \
-    __m128i X2 = X2_; \
-    __m128i X3 = X3_; \
-\
-/* byte-revert the inputs & xor the first one into the accumulator */ \
-\
-    MAKE4(RED_SHUFFLE); \
-    X3 = _mm_xor_si128(X3, accv); \
-\
-/* 4 low H*X (x0*h0) */ \
-\
-    MAKE4(RED_MUL_LOW); \
-    lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
-    lo = _mm_xor_si128(lo, H2_X2_lo); \
-    lo = _mm_xor_si128(lo, H3_X3_lo); \
-\
-/* 4 high H*X (x1*h1) */ \
-\
-    MAKE4(RED_MUL_HIGH); \
-    hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
-    hi = _mm_xor_si128(hi, H2_X2_hi); \
-    hi = _mm_xor_si128(hi, H3_X3_hi); \
-\
-/* 4 middle H*X, using Karatsuba, i.e. \
-     x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
-     we already have all x1y1 & x0y0 (accumulated in hi & lo) \
-     (0 is low half and 1 is high half) \
-  */ \
-/* permute the high and low 64 bits in H1 & X1, \
-     so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
-     then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
-     and finally multiply \
-  */ \
-    MAKE4(RED_MUL_MID); \
-\
-    /* subtracts x1*h1 and x0*h0 */ \
-    tmp0 = _mm_xor_si128(tmp0, lo); \
-    tmp0 = _mm_xor_si128(tmp0, hi); \
-    tmp0 = _mm_xor_si128(tmp1, tmp0); \
-    tmp0 = _mm_xor_si128(tmp2, tmp0); \
-    tmp0 = _mm_xor_si128(tmp3, tmp0);\
-\
-    /* reduction */ \
-    tmp0B = _mm_slli_si128(tmp0, 8); \
-    tmp0 = _mm_srli_si128(tmp0, 8); \
-    lo = _mm_xor_si128(tmp0B, lo); \
-    hi = _mm_xor_si128(tmp0, hi); \
-    tmp3 = lo; \
-    tmp2B = hi; \
-    tmp3B = _mm_srli_epi32(tmp3, 31); \
-    tmp8 = _mm_srli_epi32(tmp2B, 31); \
-    tmp3 = _mm_slli_epi32(tmp3, 1); \
-    tmp2B = _mm_slli_epi32(tmp2B, 1); \
-    tmp9 = _mm_srli_si128(tmp3B, 12); \
-    tmp8 = _mm_slli_si128(tmp8, 4); \
-    tmp3B = _mm_slli_si128(tmp3B, 4); \
-    tmp3 = _mm_or_si128(tmp3, tmp3B); \
-    tmp2B = _mm_or_si128(tmp2B, tmp8); \
-    tmp2B = _mm_or_si128(tmp2B, tmp9); \
-    tmp3B = _mm_slli_epi32(tmp3, 31); \
-    tmp8 = _mm_slli_epi32(tmp3, 30); \
-    tmp9 = _mm_slli_epi32(tmp3, 25); \
-    tmp3B = _mm_xor_si128(tmp3B, tmp8); \
-    tmp3B = _mm_xor_si128(tmp3B, tmp9); \
-    tmp8 = _mm_srli_si128(tmp3B, 4); \
-    tmp3B = _mm_slli_si128(tmp3B, 12); \
-    tmp3 = _mm_xor_si128(tmp3, tmp3B); \
-    tmp2 = _mm_srli_epi32(tmp3, 1); \
-    tmp0B = _mm_srli_epi32(tmp3, 2); \
-    tmp1B = _mm_srli_epi32(tmp3, 7); \
-    tmp2 = _mm_xor_si128(tmp2, tmp0B); \
-    tmp2 = _mm_xor_si128(tmp2, tmp1B); \
-    tmp2 = _mm_xor_si128(tmp2, tmp8); \
-    tmp3 = _mm_xor_si128(tmp3, tmp2); \
-    tmp2B = _mm_xor_si128(tmp2B, tmp3); \
-\
-    accv = tmp2B; \
-} while(0)
+    return (I256) {
+        SODIUM_C99(.hi =) r_hi,
+        SODIUM_C99(.lo =) r_lo,
+        SODIUM_C99(.mid =) r_mid,
+    };
+#else
+    const BlockVec r_hi  = CLMULHI128(x, y);
+    const BlockVec r_lo  = CLMULLO128(x, y);
+    const BlockVec r_mid = XOR128(CLMULHILO128(x, y), CLMULLOHI128(x, y));
 
-#define XORx(a)                                                       \
-        temp##a = _mm_xor_si128(temp##a,                              \
-                                _mm_loadu_si128((const __m128i *) (in + a * 16)))
+    return (I256) {
+        SODIUM_C99(.hi =) r_hi,
+        SODIUM_C99(.lo =) r_lo,
+        SODIUM_C99(.mid =) r_mid,
+    };
+#endif
+}
 
-#define LOADx(a)                                                      \
-    __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))
+/* Merge the middle word and reduce a field element */
 
-/* full encrypt & checksum 8 blocks at once */
-#define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
-do { \
-    unsigned char *out = out_; \
-    uint32_t *n = n_; \
-    const unsigned char *in = in_; \
-    const __m128i hv = hv_; \
-    const __m128i h2v = h2v_; \
-    const __m128i h3v = h3v_; \
-    const __m128i h4v = h4v_; \
-    const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-    __m128i       accv_; \
-    int           roundctr; \
-    \
-    MAKE8(NVDECLx); \
-    MAKE8(TEMPDECLx); \
-    MAKE8(NVx); \
-    MAKE8(TEMPx); \
-    for (roundctr = 1; roundctr < 14; roundctr++) { \
-        MAKE8(AESENCx); \
-    } \
-    MAKE8(AESENCLASTx); \
-    MAKE8(XORx); \
-    MAKE8(STOREx); \
-    accv_ = _mm_load_si128((const __m128i *) accum); \
-    MULREDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv_); \
-    MULREDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv_); \
-    _mm_store_si128((__m128i *) accum, accv_); \
-} while(0)
+static inline BlockVec __vectorcall gcm_reduce(const I256 x)
+{
+    const BlockVec hi = XOR128(x.hi, BYTESHR128(x.mid, 8));
+    const BlockVec lo = XOR128(x.lo, BYTESHL128(x.mid, 8));
 
-/* checksum 8 blocks at once */
-#define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
-do { \
-    const unsigned char *in = in_; \
-    const __m128i hv = hv_; \
-    const __m128i h2v = h2v_; \
-    const __m128i h3v = h3v_; \
-    const __m128i h4v = h4v_; \
-    __m128i accv_; \
-    \
-    MAKE8(LOADx); \
-    accv_ = _mm_load_si128((const __m128i *) accum); \
-    MULREDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv_); \
-    MULREDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv_); \
-    _mm_store_si128((__m128i *) accum, accv_); \
-} while(0)
+    const BlockVec p64 = SET64x2(0, 0xc200000000000000);
+    const BlockVec a   = CLMULLO128(lo, p64);
+    const BlockVec b   = XOR128(SHUFFLE32x4(lo, 2, 3, 0, 1), a);
+    const BlockVec c   = CLMULLO128(b, p64);
+    const BlockVec d   = XOR128(SHUFFLE32x4(b, 2, 3, 0, 1), c);
 
-/* decrypt 8 blocks at once */
-#define aesni_decrypt8full(out_, n_, rkeys, in_) \
-do { \
-    unsigned char       *out = out_; \
-    uint32_t            *n = n_; \
-    const unsigned char *in = in_; \
-    const __m128i        pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-    int                  roundctr; \
-\
-    MAKE8(NVDECLx); \
-    MAKE8(TEMPDECLx); \
-    MAKE8(NVx); \
-    MAKE8(TEMPx); \
-    for (roundctr = 1; roundctr < 14; roundctr++) { \
-        MAKE8(AESENCx); \
-    } \
-    MAKE8(AESENCLASTx); \
-    MAKE8(XORx); \
-    MAKE8(STOREx); \
-} while(0)
+    return XOR128(d, hi);
+}
+
+/* Precompute powers of H from `from` to `to` */
+
+static inline void __vectorcall precomp(Precomp hx[PC_COUNT], const size_t from, const size_t to)
+{
+    const Precomp h = hx[0];
+    size_t        i;
+
+    for (i = from & ~1U; i < to; i += 2) {
+        hx[i]     = gcm_reduce(clmul128(hx[i - 1], h));
+        hx[i + 1] = gcm_reduce(clsq128(hx[i / 2]));
+    }
+}
+
+/* Precompute powers of H given a key and a block count */
+
+static void __vectorcall precomp_for_block_count(Precomp             hx[PC_COUNT],
+                                                 const unsigned char gh_key[16],
+                                                 const size_t        block_count)
+{
+    const BlockVec h0    = REV128(LOAD128(gh_key));
+    BlockVec       carry = SET64x2(0xc200000000000000, 1);
+    BlockVec       mask  = SUB64x2(ZERO128, SHR64x2(h0, 63));
+    BlockVec       h0_shifted;
+    BlockVec       h;
+
+    mask       = SHUFFLE32x4(mask, 3, 3, 3, 3);
+    carry      = AND128(carry, mask);
+    h0_shifted = SHL128(h0, 1);
+    h          = XOR128(h0_shifted, carry);
+
+    hx[0] = h;
+    hx[1] = gcm_reduce(clsq128(hx[0]));
+
+    if (block_count >= PC_COUNT) {
+        precomp(hx, 2, PC_COUNT);
+    } else {
+        precomp(hx, 2, block_count);
+    }
+}
+
+/* Initialize a GHash */
+
+static inline void
+gh_init(GHash *sth)
+{
+    sth->acc = ZERO128;
+}
+
+static inline I256 __vectorcall gh_update0(const GHash *const sth, const unsigned char *const p,
+                                           const Precomp hn)
+{
+    const BlockVec m = REV128(LOAD128(p));
+    return clmul128(XOR128(sth->acc, m), hn);
+}
+
+static inline void __vectorcall gh_update(I256 *const u, const unsigned char *p, const Precomp hn)
+{
+    const BlockVec m = REV128(LOAD128(p));
+    const I256     t = clmul128(m, hn);
+    *u = (I256) { SODIUM_C99(.hi =) XOR128(u->hi, t.hi), SODIUM_C99(.lo =) XOR128(u->lo, t.lo),
+                  SODIUM_C99(.mid =) XOR128(u->mid, t.mid) };
+}
+
+/* Absorb ad_len bytes of associated data. There has to be no partial block. */
+
+static inline void
+gh_ad_blocks(const State *st, GHash *sth, const unsigned char *ad, size_t ad_len)
+{
+    size_t i;
+
+    i = (size_t) 0U;
+    for (; i + PC_COUNT * 16 <= ad_len; i += PC_COUNT * 16) {
+        I256   u = gh_update0(sth, ad + i, st->hx[PC_COUNT - 1 - 0]);
+        size_t j;
+
+        for (j = 1; j < PC_COUNT; j += 1) {
+            gh_update(&u, ad + i + j * 16, st->hx[PC_COUNT - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    for (; i + PC_COUNT * 16 / 2 <= ad_len; i += PC_COUNT * 16 / 2) {
+        I256   u = gh_update0(sth, ad + i, st->hx[PC_COUNT / 2 - 1 - 0]);
+        size_t j;
+
+        for (j = 1; j < PC_COUNT / 2; j += 1) {
+            gh_update(&u, ad + i + j * 16, st->hx[PC_COUNT / 2 - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    for (; i + 4 * 16 <= ad_len; i += 4 * 16) {
+        size_t j;
+        I256   u = gh_update0(sth, ad + i, st->hx[4 - 1 - 0]);
+
+        for (j = 1; j < 4; j += 1) {
+            gh_update(&u, ad + i + j * 16, st->hx[4 - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    for (; i + 2 * 16 <= ad_len; i += 2 * 16) {
+        size_t j;
+        I256   u = gh_update0(sth, ad + i, st->hx[2 - 1 - 0]);
+
+        for (j = 1; j < 2; j += 1) {
+            gh_update(&u, ad + i + j * 16, st->hx[2 - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    if (i < ad_len) {
+        I256 u   = gh_update0(sth, ad + i, st->hx[0]);
+        sth->acc = gcm_reduce(u);
+    }
+}
+
+/* Increment counters */
+
+static inline BlockVec __vectorcall incr_counters(BlockVec rev_counters[], BlockVec counter,
+                                                  const size_t n)
+{
+    size_t i;
+
+    const BlockVec one = ONE128;
+    for (i = 0; i < n; i++) {
+        rev_counters[i] = REV128(counter);
+        counter         = ADD64x2(counter, one);
+    }
+    return counter;
+}
+
+/* Compute the number of required blocks to encrypt and authenticate `ad_len` of associated data,
+ * and `m_len` of encrypted bytes. Return `0` if limits would be exceeded.*/
+
+static inline size_t
+required_blocks(const size_t ad_len, const size_t m_len)
+{
+    const size_t ad_blocks = (ad_len + 15) / 16;
+    const size_t m_blocks  = (m_len + 15) / 16;
+
+    if (ad_len > SIZE_MAX - 2 * PARALLEL_BLOCKS * 16 ||
+        m_len > SIZE_MAX - 2 * PARALLEL_BLOCKS * 16 || ad_len < ad_blocks || m_len < m_blocks ||
+        m_blocks >= (1ULL << 32) - 2) {
+        return 0;
+    }
+    return ad_blocks + m_blocks + 1;
+}
+
+/* Generic AES-GCM encryption. "Generic" as it can handle arbitrary input sizes,
+unlike a length-limited version that would precompute all the required powers of H */
+
+static void
+aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES], unsigned char *dst,
+                        const unsigned char *src, size_t src_len, const unsigned char *ad,
+                        size_t ad_len, unsigned char counter_[16])
+{
+    CRYPTO_ALIGN(32) I256          u;
+    CRYPTO_ALIGN(16) unsigned char last_blocks[2 * 16];
+    const BlockVec                 one = ONE128;
+    BlockVec                       final_block;
+    BlockVec                       rev_counters[PARALLEL_BLOCKS];
+    BlockVec                       counter;
+    size_t                         i;
+    size_t                         j;
+    size_t                         left;
+    size_t                         pi;
+
+    COMPILER_ASSERT(PC_COUNT % PARALLEL_BLOCKS == 0);
+
+    /* Associated data */
+
+    if (ad != NULL && ad_len != 0) {
+        gh_ad_blocks(st, sth, ad, ad_len & ~15);
+        left = ad_len & 15;
+        if (left != 0) {
+            unsigned char pad[16];
+
+            memset(pad, 0, sizeof pad);
+            memcpy(pad, ad + ad_len - left, left);
+            gh_ad_blocks(st, sth, pad, sizeof pad);
+        }
+    }
+
+    /* Encrypted data */
+
+    counter = REV128(LOAD128(counter_));
+    i       = 0;
+
+    /* 2*PARALLEL_BLOCKS aggregation */
+
+    if (src_len - i >= 2 * PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+        i += PARALLEL_BLOCKS * 16;
+
+        for (; i + 2 * PARALLEL_BLOCKS * 16 <= src_len; i += 2 * PARALLEL_BLOCKS * 16) {
+            counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+            encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+
+            pi = i - PARALLEL_BLOCKS * 16;
+            u  = gh_update0(sth, dst + pi, st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
+            for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+                gh_update(&u, dst + pi + j * 16, st->hx[2 * PARALLEL_BLOCKS - 1 - j]);
+            }
+
+            counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+            encrypt_xor_wide(st, dst + i + PARALLEL_BLOCKS * 16, src + i + PARALLEL_BLOCKS * 16,
+                             rev_counters);
+
+            pi = i;
+            for (j = 0; j < PARALLEL_BLOCKS; j += 1) {
+                gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
+            }
+            sth->acc = gcm_reduce(u);
+        }
+
+        pi = i - PARALLEL_BLOCKS * 16;
+        u  = gh_update0(sth, dst + pi, st->hx[PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* PARALLEL_BLOCKS aggregation */
+
+    if (src_len - i >= PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+        i += PARALLEL_BLOCKS * 16;
+
+        for (; i + PARALLEL_BLOCKS * 16 <= src_len; i += PARALLEL_BLOCKS * 16) {
+            counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+            encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+
+            pi = i - PARALLEL_BLOCKS * 16;
+            u  = gh_update0(sth, dst + pi, st->hx[PARALLEL_BLOCKS - 1 - 0]);
+            for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+                gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
+            }
+            sth->acc = gcm_reduce(u);
+        }
+
+        pi = i - PARALLEL_BLOCKS * 16;
+        u  = gh_update0(sth, dst + pi, st->hx[PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* 4-blocks aggregation */
+
+    for (; i + 4 * 16 <= src_len; i += 4 * 16) {
+        counter = incr_counters(rev_counters, counter, 4);
+        for (j = 0; j < 4; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+
+        u = gh_update0(sth, dst + i, st->hx[4 - 1 - 0]);
+        for (j = 1; j < 4; j += 1) {
+            gh_update(&u, dst + i + j * 16, st->hx[4 - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* 2-blocks aggregation */
+
+    for (; i + 2 * 16 <= src_len; i += 2 * 16) {
+        counter = incr_counters(rev_counters, counter, 2);
+        for (j = 0; j < 2; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+
+        u = gh_update0(sth, dst + i, st->hx[2 - 1 - 0]);
+        for (j = 1; j < 2; j += 1) {
+            gh_update(&u, dst + i + j * 16, st->hx[2 - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* Remaining *partial* blocks; if we have 16 bytes left, we want to keep the
+    full block authenticated along with the final block, hence < and not <= */
+
+    for (; i + 16 < src_len; i += 16) {
+        encrypt_xor_block(st, dst + i, src + i, REV128(counter));
+        u        = gh_update0(sth, dst + i, st->hx[1 - 1 - 0]);
+        sth->acc = gcm_reduce(u);
+        counter  = ADD64x2(counter, one);
+    }
+
+    /* Authenticate both the last block of the message and the final block */
+
+    final_block = REV128(SET64x2(ad_len * 8, src_len * 8));
+    STORE32_BE(counter_ + NPUBBYTES, 1);
+    encrypt(st, mac, counter_);
+    left = src_len - i;
+    if (left != 0) {
+        for (j = 0; j < left; j++) {
+            last_blocks[j] = src[i + j];
+        }
+        STORE128(last_blocks + 16, final_block);
+        encrypt_xor_block(st, last_blocks, last_blocks, REV128(counter));
+        for (; j < 16; j++) {
+            last_blocks[j] = 0;
+        }
+        for (j = 0; j < left; j++) {
+            dst[i + j] = last_blocks[j];
+        }
+        gh_ad_blocks(st, sth, last_blocks, 32);
+    } else {
+        STORE128(last_blocks, final_block);
+        gh_ad_blocks(st, sth, last_blocks, 16);
+    }
+    STORE128(mac, XOR128(LOAD128(mac), REV128(sth->acc)));
+}
+
+/* Generic AES-GCM decryption. "Generic" as it can handle arbitrary input sizes,
+unlike a length-limited version that would precompute all the required powers of H */
+
+static void
+aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES], unsigned char *dst,
+                        const unsigned char *src, size_t src_len, const unsigned char *ad,
+                        size_t ad_len, unsigned char counter_[16])
+{
+    CRYPTO_ALIGN(32) I256          u;
+    CRYPTO_ALIGN(16) unsigned char last_blocks[2 * 16];
+    const BlockVec                 one = ONE128;
+    BlockVec                       final_block;
+    BlockVec                       rev_counters[PARALLEL_BLOCKS];
+    BlockVec                       counter;
+    size_t                         i;
+    size_t                         j;
+    size_t                         left;
+
+    COMPILER_ASSERT(PC_COUNT % PARALLEL_BLOCKS == 0);
+
+    /* Associated data */
+
+    if (ad != NULL && ad_len != 0) {
+        gh_ad_blocks(st, sth, ad, ad_len & ~15);
+        left = ad_len & 15;
+        if (left != 0) {
+            unsigned char pad[16];
+
+            memset(pad, 0, sizeof pad);
+            memcpy(pad, ad + ad_len - left, left);
+            gh_ad_blocks(st, sth, pad, sizeof pad);
+        }
+    }
+
+    /* Encrypted data */
+
+    counter = REV128(LOAD128(counter_));
+    i       = 0;
+
+    /* 2*PARALLEL_BLOCKS aggregation */
+
+    for (; i + 2 * PARALLEL_BLOCKS * 16 <= src_len; i += 2 * PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+
+        u = gh_update0(sth, src + i, st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            gh_update(&u, src + i + j * 16, st->hx[2 * PARALLEL_BLOCKS - 1 - j]);
+        }
+
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+
+        for (j = 0; j < PARALLEL_BLOCKS; j += 1) {
+            gh_update(&u, src + i + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+
+        encrypt_xor_wide(st, dst + i + PARALLEL_BLOCKS * 16, src + i + PARALLEL_BLOCKS * 16,
+                         rev_counters);
+    }
+
+    /* PARALLEL_BLOCKS aggregation */
+
+    for (; i + PARALLEL_BLOCKS * 16 <= src_len; i += PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+
+        u = gh_update0(sth, src + i, st->hx[PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            gh_update(&u, src + i + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+    }
+
+    /* 4-blocks aggregation */
+
+    for (; i + 4 * 16 <= src_len; i += 4 * 16) {
+        counter = incr_counters(rev_counters, counter, 4);
+
+        u = gh_update0(sth, src + i, st->hx[4 - 1 - 0]);
+        for (j = 1; j < 4; j += 1) {
+            gh_update(&u, src + i + j * 16, st->hx[4 - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+
+        for (j = 0; j < 4; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+    }
+
+    /* 2-blocks aggregation */
+
+    for (; i + 2 * 16 <= src_len; i += 2 * 16) {
+        counter = incr_counters(rev_counters, counter, 2);
+
+        u = gh_update0(sth, src + i, st->hx[2 - 1 - 0]);
+        for (j = 1; j < 2; j += 1) {
+            gh_update(&u, src + i + j * 16, st->hx[2 - 1 - j]);
+        }
+        sth->acc = gcm_reduce(u);
+
+        for (j = 0; j < 2; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+    }
+
+    /* Remaining *partial* blocks; if we have 16 bytes left, we want to keep the
+    full block authenticated along with the final block, hence < and not <= */
+
+    for (; i + 16 < src_len; i += 16) {
+        u        = gh_update0(sth, src + i, st->hx[1 - 1 - 0]);
+        sth->acc = gcm_reduce(u);
+        encrypt_xor_block(st, dst + i, src + i, REV128(counter));
+        counter = ADD64x2(counter, one);
+    }
+
+    /* Authenticate both the last block of the message and the final block */
+
+    final_block = REV128(SET64x2(ad_len * 8, src_len * 8));
+    STORE32_BE(counter_ + NPUBBYTES, 1);
+    encrypt(st, mac, counter_);
+    left = src_len - i;
+    if (left != 0) {
+        for (j = 0; j < left; j++) {
+            last_blocks[j] = src[i + j];
+        }
+        for (; j < 16; j++) {
+            last_blocks[j] = 0;
+        }
+        STORE128(last_blocks + 16, final_block);
+        gh_ad_blocks(st, sth, last_blocks, 32);
+        encrypt_xor_block(st, last_blocks, last_blocks, REV128(counter));
+        for (j = 0; j < left; j++) {
+            dst[i + j] = last_blocks[j];
+        }
+    } else {
+        STORE128(last_blocks, final_block);
+        gh_ad_blocks(st, sth, last_blocks, 16);
+    }
+    STORE128(mac, XOR128(LOAD128(mac), REV128(sth->acc)));
+}
 
 int
-crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_,
-                               const unsigned char *k)
+crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *st_, const unsigned char *k)
 {
-    aes256gcm_state *ctx = (aes256gcm_state *) (void *) ctx_;
-    unsigned char   *H = ctx->H;
-    __m128i         *rkeys = ctx->rkeys;
-    __m128i          zero = _mm_setzero_si128();
+    State                         *st = (State *) (void *) st_;
+    CRYPTO_ALIGN(16) unsigned char h[16];
 
-    COMPILER_ASSERT((sizeof *ctx_) >= (sizeof *ctx));
-    aesni_key256_expand(k, rkeys);
-    aesni_encrypt1(H, zero, rkeys);
+    COMPILER_ASSERT(sizeof *st_ >= sizeof *st);
+
+    expand256(k, st->rkeys);
+    memset(h, 0, sizeof h);
+    encrypt(st, h, h);
+
+    precomp_for_block_count(st->hx, h, PC_COUNT);
 
     return 0;
 }
 
 int
-crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c,
-                                               unsigned char *mac, unsigned long long *maclen_p,
-                                               const unsigned char *m, unsigned long long mlen,
-                                               const unsigned char *ad, unsigned long long adlen,
-                                               const unsigned char *nsec,
-                                               const unsigned char *npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *mac,
+                                               unsigned long long *maclen_p, const unsigned char *m,
+                                               unsigned long long m_len_, const unsigned char *ad,
+                                               unsigned long long   ad_len_,
+                                               const unsigned char *nsec, const unsigned char *npub,
+                                               const crypto_aead_aes256gcm_state *st_)
 {
-    const __m128i          rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    const aes256gcm_state *ctx = (const aes256gcm_state *) (const void *) ctx_;
-    const __m128i         *rkeys = ctx->rkeys;
-    __m128i                Hv, H2v, H3v, H4v, accv;
-    unsigned long long     i, j;
-    unsigned long long     adlen_rnd64 = adlen & ~63ULL;
-    unsigned long long     mlen_rnd128 = mlen & ~127ULL;
-    CRYPTO_ALIGN(16) uint32_t      n2[4];
-    CRYPTO_ALIGN(16) unsigned char H[16];
-    CRYPTO_ALIGN(16) unsigned char T[16];
-    CRYPTO_ALIGN(16) unsigned char accum[16];
-    CRYPTO_ALIGN(16) unsigned char fb[16];
+    const State                   *st = (const State *) (const void *) st_;
+    GHash                          sth;
+    CRYPTO_ALIGN(16) unsigned char j[16];
+    size_t                         gh_required_blocks;
+    const size_t                   ad_len = (size_t) ad_len_;
+    const size_t                   m_len  = (size_t) m_len_;
 
     (void) nsec;
-    memcpy(H, ctx->H, sizeof H);
-    if (mlen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) {
-        sodium_misuse(); /* LCOV_EXCL_LINE */
-    }
-    memcpy(&n2[0], npub, 3 * 4);
-    n2[3] = 0x01000000;
-    aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
-    {
-        uint64_t x;
-        x = _bswap64((uint64_t) (8 * adlen));
-        memcpy(&fb[0], &x, sizeof x);
-        x = _bswap64((uint64_t) (8 * mlen));
-        memcpy(&fb[8], &x, sizeof x);
-    }
-    /* we store H (and it's power) byte-reverted once and for all */
-    Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
-    _mm_store_si128((__m128i *) H, Hv);
-    H2v = mulv(Hv, Hv);
-    H3v = mulv(H2v, Hv);
-    H4v = mulv(H3v, Hv);
-
-    accv = _mm_setzero_si128();
-    /* unrolled by 4 GCM (by 8 doesn't improve using MULREDUCE4) */
-    for (i = 0; i < adlen_rnd64; i += 64) {
-        __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
-        __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
-        __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
-        __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
-        MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
-    }
-    _mm_store_si128((__m128i *) accum, accv);
-
-    /* GCM remainder loop */
-    for (i = adlen_rnd64; i < adlen; i += 16) {
-        unsigned int blocklen = 16;
-
-        if (i + (unsigned long long) blocklen > adlen) {
-            blocklen = (unsigned int) (adlen - i);
-        }
-        addmul(accum, ad + i, blocklen, H);
-    }
-
-/* this only does 8 full blocks, so no fancy bounds checking is necessary*/
-#define LOOPRND128                                                                                   \
-    do {                                                                                             \
-        const int iter = 8;                                                                          \
-        const int lb = iter * 16;                                                                    \
-                                                                                                     \
-        for (i = 0; i < mlen_rnd128; i += lb) {                                                      \
-            aesni_encrypt8full(c + i, n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, rev);              \
-        }                                                                                            \
-    } while(0)
-
-/* remainder loop, with the slower GCM update to accommodate partial blocks */
-#define LOOPRMD128                                           \
-    do {                                                     \
-        const int iter = 8;                                  \
-        const int lb = iter * 16;                            \
-                                                             \
-        for (i = mlen_rnd128; i < mlen; i += lb) {           \
-            CRYPTO_ALIGN(16) unsigned char outni[8 * 16];    \
-            unsigned long long mj = lb;                      \
-                                                             \
-            aesni_encrypt8(outni, n2, rkeys);                \
-            if ((i + mj) >= mlen) {                          \
-                mj = mlen - i;                               \
-            }                                                \
-            for (j = 0; j < mj; j++) {                       \
-                c[i + j] = m[i + j] ^ outni[j];              \
-            }                                                \
-            for (j = 0; j < mj; j += 16) {                   \
-                unsigned int bl = 16;                        \
-                                                             \
-                if (j + (unsigned long long) bl >= mj) {     \
-                    bl = (unsigned int) (mj - j);            \
-                }                                            \
-                addmul(accum, c + i + j, bl, H);             \
-            }                                                \
-        }                                                    \
-    } while(0)
-
-    n2[3] &= 0x00ffffff;
-    COUNTER_INC2(n2);
-    LOOPRND128;
-    LOOPRMD128;
-
-    addmul(accum, fb, 16, H);
-
-    for (i = 0; i < 16; ++i) {
-        mac[i] = T[i] ^ accum[15 - i];
-    }
     if (maclen_p != NULL) {
-        *maclen_p = 16;
+        *maclen_p = 0;
+    }
+    if (ad_len_ > SODIUM_SIZE_MAX || m_len_ > SODIUM_SIZE_MAX) {
+        sodium_misuse();
+    }
+    gh_required_blocks = required_blocks(ad_len, m_len);
+    if (gh_required_blocks == 0) {
+        memset(mac, 0xd0, ABYTES);
+        memset(c, 0, m_len);
+        return -1;
+    }
+
+    gh_init(&sth);
+
+    memcpy(j, npub, NPUBBYTES);
+    STORE32_BE(j + NPUBBYTES, 2);
+
+    aes_gcm_encrypt_generic(st, &sth, mac, c, m, m_len, ad, ad_len, j);
+
+    if (maclen_p != NULL) {
+        *maclen_p = ABYTES;
     }
     return 0;
 }
 
+int
+crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p, const unsigned char *m,
+                              unsigned long long m_len, const unsigned char *ad,
+                              unsigned long long ad_len, const unsigned char *nsec,
+                              const unsigned char *npub, const unsigned char *k)
+{
+    const int ret = crypto_aead_aes256gcm_encrypt_detached(c, c + m_len, NULL, m, m_len, ad, ad_len,
+                                                           nsec, npub, k);
+    if (clen_p != NULL) {
+        if (ret == 0) {
+            *clen_p = m_len + crypto_aead_aes256gcm_ABYTES;
+        } else {
+            *clen_p = 0;
+        }
+    }
+    return ret;
+}
+
+int
+crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, unsigned char *mac,
+                                       unsigned long long *maclen_p, const unsigned char *m,
+                                       unsigned long long m_len, const unsigned char *ad,
+                                       unsigned long long ad_len, const unsigned char *nsec,
+                                       const unsigned char *npub, const unsigned char *k)
+{
+    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state st;
+    int                                          ret;
+
+    PREFETCH_WRITE(c);
+    PREFETCH_READ(m);
+    PREFETCH_READ(ad);
+
+    crypto_aead_aes256gcm_beforenm(&st, k);
+    ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c, mac, maclen_p, m, m_len, ad, ad_len,
+                                                         nsec, npub, &st);
+    sodium_memzero(&st, sizeof st);
+
+    return ret;
+}
+
 int
 crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p,
                                       const unsigned char *m, unsigned long long mlen,
                                       const unsigned char *ad, unsigned long long adlen,
-                                      const unsigned char *nsec,
-                                      const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      const unsigned char *nsec, const unsigned char *npub,
+                                      const crypto_aead_aes256gcm_state *st_)
 {
-    int ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c,
-                                                             c + mlen, NULL,
-                                                             m, mlen,
-                                                             ad, adlen,
-                                                             nsec, npub, ctx_);
+    int ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c, c + mlen, NULL, m, mlen, ad, adlen,
+                                                             nsec, npub, st_);
     if (clen_p != NULL) {
         *clen_p = mlen + crypto_aead_aes256gcm_ABYTES;
     }
     return ret;
 }
 
-int
-crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec,
-                                               const unsigned char *c, unsigned long long clen,
-                                               const unsigned char *mac,
-                                               const unsigned char *ad, unsigned long long adlen,
-                                               const unsigned char *npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+static int
+crypto_aead_aes256gcm_verify_mac(unsigned char *nsec, const unsigned char *c,
+                                 unsigned long long c_len_, const unsigned char *mac,
+                                 const unsigned char *ad, unsigned long long ad_len_,
+                                 const unsigned char *npub, const crypto_aead_aes256gcm_state *st_)
 {
-    const __m128i          rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    const aes256gcm_state *ctx = (const aes256gcm_state *) (const void *) ctx_;
-    const __m128i         *rkeys = ctx->rkeys;
-    __m128i                Hv, H2v, H3v, H4v, accv;
-    unsigned long long     i, j;
-    unsigned long long     adlen_rnd64 = adlen & ~63ULL;
-    unsigned long long     mlen;
-    unsigned long long     mlen_rnd128;
-    CRYPTO_ALIGN(16) uint32_t      n2[4];
-    CRYPTO_ALIGN(16) unsigned char H[16];
-    CRYPTO_ALIGN(16) unsigned char T[16];
-    CRYPTO_ALIGN(16) unsigned char accum[16];
-    CRYPTO_ALIGN(16) unsigned char fb[16];
+    const State                   *st = (const State *) (const void *) st_;
+    GHash                          sth;
+    BlockVec                       final_block;
+    CRYPTO_ALIGN(16) unsigned char j[16];
+    CRYPTO_ALIGN(16) unsigned char computed_mac[16];
+    CRYPTO_ALIGN(16) unsigned char last_block[16];
+    size_t                         gh_required_blocks;
+    size_t                         left;
+    const size_t                   ad_len = (size_t) ad_len_;
+    const size_t                   c_len  = (size_t) c_len_;
+    int                            ret;
 
     (void) nsec;
-    if (clen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) {
-        sodium_misuse(); /* LCOV_EXCL_LINE */
+    if (ad_len_ > SODIUM_SIZE_MAX || c_len_ > SODIUM_SIZE_MAX) {
+        sodium_misuse();
     }
-    mlen = clen;
-
-    memcpy(&n2[0], npub, 3 * 4);
-    n2[3] = 0x01000000;
-    aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
-
-    {
-        uint64_t x;
-        x = _bswap64((uint64_t)(8 * adlen));
-        memcpy(&fb[0], &x, sizeof x);
-        x = _bswap64((uint64_t)(8 * mlen));
-        memcpy(&fb[8], &x, sizeof x);
+    gh_required_blocks = required_blocks(ad_len, c_len);
+    if (gh_required_blocks == 0) {
+        return -1;
     }
 
-    memcpy(H, ctx->H, sizeof H);
-    Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
-    _mm_store_si128((__m128i *) H, Hv);
-    H2v = mulv(Hv, Hv);
-    H3v = mulv(H2v, Hv);
-    H4v = mulv(H3v, Hv);
+    gh_init(&sth);
 
-    accv = _mm_setzero_si128();
-    for (i = 0; i < adlen_rnd64; i += 64) {
-        __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
-        __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
-        __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
-        __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
-        MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
-    }
-    _mm_store_si128((__m128i *) accum, accv);
+    memcpy(j, npub, NPUBBYTES);
+    STORE32_BE(j + NPUBBYTES, 2);
 
-    for (i = adlen_rnd64; i < adlen; i += 16) {
-        unsigned int blocklen = 16;
-        if (i + (unsigned long long) blocklen > adlen) {
-            blocklen = (unsigned int) (adlen - i);
-        }
-        addmul(accum, ad + i, blocklen, H);
+    gh_ad_blocks(st, &sth, ad, ad_len & ~15);
+    left = ad_len & 15;
+    if (left != 0) {
+        unsigned char pad[16];
+
+        memset(pad, 0, sizeof pad);
+        memcpy(pad, ad + ad_len - left, left);
+        gh_ad_blocks(st, &sth, pad, sizeof pad);
     }
 
-    mlen_rnd128 = mlen & ~127ULL;
+    gh_ad_blocks(st, &sth, c, c_len & ~15);
+    left = c_len & 15;
+    if (left != 0) {
+        unsigned char pad[16];
 
-#define LOOPACCUMDRND128                                                                          \
-    do {                                                                                          \
-        const int iter = 8;                                                                       \
-        const int lb = iter * 16;                                                                 \
-        for (i = 0; i < mlen_rnd128; i += lb) {                                                   \
-            aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v, rev);                              \
-        }                                                                                         \
-    } while(0)
-
-#define LOOPDRND128                                                                               \
-    do {                                                                                          \
-        const int iter = 8;                                                                       \
-        const int lb = iter * 16;                                                                 \
-                                                                                                  \
-        for (i = 0; i < mlen_rnd128; i += lb) {                                                   \
-            aesni_decrypt8full(m + i, n2, rkeys, c + i);                                          \
-        }                                                                                         \
-    } while(0)
-
-#define LOOPACCUMDRMD128                                     \
-    do {                                                     \
-        const int iter = 8;                                  \
-        const int lb = iter * 16;                            \
-                                                             \
-        for (i = mlen_rnd128; i < mlen; i += lb) {           \
-            unsigned long long mj = lb;                      \
-                                                             \
-            if ((i + mj) >= mlen) {                          \
-                mj = mlen - i;                               \
-            }                                                \
-            for (j = 0; j < mj; j += 16) {                   \
-                unsigned int bl = 16;                        \
-                                                             \
-                if (j + (unsigned long long) bl >= mj) {     \
-                    bl = (unsigned int) (mj - j);            \
-                }                                            \
-                addmul(accum, c + i + j, bl, H);             \
-            }                                                \
-        }                                                    \
-    } while(0)
-
-#define LOOPDRMD128                                          \
-    do {                                                     \
-        const int iter = 8;                                  \
-        const int lb = iter * 16;                            \
-                                                             \
-        for (i = mlen_rnd128; i < mlen; i += lb) {           \
-            CRYPTO_ALIGN(16) unsigned char outni[8 * 16];    \
-            unsigned long long mj = lb;                      \
-                                                             \
-            if ((i + mj) >= mlen) {                          \
-                mj = mlen - i;                               \
-            }                                                \
-            aesni_encrypt8(outni, n2, rkeys);                \
-            for (j = 0; j < mj; j++) {                       \
-                m[i + j] = c[i + j] ^ outni[j];              \
-            }                                                \
-        }                                                    \
-    } while(0)
-
-    n2[3] &= 0x00ffffff;
-
-    COUNTER_INC2(n2);
-    LOOPACCUMDRND128;
-    LOOPACCUMDRMD128;
-    addmul(accum, fb, 16, H);
-    {
-        unsigned char d = 0;
-
-        for (i = 0; i < 16; i++) {
-            d |= (mac[i] ^ (T[i] ^ accum[15 - i]));
-        }
-        if (d != 0) {
-            if (m != NULL) {
-                memset(m, 0, mlen);
-            }
-            return -1;
-        }
-        if (m == NULL) {
-            return 0;
-        }
+        memset(pad, 0, sizeof pad);
+        memcpy(pad, c + c_len - left, left);
+        gh_ad_blocks(st, &sth, pad, sizeof pad);
     }
-    n2[3] = 0U;
-    COUNTER_INC2(n2);
-    LOOPDRND128;
-    LOOPDRMD128;
+    final_block = REV128(SET64x2(ad_len * 8, c_len * 8));
+    STORE32_BE(j + NPUBBYTES, 1);
+    encrypt(st, computed_mac, j);
+    STORE128(last_block, final_block);
+    gh_ad_blocks(st, &sth, last_block, 16);
+    STORE128(computed_mac, XOR128(LOAD128(computed_mac), REV128(sth.acc)));
 
+    ret = crypto_verify_16(mac, computed_mac);
+    sodium_memzero(computed_mac, sizeof computed_mac);
+
+    return ret;
+}
+
+int
+crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec,
+                                               const unsigned char *c, unsigned long long c_len_,
+                                               const unsigned char *mac, const unsigned char *ad,
+                                               unsigned long long                 ad_len_,
+                                               const unsigned char               *npub,
+                                               const crypto_aead_aes256gcm_state *st_)
+{
+    const State                   *st = (const State *) (const void *) st_;
+    GHash                          sth;
+    CRYPTO_ALIGN(16) unsigned char j[16];
+    unsigned char                  computed_mac[16];
+    size_t                         gh_required_blocks;
+    const size_t                   ad_len = (size_t) ad_len_;
+    const size_t                   c_len  = (size_t) c_len_;
+    const size_t                   m_len  = c_len;
+
+    (void) nsec;
+    if (ad_len_ > SODIUM_SIZE_MAX || c_len_ > SODIUM_SIZE_MAX) {
+        sodium_misuse();
+    }
+    if (m == NULL) {
+        return crypto_aead_aes256gcm_verify_mac(nsec, c, c_len, mac, ad, ad_len, npub, st_);
+    }
+    gh_required_blocks = required_blocks(ad_len, m_len);
+    if (gh_required_blocks == 0) {
+        return -1;
+    }
+
+    gh_init(&sth);
+
+    memcpy(j, npub, NPUBBYTES);
+    STORE32_BE(j + NPUBBYTES, 2);
+
+    aes_gcm_decrypt_generic(st, &sth, computed_mac, m, c, m_len, ad, ad_len, j);
+
+    if (crypto_verify_16(mac, computed_mac) != 0) {
+        sodium_memzero(computed_mac, sizeof computed_mac);
+        memset(m, 0xd0, m_len);
+        return -1;
+    }
     return 0;
 }
 
 int
 crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p,
-                                      unsigned char *nsec,
-                                      const unsigned char *c, unsigned long long clen,
-                                      const unsigned char *ad, unsigned long long adlen,
-                                      const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      unsigned char *nsec, const unsigned char *c,
+                                      unsigned long long clen, const unsigned char *ad,
+                                      unsigned long long adlen, const unsigned char *npub,
+                                      const crypto_aead_aes256gcm_state *st_)
 {
     unsigned long long mlen = 0ULL;
-    int                ret = -1;
+    int                ret  = -1;
 
-    if (clen >= crypto_aead_aes256gcm_ABYTES) {
-        ret = crypto_aead_aes256gcm_decrypt_detached_afternm
-            (m, nsec, c, clen - crypto_aead_aes256gcm_ABYTES,
-             c + clen - crypto_aead_aes256gcm_ABYTES,
-             ad, adlen, npub, ctx_);
+    if (clen >= ABYTES) {
+        ret = crypto_aead_aes256gcm_decrypt_detached_afternm(
+            m, nsec, c, clen - ABYTES, c + clen - ABYTES, ad, adlen, npub, st_);
     }
     if (mlen_p != NULL) {
         if (ret == 0) {
-            mlen = clen - crypto_aead_aes256gcm_ABYTES;
+            mlen = clen - ABYTES;
         }
         *mlen_p = mlen;
     }
@@ -823,90 +952,42 @@ crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen
 }
 
 int
-crypto_aead_aes256gcm_encrypt_detached(unsigned char *c,
-                                       unsigned char *mac,
-                                       unsigned long long *maclen_p,
-                                       const unsigned char *m,
-                                       unsigned long long mlen,
-                                       const unsigned char *ad,
-                                       unsigned long long adlen,
-                                       const unsigned char *nsec,
-                                       const unsigned char *npub,
+crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, unsigned char *nsec,
+                                       const unsigned char *c, unsigned long long clen,
+                                       const unsigned char *mac, const unsigned char *ad,
+                                       unsigned long long adlen, const unsigned char *npub,
                                        const unsigned char *k)
 {
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
+    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state st;
 
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
+    PREFETCH_WRITE(m);
+    PREFETCH_READ(c);
+    PREFETCH_READ(ad);
 
-    return crypto_aead_aes256gcm_encrypt_detached_afternm
-        (c, mac, maclen_p, m, mlen, ad, adlen, nsec, npub,
-            (const crypto_aead_aes256gcm_state *) &ctx);
+    crypto_aead_aes256gcm_beforenm(&st, k);
+
+    return crypto_aead_aes256gcm_decrypt_detached_afternm(
+        m, nsec, c, clen, mac, ad, adlen, npub, (const crypto_aead_aes256gcm_state *) &st);
 }
 
 int
-crypto_aead_aes256gcm_encrypt(unsigned char *c,
-                              unsigned long long *clen_p,
-                              const unsigned char *m,
-                              unsigned long long mlen,
-                              const unsigned char *ad,
-                              unsigned long long adlen,
-                              const unsigned char *nsec,
-                              const unsigned char *npub,
-                              const unsigned char *k)
+crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p, unsigned char *nsec,
+                              const unsigned char *c, unsigned long long clen,
+                              const unsigned char *ad, unsigned long long adlen,
+                              const unsigned char *npub, const unsigned char *k)
 {
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
-    int ret;
+    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state st;
+    int                                          ret;
 
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
+    PREFETCH_WRITE(m);
+    PREFETCH_READ(c);
+    PREFETCH_READ(ad);
 
-    ret = crypto_aead_aes256gcm_encrypt_afternm
-        (c, clen_p, m, mlen, ad, adlen, nsec, npub,
-            (const crypto_aead_aes256gcm_state *) &ctx);
-    sodium_memzero(&ctx, sizeof ctx);
+    crypto_aead_aes256gcm_beforenm(&st, k);
 
-    return ret;
-}
-
-int
-crypto_aead_aes256gcm_decrypt_detached(unsigned char *m,
-                                       unsigned char *nsec,
-                                       const unsigned char *c,
-                                       unsigned long long clen,
-                                       const unsigned char *mac,
-                                       const unsigned char *ad,
-                                       unsigned long long adlen,
-                                       const unsigned char *npub,
-                                       const unsigned char *k)
-{
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
-
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
-
-    return crypto_aead_aes256gcm_decrypt_detached_afternm
-        (m, nsec, c, clen, mac, ad, adlen, npub,
-            (const crypto_aead_aes256gcm_state *) &ctx);
-}
-
-int
-crypto_aead_aes256gcm_decrypt(unsigned char *m,
-                              unsigned long long *mlen_p,
-                              unsigned char *nsec,
-                              const unsigned char *c,
-                              unsigned long long clen,
-                              const unsigned char *ad,
-                              unsigned long long adlen,
-                              const unsigned char *npub,
-                              const unsigned char *k)
-{
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
-    int ret;
-
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
-
-    ret = crypto_aead_aes256gcm_decrypt_afternm
-        (m, mlen_p, nsec, c, clen, ad, adlen, npub,
-         (const crypto_aead_aes256gcm_state *) &ctx);
-    sodium_memzero(&ctx, sizeof ctx);
+    ret = crypto_aead_aes256gcm_decrypt_afternm(m, mlen_p, nsec, c, clen, ad, adlen, npub,
+                                                (const crypto_aead_aes256gcm_state *) &st);
+    sodium_memzero(&st, sizeof st);
 
     return ret;
 }
@@ -919,16 +1000,36 @@ crypto_aead_aes256gcm_is_available(void)
 
 #else
 
+#ifndef ENOSYS
+#define ENOSYS ENXIO
+#endif
+
 int
-crypto_aead_aes256gcm_encrypt_detached(unsigned char *c,
-                                       unsigned char *mac,
-                                       unsigned long long *maclen_p,
-                                       const unsigned char *m,
-                                       unsigned long long mlen,
-                                       const unsigned char *ad,
-                                       unsigned long long adlen,
-                                       const unsigned char *nsec,
-                                       const unsigned char *npub,
+crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, unsigned char *mac,
+                                       unsigned long long *maclen_p, const unsigned char *m,
+                                       unsigned long long mlen, const unsigned char *ad,
+                                       unsigned long long adlen, const unsigned char *nsec,
+                                       const unsigned char *npub, const unsigned char *k)
+{
+    errno = ENOSYS;
+    return -1;
+}
+
+int
+crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p, const unsigned char *m,
+                              unsigned long long mlen, const unsigned char *ad,
+                              unsigned long long adlen, const unsigned char *nsec,
+                              const unsigned char *npub, const unsigned char *k)
+{
+    errno = ENOSYS;
+    return -1;
+}
+
+int
+crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, unsigned char *nsec,
+                                       const unsigned char *c, unsigned long long clen,
+                                       const unsigned char *mac, const unsigned char *ad,
+                                       unsigned long long adlen, const unsigned char *npub,
                                        const unsigned char *k)
 {
     errno = ENOSYS;
@@ -936,58 +1037,29 @@ crypto_aead_aes256gcm_encrypt_detached(unsigned char *c,
 }
 
 int
-crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p,
-                              const unsigned char *m, unsigned long long mlen,
+crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p, unsigned char *nsec,
+                              const unsigned char *c, unsigned long long clen,
                               const unsigned char *ad, unsigned long long adlen,
-                              const unsigned char *nsec, const unsigned char *npub,
-                              const unsigned char *k)
+                              const unsigned char *npub, const unsigned char *k)
 {
     errno = ENOSYS;
     return -1;
 }
 
 int
-crypto_aead_aes256gcm_decrypt_detached(unsigned char *m,
-                                       unsigned char *nsec,
-                                       const unsigned char *c,
-                                       unsigned long long clen,
-                                       const unsigned char *mac,
-                                       const unsigned char *ad,
-                                       unsigned long long adlen,
-                                       const unsigned char *npub,
-                                       const unsigned char *k)
+crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *st_, const unsigned char *k)
 {
     errno = ENOSYS;
     return -1;
 }
 
 int
-crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p,
-                              unsigned char *nsec, const unsigned char *c,
-                              unsigned long long clen, const unsigned char *ad,
-                              unsigned long long adlen, const unsigned char *npub,
-                              const unsigned char *k)
-{
-    errno = ENOSYS;
-    return -1;
-}
-
-int
-crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_,
-                               const unsigned char *k)
-{
-    errno = ENOSYS;
-    return -1;
-}
-
-int
-crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c,
-                                               unsigned char *mac, unsigned long long *maclen_p,
-                                               const unsigned char *m, unsigned long long mlen,
-                                               const unsigned char *ad, unsigned long long adlen,
-                                               const unsigned char *nsec,
-                                               const unsigned char *npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *mac,
+                                               unsigned long long *maclen_p, const unsigned char *m,
+                                               unsigned long long mlen, const unsigned char *ad,
+                                               unsigned long long adlen, const unsigned char *nsec,
+                                               const unsigned char               *npub,
+                                               const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;
@@ -998,7 +1070,7 @@ crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen
                                       const unsigned char *m, unsigned long long mlen,
                                       const unsigned char *ad, unsigned long long adlen,
                                       const unsigned char *nsec, const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;
@@ -1007,10 +1079,9 @@ crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen
 int
 crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec,
                                                const unsigned char *c, unsigned long long clen,
-                                               const unsigned char *mac,
-                                               const unsigned char *ad, unsigned long long adlen,
-                                               const unsigned char *npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+                                               const unsigned char *mac, const unsigned char *ad,
+                                               unsigned long long adlen, const unsigned char *npub,
+                                               const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;
@@ -1018,11 +1089,10 @@ crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *
 
 int
 crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p,
-                                      unsigned char *nsec,
-                                      const unsigned char *c, unsigned long long clen,
-                                      const unsigned char *ad, unsigned long long adlen,
-                                      const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      unsigned char *nsec, const unsigned char *c,
+                                      unsigned long long clen, const unsigned char *ad,
+                                      unsigned long long adlen, const unsigned char *npub,
+                                      const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;