From 3eb6dd45a64928186d399d5e15765b3ae2fef889 Mon Sep 17 00:00:00 2001
From: Michael Scire <SciresM@gmail.com>
Date: Tue, 2 Apr 2019 23:29:23 -0700
Subject: [PATCH] Implement accelerated AES-CTR

---
 nx/include/switch/crypto.h         |   3 +-
 nx/include/switch/crypto/aes_ctr.h |  43 ++
 nx/source/crypto/aes_ctr.c         | 639 +++++++++++++++++++++++++++++
 3 files changed, 684 insertions(+), 1 deletion(-)
 create mode 100644 nx/include/switch/crypto/aes_ctr.h
 create mode 100644 nx/source/crypto/aes_ctr.c

diff --git a/nx/include/switch/crypto.h b/nx/include/switch/crypto.h
index 5de0edaf..c95f9be9 100644
--- a/nx/include/switch/crypto.h
+++ b/nx/include/switch/crypto.h
@@ -7,4 +7,5 @@
 #include "types.h"
 
 #include "crypto/aes.h"
-#include "crypto/aes_cbc.h"
\ No newline at end of file
+#include "crypto/aes_cbc.h"
+#include "crypto/aes_ctr.h"
\ No newline at end of file
diff --git a/nx/include/switch/crypto/aes_ctr.h b/nx/include/switch/crypto/aes_ctr.h
new file mode 100644
index 00000000..9c462bbb
--- /dev/null
+++ b/nx/include/switch/crypto/aes_ctr.h
@@ -0,0 +1,43 @@
+/**
+ * @file aes_ctr.h
+ * @brief Switch accelerated AES-CTR implementation.
+ * @copyright libnx Authors
+ */
+#pragma once
+#include "aes.h"
+
+/// Context for AES-128 CTR.
+typedef struct {
+    Aes128Context aes_ctx;
+    u8 ctr[AES_BLOCK_SIZE];
+    u8 enc_ctr_buffer[AES_BLOCK_SIZE];
+    size_t buffer_offset;
+} Aes128CtrContext;
+
+/// Context for AES-192 CTR.
+typedef struct {
+    Aes192Context aes_ctx;
+    u8 ctr[AES_BLOCK_SIZE];
+    u8 enc_ctr_buffer[AES_BLOCK_SIZE];
+    size_t buffer_offset;
+} Aes192CtrContext;
+
+/// Context for AES-256 CTR.
+typedef struct {
+    Aes256Context aes_ctx;
+    u8 ctr[AES_BLOCK_SIZE];
+    u8 enc_ctr_buffer[AES_BLOCK_SIZE];
+    size_t buffer_offset;
+} Aes256CtrContext;
+
+/// 128-bit CTR API.
+void aes128CtrContextCreate(Aes128CtrContext *out, const void *key, const void *ctr);
+void aes128CtrCrypt(Aes128CtrContext *ctx, void *dst, const void *src, size_t size);
+
+/// 192-bit CTR API.
+void aes192CtrContextCreate(Aes192CtrContext *out, const void *key, const void *ctr);
+void aes192CtrCrypt(Aes192CtrContext *ctx, void *dst, const void *src, size_t size);
+
+/// 256-bit CTR API.
+void aes256CtrContextCreate(Aes256CtrContext *out, const void *key, const void *ctr);
+void aes256CtrCrypt(Aes256CtrContext *ctx, void *dst, const void *src, size_t size);
diff --git a/nx/source/crypto/aes_ctr.c b/nx/source/crypto/aes_ctr.c
new file mode 100644
index 00000000..82569218
--- /dev/null
+++ b/nx/source/crypto/aes_ctr.c
@@ -0,0 +1,639 @@
+#include <string.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+#include "result.h"
+#include "crypto/aes_ctr.h"
+
+/* Variable management macros. */
+#define DECLARE_ROUND_KEY_VAR(n) \
+const uint8x16_t round_key_##n = vld1q_u8(ctx->aes_ctx.round_keys[n])
+
+#define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \
+[tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2)
+
+#define AES_ENC_DEC_OUTPUT_THREE_CTRS() \
+[ctr0]"+w"(ctr0), [ctr1]"+w"(ctr1), [ctr2]"+w"(ctr2)
+
+#define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \
+[tmp0]"+w"(tmp0)
+
+#define AES_ENC_DEC_OUTPUT_ONE_CTR() \
+[ctr0]"+w"(ctr0)
+
+#define CTR_INCREMENT_OUTPUT_HIGH_LOW() \
+[high]"=&r"(high), [low]"=&r"(low)
+
+#define CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP() \
+[high_tmp]"=&r"(high_tmp), [low_tmp]"=&r"(low_tmp)
+
+#define CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP() \
+[hl_tmp]"=&r"(hl_tmp)
+
+#define AES_ENC_DEC_INPUT_ROUND_KEY(n) \
+[round_key_##n]"w"(round_key_##n)
+
+/* AES Encryption macros. */
+#define AES_ENC_ROUND(n, i) \
+"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
+"aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
+
+#define AES_ENC_SECOND_LAST_ROUND(n, i) \
+"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+#define AES_ENC_LAST_ROUND(n, i) \
+"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+/* Macro for main body of crypt wrapper. */
+#define CRYPT_FUNC_BODY(block_handler) \
+do { \
+    const u8 *cur_src = src; \
+    u8 *cur_dst = dst; \
+\
+    /* Handle pre-buffered data. */ \
+    if (ctx->buffer_offset > 0) { \
+        const size_t needed = AES_BLOCK_SIZE - ctx->buffer_offset; \
+        const size_t copyable = (size > needed ? needed : size); \
+        for (size_t i = 0; i < copyable; i++) { \
+            cur_dst[i] = cur_src[i] ^ ctx->enc_ctr_buffer[ctx->buffer_offset + i]; \
+        } \
+        cur_dst += copyable; \
+        cur_src += copyable; \
+        ctx->buffer_offset += copyable; \
+        size -= copyable; \
+\
+        if (ctx->buffer_offset == AES_BLOCK_SIZE) { \
+            ctx->buffer_offset = 0; \
+        } \
+    } \
+\
+    /* Handle complete blocks. */ \
+    if (size >= AES_BLOCK_SIZE) { \
+        const size_t num_blocks = size / AES_BLOCK_SIZE; \
+        block_handler(ctx, cur_dst, cur_src, num_blocks); \
+        size -= num_blocks * AES_BLOCK_SIZE; \
+        cur_src += num_blocks * AES_BLOCK_SIZE; \
+        cur_dst += num_blocks * AES_BLOCK_SIZE; \
+    } \
+\
+    /* Buffer remaining data. */ \
+    if (size > 0) { \
+        memcpy(ctx->enc_ctr_buffer, cur_src, size); \
+        memset(ctx->enc_ctr_buffer + size, 0, AES_BLOCK_SIZE - size); \
+        block_handler(ctx, ctx->enc_ctr_buffer, ctx->enc_ctr_buffer, 1); \
+        memcpy(cur_dst, ctx->enc_ctr_buffer, size); \
+        ctx->buffer_offset = size; \
+    } \
+} while (0)
+
+static inline uint8x16_t _incrementCtr(const uint8x16_t ctr) {
+    uint8x16_t inc;
+    uint64_t high, low;
+    /* Use ASM. TODO: Better than using intrinsics? */
+    __asm__ __volatile__ (
+        "mov %[high], %[ctr].d[0]\n"
+        "mov %[low], %[ctr].d[1]\n"
+        "rev %[high], %[high]\n"
+        "rev %[low], %[low]\n"
+        "adds %[low], %[low], 1\n"
+        "cinc %[high], %[high], cs\n"
+        "rev %[high], %[high]\n"
+        "rev %[low], %[low]\n"
+        "mov %[inc].d[0], %[high]\n"
+        "mov %[inc].d[1], %[low]\n"
+        : [inc]"=w"(inc),
+          CTR_INCREMENT_OUTPUT_HIGH_LOW()
+        : [ctr]"w"(ctr)
+        : "cc"
+    );
+    return inc;
+}
+
+void aes128CtrContextCreate(Aes128CtrContext *out, const void *key, const void *ctr) {
+    /* Initialize inner context. */
+    aes128ContextCreate(&out->aes_ctx, key, true);
+
+    /* Set IV, nothing is buffered. */
+    memcpy(out->ctr, ctr, sizeof(out->ctr));
+    memset(out->enc_ctr_buffer, 0, sizeof(out->enc_ctr_buffer));
+    out->buffer_offset = 0;
+}
+
+static inline void _aes128CtrCryptBlocks(Aes128CtrContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) {
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    uint8x16_t ctr0 = vld1q_u8(ctx->ctr);
+    uint64_t high, low;
+
+    /* Process three blocks at a time, when possible. */
+    if (num_blocks >= 3) {
+        /* Increment CTR twice. */
+        uint8x16_t ctr1 = _incrementCtr(ctr0);
+        uint8x16_t ctr2 = _incrementCtr(ctr1);
+        uint64_t high_tmp, low_tmp;
+
+        while (num_blocks >= 3) {
+            /* Read blocks in. Keep them in registers for XOR later. */
+            const uint8x16_t block0 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+            const uint8x16_t block1 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+            const uint8x16_t block2 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+
+            /* We'll be encrypting the three CTRs. */
+            uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
+                AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
+                AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
+                AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
+                AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n"
+                AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n"
+                AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n"
+                AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n"
+                AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n"
+                AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n"
+                AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n"
+                AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n"
+                AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n"
+                AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n"
+                AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n"
+                AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n"
+                AES_ENC_ROUND(7, 1)
+                AES_ENC_ROUND(7, 2)
+                AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                AES_ENC_SECOND_LAST_ROUND(9, 0) AES_ENC_SECOND_LAST_ROUND(9, 1) AES_ENC_SECOND_LAST_ROUND(9, 2)
+                AES_ENC_LAST_ROUND(10, 0) AES_ENC_LAST_ROUND(10, 1) AES_ENC_LAST_ROUND(10, 2)
+                : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                  AES_ENC_DEC_OUTPUT_THREE_CTRS(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP()
+                : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(block0, tmp0);
+            tmp1 = veorq_u8(block1, tmp1);
+            tmp2 = veorq_u8(block2, tmp2);
+
+            /* Store to output. */
+            vst1q_u8(dst_u8, tmp0);
+            dst_u8 += AES_BLOCK_SIZE;
+            vst1q_u8(dst_u8, tmp1);
+            dst_u8 += AES_BLOCK_SIZE;
+            vst1q_u8(dst_u8, tmp2);
+            dst_u8 += AES_BLOCK_SIZE;
+
+            num_blocks -= 3;
+        }
+    }
+
+    while (num_blocks >= 1) {
+        /* Read block in, keep in register for XOR. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        /* We'll be encrypting the CTR. */
+        uint8x16_t tmp0 = ctr0;
+
+        /* Actually do encryption, use optimized asm. */
+        /* Interleave CTR calculations with AES ones, to mask latencies. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(0, 0)             "mov %[high], %[ctr0].d[0]\n"
+            AES_ENC_ROUND(1, 0)             "mov %[low], %[ctr0].d[1]\n"
+            AES_ENC_ROUND(2, 0)             "rev %[high], %[high]\n"
+            AES_ENC_ROUND(3, 0)             "rev %[low], %[low]\n"
+            AES_ENC_ROUND(4, 0)             "adds %[low], %[low], 1\n"
+            AES_ENC_ROUND(5, 0)             "cinc %[high], %[high], cs\n"
+            AES_ENC_ROUND(6, 0)             "rev %[high], %[high]\n"
+            AES_ENC_ROUND(7, 0)             "rev %[low], %[low]\n"
+            AES_ENC_ROUND(8, 0)             "mov %[ctr0].d[0], %[high]\n"
+            AES_ENC_SECOND_LAST_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
+            AES_ENC_LAST_ROUND(10, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+              AES_ENC_DEC_OUTPUT_ONE_CTR(),
+              CTR_INCREMENT_OUTPUT_HIGH_LOW()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10)
+            : "cc"
+        );
+
+        /* XOR blocks. */
+        tmp0 = veorq_u8(block0, tmp0);
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->ctr, ctr0);
+}
+
+void aes128CtrCrypt(Aes128CtrContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes128CtrCryptBlocks);
+}
+
+void aes192CtrContextCreate(Aes192CtrContext *out, const void *key, const void *ctr) {
+    /* Initialize inner context. */
+    aes192ContextCreate(&out->aes_ctx, key, true);
+
+    /* Set IV, nothing is buffered. */
+    memcpy(out->ctr, ctr, sizeof(out->ctr));
+    memset(out->enc_ctr_buffer, 0, sizeof(out->enc_ctr_buffer));
+    out->buffer_offset = 0;
+}
+
+static inline void _aes192CtrCryptBlocks(Aes192CtrContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) {
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    DECLARE_ROUND_KEY_VAR(11);
+    DECLARE_ROUND_KEY_VAR(12);
+    uint8x16_t ctr0 = vld1q_u8(ctx->ctr);
+    uint64_t high, low;
+
+    /* Process three blocks at a time, when possible. */
+    if (num_blocks >= 3) {
+        /* Increment CTR twice. */
+        uint8x16_t ctr1 = _incrementCtr(ctr0);
+        uint8x16_t ctr2 = _incrementCtr(ctr1);
+        uint64_t high_tmp, low_tmp;
+
+        while (num_blocks >= 3) {
+            /* Read blocks in. Keep them in registers for XOR later. */
+            const uint8x16_t block0 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+            const uint8x16_t block1 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+            const uint8x16_t block2 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+
+            /* We'll be encrypting the three CTRs. */
+            uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
+                AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
+                AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
+                AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
+                AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n"
+                AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n"
+                AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n"
+                AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n"
+                AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n"
+                AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n"
+                AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n"
+                AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n"
+                AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n"
+                AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n"
+                AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n"
+                AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n"
+                AES_ENC_ROUND(7, 1)
+                AES_ENC_ROUND(7, 2)
+                AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
+                AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
+                AES_ENC_SECOND_LAST_ROUND(11, 0) AES_ENC_SECOND_LAST_ROUND(11, 1) AES_ENC_SECOND_LAST_ROUND(11, 2)
+                AES_ENC_LAST_ROUND(12, 0) AES_ENC_LAST_ROUND(12, 1) AES_ENC_LAST_ROUND(12, 2)
+                : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                  AES_ENC_DEC_OUTPUT_THREE_CTRS(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP()
+                : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(block0, tmp0);
+            tmp1 = veorq_u8(block1, tmp1);
+            tmp2 = veorq_u8(block2, tmp2);
+
+            /* Store to output. */
+            vst1q_u8(dst_u8, tmp0);
+            dst_u8 += AES_BLOCK_SIZE;
+            vst1q_u8(dst_u8, tmp1);
+            dst_u8 += AES_BLOCK_SIZE;
+            vst1q_u8(dst_u8, tmp2);
+            dst_u8 += AES_BLOCK_SIZE;
+
+            num_blocks -= 3;
+        }
+    }
+
+    while (num_blocks >= 1) {
+        /* Read block in, keep in register for XOR. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        /* We'll be encrypting the CTR. */
+        uint8x16_t tmp0 = ctr0;
+
+        /* Actually do encryption, use optimized asm. */
+        /* Interleave CTR calculations with AES ones, to mask latencies. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n"
+            AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n"
+            AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n"
+            AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n"
+            AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n"
+            AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n"
+            AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n"
+            AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n"
+            AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n"
+            AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
+            AES_ENC_ROUND(10, 0)
+            AES_ENC_SECOND_LAST_ROUND(11, 0)
+            AES_ENC_LAST_ROUND(12, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+              AES_ENC_DEC_OUTPUT_ONE_CTR(),
+              CTR_INCREMENT_OUTPUT_HIGH_LOW()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12)
+            : "cc"
+        );
+
+        /* XOR blocks. */
+        tmp0 = veorq_u8(block0, tmp0);
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->ctr, ctr0);
+}
+
+void aes192CtrCrypt(Aes192CtrContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes192CtrCryptBlocks);
+}
+
+static inline void _aes256CtrCryptBlocks(Aes256CtrContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) {
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    DECLARE_ROUND_KEY_VAR(11);
+    DECLARE_ROUND_KEY_VAR(12);
+    DECLARE_ROUND_KEY_VAR(13);
+    DECLARE_ROUND_KEY_VAR(14);
+    uint8x16_t ctr0 = vld1q_u8(ctx->ctr);
+    uint64_t high, low;
+
+    /* Process three blocks at a time, when possible. */
+    if (num_blocks >= 3) {
+        /* Increment CTR twice. */
+        uint8x16_t ctr1 = _incrementCtr(ctr0);
+        uint8x16_t ctr2 = _incrementCtr(ctr1);
+        uint64_t hl_tmp;
+
+        while (num_blocks >= 3) {
+            /* Read blocks in. Keep them in registers for XOR later. */
+            const uint8x16_t block0 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+            const uint8x16_t block1 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+            const uint8x16_t block2 = vld1q_u8(src_u8);
+            src_u8 += AES_BLOCK_SIZE;
+
+            /* We'll be encrypting the three CTRs. */
+            uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            /* Note: ASM here only uses one temporary u64 instead of two, due to 30 operand limit. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
+                AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
+                AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
+                AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
+                AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(2, 0) "rev %[hl_tmp], %[high]\n"
+                AES_ENC_ROUND(2, 1) "mov %[ctr0].d[0], %[hl_tmp]\n"
+                AES_ENC_ROUND(2, 2) "rev %[hl_tmp], %[low]\n"
+                AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[hl_tmp]\n"
+                AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(4, 0) "rev %[hl_tmp], %[high]\n"
+                AES_ENC_ROUND(4, 1) "mov %[ctr1].d[0], %[hl_tmp]\n"
+                AES_ENC_ROUND(4, 2) "rev %[hl_tmp], %[low]\n"
+                AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[hl_tmp]\n"
+                AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(6, 0) "rev %[hl_tmp], %[high]\n"
+                AES_ENC_ROUND(6, 1) "mov %[ctr2].d[0], %[hl_tmp]\n"
+                AES_ENC_ROUND(6, 2) "rev %[hl_tmp], %[low]\n"
+                AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[hl_tmp]\n"
+                AES_ENC_ROUND(7, 1)
+                AES_ENC_ROUND(7, 2)
+                AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
+                AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
+                AES_ENC_ROUND(11, 0) AES_ENC_ROUND(11, 1) AES_ENC_ROUND(11, 2)
+                AES_ENC_ROUND(12, 0) AES_ENC_ROUND(12, 1) AES_ENC_ROUND(12, 2)
+                AES_ENC_SECOND_LAST_ROUND(13, 0) AES_ENC_SECOND_LAST_ROUND(13, 1) AES_ENC_SECOND_LAST_ROUND(13, 2)
+                AES_ENC_LAST_ROUND(14, 0) AES_ENC_LAST_ROUND(14, 1) AES_ENC_LAST_ROUND(14, 2)
+                : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                  AES_ENC_DEC_OUTPUT_THREE_CTRS(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW(),
+                  CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP()
+                : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(13),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(14)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(block0, tmp0);
+            tmp1 = veorq_u8(block1, tmp1);
+            tmp2 = veorq_u8(block2, tmp2);
+
+            /* Store to output. */
+            vst1q_u8(dst_u8, tmp0);
+            dst_u8 += AES_BLOCK_SIZE;
+            vst1q_u8(dst_u8, tmp1);
+            dst_u8 += AES_BLOCK_SIZE;
+            vst1q_u8(dst_u8, tmp2);
+            dst_u8 += AES_BLOCK_SIZE;
+
+            num_blocks -= 3;
+        }
+    }
+
+    while (num_blocks >= 1) {
+        /* Read block in, keep in register for XOR. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        /* We'll be encrypting the CTR. */
+        uint8x16_t tmp0 = ctr0;
+
+        /* Actually do encryption, use optimized asm. */
+        /* Interleave CTR calculations with AES ones, to mask latencies. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n"
+            AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n"
+            AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n"
+            AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n"
+            AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n"
+            AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n"
+            AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n"
+            AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n"
+            AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n"
+            AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
+            AES_ENC_ROUND(10, 0)
+            AES_ENC_ROUND(11, 0)
+            AES_ENC_ROUND(12, 0)
+            AES_ENC_SECOND_LAST_ROUND(13, 0)
+            AES_ENC_LAST_ROUND(14, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+              AES_ENC_DEC_OUTPUT_ONE_CTR(),
+              CTR_INCREMENT_OUTPUT_HIGH_LOW()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12),
+              AES_ENC_DEC_INPUT_ROUND_KEY(13),
+              AES_ENC_DEC_INPUT_ROUND_KEY(14)
+            : "cc"
+        );
+
+        /* XOR blocks. */
+        tmp0 = veorq_u8(block0, tmp0);
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->ctr, ctr0);
+}
+
+void aes256CtrContextCreate(Aes256CtrContext *out, const void *key, const void *ctr) {
+    /* Initialize inner context. */
+    aes256ContextCreate(&out->aes_ctx, key, true);
+
+    /* Set IV, nothing is buffered. */
+    memcpy(out->ctr, ctr, sizeof(out->ctr));
+    memset(out->enc_ctr_buffer, 0, sizeof(out->enc_ctr_buffer));
+    out->buffer_offset = 0;
+}
+
+void aes256CtrCrypt(Aes256CtrContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes256CtrCryptBlocks);
+}