diff --git a/nx/include/switch/crypto.h b/nx/include/switch/crypto.h
index 8b93898a..5de0edaf 100644
--- a/nx/include/switch/crypto.h
+++ b/nx/include/switch/crypto.h
@@ -6,4 +6,5 @@
 #pragma once
 #include "types.h"
 
-#include "crypto/aes.h"
\ No newline at end of file
+#include "crypto/aes.h"
+#include "crypto/aes_cbc.h"
\ No newline at end of file
diff --git a/nx/include/switch/crypto/aes.h b/nx/include/switch/crypto/aes.h
index d4fbd54e..2079e699 100644
--- a/nx/include/switch/crypto/aes.h
+++ b/nx/include/switch/crypto/aes.h
@@ -1,6 +1,6 @@
 /**
  * @file aes.h
- * @brief Switch accelerated AES implementation.
+ * @brief Switch accelerated AES-ECB implementation.
  * @copyright libnx Authors
  */
 #pragma once
@@ -27,7 +27,7 @@ typedef struct {
     u8 round_keys[AES_192_NUM_ROUNDS+1][AES_BLOCK_SIZE];
 } Aes192Context;
 
-/// Context for AES-192 operations.
+/// Context for AES-256 operations.
 typedef struct {
     u8 round_keys[AES_256_NUM_ROUNDS+1][AES_BLOCK_SIZE];
 } Aes256Context;
diff --git a/nx/include/switch/crypto/aes_cbc.h b/nx/include/switch/crypto/aes_cbc.h
new file mode 100644
index 00000000..0a5d9df1
--- /dev/null
+++ b/nx/include/switch/crypto/aes_cbc.h
@@ -0,0 +1,46 @@
+/**
+ * @file aes_cbc.h
+ * @brief Switch accelerated AES-CBC implementation.
+ * @copyright libnx Authors
+ */
+#pragma once
+#include "aes.h"
+
+/// Context for AES-128 CBC.
+typedef struct {
+    Aes128Context aes_ctx;
+    u8 iv[AES_BLOCK_SIZE];
+    u8 buffer[AES_BLOCK_SIZE];
+    size_t num_buffered;
+} Aes128CbcContext;
+
+/// Context for AES-192 CBC.
+typedef struct {
+    Aes192Context aes_ctx;
+    u8 iv[AES_BLOCK_SIZE];
+    u8 buffer[AES_BLOCK_SIZE];
+    size_t num_buffered;
+} Aes192CbcContext;
+
+/// Context for AES-256 CBC.
+typedef struct {
+    Aes256Context aes_ctx;
+    u8 iv[AES_BLOCK_SIZE];
+    u8 buffer[AES_BLOCK_SIZE];
+    size_t num_buffered;
+} Aes256CbcContext;
+
+/// 128-bit CBC API.
+void aes128CbcContextCreate(Aes128CbcContext *out, const void *key, const void *iv, bool is_encryptor);
+size_t aes128CbcEncrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size);
+size_t aes128CbcDecrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size);
+
+/// 192-bit CBC API.
+void aes192CbcContextCreate(Aes192CbcContext *out, const void *key, const void *iv, bool is_encryptor);
+size_t aes192CbcEncrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size);
+size_t aes192CbcDecrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size);
+
+/// 256-bit CBC API.
+void aes256CbcContextCreate(Aes256CbcContext *out, const void *key, const void *iv, bool is_encryptor);
+size_t aes256CbcEncrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size);
+size_t aes256CbcDecrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size);
diff --git a/nx/source/crypto/aes_cbc.c b/nx/source/crypto/aes_cbc.c
new file mode 100644
index 00000000..71dc25b0
--- /dev/null
+++ b/nx/source/crypto/aes_cbc.c
@@ -0,0 +1,742 @@
+#include <string.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+#include "result.h"
+#include "crypto/aes_cbc.h"
+
+/* Variable management macros. */
+#define DECLARE_ROUND_KEY_VAR(n) \
+const uint8x16_t round_key_##n = vld1q_u8(ctx->aes_ctx.round_keys[n])
+
+#define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \
+[tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2)
+
+#define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \
+[tmp0]"+w"(tmp0)
+
+#define AES_ENC_DEC_INPUT_ROUND_KEY(n) \
+[round_key_##n]"w"(round_key_##n)
+
+/* AES Encryption macros. */
+#define AES_ENC_ROUND(n, i) \
+"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
+"aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
+
+#define AES_ENC_SECOND_LAST_ROUND(n, i) \
+"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+#define AES_ENC_LAST_ROUND(n, i) \
+"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+/* AES Decryption macros. */
+#define AES_DEC_ROUND(n, i) \
+"aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
+"aesimc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
+
+#define AES_DEC_SECOND_LAST_ROUND(n, i) \
+"aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+#define AES_DEC_LAST_ROUND(n, i) \
+"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+
+/* Macro for main body of crypt wrapper. */
+#define CRYPT_FUNC_BODY(block_handler) \
+({ \
+    const u8 *cur_src = (const u8 *)src; \
+    u8 *cur_dst = (u8 *)dst; \
+\
+    /* Handle pre-buffered data. */ \
+    if (ctx->num_buffered > 0) { \
+        const size_t needed = AES_BLOCK_SIZE - ctx->num_buffered; \
+        const size_t copyable = (size > needed ? needed : size); \
+        memcpy(&ctx->buffer[ctx->num_buffered], cur_src, copyable); \
+        cur_src += copyable; \
+        ctx->num_buffered += copyable; \
+        size -= copyable; \
+\
+        if (ctx->num_buffered == AES_BLOCK_SIZE) { \
+            block_handler(ctx, cur_dst, ctx->buffer, 1); \
+            cur_dst += AES_BLOCK_SIZE; \
+            ctx->num_buffered = 0; \
+        } \
+    } \
+\
+    /* Handle complete blocks. */ \
+    if (size >= AES_BLOCK_SIZE) { \
+        const size_t num_blocks = size / AES_BLOCK_SIZE; \
+        block_handler(ctx, cur_dst, cur_src, num_blocks); \
+        size -= num_blocks * AES_BLOCK_SIZE; \
+        cur_src += num_blocks * AES_BLOCK_SIZE; \
+        cur_dst += num_blocks * AES_BLOCK_SIZE; \
+    } \
+\
+    /* Buffer remaining data. */ \
+    if (size > 0) { \
+        memcpy(ctx->buffer, cur_src, size); \
+        ctx->num_buffered = size; \
+    } \
+    return (size_t)((uintptr_t)cur_dst - (uintptr_t)dst); \
+})
+
+
+void aes128CbcContextCreate(Aes128CbcContext *out, const void *key, const void *iv, bool is_encryptor) {
+    /* Initialize inner context. */
+    aes128ContextCreate(&out->aes_ctx, key, is_encryptor);
+
+    /* Set IV, nothing is buffered. */
+    memcpy(out->iv, iv, sizeof(out->iv));
+    memset(out->buffer, 0, sizeof(out->buffer));
+    out->num_buffered = 0;
+}
+
+static void _aes128CbcEncryptBlocks(Aes128CbcContext *ctx, void *dst, const void *src, size_t num_blocks) {
+    const u8 *src_u8 = (const u8 *)src;
+    u8 *dst_u8 = (u8 *)dst;
+
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    uint8x16_t cur_iv = vld1q_u8(ctx->iv);
+
+    /* Process last block or two individually. */
+    while (num_blocks >= 1) {
+        /* Read block in, xor with IV. */
+        uint8x16_t tmp0 = veorq_u8(cur_iv, vld1q_u8(src_u8));
+        src_u8 += AES_BLOCK_SIZE;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(0, 0)
+            AES_ENC_ROUND(1, 0)
+            AES_ENC_ROUND(2, 0)
+            AES_ENC_ROUND(3, 0)
+            AES_ENC_ROUND(4, 0)
+            AES_ENC_ROUND(5, 0)
+            AES_ENC_ROUND(6, 0)
+            AES_ENC_ROUND(7, 0)
+            AES_ENC_ROUND(8, 0)
+            AES_ENC_SECOND_LAST_ROUND(9, 0)
+            AES_ENC_LAST_ROUND(10, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10)
+        );
+
+        /* Update IV. */
+        cur_iv = tmp0;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->iv, cur_iv);
+}
+
+static void _aes128CbcDecryptBlocks(Aes128CbcContext *ctx, void *dst, const void *src, size_t num_blocks) {
+    const u8 *src_u8 = (const u8 *)src;
+    u8 *dst_u8 = (u8 *)dst;
+
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    uint8x16_t cur_iv = vld1q_u8(ctx->iv);
+
+    /* Process three blocks at a time, when possible. */
+    while (num_blocks >= 3) {
+        /* Read blocks in. Keep them in registers for XOR later. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+        const uint8x16_t block1 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+        const uint8x16_t block2 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        uint8x16_t tmp0 = block0, tmp1 = block1, tmp2 = block2;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(10, 0) AES_DEC_ROUND(10, 1) AES_DEC_ROUND(10, 2)
+            AES_DEC_ROUND(9, 0) AES_DEC_ROUND(9, 1) AES_DEC_ROUND(9, 2)
+            AES_DEC_ROUND(8, 0) AES_DEC_ROUND(8, 1) AES_DEC_ROUND(8, 2)
+            AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2)
+            AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2)
+            AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2)
+            AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2)
+            AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2)
+            AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2)
+            AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2)
+            AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2)
+            : AES_ENC_DEC_OUTPUT_THREE_BLOCKS()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10)
+        );
+
+        /* Do XOR for CBC. */
+        tmp0 = veorq_u8(tmp0, cur_iv);
+        tmp1 = veorq_u8(tmp1, block0);
+        tmp2 = veorq_u8(tmp2, block1);
+        cur_iv = block2;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+        vst1q_u8(dst_u8, tmp1);
+        dst_u8 += AES_BLOCK_SIZE;
+        vst1q_u8(dst_u8, tmp2);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks -= 3;
+    }
+
+    /* Process last block or two individually. */
+    while (num_blocks >= 1) {
+        /* Read block in, keep in register for IV later. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        uint8x16_t tmp0 = block0;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(10, 0)
+            AES_DEC_ROUND(9, 0)
+            AES_DEC_ROUND(8, 0)
+            AES_DEC_ROUND(7, 0)
+            AES_DEC_ROUND(6, 0)
+            AES_DEC_ROUND(5, 0)
+            AES_DEC_ROUND(4, 0)
+            AES_DEC_ROUND(3, 0)
+            AES_DEC_ROUND(2, 0)
+            AES_DEC_SECOND_LAST_ROUND(1, 0)
+            AES_DEC_LAST_ROUND(0, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10)
+        );
+
+        /* Update IV. */
+        cur_iv = tmp0;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->iv, cur_iv);
+}
+
+size_t aes128CbcEncrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes128CbcEncryptBlocks);
+}
+
+size_t aes128CbcDecrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes128CbcDecryptBlocks);
+}
+
+void aes192CbcContextCreate(Aes192CbcContext *out, const void *key, const void *iv, bool is_encryptor) {
+    /* Initialize inner context. */
+    aes192ContextCreate(&out->aes_ctx, key, is_encryptor);
+
+    /* Set IV, nothing is buffered. */
+    memcpy(out->iv, iv, sizeof(out->iv));
+    memset(out->buffer, 0, sizeof(out->buffer));
+    out->num_buffered = 0;
+}
+
+static void _aes192CbcEncryptBlocks(Aes192CbcContext *ctx, void *dst, const void *src, size_t num_blocks) {
+    const u8 *src_u8 = (const u8 *)src;
+    u8 *dst_u8 = (u8 *)dst;
+
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    DECLARE_ROUND_KEY_VAR(11);
+    DECLARE_ROUND_KEY_VAR(12);
+    uint8x16_t cur_iv = vld1q_u8(ctx->iv);
+
+    /* Process last block or two individually. */
+    while (num_blocks >= 1) {
+        /* Read block in, xor with IV. */
+        uint8x16_t tmp0 = veorq_u8(cur_iv, vld1q_u8(src_u8));
+        src_u8 += AES_BLOCK_SIZE;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(0, 0)
+            AES_ENC_ROUND(1, 0)
+            AES_ENC_ROUND(2, 0)
+            AES_ENC_ROUND(3, 0)
+            AES_ENC_ROUND(4, 0)
+            AES_ENC_ROUND(5, 0)
+            AES_ENC_ROUND(6, 0)
+            AES_ENC_ROUND(7, 0)
+            AES_ENC_ROUND(8, 0)
+            AES_ENC_ROUND(9, 0)
+            AES_ENC_ROUND(10, 0)
+            AES_ENC_SECOND_LAST_ROUND(11, 0)
+            AES_ENC_LAST_ROUND(12, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12)
+        );
+
+        /* Update IV. */
+        cur_iv = tmp0;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->iv, cur_iv);
+}
+
+static void _aes192CbcDecryptBlocks(Aes192CbcContext *ctx, void *dst, const void *src, size_t num_blocks) {
+    const u8 *src_u8 = (const u8 *)src;
+    u8 *dst_u8 = (u8 *)dst;
+
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    DECLARE_ROUND_KEY_VAR(11);
+    DECLARE_ROUND_KEY_VAR(12);
+    uint8x16_t cur_iv = vld1q_u8(ctx->iv);
+
+    /* Process three blocks at a time, when possible. */
+    while (num_blocks >= 3) {
+        /* Read blocks in. Keep them in registers for XOR later. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+        const uint8x16_t block1 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+        const uint8x16_t block2 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        uint8x16_t tmp0 = block0, tmp1 = block1, tmp2 = block2;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(12, 0) AES_DEC_ROUND(12, 1) AES_DEC_ROUND(12, 2)
+            AES_DEC_ROUND(11, 0) AES_DEC_ROUND(11, 1) AES_DEC_ROUND(11, 2)
+            AES_DEC_ROUND(10, 0) AES_DEC_ROUND(10, 1) AES_DEC_ROUND(10, 2)
+            AES_DEC_ROUND(9, 0) AES_DEC_ROUND(9, 1) AES_DEC_ROUND(9, 2)
+            AES_DEC_ROUND(8, 0) AES_DEC_ROUND(8, 1) AES_DEC_ROUND(8, 2)
+            AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2)
+            AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2)
+            AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2)
+            AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2)
+            AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2)
+            AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2)
+            AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2)
+            AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2)
+            : AES_ENC_DEC_OUTPUT_THREE_BLOCKS()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12)
+        );
+
+        /* Do XOR for CBC. */
+        tmp0 = veorq_u8(tmp0, cur_iv);
+        tmp1 = veorq_u8(tmp1, block0);
+        tmp2 = veorq_u8(tmp2, block1);
+        cur_iv = block2;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+        vst1q_u8(dst_u8, tmp1);
+        dst_u8 += AES_BLOCK_SIZE;
+        vst1q_u8(dst_u8, tmp2);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks -= 3;
+    }
+
+    /* Process last block or two individually. */
+    while (num_blocks >= 1) {
+        /* Read block in, keep in register for IV later. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        uint8x16_t tmp0 = block0;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(12, 0)
+            AES_DEC_ROUND(11, 0)
+            AES_DEC_ROUND(10, 0)
+            AES_DEC_ROUND(9, 0)
+            AES_DEC_ROUND(8, 0)
+            AES_DEC_ROUND(7, 0)
+            AES_DEC_ROUND(6, 0)
+            AES_DEC_ROUND(5, 0)
+            AES_DEC_ROUND(4, 0)
+            AES_DEC_ROUND(3, 0)
+            AES_DEC_ROUND(2, 0)
+            AES_DEC_SECOND_LAST_ROUND(1, 0)
+            AES_DEC_LAST_ROUND(0, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12)
+        );
+
+        /* Update IV. */
+        cur_iv = tmp0;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->iv, cur_iv);
+}
+
+size_t aes192CbcEncrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes192CbcEncryptBlocks);
+}
+
+size_t aes192CbcDecrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes192CbcDecryptBlocks);
+}
+
+void aes256CbcContextCreate(Aes256CbcContext *out, const void *key, const void *iv, bool is_encryptor) {
+    /* Initialize inner context. */
+    aes256ContextCreate(&out->aes_ctx, key, is_encryptor);
+
+    /* Set IV, nothing is buffered. */
+    memcpy(out->iv, iv, sizeof(out->iv));
+    memset(out->buffer, 0, sizeof(out->buffer));
+    out->num_buffered = 0;
+}
+
+static void _aes256CbcEncryptBlocks(Aes256CbcContext *ctx, void *dst, const void *src, size_t num_blocks) {
+    const u8 *src_u8 = (const u8 *)src;
+    u8 *dst_u8 = (u8 *)dst;
+
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    DECLARE_ROUND_KEY_VAR(11);
+    DECLARE_ROUND_KEY_VAR(12);
+    DECLARE_ROUND_KEY_VAR(13);
+    DECLARE_ROUND_KEY_VAR(14);
+    uint8x16_t cur_iv = vld1q_u8(ctx->iv);
+
+    /* Process last block or two individually. */
+    while (num_blocks >= 1) {
+        /* Read block in, xor with IV. */
+        uint8x16_t tmp0 = veorq_u8(cur_iv, vld1q_u8(src_u8));
+        src_u8 += AES_BLOCK_SIZE;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(0, 0)
+            AES_ENC_ROUND(1, 0)
+            AES_ENC_ROUND(2, 0)
+            AES_ENC_ROUND(3, 0)
+            AES_ENC_ROUND(4, 0)
+            AES_ENC_ROUND(5, 0)
+            AES_ENC_ROUND(6, 0)
+            AES_ENC_ROUND(7, 0)
+            AES_ENC_ROUND(8, 0)
+            AES_ENC_ROUND(9, 0)
+            AES_ENC_ROUND(10, 0)
+            AES_ENC_ROUND(11, 0)
+            AES_ENC_ROUND(12, 0)
+            AES_ENC_SECOND_LAST_ROUND(13, 0)
+            AES_ENC_LAST_ROUND(14, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12),
+              AES_ENC_DEC_INPUT_ROUND_KEY(13),
+              AES_ENC_DEC_INPUT_ROUND_KEY(14)
+        );
+
+        /* Update IV. */
+        cur_iv = tmp0;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->iv, cur_iv);
+}
+
+static void _aes256CbcDecryptBlocks(Aes256CbcContext *ctx, void *dst, const void *src, size_t num_blocks) {
+    const u8 *src_u8 = (const u8 *)src;
+    u8 *dst_u8 = (u8 *)dst;
+
+    /* Preload all round keys + iv into neon registers. */
+    DECLARE_ROUND_KEY_VAR(0);
+    DECLARE_ROUND_KEY_VAR(1);
+    DECLARE_ROUND_KEY_VAR(2);
+    DECLARE_ROUND_KEY_VAR(3);
+    DECLARE_ROUND_KEY_VAR(4);
+    DECLARE_ROUND_KEY_VAR(5);
+    DECLARE_ROUND_KEY_VAR(6);
+    DECLARE_ROUND_KEY_VAR(7);
+    DECLARE_ROUND_KEY_VAR(8);
+    DECLARE_ROUND_KEY_VAR(9);
+    DECLARE_ROUND_KEY_VAR(10);
+    DECLARE_ROUND_KEY_VAR(11);
+    DECLARE_ROUND_KEY_VAR(12);
+    DECLARE_ROUND_KEY_VAR(13);
+    DECLARE_ROUND_KEY_VAR(14);
+    uint8x16_t cur_iv = vld1q_u8(ctx->iv);
+
+    /* Process three blocks at a time, when possible. */
+    while (num_blocks >= 3) {
+        /* Read blocks in. Keep them in registers for XOR later. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+        const uint8x16_t block1 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+        const uint8x16_t block2 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        uint8x16_t tmp0 = block0, tmp1 = block1, tmp2 = block2;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(14, 0) AES_DEC_ROUND(14, 1) AES_DEC_ROUND(14, 2)
+            AES_DEC_ROUND(13, 0) AES_DEC_ROUND(13, 1) AES_DEC_ROUND(13, 2)
+            AES_DEC_ROUND(12, 0) AES_DEC_ROUND(12, 1) AES_DEC_ROUND(12, 2)
+            AES_DEC_ROUND(11, 0) AES_DEC_ROUND(11, 1) AES_DEC_ROUND(11, 2)
+            AES_DEC_ROUND(10, 0) AES_DEC_ROUND(10, 1) AES_DEC_ROUND(10, 2)
+            AES_DEC_ROUND(9, 0) AES_DEC_ROUND(9, 1) AES_DEC_ROUND(9, 2)
+            AES_DEC_ROUND(8, 0) AES_DEC_ROUND(8, 1) AES_DEC_ROUND(8, 2)
+            AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2)
+            AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2)
+            AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2)
+            AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2)
+            AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2)
+            AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2)
+            AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2)
+            AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2)
+            : AES_ENC_DEC_OUTPUT_THREE_BLOCKS()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12),
+              AES_ENC_DEC_INPUT_ROUND_KEY(13),
+              AES_ENC_DEC_INPUT_ROUND_KEY(14)
+        );
+
+        /* Do XOR for CBC. */
+        tmp0 = veorq_u8(tmp0, cur_iv);
+        tmp1 = veorq_u8(tmp1, block0);
+        tmp2 = veorq_u8(tmp2, block1);
+        cur_iv = block2;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+        vst1q_u8(dst_u8, tmp1);
+        dst_u8 += AES_BLOCK_SIZE;
+        vst1q_u8(dst_u8, tmp2);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks -= 3;
+    }
+
+    /* Process last block or two individually. */
+    while (num_blocks >= 1) {
+        /* Read block in, keep in register for IV later. */
+        const uint8x16_t block0 = vld1q_u8(src_u8);
+        src_u8 += AES_BLOCK_SIZE;
+
+        uint8x16_t tmp0 = block0;
+
+        /* Actually do encryption, use optimized asm. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(14, 0)
+            AES_DEC_ROUND(13, 0)
+            AES_DEC_ROUND(12, 0)
+            AES_DEC_ROUND(11, 0)
+            AES_DEC_ROUND(10, 0)
+            AES_DEC_ROUND(9, 0)
+            AES_DEC_ROUND(8, 0)
+            AES_DEC_ROUND(7, 0)
+            AES_DEC_ROUND(6, 0)
+            AES_DEC_ROUND(5, 0)
+            AES_DEC_ROUND(4, 0)
+            AES_DEC_ROUND(3, 0)
+            AES_DEC_ROUND(2, 0)
+            AES_DEC_SECOND_LAST_ROUND(1, 0)
+            AES_DEC_LAST_ROUND(0, 0)
+            : AES_ENC_DEC_OUTPUT_ONE_BLOCK()
+            : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+              AES_ENC_DEC_INPUT_ROUND_KEY(1),
+              AES_ENC_DEC_INPUT_ROUND_KEY(2),
+              AES_ENC_DEC_INPUT_ROUND_KEY(3),
+              AES_ENC_DEC_INPUT_ROUND_KEY(4),
+              AES_ENC_DEC_INPUT_ROUND_KEY(5),
+              AES_ENC_DEC_INPUT_ROUND_KEY(6),
+              AES_ENC_DEC_INPUT_ROUND_KEY(7),
+              AES_ENC_DEC_INPUT_ROUND_KEY(8),
+              AES_ENC_DEC_INPUT_ROUND_KEY(9),
+              AES_ENC_DEC_INPUT_ROUND_KEY(10),
+              AES_ENC_DEC_INPUT_ROUND_KEY(11),
+              AES_ENC_DEC_INPUT_ROUND_KEY(12),
+              AES_ENC_DEC_INPUT_ROUND_KEY(13),
+              AES_ENC_DEC_INPUT_ROUND_KEY(14)
+        );
+
+        /* Update IV. */
+        cur_iv = tmp0;
+
+        /* Store to output. */
+        vst1q_u8(dst_u8, tmp0);
+        dst_u8 += AES_BLOCK_SIZE;
+
+        num_blocks--;
+    }
+
+    vst1q_u8(ctx->iv, cur_iv);
+}
+
+size_t aes256CbcEncrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes256CbcEncryptBlocks);
+}
+
+size_t aes256CbcDecrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size) {
+    CRYPT_FUNC_BODY(_aes256CbcDecryptBlocks);
+}