diff --git a/nx/include/switch/crypto.h b/nx/include/switch/crypto.h index 8b93898a..5de0edaf 100644 --- a/nx/include/switch/crypto.h +++ b/nx/include/switch/crypto.h @@ -6,4 +6,5 @@ #pragma once #include "types.h" -#include "crypto/aes.h" \ No newline at end of file +#include "crypto/aes.h" +#include "crypto/aes_cbc.h" \ No newline at end of file diff --git a/nx/include/switch/crypto/aes.h b/nx/include/switch/crypto/aes.h index d4fbd54e..2079e699 100644 --- a/nx/include/switch/crypto/aes.h +++ b/nx/include/switch/crypto/aes.h @@ -1,6 +1,6 @@ /** * @file aes.h - * @brief Switch accelerated AES implementation. + * @brief Switch accelerated AES-ECB implementation. * @copyright libnx Authors */ #pragma once @@ -27,7 +27,7 @@ typedef struct { u8 round_keys[AES_192_NUM_ROUNDS+1][AES_BLOCK_SIZE]; } Aes192Context; -/// Context for AES-192 operations. +/// Context for AES-256 operations. typedef struct { u8 round_keys[AES_256_NUM_ROUNDS+1][AES_BLOCK_SIZE]; } Aes256Context; diff --git a/nx/include/switch/crypto/aes_cbc.h b/nx/include/switch/crypto/aes_cbc.h new file mode 100644 index 00000000..0a5d9df1 --- /dev/null +++ b/nx/include/switch/crypto/aes_cbc.h @@ -0,0 +1,46 @@ +/** + * @file aes_cbc.h + * @brief Switch accelerated AES-CBC implementation. + * @copyright libnx Authors + */ +#pragma once +#include "aes.h" + +/// Context for AES-128 CBC. +typedef struct { + Aes128Context aes_ctx; + u8 iv[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; +} Aes128CbcContext; + +/// Context for AES-192 CBC. +typedef struct { + Aes192Context aes_ctx; + u8 iv[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; +} Aes192CbcContext; + +/// Context for AES-256 CBC. +typedef struct { + Aes256Context aes_ctx; + u8 iv[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; +} Aes256CbcContext; + +/// 128-bit CBC API. +void aes128CbcContextCreate(Aes128CbcContext *out, const void *key, const void *iv, bool is_encryptor); +size_t aes128CbcEncrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size); +size_t aes128CbcDecrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size); + +/// 192-bit CBC API. +void aes192CbcContextCreate(Aes192CbcContext *out, const void *key, const void *iv, bool is_encryptor); +size_t aes192CbcEncrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size); +size_t aes192CbcDecrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size); + +/// 256-bit CBC API. +void aes256CbcContextCreate(Aes256CbcContext *out, const void *key, const void *iv, bool is_encryptor); +size_t aes256CbcEncrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size); +size_t aes256CbcDecrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size); diff --git a/nx/source/crypto/aes_cbc.c b/nx/source/crypto/aes_cbc.c new file mode 100644 index 00000000..71dc25b0 --- /dev/null +++ b/nx/source/crypto/aes_cbc.c @@ -0,0 +1,742 @@ +#include +#include +#include + +#include "result.h" +#include "crypto/aes_cbc.h" + +/* Variable management macros. */ +#define DECLARE_ROUND_KEY_VAR(n) \ +const uint8x16_t round_key_##n = vld1q_u8(ctx->aes_ctx.round_keys[n]) + +#define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \ +[tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2) + +#define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \ +[tmp0]"+w"(tmp0) + +#define AES_ENC_DEC_INPUT_ROUND_KEY(n) \ +[round_key_##n]"w"(round_key_##n) + +/* AES Encryption macros. */ +#define AES_ENC_ROUND(n, i) \ +"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ +"aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + +#define AES_ENC_SECOND_LAST_ROUND(n, i) \ +"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +#define AES_ENC_LAST_ROUND(n, i) \ +"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +/* AES Decryption macros. */ +#define AES_DEC_ROUND(n, i) \ +"aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ +"aesimc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + +#define AES_DEC_SECOND_LAST_ROUND(n, i) \ +"aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +#define AES_DEC_LAST_ROUND(n, i) \ +"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + + +/* Macro for main body of crypt wrapper. */ +#define CRYPT_FUNC_BODY(block_handler) \ +({ \ + const u8 *cur_src = (const u8 *)src; \ + u8 *cur_dst = (u8 *)dst; \ +\ + /* Handle pre-buffered data. */ \ + if (ctx->num_buffered > 0) { \ + const size_t needed = AES_BLOCK_SIZE - ctx->num_buffered; \ + const size_t copyable = (size > needed ? needed : size); \ + memcpy(&ctx->buffer[ctx->num_buffered], cur_src, copyable); \ + cur_src += copyable; \ + ctx->num_buffered += copyable; \ + size -= copyable; \ +\ + if (ctx->num_buffered == AES_BLOCK_SIZE) { \ + block_handler(ctx, cur_dst, ctx->buffer, 1); \ + cur_dst += AES_BLOCK_SIZE; \ + ctx->num_buffered = 0; \ + } \ + } \ +\ + /* Handle complete blocks. */ \ + if (size >= AES_BLOCK_SIZE) { \ + const size_t num_blocks = size / AES_BLOCK_SIZE; \ + block_handler(ctx, cur_dst, cur_src, num_blocks); \ + size -= num_blocks * AES_BLOCK_SIZE; \ + cur_src += num_blocks * AES_BLOCK_SIZE; \ + cur_dst += num_blocks * AES_BLOCK_SIZE; \ + } \ +\ + /* Buffer remaining data. */ \ + if (size > 0) { \ + memcpy(ctx->buffer, cur_src, size); \ + ctx->num_buffered = size; \ + } \ + return (size_t)((uintptr_t)cur_dst - (uintptr_t)dst); \ +}) + + +void aes128CbcContextCreate(Aes128CbcContext *out, const void *key, const void *iv, bool is_encryptor) { + /* Initialize inner context. */ + aes128ContextCreate(&out->aes_ctx, key, is_encryptor); + + /* Set IV, nothing is buffered. */ + memcpy(out->iv, iv, sizeof(out->iv)); + memset(out->buffer, 0, sizeof(out->buffer)); + out->num_buffered = 0; +} + +static void _aes128CbcEncryptBlocks(Aes128CbcContext *ctx, void *dst, const void *src, size_t num_blocks) { + const u8 *src_u8 = (const u8 *)src; + u8 *dst_u8 = (u8 *)dst; + + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t cur_iv = vld1q_u8(ctx->iv); + + /* Process last block or two individually. */ + while (num_blocks >= 1) { + /* Read block in, xor with IV. */ + uint8x16_t tmp0 = veorq_u8(cur_iv, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) + AES_ENC_ROUND(1, 0) + AES_ENC_ROUND(2, 0) + AES_ENC_ROUND(3, 0) + AES_ENC_ROUND(4, 0) + AES_ENC_ROUND(5, 0) + AES_ENC_ROUND(6, 0) + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_SECOND_LAST_ROUND(9, 0) + AES_ENC_LAST_ROUND(10, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + ); + + /* Update IV. */ + cur_iv = tmp0; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->iv, cur_iv); +} + +static void _aes128CbcDecryptBlocks(Aes128CbcContext *ctx, void *dst, const void *src, size_t num_blocks) { + const u8 *src_u8 = (const u8 *)src; + u8 *dst_u8 = (u8 *)dst; + + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t cur_iv = vld1q_u8(ctx->iv); + + /* Process three blocks at a time, when possible. */ + while (num_blocks >= 3) { + /* Read blocks in. Keep them in registers for XOR later. */ + const uint8x16_t block0 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + const uint8x16_t block1 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + const uint8x16_t block2 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + + uint8x16_t tmp0 = block0, tmp1 = block1, tmp2 = block2; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(10, 0) AES_DEC_ROUND(10, 1) AES_DEC_ROUND(10, 2) + AES_DEC_ROUND(9, 0) AES_DEC_ROUND(9, 1) AES_DEC_ROUND(9, 2) + AES_DEC_ROUND(8, 0) AES_DEC_ROUND(8, 1) AES_DEC_ROUND(8, 2) + AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2) + AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2) + AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + ); + + /* Do XOR for CBC. */ + tmp0 = veorq_u8(tmp0, cur_iv); + tmp1 = veorq_u8(tmp1, block0); + tmp2 = veorq_u8(tmp2, block1); + cur_iv = block2; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + + /* Process last block or two individually. */ + while (num_blocks >= 1) { + /* Read block in, keep in register for IV later. */ + const uint8x16_t block0 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + + uint8x16_t tmp0 = block0; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(10, 0) + AES_DEC_ROUND(9, 0) + AES_DEC_ROUND(8, 0) + AES_DEC_ROUND(7, 0) + AES_DEC_ROUND(6, 0) + AES_DEC_ROUND(5, 0) + AES_DEC_ROUND(4, 0) + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + ); + + /* Update IV. */ + cur_iv = tmp0; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->iv, cur_iv); +} + +size_t aes128CbcEncrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes128CbcEncryptBlocks); +} + +size_t aes128CbcDecrypt(Aes128CbcContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes128CbcDecryptBlocks); +} + +void aes192CbcContextCreate(Aes192CbcContext *out, const void *key, const void *iv, bool is_encryptor) { + /* Initialize inner context. */ + aes192ContextCreate(&out->aes_ctx, key, is_encryptor); + + /* Set IV, nothing is buffered. */ + memcpy(out->iv, iv, sizeof(out->iv)); + memset(out->buffer, 0, sizeof(out->buffer)); + out->num_buffered = 0; +} + +static void _aes192CbcEncryptBlocks(Aes192CbcContext *ctx, void *dst, const void *src, size_t num_blocks) { + const u8 *src_u8 = (const u8 *)src; + u8 *dst_u8 = (u8 *)dst; + + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t cur_iv = vld1q_u8(ctx->iv); + + /* Process last block or two individually. */ + while (num_blocks >= 1) { + /* Read block in, xor with IV. */ + uint8x16_t tmp0 = veorq_u8(cur_iv, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) + AES_ENC_ROUND(1, 0) + AES_ENC_ROUND(2, 0) + AES_ENC_ROUND(3, 0) + AES_ENC_ROUND(4, 0) + AES_ENC_ROUND(5, 0) + AES_ENC_ROUND(6, 0) + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_SECOND_LAST_ROUND(11, 0) + AES_ENC_LAST_ROUND(12, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + ); + + /* Update IV. */ + cur_iv = tmp0; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->iv, cur_iv); +} + +static void _aes192CbcDecryptBlocks(Aes192CbcContext *ctx, void *dst, const void *src, size_t num_blocks) { + const u8 *src_u8 = (const u8 *)src; + u8 *dst_u8 = (u8 *)dst; + + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t cur_iv = vld1q_u8(ctx->iv); + + /* Process three blocks at a time, when possible. */ + while (num_blocks >= 3) { + /* Read blocks in. Keep them in registers for XOR later. */ + const uint8x16_t block0 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + const uint8x16_t block1 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + const uint8x16_t block2 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + + uint8x16_t tmp0 = block0, tmp1 = block1, tmp2 = block2; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(12, 0) AES_DEC_ROUND(12, 1) AES_DEC_ROUND(12, 2) + AES_DEC_ROUND(11, 0) AES_DEC_ROUND(11, 1) AES_DEC_ROUND(11, 2) + AES_DEC_ROUND(10, 0) AES_DEC_ROUND(10, 1) AES_DEC_ROUND(10, 2) + AES_DEC_ROUND(9, 0) AES_DEC_ROUND(9, 1) AES_DEC_ROUND(9, 2) + AES_DEC_ROUND(8, 0) AES_DEC_ROUND(8, 1) AES_DEC_ROUND(8, 2) + AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2) + AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2) + AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + ); + + /* Do XOR for CBC. */ + tmp0 = veorq_u8(tmp0, cur_iv); + tmp1 = veorq_u8(tmp1, block0); + tmp2 = veorq_u8(tmp2, block1); + cur_iv = block2; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + + /* Process last block or two individually. */ + while (num_blocks >= 1) { + /* Read block in, keep in register for IV later. */ + const uint8x16_t block0 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + + uint8x16_t tmp0 = block0; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(12, 0) + AES_DEC_ROUND(11, 0) + AES_DEC_ROUND(10, 0) + AES_DEC_ROUND(9, 0) + AES_DEC_ROUND(8, 0) + AES_DEC_ROUND(7, 0) + AES_DEC_ROUND(6, 0) + AES_DEC_ROUND(5, 0) + AES_DEC_ROUND(4, 0) + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + ); + + /* Update IV. */ + cur_iv = tmp0; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->iv, cur_iv); +} + +size_t aes192CbcEncrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes192CbcEncryptBlocks); +} + +size_t aes192CbcDecrypt(Aes192CbcContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes192CbcDecryptBlocks); +} + +void aes256CbcContextCreate(Aes256CbcContext *out, const void *key, const void *iv, bool is_encryptor) { + /* Initialize inner context. */ + aes256ContextCreate(&out->aes_ctx, key, is_encryptor); + + /* Set IV, nothing is buffered. */ + memcpy(out->iv, iv, sizeof(out->iv)); + memset(out->buffer, 0, sizeof(out->buffer)); + out->num_buffered = 0; +} + +static void _aes256CbcEncryptBlocks(Aes256CbcContext *ctx, void *dst, const void *src, size_t num_blocks) { + const u8 *src_u8 = (const u8 *)src; + u8 *dst_u8 = (u8 *)dst; + + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t cur_iv = vld1q_u8(ctx->iv); + + /* Process last block or two individually. */ + while (num_blocks >= 1) { + /* Read block in, xor with IV. */ + uint8x16_t tmp0 = veorq_u8(cur_iv, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) + AES_ENC_ROUND(1, 0) + AES_ENC_ROUND(2, 0) + AES_ENC_ROUND(3, 0) + AES_ENC_ROUND(4, 0) + AES_ENC_ROUND(5, 0) + AES_ENC_ROUND(6, 0) + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_ROUND(11, 0) + AES_ENC_ROUND(12, 0) + AES_ENC_SECOND_LAST_ROUND(13, 0) + AES_ENC_LAST_ROUND(14, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + ); + + /* Update IV. */ + cur_iv = tmp0; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->iv, cur_iv); +} + +static void _aes256CbcDecryptBlocks(Aes256CbcContext *ctx, void *dst, const void *src, size_t num_blocks) { + const u8 *src_u8 = (const u8 *)src; + u8 *dst_u8 = (u8 *)dst; + + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t cur_iv = vld1q_u8(ctx->iv); + + /* Process three blocks at a time, when possible. */ + while (num_blocks >= 3) { + /* Read blocks in. Keep them in registers for XOR later. */ + const uint8x16_t block0 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + const uint8x16_t block1 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + const uint8x16_t block2 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + + uint8x16_t tmp0 = block0, tmp1 = block1, tmp2 = block2; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(14, 0) AES_DEC_ROUND(14, 1) AES_DEC_ROUND(14, 2) + AES_DEC_ROUND(13, 0) AES_DEC_ROUND(13, 1) AES_DEC_ROUND(13, 2) + AES_DEC_ROUND(12, 0) AES_DEC_ROUND(12, 1) AES_DEC_ROUND(12, 2) + AES_DEC_ROUND(11, 0) AES_DEC_ROUND(11, 1) AES_DEC_ROUND(11, 2) + AES_DEC_ROUND(10, 0) AES_DEC_ROUND(10, 1) AES_DEC_ROUND(10, 2) + AES_DEC_ROUND(9, 0) AES_DEC_ROUND(9, 1) AES_DEC_ROUND(9, 2) + AES_DEC_ROUND(8, 0) AES_DEC_ROUND(8, 1) AES_DEC_ROUND(8, 2) + AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2) + AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2) + AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + ); + + /* Do XOR for CBC. */ + tmp0 = veorq_u8(tmp0, cur_iv); + tmp1 = veorq_u8(tmp1, block0); + tmp2 = veorq_u8(tmp2, block1); + cur_iv = block2; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + + /* Process last block or two individually. */ + while (num_blocks >= 1) { + /* Read block in, keep in register for IV later. */ + const uint8x16_t block0 = vld1q_u8(src_u8); + src_u8 += AES_BLOCK_SIZE; + + uint8x16_t tmp0 = block0; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(14, 0) + AES_DEC_ROUND(13, 0) + AES_DEC_ROUND(12, 0) + AES_DEC_ROUND(11, 0) + AES_DEC_ROUND(10, 0) + AES_DEC_ROUND(9, 0) + AES_DEC_ROUND(8, 0) + AES_DEC_ROUND(7, 0) + AES_DEC_ROUND(6, 0) + AES_DEC_ROUND(5, 0) + AES_DEC_ROUND(4, 0) + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + ); + + /* Update IV. */ + cur_iv = tmp0; + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->iv, cur_iv); +} + +size_t aes256CbcEncrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes256CbcEncryptBlocks); +} + +size_t aes256CbcDecrypt(Aes256CbcContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes256CbcDecryptBlocks); +}