From f2fe4da72253499be71e9f35e8e6210b17305a27 Mon Sep 17 00:00:00 2001 From: Michael Scire Date: Wed, 3 Apr 2019 12:04:45 -0700 Subject: [PATCH] Implement accelerated AES-XTS --- nx/include/switch/crypto.h | 3 +- nx/include/switch/crypto/aes_xts.h | 55 ++ nx/source/crypto/aes_xts.c | 1207 ++++++++++++++++++++++++++++ 3 files changed, 1264 insertions(+), 1 deletion(-) create mode 100644 nx/include/switch/crypto/aes_xts.h create mode 100644 nx/source/crypto/aes_xts.c diff --git a/nx/include/switch/crypto.h b/nx/include/switch/crypto.h index c95f9be9..7e03e2bb 100644 --- a/nx/include/switch/crypto.h +++ b/nx/include/switch/crypto.h @@ -8,4 +8,5 @@ #include "crypto/aes.h" #include "crypto/aes_cbc.h" -#include "crypto/aes_ctr.h" \ No newline at end of file +#include "crypto/aes_ctr.h" +#include "crypto/aes_xts.h" \ No newline at end of file diff --git a/nx/include/switch/crypto/aes_xts.h b/nx/include/switch/crypto/aes_xts.h new file mode 100644 index 00000000..624188c6 --- /dev/null +++ b/nx/include/switch/crypto/aes_xts.h @@ -0,0 +1,55 @@ +/** + * @file aes_xts.h + * @brief Switch accelerated AES-XTS implementation. + * @copyright libnx Authors + */ +#pragma once +#include "aes.h" + +/// Context for AES-128 XTS. +typedef struct { + Aes128Context aes_ctx; + Aes128Context tweak_ctx; + u8 tweak[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; +} Aes128XtsContext; + +/// Context for AES-192 XTS. +typedef struct { + Aes192Context aes_ctx; + Aes192Context tweak_ctx; + u8 tweak[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; +} Aes192XtsContext; + +/// Context for AES-256 XTS. +typedef struct { + Aes256Context aes_ctx; + Aes256Context tweak_ctx; + u8 tweak[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; +} Aes256XtsContext; + +/// 128-bit XTS API. +void aes128XtsContextCreate(Aes128XtsContext *out, const void *key0, const void *key1, bool is_encryptor); +void aes128XtsContextResetTweak(Aes128XtsContext *ctx, const void *tweak); +void aes128XtsContextResetSector(Aes128XtsContext *ctx, uint64_t sector, bool nintendo); +size_t aes128XtsEncrypt(Aes128XtsContext *ctx, void *dst, const void *src, size_t size); +size_t aes128XtsDecrypt(Aes128XtsContext *ctx, void *dst, const void *src, size_t size); + +/// 192-bit XTS API. +void aes192XtsContextCreate(Aes192XtsContext *out, const void *key0, const void *key1, bool is_encryptor); +void aes192XtsContextResetTweak(Aes192XtsContext *ctx, const void *tweak); +void aes192XtsContextResetSector(Aes192XtsContext *ctx, uint64_t sector, bool nintendo); +size_t aes192XtsEncrypt(Aes192XtsContext *ctx, void *dst, const void *src, size_t size); +size_t aes192XtsDecrypt(Aes192XtsContext *ctx, void *dst, const void *src, size_t size); + +/// 256-bit XTS API. +void aes256XtsContextCreate(Aes256XtsContext *out, const void *key0, const void *key1, bool is_encryptor); +void aes256XtsContextResetTweak(Aes256XtsContext *ctx, const void *tweak); +void aes256XtsContextResetSector(Aes256XtsContext *ctx, uint64_t sector, bool nintendo); +size_t aes256XtsEncrypt(Aes256XtsContext *ctx, void *dst, const void *src, size_t size); +size_t aes256XtsDecrypt(Aes256XtsContext *ctx, void *dst, const void *src, size_t size); diff --git a/nx/source/crypto/aes_xts.c b/nx/source/crypto/aes_xts.c new file mode 100644 index 00000000..ceb50823 --- /dev/null +++ b/nx/source/crypto/aes_xts.c @@ -0,0 +1,1207 @@ +#include +#include +#include + +#include "result.h" +#include "crypto/aes.h" +#include "crypto/aes_xts.h" + +/* Variable management macros. */ +#define DECLARE_ROUND_KEY_VAR(n) \ +const uint8x16_t round_key_##n = vld1q_u8(ctx->aes_ctx.round_keys[n]) + +#define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \ +[tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2) + +#define AES_ENC_DEC_OUTPUT_THREE_TWEAKS() \ +[tweak0]"+w"(tweak0), [tweak1]"+w"(tweak1), [tweak2]"+w"(tweak2) + +#define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \ +[tmp0]"+w"(tmp0) + +#define AES_ENC_DEC_OUTPUT_ONE_TWEAK() \ +[tweak0]"+w"(tweak0) + +#define XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() \ +[high]"=&r"(high), [low]"=&r"(low), [mask]"=&r"(mask) + +#define XTS_INCREMENT_INPUT_XOR() \ +[xor]"r"(xor) + +#define AES_ENC_DEC_INPUT_ROUND_KEY(n) \ +[round_key_##n]"w"(round_key_##n) + +/* AES Encryption macros. */ +#define AES_ENC_ROUND(n, i) \ +"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ +"aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + +#define AES_ENC_SECOND_LAST_ROUND(n, i) \ +"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +#define AES_ENC_LAST_ROUND(n, i) \ +"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +/* AES Decryption macros. */ +#define AES_DEC_ROUND(n, i) \ +"aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ +"aesimc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + +#define AES_DEC_SECOND_LAST_ROUND(n, i) \ +"aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +#define AES_DEC_LAST_ROUND(n, i) \ +"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +/* Macro for main body of crypt wrapper. */ +#define CRYPT_FUNC_BODY(block_handler) \ +do { \ + const u8 *cur_src = src; \ + u8 *cur_dst = dst; \ +\ + /* Handle pre-buffered data. */ \ + if (ctx->num_buffered > 0) { \ + const size_t needed = AES_BLOCK_SIZE - ctx->num_buffered; \ + const size_t copyable = (size > needed ? needed : size); \ + memcpy(&ctx->buffer[ctx->num_buffered], cur_src, copyable); \ + cur_src += copyable; \ + ctx->num_buffered += copyable; \ + size -= copyable; \ +\ + if (ctx->num_buffered == AES_BLOCK_SIZE) { \ + block_handler(ctx, cur_dst, ctx->buffer, 1); \ + cur_dst += AES_BLOCK_SIZE; \ + ctx->num_buffered = 0; \ + } \ + } \ +\ + /* Handle complete blocks. */ \ + if (size >= AES_BLOCK_SIZE) { \ + const size_t num_blocks = size / AES_BLOCK_SIZE; \ + block_handler(ctx, cur_dst, cur_src, num_blocks); \ + size -= num_blocks * AES_BLOCK_SIZE; \ + cur_src += num_blocks * AES_BLOCK_SIZE; \ + cur_dst += num_blocks * AES_BLOCK_SIZE; \ + } \ +\ + /* Buffer remaining data. */ \ + if (size > 0) { \ + memcpy(ctx->buffer, cur_src, size); \ + ctx->num_buffered = size; \ + } \ + return (size_t)((uintptr_t)cur_dst - (uintptr_t)dst); \ +} while (0) + +static inline uint8x16_t _multiplyTweak(const uint8x16_t tweak) { + uint8x16_t mult; + uint64_t high, low, mask; + const uint64_t xor = 0x87ul; + /* Use ASM. TODO: Better than using intrinsics? */ + __asm__ __volatile__ ( + "mov %[high], %[tweak].d[1]\n" + "mov %[low], %[tweak].d[0]\n" + "and %[mask], %[xor], %[high], asr 63\n" + "extr %[high], %[high], %[low], 63\n" + "eor %[low], %[mask], %[low], lsl 1\n" + "mov %[mult].d[1], %[high]\n" + "mov %[mult].d[0], %[low]\n" + : [mult]"=w"(mult), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : [tweak]"w"(tweak), + XTS_INCREMENT_INPUT_XOR() + : "cc" + ); + return mult; +} + +void aes128XtsContextCreate(Aes128XtsContext *out, const void *key0, const void *key1, bool is_encryptor) { + /* Initialize inner context. */ + aes128ContextCreate(&out->aes_ctx, key0, is_encryptor); + aes128ContextCreate(&out->tweak_ctx, key1, true); + aes128XtsContextResetSector(out, 0, false); +} + +void aes128XtsContextResetTweak(Aes128XtsContext *ctx, const void *tweak) { + /* Set and encrypt tweak, nothing is buffered. */ + memcpy(ctx->tweak, tweak, sizeof(ctx->tweak)); + aes128EncryptBlock(&ctx->tweak_ctx, ctx->tweak, ctx->tweak); + memset(ctx->buffer, 0, sizeof(ctx->buffer)); + ctx->num_buffered = 0; +} + +void aes128XtsContextResetSector(Aes128XtsContext *ctx, uint64_t sector, bool nintendo) { + /* Set and encrypt tweak, nothing is buffered. */ + uint64_t *tweak_u64 = (uint64_t *)(&ctx->tweak); + if (nintendo) { + /* Nintendo uses big endian tweak-from-sector, despite little endian gf multiplication. */ + /* This is probably a Nintendo bug, but given all their content relies on it, not like it can change... */ + tweak_u64[0] = 0; + tweak_u64[1] = __builtin_bswap64(sector); + } else { + /* Tweaks are normally little endian. */ + tweak_u64[0] = sector; + tweak_u64[1] = 0; + } + aes128EncryptBlock(&ctx->tweak_ctx, ctx->tweak, ctx->tweak); + memset(ctx->buffer, 0, sizeof(ctx->buffer)); + ctx->num_buffered = 0; +} + +static inline void _aes128XtsEncryptBlocks(Aes128XtsContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t tweak0 = vld1q_u8(ctx->tweak); + const uint64_t xor = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = _multiplyTweak(tweak0); + uint8x16_t tweak2 = _multiplyTweak(tweak1); + + while (num_blocks >= 3) { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n" + AES_ENC_ROUND(0, 2) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(1, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(1, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(1, 2) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(2, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(2, 1) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(2, 2) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(3, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(3, 1) "mov %[tweak1].d[1], %[high]\n" + AES_ENC_ROUND(3, 2) "mov %[tweak1].d[0], %[low]\n" + AES_ENC_ROUND(4, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(4, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n" + AES_ENC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n" + AES_ENC_ROUND(5, 2) + AES_ENC_ROUND(6, 0) AES_ENC_ROUND(6, 1) AES_ENC_ROUND(6, 2) + AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_SECOND_LAST_ROUND(9, 0) AES_ENC_SECOND_LAST_ROUND(9, 1) AES_ENC_SECOND_LAST_ROUND(9, 2) + AES_ENC_LAST_ROUND(10, 0) AES_ENC_LAST_ROUND(10, 1) AES_ENC_LAST_ROUND(10, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n" + AES_ENC_ROUND(2, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_SECOND_LAST_ROUND(9, 0) + AES_ENC_LAST_ROUND(10, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->tweak, tweak0); +} + +size_t aes128XtsEncrypt(Aes128XtsContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes128XtsEncryptBlocks); +} + +static inline void _aes128XtsDecryptBlocks(Aes128XtsContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t tweak0 = vld1q_u8(ctx->tweak); + const uint64_t xor = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = _multiplyTweak(tweak0); + uint8x16_t tweak2 = _multiplyTweak(tweak1); + + while (num_blocks >= 3) { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(10, 0) "mov %[high], %[tweak2].d[1]\n" + AES_DEC_ROUND(10, 1) "mov %[low], %[tweak2].d[0]\n" + AES_DEC_ROUND(10, 2) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(9, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(9, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(9, 2) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(8, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(8, 1) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(8, 2) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(7, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(7, 1) "mov %[tweak1].d[1], %[high]\n" + AES_DEC_ROUND(7, 2) "mov %[tweak1].d[0], %[low]\n" + AES_DEC_ROUND(6, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(6, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(6, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n" + AES_DEC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n" + AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(10, 0) "mov %[high], %[tweak0].d[1]\n" + AES_DEC_ROUND(9, 0) "mov %[low], %[tweak0].d[0]\n" + AES_DEC_ROUND(8, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(7, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(6, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(4, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->tweak, tweak0); +} + +size_t aes128XtsDecrypt(Aes128XtsContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes128XtsDecryptBlocks); +} + +void aes192XtsContextCreate(Aes192XtsContext *out, const void *key0, const void *key1, bool is_encryptor) { + /* Initialize inner context. */ + aes192ContextCreate(&out->aes_ctx, key0, is_encryptor); + aes192ContextCreate(&out->tweak_ctx, key1, true); + aes192XtsContextResetSector(out, 0, false); +} + +void aes192XtsContextResetTweak(Aes192XtsContext *ctx, const void *tweak) { + /* Set and encrypt tweak, nothing is buffered. */ + memcpy(ctx->tweak, tweak, sizeof(ctx->tweak)); + aes192EncryptBlock(&ctx->tweak_ctx, ctx->tweak, ctx->tweak); + memset(ctx->buffer, 0, sizeof(ctx->buffer)); + ctx->num_buffered = 0; +} + +void aes192XtsContextResetSector(Aes192XtsContext *ctx, uint64_t sector, bool nintendo) { + /* Set and encrypt tweak, nothing is buffered. */ + uint64_t *tweak_u64 = (uint64_t *)(&ctx->tweak); + if (nintendo) { + /* Nintendo uses big endian tweak-from-sector, despite little endian gf multiplication. */ + /* This is probably a Nintendo bug, but given all their content relies on it, not like it can change... */ + tweak_u64[0] = 0; + tweak_u64[1] = __builtin_bswap64(sector); + } else { + /* Tweaks are normally little endian. */ + tweak_u64[0] = sector; + tweak_u64[1] = 0; + } + aes192EncryptBlock(&ctx->tweak_ctx, ctx->tweak, ctx->tweak); + memset(ctx->buffer, 0, sizeof(ctx->buffer)); + ctx->num_buffered = 0; +} + +static inline void _aes192XtsEncryptBlocks(Aes192XtsContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t tweak0 = vld1q_u8(ctx->tweak); + const uint64_t xor = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = _multiplyTweak(tweak0); + uint8x16_t tweak2 = _multiplyTweak(tweak1); + + while (num_blocks >= 3) { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n" + AES_ENC_ROUND(0, 2) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(1, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(1, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(1, 2) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(2, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(2, 1) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(2, 2) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(3, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(3, 1) "mov %[tweak1].d[1], %[high]\n" + AES_ENC_ROUND(3, 2) "mov %[tweak1].d[0], %[low]\n" + AES_ENC_ROUND(4, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(4, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n" + AES_ENC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n" + AES_ENC_ROUND(5, 2) + AES_ENC_ROUND(6, 0) AES_ENC_ROUND(6, 1) AES_ENC_ROUND(6, 2) + AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2) + AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2) + AES_ENC_SECOND_LAST_ROUND(11, 0) AES_ENC_SECOND_LAST_ROUND(11, 1) AES_ENC_SECOND_LAST_ROUND(11, 2) + AES_ENC_LAST_ROUND(12, 0) AES_ENC_LAST_ROUND(12, 1) AES_ENC_LAST_ROUND(12, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n" + AES_ENC_ROUND(2, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_SECOND_LAST_ROUND(11, 0) + AES_ENC_LAST_ROUND(12, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->tweak, tweak0); +} + +size_t aes192XtsEncrypt(Aes192XtsContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes192XtsEncryptBlocks); +} + +static inline void _aes192XtsDecryptBlocks(Aes192XtsContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t tweak0 = vld1q_u8(ctx->tweak); + const uint64_t xor = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = _multiplyTweak(tweak0); + uint8x16_t tweak2 = _multiplyTweak(tweak1); + + while (num_blocks >= 3) { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(12, 0) "mov %[high], %[tweak2].d[1]\n" + AES_DEC_ROUND(12, 1) "mov %[low], %[tweak2].d[0]\n" + AES_DEC_ROUND(12, 2) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(11, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(11, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(11, 2) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(10, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(10, 1) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(10, 2) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(9, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(9, 1) "mov %[tweak1].d[1], %[high]\n" + AES_DEC_ROUND(9, 2) "mov %[tweak1].d[0], %[low]\n" + AES_DEC_ROUND(8, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(8, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(8, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(7, 0) "mov %[tweak2].d[1], %[high]\n" + AES_DEC_ROUND(7, 1) "mov %[tweak2].d[0], %[low]\n" + AES_DEC_ROUND(7, 2) + AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2) + AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(12, 0) "mov %[high], %[tweak0].d[1]\n" + AES_DEC_ROUND(11, 0) "mov %[low], %[tweak0].d[0]\n" + AES_DEC_ROUND(10, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(9, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(8, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(7, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(5, 0) + AES_DEC_ROUND(4, 0) + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->tweak, tweak0); +} + +size_t aes192XtsDecrypt(Aes192XtsContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes192XtsDecryptBlocks); +} + +void aes256XtsContextCreate(Aes256XtsContext *out, const void *key0, const void *key1, bool is_encryptor) { + /* Initialize inner context. */ + aes256ContextCreate(&out->aes_ctx, key0, is_encryptor); + aes256ContextCreate(&out->tweak_ctx, key1, true); + aes256XtsContextResetSector(out, 0, false); +} + +void aes256XtsContextResetTweak(Aes256XtsContext *ctx, const void *tweak) { + /* Set and encrypt tweak, nothing is buffered. */ + memcpy(ctx->tweak, tweak, sizeof(ctx->tweak)); + aes256EncryptBlock(&ctx->tweak_ctx, ctx->tweak, ctx->tweak); + memset(ctx->buffer, 0, sizeof(ctx->buffer)); + ctx->num_buffered = 0; +} + +void aes256XtsContextResetSector(Aes256XtsContext *ctx, uint64_t sector, bool nintendo) { + /* Set and encrypt tweak, nothing is buffered. */ + uint64_t *tweak_u64 = (uint64_t *)(&ctx->tweak); + if (nintendo) { + /* Nintendo uses big endian tweak-from-sector, despite little endian gf multiplication. */ + /* This is probably a Nintendo bug, but given all their content relies on it, not like it can change... */ + tweak_u64[0] = 0; + tweak_u64[1] = __builtin_bswap64(sector); + } else { + /* Tweaks are normally little endian. */ + tweak_u64[0] = sector; + tweak_u64[1] = 0; + } + aes256EncryptBlock(&ctx->tweak_ctx, ctx->tweak, ctx->tweak); + memset(ctx->buffer, 0, sizeof(ctx->buffer)); + ctx->num_buffered = 0; +} + +static inline void _aes256XtsEncryptBlocks(Aes256XtsContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t tweak0 = vld1q_u8(ctx->tweak); + const uint64_t xor = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = _multiplyTweak(tweak0); + uint8x16_t tweak2 = _multiplyTweak(tweak1); + + while (num_blocks >= 3) { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + /* Note: ASM here cannot use constant xor reg due to operand limitations. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n" + AES_ENC_ROUND(0, 2) "mov %[mask], #0x87\n" + AES_ENC_ROUND(1, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_ENC_ROUND(1, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(1, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(2, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(2, 1) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(2, 2) "mov %[mask], #0x87\n" + AES_ENC_ROUND(3, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_ENC_ROUND(3, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(3, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(4, 0) "mov %[tweak1].d[1], %[high]\n" + AES_ENC_ROUND(4, 1) "mov %[tweak1].d[0], %[low]\n" + AES_ENC_ROUND(4, 2) "mov %[mask], #0x87\n" + AES_ENC_ROUND(5, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_ENC_ROUND(5, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(5, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(6, 0) "mov %[tweak2].d[1], %[high]\n" + AES_ENC_ROUND(6, 1) "mov %[tweak2].d[0], %[low]\n" + AES_ENC_ROUND(6, 2) + AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2) + AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2) + AES_ENC_ROUND(11, 0) AES_ENC_ROUND(11, 1) AES_ENC_ROUND(11, 2) + AES_ENC_ROUND(12, 0) AES_ENC_ROUND(12, 1) AES_ENC_ROUND(12, 2) + AES_ENC_SECOND_LAST_ROUND(13, 0) AES_ENC_SECOND_LAST_ROUND(13, 1) AES_ENC_SECOND_LAST_ROUND(13, 2) + AES_ENC_LAST_ROUND(14, 0) AES_ENC_LAST_ROUND(14, 1) AES_ENC_LAST_ROUND(14, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n" + AES_ENC_ROUND(2, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_ROUND(11, 0) + AES_ENC_ROUND(12, 0) + AES_ENC_SECOND_LAST_ROUND(13, 0) + AES_ENC_LAST_ROUND(14, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->tweak, tweak0); +} + +size_t aes256XtsEncrypt(Aes256XtsContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes256XtsEncryptBlocks); +} + +static inline void _aes256XtsDecryptBlocks(Aes256XtsContext *ctx, u8 *dst_u8, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t tweak0 = vld1q_u8(ctx->tweak); + const uint64_t xor = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = _multiplyTweak(tweak0); + uint8x16_t tweak2 = _multiplyTweak(tweak1); + + while (num_blocks >= 3) { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(14, 0) "mov %[high], %[tweak2].d[1]\n" + AES_DEC_ROUND(14, 1) "mov %[low], %[tweak2].d[0]\n" + AES_DEC_ROUND(14, 2) "mov %[mask], 0x87\n" + AES_DEC_ROUND(13, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_DEC_ROUND(13, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(13, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(12, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(12, 1) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(12, 2) "mov %[mask], 0x87\n" + AES_DEC_ROUND(11, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_DEC_ROUND(11, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(11, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(10, 0) "mov %[tweak1].d[1], %[high]\n" + AES_DEC_ROUND(10, 1) "mov %[tweak1].d[0], %[low]\n" + AES_DEC_ROUND(10, 2) "mov %[mask], 0x87\n" + AES_DEC_ROUND(9, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_DEC_ROUND(9, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(9, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(8, 0) "mov %[tweak2].d[1], %[high]\n" + AES_DEC_ROUND(8, 1) "mov %[tweak2].d[0], %[low]\n" + AES_DEC_ROUND(8, 2) + AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2) + AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2) + AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp1); + dst_u8 += AES_BLOCK_SIZE; + vst1q_u8(dst_u8, tmp2); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(14, 0) "mov %[high], %[tweak0].d[1]\n" + AES_DEC_ROUND(13, 0) "mov %[low], %[tweak0].d[0]\n" + AES_DEC_ROUND(12, 0) "and %[mask], %[xor], %[high], asr 63\n" + AES_DEC_ROUND(11, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(10, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(9, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(8, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(7, 0) + AES_DEC_ROUND(6, 0) + AES_DEC_ROUND(5, 0) + AES_DEC_ROUND(4, 0) + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst_u8, tmp0); + dst_u8 += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(ctx->tweak, tweak0); +} + +size_t aes256XtsDecrypt(Aes256XtsContext *ctx, void *dst, const void *src, size_t size) { + CRYPT_FUNC_BODY(_aes256XtsDecryptBlocks); +}