diff --git a/nx/include/switch/crypto.h b/nx/include/switch/crypto.h index b8fa7993..70e6a45e 100644 --- a/nx/include/switch/crypto.h +++ b/nx/include/switch/crypto.h @@ -10,6 +10,7 @@ #include "crypto/aes_cbc.h" #include "crypto/aes_ctr.h" #include "crypto/aes_xts.h" +#include "crypto/cmac.h" #include "crypto/sha256.h" #include "crypto/sha1.h" diff --git a/nx/include/switch/crypto/cmac.h b/nx/include/switch/crypto/cmac.h new file mode 100644 index 00000000..d35ae860 --- /dev/null +++ b/nx/include/switch/crypto/cmac.h @@ -0,0 +1,67 @@ +/** + * @file cmac.h + * @brief Hardware accelerated AES-CMAC implementation. + * @copyright libnx Authors + */ +#pragma once +#include "aes.h" + +/// Context for AES-128 CMAC. +typedef struct { + Aes128Context ctx; + u8 subkey[AES_BLOCK_SIZE]; + u8 mac[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; + bool finalized; +} Aes128CmacContext; + +/// Context for AES-192 CMAC. +typedef struct { + Aes192Context ctx; + u8 subkey[AES_BLOCK_SIZE]; + u8 mac[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; + bool finalized; +} Aes192CmacContext; + +/// Context for AES-256 CMAC. +typedef struct { + Aes256Context ctx; + u8 subkey[AES_BLOCK_SIZE]; + u8 mac[AES_BLOCK_SIZE]; + u8 buffer[AES_BLOCK_SIZE]; + size_t num_buffered; + bool finalized; +} Aes256CmacContext; + +/// Initialize an AES-128-CMAC context. +void cmacAes128ContextCreate(Aes128CmacContext *out, const void *key); +/// Updates AES-128-CMAC context with data to hash +void cmacAes128ContextUpdate(Aes128CmacContext *ctx, const void *src, size_t size); +/// Gets the context's output mac, finalizes the context. +void cmacAes128ContextGetMac(Aes128CmacContext *ctx, void *dst); + +/// Simple all-in-one AES-128-CMAC calculator. +void cmacAes128CalculateMac(void *dst, const void *key, const void *src, size_t size); + +/// Initialize an AES-192-CMAC context. +void cmacAes192ContextCreate(Aes192CmacContext *out, const void *key); +/// Updates AES-192-CMAC context with data to hash +void cmacAes192ContextUpdate(Aes192CmacContext *ctx, const void *src, size_t size); +/// Gets the context's output mac, finalizes the context. +void cmacAes192ContextGetMac(Aes192CmacContext *ctx, void *dst); + +/// Simple all-in-one AES-192-CMAC calculator. +void cmacAes192CalculateMac(void *dst, const void *key, const void *src, size_t size); + +/// Initialize an AES-256-CMAC context. +void cmacAes256ContextCreate(Aes256CmacContext *out, const void *key); +/// Updates AES-256-CMAC context with data to hash +void cmacAes256ContextUpdate(Aes256CmacContext *ctx, const void *src, size_t size); +/// Gets the context's output mac, finalizes the context. +void cmacAes256ContextGetMac(Aes256CmacContext *ctx, void *dst); + +/// Simple all-in-one AES-256-CMAC calculator. +void cmacAes256CalculateMac(void *dst, const void *key, const void *src, size_t size); diff --git a/nx/source/crypto/cmac.c b/nx/source/crypto/cmac.c new file mode 100644 index 00000000..860806db --- /dev/null +++ b/nx/source/crypto/cmac.c @@ -0,0 +1,381 @@ +#include +#include +#include + +#include "result.h" +#include "crypto/cmac.h" + +/* Variable management macros. */ +#define DECLARE_ROUND_KEY_VAR(n) \ +const uint8x16_t round_key_##n = vld1q_u8(ctx->ctx.round_keys[n]) + +#define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \ +[tmp0]"+w"(tmp0) + +#define AES_ENC_DEC_INPUT_ROUND_KEY(n) \ +[round_key_##n]"w"(round_key_##n) + +/* AES Encryption macros. */ +#define AES_ENC_ROUND(n, i) \ +"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ +"aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + +#define AES_ENC_SECOND_LAST_ROUND(n, i) \ +"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +#define AES_ENC_LAST_ROUND(n, i) \ +"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + +/* Function body macros. */ +#define CMAC_CONTEXT_CREATE(cipher) \ +do { \ + cipher##ContextCreate(&out->ctx, key, true); \ + memset(&out->subkey, 0, sizeof(out->subkey)); \ + cipher##EncryptBlock(&out->ctx, out->subkey, out->subkey); \ + vst1q_u8(out->subkey, _galoisMultiply(vld1q_u8(out->subkey))); \ + memset(out->mac, 0, sizeof(out->mac)); \ + memset(out->buffer, 0, sizeof(out->buffer)); \ + out->num_buffered = 0; \ + out->finalized = false; \ +} while (0) + +#define CMAC_CONTEXT_UPDATE(cipher) \ +do { \ + const u8 *cur_src = src; \ +\ + /* Handle pre-buffered data. */ \ + if (ctx->num_buffered > 0) { \ + const size_t needed = sizeof(ctx->buffer) - ctx->num_buffered; \ + const size_t copyable = (size > needed ? needed : size); \ + memcpy(&ctx->buffer[ctx->num_buffered], cur_src, copyable); \ + cur_src += copyable; \ + ctx->num_buffered += copyable; \ + size -= copyable; \ +\ + if (ctx->num_buffered == sizeof(ctx->buffer) && size > 0) { \ + _cmac##cipher##ProcessBlocks(ctx, ctx->buffer, 1); \ + ctx->num_buffered = 0; \ + } \ + } \ +\ + /* Handle complete blocks. */ \ + if (size >= sizeof(ctx->buffer)) { \ + const size_t num_blocks = size / sizeof(ctx->buffer); \ + if (num_blocks > 1) { \ + _cmac##cipher##ProcessBlocks(ctx, cur_src, num_blocks - 1); \ + size -= (num_blocks - 1) * sizeof(ctx->buffer); \ + cur_src += (num_blocks - 1) * sizeof(ctx->buffer); \ + } \ + memcpy(ctx->buffer, cur_src, sizeof(ctx->buffer)); \ + size -= sizeof(ctx->buffer); \ + cur_src += sizeof(ctx->buffer); \ + ctx->num_buffered = sizeof(ctx->buffer); \ + } \ +\ + /* Buffer remaining data. */ \ + if (size > 0) { \ + if (ctx->num_buffered == AES_BLOCK_SIZE) { \ + _cmac##cipher##ProcessBlocks(ctx, ctx->buffer, 1); \ + ctx->num_buffered = 0; \ + } \ + memcpy(ctx->buffer, cur_src, size); \ + ctx->num_buffered = size; \ + } \ +} while (0) + +#define CMAC_CONTEXT_GET_MAC(cipher) \ +do { \ + if (!ctx->finalized) { \ + /* Handle unaligned buffered data. */ \ + if (ctx->num_buffered != sizeof(ctx->buffer)) { \ + static const u8 padding[sizeof(ctx->buffer)] = {0x80,}; \ + const size_t needed = sizeof(ctx->buffer) - ctx->num_buffered; \ + memcpy(ctx->buffer + ctx->num_buffered, padding, needed); \ + ctx->num_buffered = sizeof(ctx->buffer); \ + vst1q_u8(ctx->subkey, _galoisMultiply(vld1q_u8(ctx->subkey))); \ + } \ + /* Mask in subkey. */ \ + for (size_t i = 0; i < sizeof(ctx->buffer); i++) { \ + ctx->buffer[i] ^= ctx->subkey[i]; \ + } \ + _cmac##cipher##ProcessBlocks(ctx, ctx->buffer, 1); \ + ctx->num_buffered = 0; \ + ctx->finalized = true; \ + } \ +\ + memcpy(dst, ctx->mac, sizeof(ctx->mac)); \ +} while (0) + +#define CMAC_AES_CALCULATE_MAC(cipher) \ +do { \ + /* Make a new context, calculate hash, store to output, clear memory. */ \ + cipher##CmacContext ctx; \ + cmac##cipher##ContextCreate(&ctx, key); \ + cmac##cipher##ContextUpdate(&ctx, src, size); \ + cmac##cipher##ContextGetMac(&ctx, dst); \ + memset(&ctx, 0, sizeof(ctx)); \ +} while (0) + +/* Multiplies block in galois field. */ +static inline uint8x16_t _galoisMultiply(const uint8x16_t val) { + uint8x16_t mult; + uint64_t high, low, mask; + const uint64_t xor = 0x87ul; + /* Use ASM. TODO: Better than using intrinsics? */ + __asm__ __volatile__ ( + "mov %[high], %[val].d[0]\n" + "mov %[low], %[val].d[1]\n" + "rev %[high], %[high]\n" + "rev %[low], %[low]\n" + "and %[mask], %[xor], %[high], asr 63\n" + "extr %[high], %[high], %[low], 63\n" + "eor %[low], %[mask], %[low], lsl 1\n" + "rev %[high], %[high]\n" + "rev %[low], %[low]\n" + "mov %[mult].d[0], %[high]\n" + "mov %[mult].d[1], %[low]\n" + : [mult]"=w"(mult), + [high]"=&r"(high), [low]"=&r"(low), [mask]"=&r"(mask) + : [val]"w"(val), + [xor]"r"(xor) + : "cc" + ); + return mult; +} + +static void _cmacAes128ProcessBlocks(Aes128CmacContext *ctx, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t cur_mac = vld1q_u8(ctx->mac); + + /* Process blocks one-by-one. */ + while (num_blocks >= 1) { + /* Read block in, xor with MAC. */ + uint8x16_t tmp0 = veorq_u8(cur_mac, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) + AES_ENC_ROUND(1, 0) + AES_ENC_ROUND(2, 0) + AES_ENC_ROUND(3, 0) + AES_ENC_ROUND(4, 0) + AES_ENC_ROUND(5, 0) + AES_ENC_ROUND(6, 0) + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_SECOND_LAST_ROUND(9, 0) + AES_ENC_LAST_ROUND(10, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + ); + + /* Update MAC. */ + cur_mac = tmp0; + + num_blocks--; + } + + vst1q_u8(ctx->mac, cur_mac); +} + +static void _cmacAes192ProcessBlocks(Aes192CmacContext *ctx, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t cur_mac = vld1q_u8(ctx->mac); + + /* Process blocks one-by-one. */ + while (num_blocks >= 1) { + /* Read block in, xor with MAC. */ + uint8x16_t tmp0 = veorq_u8(cur_mac, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) + AES_ENC_ROUND(1, 0) + AES_ENC_ROUND(2, 0) + AES_ENC_ROUND(3, 0) + AES_ENC_ROUND(4, 0) + AES_ENC_ROUND(5, 0) + AES_ENC_ROUND(6, 0) + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_SECOND_LAST_ROUND(11, 0) + AES_ENC_LAST_ROUND(12, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + ); + + /* Update MAC. */ + cur_mac = tmp0; + + num_blocks--; + } + + vst1q_u8(ctx->mac, cur_mac); +} + +static void _cmacAes256ProcessBlocks(Aes256CmacContext *ctx, const u8 *src_u8, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t cur_mac = vld1q_u8(ctx->mac); + + /* Process blocks one-by-one. */ + while (num_blocks >= 1) { + /* Read block in, xor with MAC. */ + uint8x16_t tmp0 = veorq_u8(cur_mac, vld1q_u8(src_u8)); + src_u8 += AES_BLOCK_SIZE; + + /* Actually do encryption, use optimized asm. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) + AES_ENC_ROUND(1, 0) + AES_ENC_ROUND(2, 0) + AES_ENC_ROUND(3, 0) + AES_ENC_ROUND(4, 0) + AES_ENC_ROUND(5, 0) + AES_ENC_ROUND(6, 0) + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_ROUND(11, 0) + AES_ENC_ROUND(12, 0) + AES_ENC_SECOND_LAST_ROUND(13, 0) + AES_ENC_LAST_ROUND(14, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + ); + + /* Update MAC. */ + cur_mac = tmp0; + + num_blocks--; + } + + vst1q_u8(ctx->mac, cur_mac); +} + +void cmacAes128ContextCreate(Aes128CmacContext *out, const void *key) { + CMAC_CONTEXT_CREATE(aes128); +} + +void cmacAes128ContextUpdate(Aes128CmacContext *ctx, const void *src, size_t size) { + CMAC_CONTEXT_UPDATE(Aes128); +} + +void cmacAes128ContextGetMac(Aes128CmacContext *ctx, void *dst) { + CMAC_CONTEXT_GET_MAC(Aes128); +} + +void cmacAes128CalculateMac(void *dst, const void *key, const void *src, size_t size) { + CMAC_AES_CALCULATE_MAC(Aes128); +} + +void cmacAes192ContextCreate(Aes192CmacContext *out, const void *key) { + CMAC_CONTEXT_CREATE(aes192); +} + +void cmacAes192ContextUpdate(Aes192CmacContext *ctx, const void *src, size_t size) { + CMAC_CONTEXT_UPDATE(Aes192); +} + +void cmacAes192ContextGetMac(Aes192CmacContext *ctx, void *dst) { + CMAC_CONTEXT_GET_MAC(Aes192); +} + +void cmacAes192CalculateMac(void *dst, const void *key, const void *src, size_t size) { + CMAC_AES_CALCULATE_MAC(Aes192); +} + +void cmacAes256ContextCreate(Aes256CmacContext *out, const void *key) { + CMAC_CONTEXT_CREATE(aes256); +} + +void cmacAes256ContextUpdate(Aes256CmacContext *ctx, const void *src, size_t size) { + CMAC_CONTEXT_UPDATE(Aes256); +} + +void cmacAes256ContextGetMac(Aes256CmacContext *ctx, void *dst) { + CMAC_CONTEXT_GET_MAC(Aes256); +} + +void cmacAes256CalculateMac(void *dst, const void *key, const void *src, size_t size) { + CMAC_AES_CALCULATE_MAC(Aes256); +}