Implement AES-(128, 192, 256)-CMAC

This commit is contained in:
Michael Scire 2019-04-04 01:54:33 -07:00 committed by fincs
parent 13a690a5a8
commit e4481e12cb
3 changed files with 449 additions and 0 deletions

View File

@ -10,6 +10,7 @@
#include "crypto/aes_cbc.h" #include "crypto/aes_cbc.h"
#include "crypto/aes_ctr.h" #include "crypto/aes_ctr.h"
#include "crypto/aes_xts.h" #include "crypto/aes_xts.h"
#include "crypto/cmac.h"
#include "crypto/sha256.h" #include "crypto/sha256.h"
#include "crypto/sha1.h" #include "crypto/sha1.h"

View File

@ -0,0 +1,67 @@
/**
* @file cmac.h
* @brief Hardware accelerated AES-CMAC implementation.
* @copyright libnx Authors
*/
#pragma once
#include "aes.h"
/// Context for AES-128 CMAC.
typedef struct {
Aes128Context ctx;
u8 subkey[AES_BLOCK_SIZE];
u8 mac[AES_BLOCK_SIZE];
u8 buffer[AES_BLOCK_SIZE];
size_t num_buffered;
bool finalized;
} Aes128CmacContext;
/// Context for AES-192 CMAC.
typedef struct {
Aes192Context ctx;
u8 subkey[AES_BLOCK_SIZE];
u8 mac[AES_BLOCK_SIZE];
u8 buffer[AES_BLOCK_SIZE];
size_t num_buffered;
bool finalized;
} Aes192CmacContext;
/// Context for AES-256 CMAC.
typedef struct {
Aes256Context ctx;
u8 subkey[AES_BLOCK_SIZE];
u8 mac[AES_BLOCK_SIZE];
u8 buffer[AES_BLOCK_SIZE];
size_t num_buffered;
bool finalized;
} Aes256CmacContext;
/// Initialize an AES-128-CMAC context.
void cmacAes128ContextCreate(Aes128CmacContext *out, const void *key);
/// Updates AES-128-CMAC context with data to hash
void cmacAes128ContextUpdate(Aes128CmacContext *ctx, const void *src, size_t size);
/// Gets the context's output mac, finalizes the context.
void cmacAes128ContextGetMac(Aes128CmacContext *ctx, void *dst);
/// Simple all-in-one AES-128-CMAC calculator.
void cmacAes128CalculateMac(void *dst, const void *key, const void *src, size_t size);
/// Initialize an AES-192-CMAC context.
void cmacAes192ContextCreate(Aes192CmacContext *out, const void *key);
/// Updates AES-192-CMAC context with data to hash
void cmacAes192ContextUpdate(Aes192CmacContext *ctx, const void *src, size_t size);
/// Gets the context's output mac, finalizes the context.
void cmacAes192ContextGetMac(Aes192CmacContext *ctx, void *dst);
/// Simple all-in-one AES-192-CMAC calculator.
void cmacAes192CalculateMac(void *dst, const void *key, const void *src, size_t size);
/// Initialize an AES-256-CMAC context.
void cmacAes256ContextCreate(Aes256CmacContext *out, const void *key);
/// Updates AES-256-CMAC context with data to hash
void cmacAes256ContextUpdate(Aes256CmacContext *ctx, const void *src, size_t size);
/// Gets the context's output mac, finalizes the context.
void cmacAes256ContextGetMac(Aes256CmacContext *ctx, void *dst);
/// Simple all-in-one AES-256-CMAC calculator.
void cmacAes256CalculateMac(void *dst, const void *key, const void *src, size_t size);

381
nx/source/crypto/cmac.c Normal file
View File

@ -0,0 +1,381 @@
#include <string.h>
#include <stdlib.h>
#include <arm_neon.h>
#include "result.h"
#include "crypto/cmac.h"
/* Variable management macros. */
#define DECLARE_ROUND_KEY_VAR(n) \
const uint8x16_t round_key_##n = vld1q_u8(ctx->ctx.round_keys[n])
#define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \
[tmp0]"+w"(tmp0)
#define AES_ENC_DEC_INPUT_ROUND_KEY(n) \
[round_key_##n]"w"(round_key_##n)
/* AES Encryption macros. */
#define AES_ENC_ROUND(n, i) \
"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
"aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
#define AES_ENC_SECOND_LAST_ROUND(n, i) \
"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
#define AES_ENC_LAST_ROUND(n, i) \
"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
/* Function body macros. */
#define CMAC_CONTEXT_CREATE(cipher) \
do { \
cipher##ContextCreate(&out->ctx, key, true); \
memset(&out->subkey, 0, sizeof(out->subkey)); \
cipher##EncryptBlock(&out->ctx, out->subkey, out->subkey); \
vst1q_u8(out->subkey, _galoisMultiply(vld1q_u8(out->subkey))); \
memset(out->mac, 0, sizeof(out->mac)); \
memset(out->buffer, 0, sizeof(out->buffer)); \
out->num_buffered = 0; \
out->finalized = false; \
} while (0)
#define CMAC_CONTEXT_UPDATE(cipher) \
do { \
const u8 *cur_src = src; \
\
/* Handle pre-buffered data. */ \
if (ctx->num_buffered > 0) { \
const size_t needed = sizeof(ctx->buffer) - ctx->num_buffered; \
const size_t copyable = (size > needed ? needed : size); \
memcpy(&ctx->buffer[ctx->num_buffered], cur_src, copyable); \
cur_src += copyable; \
ctx->num_buffered += copyable; \
size -= copyable; \
\
if (ctx->num_buffered == sizeof(ctx->buffer) && size > 0) { \
_cmac##cipher##ProcessBlocks(ctx, ctx->buffer, 1); \
ctx->num_buffered = 0; \
} \
} \
\
/* Handle complete blocks. */ \
if (size >= sizeof(ctx->buffer)) { \
const size_t num_blocks = size / sizeof(ctx->buffer); \
if (num_blocks > 1) { \
_cmac##cipher##ProcessBlocks(ctx, cur_src, num_blocks - 1); \
size -= (num_blocks - 1) * sizeof(ctx->buffer); \
cur_src += (num_blocks - 1) * sizeof(ctx->buffer); \
} \
memcpy(ctx->buffer, cur_src, sizeof(ctx->buffer)); \
size -= sizeof(ctx->buffer); \
cur_src += sizeof(ctx->buffer); \
ctx->num_buffered = sizeof(ctx->buffer); \
} \
\
/* Buffer remaining data. */ \
if (size > 0) { \
if (ctx->num_buffered == AES_BLOCK_SIZE) { \
_cmac##cipher##ProcessBlocks(ctx, ctx->buffer, 1); \
ctx->num_buffered = 0; \
} \
memcpy(ctx->buffer, cur_src, size); \
ctx->num_buffered = size; \
} \
} while (0)
#define CMAC_CONTEXT_GET_MAC(cipher) \
do { \
if (!ctx->finalized) { \
/* Handle unaligned buffered data. */ \
if (ctx->num_buffered != sizeof(ctx->buffer)) { \
static const u8 padding[sizeof(ctx->buffer)] = {0x80,}; \
const size_t needed = sizeof(ctx->buffer) - ctx->num_buffered; \
memcpy(ctx->buffer + ctx->num_buffered, padding, needed); \
ctx->num_buffered = sizeof(ctx->buffer); \
vst1q_u8(ctx->subkey, _galoisMultiply(vld1q_u8(ctx->subkey))); \
} \
/* Mask in subkey. */ \
for (size_t i = 0; i < sizeof(ctx->buffer); i++) { \
ctx->buffer[i] ^= ctx->subkey[i]; \
} \
_cmac##cipher##ProcessBlocks(ctx, ctx->buffer, 1); \
ctx->num_buffered = 0; \
ctx->finalized = true; \
} \
\
memcpy(dst, ctx->mac, sizeof(ctx->mac)); \
} while (0)
#define CMAC_AES_CALCULATE_MAC(cipher) \
do { \
/* Make a new context, calculate hash, store to output, clear memory. */ \
cipher##CmacContext ctx; \
cmac##cipher##ContextCreate(&ctx, key); \
cmac##cipher##ContextUpdate(&ctx, src, size); \
cmac##cipher##ContextGetMac(&ctx, dst); \
memset(&ctx, 0, sizeof(ctx)); \
} while (0)
/* Multiplies block in galois field. */
static inline uint8x16_t _galoisMultiply(const uint8x16_t val) {
uint8x16_t mult;
uint64_t high, low, mask;
const uint64_t xor = 0x87ul;
/* Use ASM. TODO: Better than using intrinsics? */
__asm__ __volatile__ (
"mov %[high], %[val].d[0]\n"
"mov %[low], %[val].d[1]\n"
"rev %[high], %[high]\n"
"rev %[low], %[low]\n"
"and %[mask], %[xor], %[high], asr 63\n"
"extr %[high], %[high], %[low], 63\n"
"eor %[low], %[mask], %[low], lsl 1\n"
"rev %[high], %[high]\n"
"rev %[low], %[low]\n"
"mov %[mult].d[0], %[high]\n"
"mov %[mult].d[1], %[low]\n"
: [mult]"=w"(mult),
[high]"=&r"(high), [low]"=&r"(low), [mask]"=&r"(mask)
: [val]"w"(val),
[xor]"r"(xor)
: "cc"
);
return mult;
}
static void _cmacAes128ProcessBlocks(Aes128CmacContext *ctx, const u8 *src_u8, size_t num_blocks) {
/* Preload all round keys + iv into neon registers. */
DECLARE_ROUND_KEY_VAR(0);
DECLARE_ROUND_KEY_VAR(1);
DECLARE_ROUND_KEY_VAR(2);
DECLARE_ROUND_KEY_VAR(3);
DECLARE_ROUND_KEY_VAR(4);
DECLARE_ROUND_KEY_VAR(5);
DECLARE_ROUND_KEY_VAR(6);
DECLARE_ROUND_KEY_VAR(7);
DECLARE_ROUND_KEY_VAR(8);
DECLARE_ROUND_KEY_VAR(9);
DECLARE_ROUND_KEY_VAR(10);
uint8x16_t cur_mac = vld1q_u8(ctx->mac);
/* Process blocks one-by-one. */
while (num_blocks >= 1) {
/* Read block in, xor with MAC. */
uint8x16_t tmp0 = veorq_u8(cur_mac, vld1q_u8(src_u8));
src_u8 += AES_BLOCK_SIZE;
/* Actually do encryption, use optimized asm. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0)
AES_ENC_ROUND(1, 0)
AES_ENC_ROUND(2, 0)
AES_ENC_ROUND(3, 0)
AES_ENC_ROUND(4, 0)
AES_ENC_ROUND(5, 0)
AES_ENC_ROUND(6, 0)
AES_ENC_ROUND(7, 0)
AES_ENC_ROUND(8, 0)
AES_ENC_SECOND_LAST_ROUND(9, 0)
AES_ENC_LAST_ROUND(10, 0)
: AES_ENC_DEC_OUTPUT_ONE_BLOCK()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10)
);
/* Update MAC. */
cur_mac = tmp0;
num_blocks--;
}
vst1q_u8(ctx->mac, cur_mac);
}
static void _cmacAes192ProcessBlocks(Aes192CmacContext *ctx, const u8 *src_u8, size_t num_blocks) {
/* Preload all round keys + iv into neon registers. */
DECLARE_ROUND_KEY_VAR(0);
DECLARE_ROUND_KEY_VAR(1);
DECLARE_ROUND_KEY_VAR(2);
DECLARE_ROUND_KEY_VAR(3);
DECLARE_ROUND_KEY_VAR(4);
DECLARE_ROUND_KEY_VAR(5);
DECLARE_ROUND_KEY_VAR(6);
DECLARE_ROUND_KEY_VAR(7);
DECLARE_ROUND_KEY_VAR(8);
DECLARE_ROUND_KEY_VAR(9);
DECLARE_ROUND_KEY_VAR(10);
DECLARE_ROUND_KEY_VAR(11);
DECLARE_ROUND_KEY_VAR(12);
uint8x16_t cur_mac = vld1q_u8(ctx->mac);
/* Process blocks one-by-one. */
while (num_blocks >= 1) {
/* Read block in, xor with MAC. */
uint8x16_t tmp0 = veorq_u8(cur_mac, vld1q_u8(src_u8));
src_u8 += AES_BLOCK_SIZE;
/* Actually do encryption, use optimized asm. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0)
AES_ENC_ROUND(1, 0)
AES_ENC_ROUND(2, 0)
AES_ENC_ROUND(3, 0)
AES_ENC_ROUND(4, 0)
AES_ENC_ROUND(5, 0)
AES_ENC_ROUND(6, 0)
AES_ENC_ROUND(7, 0)
AES_ENC_ROUND(8, 0)
AES_ENC_ROUND(9, 0)
AES_ENC_ROUND(10, 0)
AES_ENC_SECOND_LAST_ROUND(11, 0)
AES_ENC_LAST_ROUND(12, 0)
: AES_ENC_DEC_OUTPUT_ONE_BLOCK()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10),
AES_ENC_DEC_INPUT_ROUND_KEY(11),
AES_ENC_DEC_INPUT_ROUND_KEY(12)
);
/* Update MAC. */
cur_mac = tmp0;
num_blocks--;
}
vst1q_u8(ctx->mac, cur_mac);
}
static void _cmacAes256ProcessBlocks(Aes256CmacContext *ctx, const u8 *src_u8, size_t num_blocks) {
/* Preload all round keys + iv into neon registers. */
DECLARE_ROUND_KEY_VAR(0);
DECLARE_ROUND_KEY_VAR(1);
DECLARE_ROUND_KEY_VAR(2);
DECLARE_ROUND_KEY_VAR(3);
DECLARE_ROUND_KEY_VAR(4);
DECLARE_ROUND_KEY_VAR(5);
DECLARE_ROUND_KEY_VAR(6);
DECLARE_ROUND_KEY_VAR(7);
DECLARE_ROUND_KEY_VAR(8);
DECLARE_ROUND_KEY_VAR(9);
DECLARE_ROUND_KEY_VAR(10);
DECLARE_ROUND_KEY_VAR(11);
DECLARE_ROUND_KEY_VAR(12);
DECLARE_ROUND_KEY_VAR(13);
DECLARE_ROUND_KEY_VAR(14);
uint8x16_t cur_mac = vld1q_u8(ctx->mac);
/* Process blocks one-by-one. */
while (num_blocks >= 1) {
/* Read block in, xor with MAC. */
uint8x16_t tmp0 = veorq_u8(cur_mac, vld1q_u8(src_u8));
src_u8 += AES_BLOCK_SIZE;
/* Actually do encryption, use optimized asm. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0)
AES_ENC_ROUND(1, 0)
AES_ENC_ROUND(2, 0)
AES_ENC_ROUND(3, 0)
AES_ENC_ROUND(4, 0)
AES_ENC_ROUND(5, 0)
AES_ENC_ROUND(6, 0)
AES_ENC_ROUND(7, 0)
AES_ENC_ROUND(8, 0)
AES_ENC_ROUND(9, 0)
AES_ENC_ROUND(10, 0)
AES_ENC_ROUND(11, 0)
AES_ENC_ROUND(12, 0)
AES_ENC_SECOND_LAST_ROUND(13, 0)
AES_ENC_LAST_ROUND(14, 0)
: AES_ENC_DEC_OUTPUT_ONE_BLOCK()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10),
AES_ENC_DEC_INPUT_ROUND_KEY(11),
AES_ENC_DEC_INPUT_ROUND_KEY(12),
AES_ENC_DEC_INPUT_ROUND_KEY(13),
AES_ENC_DEC_INPUT_ROUND_KEY(14)
);
/* Update MAC. */
cur_mac = tmp0;
num_blocks--;
}
vst1q_u8(ctx->mac, cur_mac);
}
void cmacAes128ContextCreate(Aes128CmacContext *out, const void *key) {
CMAC_CONTEXT_CREATE(aes128);
}
void cmacAes128ContextUpdate(Aes128CmacContext *ctx, const void *src, size_t size) {
CMAC_CONTEXT_UPDATE(Aes128);
}
void cmacAes128ContextGetMac(Aes128CmacContext *ctx, void *dst) {
CMAC_CONTEXT_GET_MAC(Aes128);
}
void cmacAes128CalculateMac(void *dst, const void *key, const void *src, size_t size) {
CMAC_AES_CALCULATE_MAC(Aes128);
}
void cmacAes192ContextCreate(Aes192CmacContext *out, const void *key) {
CMAC_CONTEXT_CREATE(aes192);
}
void cmacAes192ContextUpdate(Aes192CmacContext *ctx, const void *src, size_t size) {
CMAC_CONTEXT_UPDATE(Aes192);
}
void cmacAes192ContextGetMac(Aes192CmacContext *ctx, void *dst) {
CMAC_CONTEXT_GET_MAC(Aes192);
}
void cmacAes192CalculateMac(void *dst, const void *key, const void *src, size_t size) {
CMAC_AES_CALCULATE_MAC(Aes192);
}
void cmacAes256ContextCreate(Aes256CmacContext *out, const void *key) {
CMAC_CONTEXT_CREATE(aes256);
}
void cmacAes256ContextUpdate(Aes256CmacContext *ctx, const void *src, size_t size) {
CMAC_CONTEXT_UPDATE(Aes256);
}
void cmacAes256ContextGetMac(Aes256CmacContext *ctx, void *dst) {
CMAC_CONTEXT_GET_MAC(Aes256);
}
void cmacAes256CalculateMac(void *dst, const void *key, const void *src, size_t size) {
CMAC_AES_CALCULATE_MAC(Aes256);
}