Implement accelerated SHA256, HMAC-SHA256

This commit is contained in:
Michael Scire 2019-04-03 19:14:51 -07:00 committed by fincs
parent e4dda8fa46
commit 0dde100135
5 changed files with 384 additions and 1 deletions

View File

@ -9,4 +9,7 @@
#include "crypto/aes.h" #include "crypto/aes.h"
#include "crypto/aes_cbc.h" #include "crypto/aes_cbc.h"
#include "crypto/aes_ctr.h" #include "crypto/aes_ctr.h"
#include "crypto/aes_xts.h" #include "crypto/aes_xts.h"
#include "crypto/sha256.h"
#include "crypto/hmac.h"

View File

@ -0,0 +1,25 @@
/**
* @file hmac.h
* @brief Hardware accelerated HMAC-SHA(1, 256) implementation.
* @copyright libnx Authors
*/
#pragma once
#include "sha256.h"
/// Context for HMAC-SHA256 operations.
typedef struct {
Sha256Context sha_ctx;
u32 key[SHA256_BLOCK_SIZE / sizeof(u32)];
u32 mac[SHA256_HASH_SIZE / sizeof(u32)];
bool finalized;
} HmacSha256Context;
/// Initialize a HMAC-SHA256 context.
void hmacSha256ContextCreate(HmacSha256Context *out, const void *key, size_t key_size);
/// Updates HMAC-SHA256 context with data to hash
void hmacSha256ContextUpdate(HmacSha256Context *ctx, const void *src, size_t size);
/// Gets the context's output mac, finalizes the context.
void hmacSha256ContextGetMac(HmacSha256Context *ctx, void *dst);
/// Simple all-in-one HMAC-SHA256 calculator.
void hmacSha256CalculateMac(void *dst, const void *key, size_t key_size, const void *src, size_t size);

View File

@ -0,0 +1,29 @@
/**
* @file sha256.h
* @brief Hardware accelerated SHA256 implementation.
* @copyright libnx Authors
*/
#pragma once
#include "../types.h"
#define SHA256_HASH_SIZE 0x20
#define SHA256_BLOCK_SIZE 0x40
/// Context for SHA256 operations.
typedef struct {
u32 intermediate_hash[SHA256_HASH_SIZE / sizeof(u32)];
u8 buffer[SHA256_BLOCK_SIZE];
u64 bits_consumed;
size_t num_buffered;
bool finalized;
} Sha256Context;
/// Initialize a SHA256 context.
void sha256ContextCreate(Sha256Context *out);
/// Updates SHA256 context with data to hash
void sha256ContextUpdate(Sha256Context *ctx, const void *src, size_t size);
/// Gets the context's output hash, finalizes the context.
void sha256ContextGetHash(Sha256Context *ctx, void *dst);
/// Simple all-in-one SHA256 calculator.
void sha256CalculateHash(void *dst, const void *src, size_t size);

68
nx/source/crypto/hmac.c Normal file
View File

@ -0,0 +1,68 @@
#include <string.h>
#include <stdlib.h>
#include "crypto/hmac.h"
#define HMAC_IPAD_VAL 0x36363636
#define HMAC_OPAD_VAL 0x5c5c5c5c
#define HMAC_IPAD_XOR_OPAD_VAL (HMAC_IPAD_VAL ^ HMAC_OPAD_VAL)
void hmacSha256ContextCreate(HmacSha256Context *out, const void *key, size_t key_size) {
/* Clear key. */
memset(out->key, 0, sizeof(out->key));
/* Either hash the key into the context, or copy it directly if possible. */
if (key_size <= sizeof(out->key)) {
memcpy(out->key, key, key_size);
} else {
sha256ContextCreate(&out->sha_ctx);
sha256ContextUpdate(&out->sha_ctx, key, key_size);
sha256ContextGetHash(&out->sha_ctx, out->key);
}
/* XOR key with IPAD. */
for (size_t i = 0; i < sizeof(out->key) / sizeof(u32); i++) {
out->key[i] ^= HMAC_IPAD_VAL;
}
/* Update hash context with key ^ ipad. */
sha256ContextCreate(&out->sha_ctx);
sha256ContextUpdate(&out->sha_ctx, out->key, sizeof(out->key));
}
void hmacSha256ContextUpdate(HmacSha256Context *ctx, const void *src, size_t size) {
/* Just update, since we want H((key ^ ipad) || data). */
sha256ContextUpdate(&ctx->sha_ctx, src, size);
}
void hmacSha256ContextGetMac(HmacSha256Context *ctx, void *dst) {
if (!ctx->finalized) {
/* Save H((key ^ ipad) || data) into mac. */
sha256ContextGetHash(&ctx->sha_ctx, ctx->mac);
/* We want key ^ opad, so we xor (key ^ ipad) with (ipad ^ opad). */
for (size_t i = 0; i < sizeof(ctx->key) / sizeof(u32); i++) {
ctx->key[i] ^= HMAC_IPAD_XOR_OPAD_VAL;
}
/* Calculate H((key ^ opad) || H((key ^ ipad) || data)). */
sha256ContextCreate(&ctx->sha_ctx);
sha256ContextUpdate(&ctx->sha_ctx, ctx->key, sizeof(ctx->key));
sha256ContextUpdate(&ctx->sha_ctx, ctx->mac, sizeof(ctx->mac));
sha256ContextGetHash(&ctx->sha_ctx, ctx->mac);
/* We're done. */
ctx->finalized = true;
}
memcpy(dst, ctx->mac, sizeof(ctx->mac));
}
void hmacSha256CalculateMac(void *dst, const void *key, size_t key_size, const void *src, size_t size) {
/* Make a new context, calculate hash, store to output, clear memory. */
HmacSha256Context ctx;
hmacSha256ContextCreate(&ctx, key, key_size);
hmacSha256ContextUpdate(&ctx, src, size);
hmacSha256ContextGetMac(&ctx, dst);
memset(&ctx, 0, sizeof(ctx));
}

258
nx/source/crypto/sha256.c Normal file
View File

@ -0,0 +1,258 @@
#include <string.h>
#include <stdlib.h>
#include <arm_neon.h>
#include "crypto/sha256.h"
alignas(SHA256_BLOCK_SIZE) static const u32 s_roundConstants[0x40] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
void sha256ContextCreate(Sha256Context *out) {
static const u32 H_0[SHA256_HASH_SIZE / sizeof(u32)] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
memcpy(out->intermediate_hash, H_0, sizeof(out->intermediate_hash));
memset(out->buffer, 0, sizeof(out->buffer));
out->bits_consumed = 0;
out->num_buffered = 0;
out->finalized = false;
}
static void _sha256ProcessBlocks(Sha256Context *ctx, const u8 *src_u8, size_t num_blocks) {
/* Load previous hash with intermediate state, current hash with zeroes. */
uint32x4_t prev_hash0 = vld1q_u32(ctx->intermediate_hash + 0);
uint32x4_t prev_hash1 = vld1q_u32(ctx->intermediate_hash + 4);
uint32x4_t cur_hash0 = vdupq_n_u32(0);
uint32x4_t cur_hash1 = vdupq_n_u32(0);
/* Actually do hash processing blocks. */
while (num_blocks > 0) {
uint32x4_t round_constant0, round_constant1;
uint32x4_t data0, data1, data2, data3;
uint32x4_t tmp0, tmp1, tmp2, tmp3;
uint32x4_t tmp_hash;
/* Use optimized ASM implementation to process the block. */
__asm__ __volatile__ (
"ldp %q[data0], %q[data1], [%[src_u8]], #0x20\n"
"ldp %q[data2], %q[data3], [%[src_u8]], #0x20\n"
"add %[cur_hash0].4s, %[cur_hash0].4s, %[prev_hash0].4s\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x00]\n"
"add %[cur_hash1].4s, %[cur_hash1].4s, %[prev_hash1].4s\n"
"rev32 %[data0].16b, %[data0].16b\n"
"rev32 %[data1].16b, %[data1].16b\n"
"rev32 %[data2].16b, %[data2].16b\n"
"rev32 %[data3].16b, %[data3].16b\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x20]\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"mov %[prev_hash0].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"mov %[prev_hash1].16b, %[cur_hash1].16b\n"
"sha256h2 %q[cur_hash1], %q[prev_hash0], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x40]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x60]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x80]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xA0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xC0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xE0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
: [data0]"=w"(data0), [data1]"=w"(data1), [data2]"=w"(data2), [data3]"=w"(data3),
[tmp0]"=w"(tmp0), [tmp1]"=w"(tmp1), [tmp2]"=w"(tmp2), [tmp3]"=w"(tmp3),
[round_constant0]"=w"(round_constant0), [round_constant1]"=w"(round_constant1),
[cur_hash0]"+w"(cur_hash0), [cur_hash1]"+w"(cur_hash1),
[prev_hash0]"+w"(prev_hash0), [prev_hash1]"+w"(prev_hash1),
[tmp_hash]"=w"(tmp_hash), [src_u8]"+r"(src_u8)
: [round_constants]"r"(s_roundConstants)
:
);
num_blocks--;
}
/* Add hashes together, and store. */
cur_hash0 = vaddq_u32(prev_hash0, cur_hash0);
cur_hash1 = vaddq_u32(prev_hash1, cur_hash1);
vst1q_u32(ctx->intermediate_hash + 0, cur_hash0);
vst1q_u32(ctx->intermediate_hash + 4, cur_hash1);
}
void sha256ContextUpdate(Sha256Context *ctx, const void *src, size_t size) {
/* Convert src to u8* for utility. */
const u8 *cur_src = (const u8 *)src;
/* Update bits consumed. */
ctx->bits_consumed += (((ctx->num_buffered + size) / SHA256_BLOCK_SIZE) * SHA256_BLOCK_SIZE) * 8;
/* Handle pre-buffered data. */
if (ctx->num_buffered > 0) {
const size_t needed = SHA256_BLOCK_SIZE - ctx->num_buffered;
const size_t copyable = (size > needed ? needed : size);
memcpy(&ctx->buffer[ctx->num_buffered], cur_src, copyable);
cur_src += copyable;
ctx->num_buffered += copyable;
size -= copyable;
if (ctx->num_buffered == SHA256_BLOCK_SIZE) {
_sha256ProcessBlocks(ctx, ctx->buffer, 1);
ctx->num_buffered = 0;
}
}
/* Handle complete blocks. */
if (size >= SHA256_BLOCK_SIZE) {
const size_t num_blocks = size / SHA256_BLOCK_SIZE;
_sha256ProcessBlocks(ctx, cur_src, num_blocks);
size -= SHA256_BLOCK_SIZE * num_blocks;
cur_src += SHA256_BLOCK_SIZE * num_blocks;
}
/* Buffer remaining data. */
if (size > 0) {
memcpy(ctx->buffer, cur_src, size);
ctx->num_buffered = size;
}
}
void sha256ContextGetHash(Sha256Context *ctx, void *dst) {
if (!ctx->finalized) {
/* Process last block, if necessary. */
{
ctx->bits_consumed += 8 * ctx->num_buffered;
ctx->buffer[ctx->num_buffered++] = 0x80;
const size_t last_block_max_size = SHA256_BLOCK_SIZE - sizeof(u64);
/* If we've got space for the bits consumed field, just set to zero. */
if (ctx->num_buffered <= last_block_max_size) {
memset(ctx->buffer + ctx->num_buffered, 0, last_block_max_size - ctx->num_buffered);
} else {
/* Pad with zeroes, and process. */
memset(ctx->buffer + ctx->num_buffered, 0, SHA256_BLOCK_SIZE - ctx->num_buffered);
_sha256ProcessBlocks(ctx, ctx->buffer, 1);
/* Clear the rest of the buffer with zeroes. */
memset(ctx->buffer, 0, last_block_max_size);
}
/* Copy in bits consumed field, then process last block. */
u64 big_endian_bits_consumed = __builtin_bswap64(ctx->bits_consumed);
memcpy(ctx->buffer + last_block_max_size, &big_endian_bits_consumed, sizeof(big_endian_bits_consumed));
_sha256ProcessBlocks(ctx, ctx->buffer, 1);
}
ctx->finalized = true;
}
/* Copy endian-swapped intermediate hash out. */
u32 *dst_u32 = (u32 *)dst;
for (size_t i = 0; i < sizeof(ctx->intermediate_hash) / sizeof(u32); i++) {
dst_u32[i] = __builtin_bswap32(ctx->intermediate_hash[i]);
}
}
void sha256CalculateHash(void *dst, const void *src, size_t size) {
/* Make a new context, calculate hash, store to output. */
Sha256Context ctx;
sha256ContextCreate(&ctx);
sha256ContextUpdate(&ctx, src, size);
sha256ContextGetHash(&ctx, dst);
}