From 50a0dceaa2634cd5a2a645ae0714da22fbbb2358 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 19 Jan 2018 12:04:33 +0000 Subject: crypto: sha3-generic - fixes for alignment and big endian operation Ensure that the input is byte swabbed before injecting it into the SHA3 transform. Use the get_unaligned() accessor for this so that we don't perform unaligned access inadvertently on architectures that do not support that. Cc: Fixes: dc4b27a1bc222c3b ("crypto: sha3 - Add SHA-3 hash algorithm") Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/sha3_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'crypto/sha3_generic.c') diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index 7e8ed962..a68be626 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -18,6 +18,7 @@ #include #include #include +#include #define KECCAK_ROUNDS 24 @@ -149,7 +150,7 @@ static int sha3_update(struct shash_desc *desc, const u8 *data, unsigned int i; for (i = 0; i < sctx->rsizw; i++) - sctx->st[i] ^= ((u64 *) src)[i]; + sctx->st[i] ^= get_unaligned_le64(src + 8 * i); keccakf(sctx->st); done += sctx->rsiz; @@ -174,7 +175,7 @@ static int sha3_final(struct shash_desc *desc, u8 *out) sctx->buf[sctx->rsiz - 1] |= 0x80; for (i = 0; i < sctx->rsizw; i++) - sctx->st[i] ^= ((u64 *) sctx->buf)[i]; + sctx->st[i] ^= get_unaligned_le64(sctx->buf + 8 * i); keccakf(sctx->st); -- cgit v1.2.3 From 116121f60112aefdcda040aae3f8677e778e7303 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 19 Jan 2018 12:04:34 +0000 Subject: crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize The way the KECCAK transform is currently coded involves many references into the state array using indexes that are calculated at runtime using simple but non-trivial arithmetic. This forces the compiler to treat the state matrix as an array in memory rather than keep it in registers, which results in poor performance. So instead, let's rephrase the algorithm using fixed array indexes only. This helps the compiler keep the state matrix in registers, resulting in the following speedup (SHA3-256 performance in cycles per byte): before after speedup Intel Core i7 @ 2.0 GHz (2.9 turbo) 100.6 35.7 2.8x Cortex-A57 @ 2.0 GHz (64-bit mode) 101.6 12.7 8.0x Cortex-A53 @ 1.0 GHz 224.4 15.8 14.2x Cortex-A57 @ 2.0 GHz (32-bit mode) 201.8 63.0 3.2x Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/sha3_generic.c | 134 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 96 insertions(+), 38 deletions(-) (limited to 'crypto/sha3_generic.c') diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index a68be626..5fecb609 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -5,6 +5,7 @@ * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf * * SHA-3 code by Jeff Garzik + * Ard Biesheuvel * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -22,8 +23,6 @@ #define KECCAK_ROUNDS 24 -#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y)))) - static const u64 keccakf_rndc[24] = { 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL, @@ -35,53 +34,112 @@ static const u64 keccakf_rndc[24] = { 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL }; -static const int keccakf_rotc[24] = { - 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, - 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 -}; - -static const int keccakf_piln[24] = { - 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, - 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 -}; - /* update the state with given number of rounds */ -static void keccakf(u64 st[25]) +static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) { - int i, j, round; - u64 t, bc[5]; + u64 t[5], tt, bc[5]; + int round; for (round = 0; round < KECCAK_ROUNDS; round++) { /* Theta */ - for (i = 0; i < 5; i++) - bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] - ^ st[i + 20]; - - for (i = 0; i < 5; i++) { - t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); - for (j = 0; j < 25; j += 5) - st[j + i] ^= t; - } + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; + + t[0] = bc[4] ^ rol64(bc[1], 1); + t[1] = bc[0] ^ rol64(bc[2], 1); + t[2] = bc[1] ^ rol64(bc[3], 1); + t[3] = bc[2] ^ rol64(bc[4], 1); + t[4] = bc[3] ^ rol64(bc[0], 1); + + st[0] ^= t[0]; /* Rho Pi */ - t = st[1]; - for (i = 0; i < 24; i++) { - j = keccakf_piln[i]; - bc[0] = st[j]; - st[j] = ROTL64(t, keccakf_rotc[i]); - t = bc[0]; - } + tt = st[1]; + st[ 1] = rol64(st[ 6] ^ t[1], 44); + st[ 6] = rol64(st[ 9] ^ t[4], 20); + st[ 9] = rol64(st[22] ^ t[2], 61); + st[22] = rol64(st[14] ^ t[4], 39); + st[14] = rol64(st[20] ^ t[0], 18); + st[20] = rol64(st[ 2] ^ t[2], 62); + st[ 2] = rol64(st[12] ^ t[2], 43); + st[12] = rol64(st[13] ^ t[3], 25); + st[13] = rol64(st[19] ^ t[4], 8); + st[19] = rol64(st[23] ^ t[3], 56); + st[23] = rol64(st[15] ^ t[0], 41); + st[15] = rol64(st[ 4] ^ t[4], 27); + st[ 4] = rol64(st[24] ^ t[4], 14); + st[24] = rol64(st[21] ^ t[1], 2); + st[21] = rol64(st[ 8] ^ t[3], 55); + st[ 8] = rol64(st[16] ^ t[1], 45); + st[16] = rol64(st[ 5] ^ t[0], 36); + st[ 5] = rol64(st[ 3] ^ t[3], 28); + st[ 3] = rol64(st[18] ^ t[3], 21); + st[18] = rol64(st[17] ^ t[2], 15); + st[17] = rol64(st[11] ^ t[1], 10); + st[11] = rol64(st[ 7] ^ t[2], 6); + st[ 7] = rol64(st[10] ^ t[0], 3); + st[10] = rol64( tt ^ t[1], 1); /* Chi */ - for (j = 0; j < 25; j += 5) { - for (i = 0; i < 5; i++) - bc[i] = st[j + i]; - for (i = 0; i < 5; i++) - st[j + i] ^= (~bc[(i + 1) % 5]) & - bc[(i + 2) % 5]; - } + bc[ 0] = ~st[ 1] & st[ 2]; + bc[ 1] = ~st[ 2] & st[ 3]; + bc[ 2] = ~st[ 3] & st[ 4]; + bc[ 3] = ~st[ 4] & st[ 0]; + bc[ 4] = ~st[ 0] & st[ 1]; + st[ 0] ^= bc[ 0]; + st[ 1] ^= bc[ 1]; + st[ 2] ^= bc[ 2]; + st[ 3] ^= bc[ 3]; + st[ 4] ^= bc[ 4]; + + bc[ 0] = ~st[ 6] & st[ 7]; + bc[ 1] = ~st[ 7] & st[ 8]; + bc[ 2] = ~st[ 8] & st[ 9]; + bc[ 3] = ~st[ 9] & st[ 5]; + bc[ 4] = ~st[ 5] & st[ 6]; + st[ 5] ^= bc[ 0]; + st[ 6] ^= bc[ 1]; + st[ 7] ^= bc[ 2]; + st[ 8] ^= bc[ 3]; + st[ 9] ^= bc[ 4]; + + bc[ 0] = ~st[11] & st[12]; + bc[ 1] = ~st[12] & st[13]; + bc[ 2] = ~st[13] & st[14]; + bc[ 3] = ~st[14] & st[10]; + bc[ 4] = ~st[10] & st[11]; + st[10] ^= bc[ 0]; + st[11] ^= bc[ 1]; + st[12] ^= bc[ 2]; + st[13] ^= bc[ 3]; + st[14] ^= bc[ 4]; + + bc[ 0] = ~st[16] & st[17]; + bc[ 1] = ~st[17] & st[18]; + bc[ 2] = ~st[18] & st[19]; + bc[ 3] = ~st[19] & st[15]; + bc[ 4] = ~st[15] & st[16]; + st[15] ^= bc[ 0]; + st[16] ^= bc[ 1]; + st[17] ^= bc[ 2]; + st[18] ^= bc[ 3]; + st[19] ^= bc[ 4]; + + bc[ 0] = ~st[21] & st[22]; + bc[ 1] = ~st[22] & st[23]; + bc[ 2] = ~st[23] & st[24]; + bc[ 3] = ~st[24] & st[20]; + bc[ 4] = ~st[20] & st[21]; + st[20] ^= bc[ 0]; + st[21] ^= bc[ 1]; + st[22] ^= bc[ 2]; + st[23] ^= bc[ 3]; + st[24] ^= bc[ 4]; /* Iota */ st[0] ^= keccakf_rndc[round]; -- cgit v1.2.3 From 9fb5b09b8d34b49c67c05eab0d3d92e806c03b24 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 19 Jan 2018 12:04:35 +0000 Subject: crypto: sha3-generic - simplify code In preparation of exposing the generic SHA3 implementation to other versions as a fallback, simplify the code, and remove an inconsistency in the output handling (endian swabbing rsizw words of state before writing the output does not make sense) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/sha3_generic.c | 184 ++++++++++++++++---------------------------------- 1 file changed, 59 insertions(+), 125 deletions(-) (limited to 'crypto/sha3_generic.c') diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index 5fecb609..c7084a24 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #define KECCAK_ROUNDS 24 @@ -146,43 +145,16 @@ static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) } } -static void sha3_init(struct sha3_state *sctx, unsigned int digest_sz) -{ - memset(sctx, 0, sizeof(*sctx)); - sctx->md_len = digest_sz; - sctx->rsiz = 200 - 2 * digest_sz; - sctx->rsizw = sctx->rsiz / 8; -} - -static int sha3_224_init(struct shash_desc *desc) +static int sha3_init(struct shash_desc *desc) { struct sha3_state *sctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); - sha3_init(sctx, SHA3_224_DIGEST_SIZE); - return 0; -} - -static int sha3_256_init(struct shash_desc *desc) -{ - struct sha3_state *sctx = shash_desc_ctx(desc); - - sha3_init(sctx, SHA3_256_DIGEST_SIZE); - return 0; -} - -static int sha3_384_init(struct shash_desc *desc) -{ - struct sha3_state *sctx = shash_desc_ctx(desc); - - sha3_init(sctx, SHA3_384_DIGEST_SIZE); - return 0; -} - -static int sha3_512_init(struct shash_desc *desc) -{ - struct sha3_state *sctx = shash_desc_ctx(desc); + sctx->rsiz = 200 - 2 * digest_size; + sctx->rsizw = sctx->rsiz / 8; + sctx->partial = 0; - sha3_init(sctx, SHA3_512_DIGEST_SIZE); + memset(sctx->st, 0, sizeof(sctx->st)); return 0; } @@ -227,6 +199,8 @@ static int sha3_final(struct shash_desc *desc, u8 *out) { struct sha3_state *sctx = shash_desc_ctx(desc); unsigned int i, inlen = sctx->partial; + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + __le64 *digest = (__le64 *)out; sctx->buf[inlen++] = 0x06; memset(sctx->buf + inlen, 0, sctx->rsiz - inlen); @@ -237,110 +211,70 @@ static int sha3_final(struct shash_desc *desc, u8 *out) keccakf(sctx->st); - for (i = 0; i < sctx->rsizw; i++) - sctx->st[i] = cpu_to_le64(sctx->st[i]); + for (i = 0; i < digest_size / 8; i++) + put_unaligned_le64(sctx->st[i], digest++); - memcpy(out, sctx->st, sctx->md_len); + if (digest_size & 4) + put_unaligned_le32(sctx->st[i], (__le32 *)digest); memset(sctx, 0, sizeof(*sctx)); return 0; } -static struct shash_alg sha3_224 = { - .digestsize = SHA3_224_DIGEST_SIZE, - .init = sha3_224_init, - .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-224", - .cra_driver_name = "sha3-224-generic", - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = SHA3_224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static struct shash_alg sha3_256 = { - .digestsize = SHA3_256_DIGEST_SIZE, - .init = sha3_256_init, - .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-256", - .cra_driver_name = "sha3-256-generic", - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = SHA3_256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static struct shash_alg sha3_384 = { - .digestsize = SHA3_384_DIGEST_SIZE, - .init = sha3_384_init, - .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-384", - .cra_driver_name = "sha3-384-generic", - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = SHA3_384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static struct shash_alg sha3_512 = { - .digestsize = SHA3_512_DIGEST_SIZE, - .init = sha3_512_init, - .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), - .base = { - .cra_name = "sha3-512", - .cra_driver_name = "sha3-512-generic", - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = SHA3_512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; +static struct shash_alg algs[] = { { + .digestsize = SHA3_224_DIGEST_SIZE, + .init = sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-224", + .base.cra_driver_name = "sha3-224-generic", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA3_256_DIGEST_SIZE, + .init = sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-256", + .base.cra_driver_name = "sha3-256-generic", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA3_384_DIGEST_SIZE, + .init = sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-384", + .base.cra_driver_name = "sha3-384-generic", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_384_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA3_512_DIGEST_SIZE, + .init = sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-512", + .base.cra_driver_name = "sha3-512-generic", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; static int __init sha3_generic_mod_init(void) { - int ret; - - ret = crypto_register_shash(&sha3_224); - if (ret < 0) - goto err_out; - ret = crypto_register_shash(&sha3_256); - if (ret < 0) - goto err_out_224; - ret = crypto_register_shash(&sha3_384); - if (ret < 0) - goto err_out_256; - ret = crypto_register_shash(&sha3_512); - if (ret < 0) - goto err_out_384; - - return 0; - -err_out_384: - crypto_unregister_shash(&sha3_384); -err_out_256: - crypto_unregister_shash(&sha3_256); -err_out_224: - crypto_unregister_shash(&sha3_224); -err_out: - return ret; + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } static void __exit sha3_generic_mod_fini(void) { - crypto_unregister_shash(&sha3_224); - crypto_unregister_shash(&sha3_256); - crypto_unregister_shash(&sha3_384); - crypto_unregister_shash(&sha3_512); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_init(sha3_generic_mod_init); -- cgit v1.2.3 From 05c0463c8af9154c19d7824aed5fe1b540b7c396 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 19 Jan 2018 12:04:36 +0000 Subject: crypto: sha3-generic - export init/update/final routines To allow accelerated implementations to fall back to the generic routines, e.g., in contexts where a SIMD based implementation is not allowed to run, expose the generic SHA3 init/update/final routines to other modules. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/sha3_generic.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'crypto/sha3_generic.c') diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index c7084a24..a965b9d8 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -145,7 +145,7 @@ static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) } } -static int sha3_init(struct shash_desc *desc) +int crypto_sha3_init(struct shash_desc *desc) { struct sha3_state *sctx = shash_desc_ctx(desc); unsigned int digest_size = crypto_shash_digestsize(desc->tfm); @@ -157,8 +157,9 @@ static int sha3_init(struct shash_desc *desc) memset(sctx->st, 0, sizeof(sctx->st)); return 0; } +EXPORT_SYMBOL(crypto_sha3_init); -static int sha3_update(struct shash_desc *desc, const u8 *data, +int crypto_sha3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha3_state *sctx = shash_desc_ctx(desc); @@ -194,8 +195,9 @@ static int sha3_update(struct shash_desc *desc, const u8 *data, return 0; } +EXPORT_SYMBOL(crypto_sha3_update); -static int sha3_final(struct shash_desc *desc, u8 *out) +int crypto_sha3_final(struct shash_desc *desc, u8 *out) { struct sha3_state *sctx = shash_desc_ctx(desc); unsigned int i, inlen = sctx->partial; @@ -220,12 +222,13 @@ static int sha3_final(struct shash_desc *desc, u8 *out) memset(sctx, 0, sizeof(*sctx)); return 0; } +EXPORT_SYMBOL(crypto_sha3_final); static struct shash_alg algs[] = { { .digestsize = SHA3_224_DIGEST_SIZE, - .init = sha3_init, - .update = sha3_update, - .final = sha3_final, + .init = crypto_sha3_init, + .update = crypto_sha3_update, + .final = crypto_sha3_final, .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-224", .base.cra_driver_name = "sha3-224-generic", @@ -234,9 +237,9 @@ static struct shash_alg algs[] = { { .base.cra_module = THIS_MODULE, }, { .digestsize = SHA3_256_DIGEST_SIZE, - .init = sha3_init, - .update = sha3_update, - .final = sha3_final, + .init = crypto_sha3_init, + .update = crypto_sha3_update, + .final = crypto_sha3_final, .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-256", .base.cra_driver_name = "sha3-256-generic", @@ -245,9 +248,9 @@ static struct shash_alg algs[] = { { .base.cra_module = THIS_MODULE, }, { .digestsize = SHA3_384_DIGEST_SIZE, - .init = sha3_init, - .update = sha3_update, - .final = sha3_final, + .init = crypto_sha3_init, + .update = crypto_sha3_update, + .final = crypto_sha3_final, .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-384", .base.cra_driver_name = "sha3-384-generic", @@ -256,9 +259,9 @@ static struct shash_alg algs[] = { { .base.cra_module = THIS_MODULE, }, { .digestsize = SHA3_512_DIGEST_SIZE, - .init = sha3_init, - .update = sha3_update, - .final = sha3_final, + .init = crypto_sha3_init, + .update = crypto_sha3_update, + .final = crypto_sha3_final, .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-512", .base.cra_driver_name = "sha3-512-generic", -- cgit v1.2.3 From 4c4a644abbdc229f52c5decd436ee842444307c3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 27 Jan 2018 09:18:32 +0000 Subject: crypto: sha3-generic - deal with oversize stack frames As reported by kbuild test robot, the optimized SHA3 C implementation compiles to mn10300 code that uses a disproportionate amount of stack space, i.e., crypto/sha3_generic.c: In function 'keccakf': crypto/sha3_generic.c:147:1: warning: the frame size of 1232 bytes is larger than 1024 bytes [-Wframe-larger-than=] As kindly diagnosed by Arnd, this does not only occur when building for the mn10300 architecture (which is what the report was about) but also for h8300, and builds for other 32-bit architectures show an increase in stack space utilization as well. Given that SHA3 operates on 64-bit quantities, and keeps a state matrix of 25 64-bit words, it is not surprising that 32-bit architectures with few general purpose registers are impacted the most by this, and it is therefore reasonable to implement a workaround that distinguishes between 32-bit and 64-bit architectures. Arnd figured out that taking the round calculation out of the loop, and inlining it explicitly but only on 64-bit architectures preserves most of the performance gain achieved by the rewrite, and also gets rid of the excessive use of stack space. Reported-by: kbuild test robot Suggested-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/sha3_generic.c | 218 +++++++++++++++++++++++++++----------------------- 1 file changed, 118 insertions(+), 100 deletions(-) (limited to 'crypto/sha3_generic.c') diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index a965b9d8..951c4eb7 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -20,6 +20,20 @@ #include #include +/* + * On some 32-bit architectures (mn10300 and h8300), GCC ends up using + * over 1 KB of stack if we inline the round calculation into the loop + * in keccakf(). On the other hand, on 64-bit architectures with plenty + * of [64-bit wide] general purpose registers, not inlining it severely + * hurts performance. So let's use 64-bitness as a heuristic to decide + * whether to inline or not. + */ +#ifdef CONFIG_64BIT +#define SHA3_INLINE inline +#else +#define SHA3_INLINE noinline +#endif + #define KECCAK_ROUNDS 24 static const u64 keccakf_rndc[24] = { @@ -35,111 +49,115 @@ static const u64 keccakf_rndc[24] = { /* update the state with given number of rounds */ -static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) +static SHA3_INLINE void keccakf_round(u64 st[25]) { u64 t[5], tt, bc[5]; - int round; - for (round = 0; round < KECCAK_ROUNDS; round++) { + /* Theta */ + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; + + t[0] = bc[4] ^ rol64(bc[1], 1); + t[1] = bc[0] ^ rol64(bc[2], 1); + t[2] = bc[1] ^ rol64(bc[3], 1); + t[3] = bc[2] ^ rol64(bc[4], 1); + t[4] = bc[3] ^ rol64(bc[0], 1); + + st[0] ^= t[0]; + + /* Rho Pi */ + tt = st[1]; + st[ 1] = rol64(st[ 6] ^ t[1], 44); + st[ 6] = rol64(st[ 9] ^ t[4], 20); + st[ 9] = rol64(st[22] ^ t[2], 61); + st[22] = rol64(st[14] ^ t[4], 39); + st[14] = rol64(st[20] ^ t[0], 18); + st[20] = rol64(st[ 2] ^ t[2], 62); + st[ 2] = rol64(st[12] ^ t[2], 43); + st[12] = rol64(st[13] ^ t[3], 25); + st[13] = rol64(st[19] ^ t[4], 8); + st[19] = rol64(st[23] ^ t[3], 56); + st[23] = rol64(st[15] ^ t[0], 41); + st[15] = rol64(st[ 4] ^ t[4], 27); + st[ 4] = rol64(st[24] ^ t[4], 14); + st[24] = rol64(st[21] ^ t[1], 2); + st[21] = rol64(st[ 8] ^ t[3], 55); + st[ 8] = rol64(st[16] ^ t[1], 45); + st[16] = rol64(st[ 5] ^ t[0], 36); + st[ 5] = rol64(st[ 3] ^ t[3], 28); + st[ 3] = rol64(st[18] ^ t[3], 21); + st[18] = rol64(st[17] ^ t[2], 15); + st[17] = rol64(st[11] ^ t[1], 10); + st[11] = rol64(st[ 7] ^ t[2], 6); + st[ 7] = rol64(st[10] ^ t[0], 3); + st[10] = rol64( tt ^ t[1], 1); + + /* Chi */ + bc[ 0] = ~st[ 1] & st[ 2]; + bc[ 1] = ~st[ 2] & st[ 3]; + bc[ 2] = ~st[ 3] & st[ 4]; + bc[ 3] = ~st[ 4] & st[ 0]; + bc[ 4] = ~st[ 0] & st[ 1]; + st[ 0] ^= bc[ 0]; + st[ 1] ^= bc[ 1]; + st[ 2] ^= bc[ 2]; + st[ 3] ^= bc[ 3]; + st[ 4] ^= bc[ 4]; + + bc[ 0] = ~st[ 6] & st[ 7]; + bc[ 1] = ~st[ 7] & st[ 8]; + bc[ 2] = ~st[ 8] & st[ 9]; + bc[ 3] = ~st[ 9] & st[ 5]; + bc[ 4] = ~st[ 5] & st[ 6]; + st[ 5] ^= bc[ 0]; + st[ 6] ^= bc[ 1]; + st[ 7] ^= bc[ 2]; + st[ 8] ^= bc[ 3]; + st[ 9] ^= bc[ 4]; + + bc[ 0] = ~st[11] & st[12]; + bc[ 1] = ~st[12] & st[13]; + bc[ 2] = ~st[13] & st[14]; + bc[ 3] = ~st[14] & st[10]; + bc[ 4] = ~st[10] & st[11]; + st[10] ^= bc[ 0]; + st[11] ^= bc[ 1]; + st[12] ^= bc[ 2]; + st[13] ^= bc[ 3]; + st[14] ^= bc[ 4]; + + bc[ 0] = ~st[16] & st[17]; + bc[ 1] = ~st[17] & st[18]; + bc[ 2] = ~st[18] & st[19]; + bc[ 3] = ~st[19] & st[15]; + bc[ 4] = ~st[15] & st[16]; + st[15] ^= bc[ 0]; + st[16] ^= bc[ 1]; + st[17] ^= bc[ 2]; + st[18] ^= bc[ 3]; + st[19] ^= bc[ 4]; + + bc[ 0] = ~st[21] & st[22]; + bc[ 1] = ~st[22] & st[23]; + bc[ 2] = ~st[23] & st[24]; + bc[ 3] = ~st[24] & st[20]; + bc[ 4] = ~st[20] & st[21]; + st[20] ^= bc[ 0]; + st[21] ^= bc[ 1]; + st[22] ^= bc[ 2]; + st[23] ^= bc[ 3]; + st[24] ^= bc[ 4]; +} - /* Theta */ - bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; - bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; - bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; - bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; - bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; - - t[0] = bc[4] ^ rol64(bc[1], 1); - t[1] = bc[0] ^ rol64(bc[2], 1); - t[2] = bc[1] ^ rol64(bc[3], 1); - t[3] = bc[2] ^ rol64(bc[4], 1); - t[4] = bc[3] ^ rol64(bc[0], 1); - - st[0] ^= t[0]; - - /* Rho Pi */ - tt = st[1]; - st[ 1] = rol64(st[ 6] ^ t[1], 44); - st[ 6] = rol64(st[ 9] ^ t[4], 20); - st[ 9] = rol64(st[22] ^ t[2], 61); - st[22] = rol64(st[14] ^ t[4], 39); - st[14] = rol64(st[20] ^ t[0], 18); - st[20] = rol64(st[ 2] ^ t[2], 62); - st[ 2] = rol64(st[12] ^ t[2], 43); - st[12] = rol64(st[13] ^ t[3], 25); - st[13] = rol64(st[19] ^ t[4], 8); - st[19] = rol64(st[23] ^ t[3], 56); - st[23] = rol64(st[15] ^ t[0], 41); - st[15] = rol64(st[ 4] ^ t[4], 27); - st[ 4] = rol64(st[24] ^ t[4], 14); - st[24] = rol64(st[21] ^ t[1], 2); - st[21] = rol64(st[ 8] ^ t[3], 55); - st[ 8] = rol64(st[16] ^ t[1], 45); - st[16] = rol64(st[ 5] ^ t[0], 36); - st[ 5] = rol64(st[ 3] ^ t[3], 28); - st[ 3] = rol64(st[18] ^ t[3], 21); - st[18] = rol64(st[17] ^ t[2], 15); - st[17] = rol64(st[11] ^ t[1], 10); - st[11] = rol64(st[ 7] ^ t[2], 6); - st[ 7] = rol64(st[10] ^ t[0], 3); - st[10] = rol64( tt ^ t[1], 1); - - /* Chi */ - bc[ 0] = ~st[ 1] & st[ 2]; - bc[ 1] = ~st[ 2] & st[ 3]; - bc[ 2] = ~st[ 3] & st[ 4]; - bc[ 3] = ~st[ 4] & st[ 0]; - bc[ 4] = ~st[ 0] & st[ 1]; - st[ 0] ^= bc[ 0]; - st[ 1] ^= bc[ 1]; - st[ 2] ^= bc[ 2]; - st[ 3] ^= bc[ 3]; - st[ 4] ^= bc[ 4]; - - bc[ 0] = ~st[ 6] & st[ 7]; - bc[ 1] = ~st[ 7] & st[ 8]; - bc[ 2] = ~st[ 8] & st[ 9]; - bc[ 3] = ~st[ 9] & st[ 5]; - bc[ 4] = ~st[ 5] & st[ 6]; - st[ 5] ^= bc[ 0]; - st[ 6] ^= bc[ 1]; - st[ 7] ^= bc[ 2]; - st[ 8] ^= bc[ 3]; - st[ 9] ^= bc[ 4]; - - bc[ 0] = ~st[11] & st[12]; - bc[ 1] = ~st[12] & st[13]; - bc[ 2] = ~st[13] & st[14]; - bc[ 3] = ~st[14] & st[10]; - bc[ 4] = ~st[10] & st[11]; - st[10] ^= bc[ 0]; - st[11] ^= bc[ 1]; - st[12] ^= bc[ 2]; - st[13] ^= bc[ 3]; - st[14] ^= bc[ 4]; - - bc[ 0] = ~st[16] & st[17]; - bc[ 1] = ~st[17] & st[18]; - bc[ 2] = ~st[18] & st[19]; - bc[ 3] = ~st[19] & st[15]; - bc[ 4] = ~st[15] & st[16]; - st[15] ^= bc[ 0]; - st[16] ^= bc[ 1]; - st[17] ^= bc[ 2]; - st[18] ^= bc[ 3]; - st[19] ^= bc[ 4]; - - bc[ 0] = ~st[21] & st[22]; - bc[ 1] = ~st[22] & st[23]; - bc[ 2] = ~st[23] & st[24]; - bc[ 3] = ~st[24] & st[20]; - bc[ 4] = ~st[20] & st[21]; - st[20] ^= bc[ 0]; - st[21] ^= bc[ 1]; - st[22] ^= bc[ 2]; - st[23] ^= bc[ 3]; - st[24] ^= bc[ 4]; +static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) +{ + int round; + for (round = 0; round < KECCAK_ROUNDS; round++) { + keccakf_round(st); /* Iota */ st[0] ^= keccakf_rndc[round]; } -- cgit v1.2.3 From a8b8d012167230d3ef15a854b8440536bc6b8d94 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 1 Feb 2018 11:22:00 +0100 Subject: crypto: sha3-generic - Use __optimize to support old compilers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2: crypto/sha3_generic.c:39: warning: ‘__optimize__’ attribute directive ignored Use the newly introduced __optimize macro to fix this. Fixes: 116121f60112aefd ("crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize") Signed-off-by: Geert Uytterhoeven Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/sha3_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'crypto/sha3_generic.c') diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index 951c4eb7..ded14878 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -152,7 +152,7 @@ static SHA3_INLINE void keccakf_round(u64 st[25]) st[24] ^= bc[ 4]; } -static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) +static void __optimize("O3") keccakf(u64 st[25]) { int round; -- cgit v1.2.3