diff mbox series

[v3,1/4] crypto: aria: add keystream array into struct aria_ctx

Message ID 20221106143627.30920-2-ap420073@gmail.com
State Superseded
Headers show
Series crypto: aria: implement aria-avx2 and aria-avx512 | expand

Commit Message

Taehee Yoo Nov. 6, 2022, 2:36 p.m. UTC
avx accelerated aria module used local keystream array.
But, keystream array size is too big.
So, it puts the keystream array into struct aria_ctx.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---

v3:
 - No changes.

v2:
 - Patch introduced.

 arch/x86/crypto/aria-avx.h            |  3 ---
 arch/x86/crypto/aria_aesni_avx_glue.c | 24 +++++++++++-------------
 include/crypto/aria.h                 | 11 +++++++++++
 3 files changed, 22 insertions(+), 16 deletions(-)

Comments

Taehee Yoo Nov. 7, 2022, 11:39 a.m. UTC | #1
Hi Herbert,
Thank you so much for your review!

On 11/7/22 17:48, Herbert Xu wrote:
 > On Sun, Nov 06, 2022 at 02:36:24PM +0000, Taehee Yoo wrote:
 >>
 >>   struct aria_ctx {
 >>   	u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
 >>   	u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
 >>   	int rounds;
 >>   	int key_length;
 >> +#if defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) ||	\
 >> +	defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64_MODULE)
 >> +	u8 keystream[ARIA_KEYSTREAM_SIZE];
 >> +#endif
 >>   };
 >
 > The tfm ctx is shared between all users of the tfm.  You need
 > something that is private to the request so this needs to be
 > moved into the reqctx.
 >
 > Cheers,

Sorry, I think I hadn't understood the point of your previous review 
correctly.
I will move keystream into reqctx instead of the tfm.

Thanks a lot!
Taehee Yoo
Taehee Yoo Nov. 9, 2022, 1:16 p.m. UTC | #2
Hi Herbert,

On 11/7/22 17:48, Herbert Xu wrote:
 > On Sun, Nov 06, 2022 at 02:36:24PM +0000, Taehee Yoo wrote:
 >>
 >>   struct aria_ctx {
 >>   	u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
 >>   	u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
 >>   	int rounds;
 >>   	int key_length;
 >> +#if defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) ||	\
 >> +	defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64_MODULE)
 >> +	u8 keystream[ARIA_KEYSTREAM_SIZE];
 >> +#endif
 >>   };
 >
 > The tfm ctx is shared between all users of the tfm.  You need
 > something that is private to the request so this needs to be
 > moved into the reqctx.
 >
 > Cheers,

I have encountered kernel panic(stack-out-of-bounds) while using the 
reqctx instead of the tfm.

cryptd is used when simd drivers are used.
cryptd_skcipher_encrypt() internally doesn't allocate a request ctx of a 
child, instead, it uses stack memory with SYNC_SKCIPHER_REQUEST_ON_STACK.
It retains only 384 bytes for child request ctx even if a child set a 
large reqsize value with crypto_skcipher_set_reqsize().
aria-avx2 needs 512 bytes and aria-avx512 needs 1024 bytes.
So, stack-out-of-bounds occurs.

What do you think about using the per-cpu for the keystream array?
SIMD functions are called under fpu context.
So, per-cpu keystream will not be used concurrently, it's safe.
It also can avoid unnecessary allocation for large temporary memory.

I tested the per-cpu for keystream, and it works well.
If you are okay, I would like to submit v4 patch using per-cpu.

Thanks,
Taehee Yoo
Herbert Xu Nov. 10, 2022, 3:19 a.m. UTC | #3
On Wed, Nov 09, 2022 at 10:16:58PM +0900, Taehee Yoo wrote:
.
> I have encountered kernel panic(stack-out-of-bounds) while using the reqctx
> instead of the tfm.
> 
> cryptd is used when simd drivers are used.
> cryptd_skcipher_encrypt() internally doesn't allocate a request ctx of a
> child, instead, it uses stack memory with SYNC_SKCIPHER_REQUEST_ON_STACK.
> It retains only 384 bytes for child request ctx even if a child set a large
> reqsize value with crypto_skcipher_set_reqsize().
> aria-avx2 needs 512 bytes and aria-avx512 needs 1024 bytes.
> So, stack-out-of-bounds occurs.

That's not good.  Let me look into this.

Thanks,
Taehee Yoo Nov. 11, 2022, 3:43 p.m. UTC | #4
Hi Herbert,
Thank you so much for this work!

On 11/11/22 18:59, Herbert Xu wrote:
 > On Wed, Nov 09, 2022 at 10:16:58PM +0900, Taehee Yoo wrote:
 >>
 >> I have encountered kernel panic(stack-out-of-bounds) while using the 
reqctx
 >> instead of the tfm.
 >>
 >> cryptd is used when simd drivers are used.
 >> cryptd_skcipher_encrypt() internally doesn't allocate a request ctx of a
 >> child, instead, it uses stack memory with 
SYNC_SKCIPHER_REQUEST_ON_STACK.
 >> It retains only 384 bytes for child request ctx even if a child set 
a large
 >> reqsize value with crypto_skcipher_set_reqsize().
 >> aria-avx2 needs 512 bytes and aria-avx512 needs 1024 bytes.
 >> So, stack-out-of-bounds occurs.
 >
 > OK this is not supposed to happen.
 >
 > ---8<---
 > cryptd is buggy as it tries to use sync_skcipher without going
 > through the proper sync_skcipher interface.  In fact it doesn't
 > even need sync_skcipher since it's already a proper skcipher and
 > can easily access the request context instead of using something
 > off the stack.
 >

I have tested this patch with ctr(aria-avx), ctr(aria-avx2), and 
ctr(aria-avx512), and it works well.
stack-out-of-bounds issues have disappeared after this patch.
So, I will test more and I will send a v4 patch.

 > Fixes: 36b3875a97b8 ("crypto: cryptd - Remove VLA usage of skcipher")
 > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 >
 > diff --git a/crypto/cryptd.c b/crypto/cryptd.c
 > index 668095eca0fa..ca3a40fc7da9 100644
 > --- a/crypto/cryptd.c
 > +++ b/crypto/cryptd.c
 > @@ -68,11 +68,12 @@ struct aead_instance_ctx {
 >
 >   struct cryptd_skcipher_ctx {
 >   	refcount_t refcnt;
 > -	struct crypto_sync_skcipher *child;
 > +	struct crypto_skcipher *child;
 >   };
 >
 >   struct cryptd_skcipher_request_ctx {
 >   	crypto_completion_t complete;
 > +	struct skcipher_request req;
 >   };
 >
 >   struct cryptd_hash_ctx {
 > @@ -227,13 +228,13 @@ static int cryptd_skcipher_setkey(struct 
crypto_skcipher *parent,
 >   				  const u8 *key, unsigned int keylen)
 >   {
 >   	struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(parent);
 > -	struct crypto_sync_skcipher *child = ctx->child;
 > +	struct crypto_skcipher *child = ctx->child;
 >
 > -	crypto_sync_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
 > -	crypto_sync_skcipher_set_flags(child,
 > -				       crypto_skcipher_get_flags(parent) &
 > -					 CRYPTO_TFM_REQ_MASK);
 > -	return crypto_sync_skcipher_setkey(child, key, keylen);
 > +	crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
 > +	crypto_skcipher_set_flags(child,
 > +				  crypto_skcipher_get_flags(parent) &
 > +				  CRYPTO_TFM_REQ_MASK);
 > +	return crypto_skcipher_setkey(child, key, keylen);
 >   }
 >
 >   static void cryptd_skcipher_complete(struct skcipher_request *req, 
int err)
 > @@ -258,13 +259,13 @@ static void cryptd_skcipher_encrypt(struct 
crypto_async_request *base,
 >   	struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req);
 >   	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 >   	struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
 > -	struct crypto_sync_skcipher *child = ctx->child;
 > -	SYNC_SKCIPHER_REQUEST_ON_STACK(subreq, child);
 > +	struct skcipher_request *subreq = &rctx->req;
 > +	struct crypto_skcipher *child = ctx->child;
 >
 >   	if (unlikely(err == -EINPROGRESS))
 >   		goto out;
 >
 > -	skcipher_request_set_sync_tfm(subreq, child);
 > +	skcipher_request_set_tfm(subreq, child);
 >   	skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
 >   				      NULL, NULL);
 >   	skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
 > @@ -286,13 +287,13 @@ static void cryptd_skcipher_decrypt(struct 
crypto_async_request *base,
 >   	struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req);
 >   	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 >   	struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
 > -	struct crypto_sync_skcipher *child = ctx->child;
 > -	SYNC_SKCIPHER_REQUEST_ON_STACK(subreq, child);
 > +	struct skcipher_request *subreq = &rctx->req;
 > +	struct crypto_skcipher *child = ctx->child;
 >
 >   	if (unlikely(err == -EINPROGRESS))
 >   		goto out;
 >
 > -	skcipher_request_set_sync_tfm(subreq, child);
 > +	skcipher_request_set_tfm(subreq, child);
 >   	skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
 >   				      NULL, NULL);
 >   	skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
 > @@ -343,9 +344,10 @@ static int cryptd_skcipher_init_tfm(struct 
crypto_skcipher *tfm)
 >   	if (IS_ERR(cipher))
 >   		return PTR_ERR(cipher);
 >
 > -	ctx->child = (struct crypto_sync_skcipher *)cipher;
 > +	ctx->child = cipher;
 >   	crypto_skcipher_set_reqsize(
 > -		tfm, sizeof(struct cryptd_skcipher_request_ctx));
 > +		tfm, sizeof(struct cryptd_skcipher_request_ctx) +
 > +		     crypto_skcipher_reqsize(cipher));
 >   	return 0;
 >   }
 >
 > @@ -353,7 +355,7 @@ static void cryptd_skcipher_exit_tfm(struct 
crypto_skcipher *tfm)
 >   {
 >   	struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
 >
 > -	crypto_free_sync_skcipher(ctx->child);
 > +	crypto_free_skcipher(ctx->child);
 >   }
 >
 >   static void cryptd_skcipher_free(struct skcipher_instance *inst)
 > @@ -931,7 +933,7 @@ struct crypto_skcipher 
*cryptd_skcipher_child(struct cryptd_skcipher *tfm)
 >   {
 >   	struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(&tfm->base);
 >
 > -	return &ctx->child->base;
 > +	return ctx->child;
 >   }
 >   EXPORT_SYMBOL_GPL(cryptd_skcipher_child);
 >

Thanks a lot,
Taehee Yoo
diff mbox series

Patch

diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h
index 01e9a01dc157..afd37af95e58 100644
--- a/arch/x86/crypto/aria-avx.h
+++ b/arch/x86/crypto/aria-avx.h
@@ -4,9 +4,6 @@ 
 
 #include <linux/types.h>
 
-#define ARIA_AESNI_PARALLEL_BLOCKS 16
-#define ARIA_AESNI_PARALLEL_BLOCK_SIZE  (ARIA_BLOCK_SIZE * 16)
-
 struct aria_avx_ops {
 	void (*aria_encrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
 	void (*aria_decrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
index c561ea4fefa5..b122482d0c9d 100644
--- a/arch/x86/crypto/aria_aesni_avx_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -86,10 +86,9 @@  static int aria_avx_ctr_encrypt(struct skcipher_request *req)
 		u8 *dst = walk.dst.virt.addr;
 
 		while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
-			u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE];
-
 			kernel_fpu_begin();
-			aria_ops.aria_ctr_crypt_16way(ctx, dst, src, keystream,
+			aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+						      &ctx->keystream[0],
 						      walk.iv);
 			kernel_fpu_end();
 			dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
@@ -98,28 +97,27 @@  static int aria_avx_ctr_encrypt(struct skcipher_request *req)
 		}
 
 		while (nbytes >= ARIA_BLOCK_SIZE) {
-			u8 keystream[ARIA_BLOCK_SIZE];
-
-			memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+			memcpy(&ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
 			crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
 
-			aria_encrypt(ctx, keystream, keystream);
+			aria_encrypt(ctx, &ctx->keystream[0],
+				     &ctx->keystream[0]);
 
-			crypto_xor_cpy(dst, src, keystream, ARIA_BLOCK_SIZE);
+			crypto_xor_cpy(dst, src, &ctx->keystream[0],
+				       ARIA_BLOCK_SIZE);
 			dst += ARIA_BLOCK_SIZE;
 			src += ARIA_BLOCK_SIZE;
 			nbytes -= ARIA_BLOCK_SIZE;
 		}
 
 		if (walk.nbytes == walk.total && nbytes > 0) {
-			u8 keystream[ARIA_BLOCK_SIZE];
-
-			memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+			memcpy(&ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
 			crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
 
-			aria_encrypt(ctx, keystream, keystream);
+			aria_encrypt(ctx, &ctx->keystream[0],
+				     &ctx->keystream[0]);
 
-			crypto_xor_cpy(dst, src, keystream, nbytes);
+			crypto_xor_cpy(dst, src, &ctx->keystream[0], nbytes);
 			dst += nbytes;
 			src += nbytes;
 			nbytes = 0;
diff --git a/include/crypto/aria.h b/include/crypto/aria.h
index 254da46cc385..f5c7a87378cd 100644
--- a/include/crypto/aria.h
+++ b/include/crypto/aria.h
@@ -31,11 +31,22 @@ 
 #define ARIA_MAX_RD_KEYS	17
 #define ARIA_RD_KEY_WORDS	(ARIA_BLOCK_SIZE / sizeof(u32))
 
+#define ARIA_AESNI_PARALLEL_BLOCKS 16
+#define ARIA_AESNI_PARALLEL_BLOCK_SIZE  (ARIA_BLOCK_SIZE * 16)
+#if defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) ||	\
+	defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64_MODULE)
+#define ARIA_KEYSTREAM_SIZE ARIA_AESNI_PARALLEL_BLOCK_SIZE
+#endif
+
 struct aria_ctx {
 	u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
 	u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
 	int rounds;
 	int key_length;
+#if defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) ||	\
+	defined(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64_MODULE)
+	u8 keystream[ARIA_KEYSTREAM_SIZE];
+#endif
 };
 
 static const u32 s1[256] = {