diff mbox series

[v3,02/29] crypto: x86/chacha - depend on generic chacha library instead of crypto driver

Message ID 20191007164610.6881-3-ard.biesheuvel@linaro.org
State New
Headers show
Series crypto: crypto API library interfaces for WireGuard | expand

Commit Message

Ard Biesheuvel Oct. 7, 2019, 4:45 p.m. UTC
In preparation of extending the x86 ChaCha driver to also expose the ChaCha
library interface, drop the dependency on the chacha_generic crypto driver
as a non-SIMD fallback, and depend on the generic ChaCha library directly.
This way, we only pull in the code we actually need, without registering
a set of ChaCha skciphers that we will never use.

Since turning the FPU on and off is cheap these days, simplify the SIMD
routine by dropping the per-page yield, which makes for a cleaner switch
to the library API as well.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
 arch/x86/crypto/chacha_glue.c | 77 ++++++++++----------
 crypto/Kconfig                |  2 +-
 2 files changed, 40 insertions(+), 39 deletions(-)

-- 
2.20.1

Comments

Eric Biggers Oct. 11, 2019, 6 a.m. UTC | #1
On Mon, Oct 07, 2019 at 06:45:43PM +0200, Ard Biesheuvel wrote:
> In preparation of extending the x86 ChaCha driver to also expose the ChaCha

> library interface, drop the dependency on the chacha_generic crypto driver

> as a non-SIMD fallback, and depend on the generic ChaCha library directly.

> This way, we only pull in the code we actually need, without registering

> a set of ChaCha skciphers that we will never use.

> 

> Since turning the FPU on and off is cheap these days, simplify the SIMD

> routine by dropping the per-page yield, which makes for a cleaner switch

> to the library API as well.

> 

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---

>  arch/x86/crypto/chacha_glue.c | 77 ++++++++++----------

>  crypto/Kconfig                |  2 +-

>  2 files changed, 40 insertions(+), 39 deletions(-)

> 

> diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c

> index bc62daa8dafd..3a1a11a4326d 100644

> --- a/arch/x86/crypto/chacha_glue.c

> +++ b/arch/x86/crypto/chacha_glue.c

> @@ -127,32 +127,32 @@ static int chacha_simd_stream_xor(struct skcipher_walk *walk,

>  				  const struct chacha_ctx *ctx, const u8 *iv)

>  {

>  	u32 *state, state_buf[16 + 2] __aligned(8);

> -	int next_yield = 4096; /* bytes until next FPU yield */

> +	bool do_simd;

>  	int err = 0;

>  

>  	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);

>  	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);

>  

> -	crypto_chacha_init(state, ctx, iv);

> +	chacha_init_generic(state, ctx->key, iv);

>  

> +	do_simd = (walk->total > CHACHA_BLOCK_SIZE) && crypto_simd_usable();

>  	while (walk->nbytes > 0) {

>  		unsigned int nbytes = walk->nbytes;

>  

> -		if (nbytes < walk->total) {

> +		if (nbytes < walk->total)

>  			nbytes = round_down(nbytes, walk->stride);

> -			next_yield -= nbytes;

> -		}

> -

> -		chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,

> -			      nbytes, ctx->nrounds);

>  

> -		if (next_yield <= 0) {

> -			/* temporarily allow preemption */

> -			kernel_fpu_end();

> +		if (!do_simd) {

> +			chacha_crypt_generic(state, walk->dst.virt.addr,

> +					     walk->src.virt.addr, nbytes,

> +					     ctx->nrounds);

> +		} else {

>  			kernel_fpu_begin();

> -			next_yield = 4096;

> +			chacha_dosimd(state, walk->dst.virt.addr,

> +				      walk->src.virt.addr, nbytes,

> +				      ctx->nrounds);

> +			kernel_fpu_end();


Since the kernel_fpu_begin() and kernel_fpu_end() were moved here, it's now
possible to simplify the code by moving the call to skcipher_walk_virt() into
chacha_simd_stream_xor() rather than making the caller do it.

I.e., see what the code was like prior to the following commit:

	commit f9c9bdb5131eee60dc3b92e5126d4c0e291703e2
	Author: Eric Biggers <ebiggers@google.com>
	Date:   Sat Dec 15 12:40:17 2018 -0800

	    crypto: x86/chacha - avoid sleeping under kernel_fpu_begin()

>  		}

> -

>  		err = skcipher_walk_done(walk, walk->nbytes - nbytes);

>  	}

>  

> @@ -164,19 +164,9 @@ static int chacha_simd(struct skcipher_request *req)

>  	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);

>  	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);

>  	struct skcipher_walk walk;

> -	int err;

>  

> -	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())

> -		return crypto_chacha_crypt(req);

> -

> -	err = skcipher_walk_virt(&walk, req, true);

> -	if (err)

> -		return err;

> -

> -	kernel_fpu_begin();

> -	err = chacha_simd_stream_xor(&walk, ctx, req->iv);

> -	kernel_fpu_end();

> -	return err;

> +	return skcipher_walk_virt(&walk, req, true) ?:

> +	       chacha_simd_stream_xor(&walk, ctx, req->iv);

>  }

>  

>  static int xchacha_simd(struct skcipher_request *req)

> @@ -189,31 +179,42 @@ static int xchacha_simd(struct skcipher_request *req)

>  	u8 real_iv[16];

>  	int err;

>  

> -	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())

> -		return crypto_xchacha_crypt(req);

> -

>  	err = skcipher_walk_virt(&walk, req, true);

>  	if (err)

>  		return err;

>  

>  	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);

>  	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);

> -	crypto_chacha_init(state, ctx, req->iv);

> -

> -	kernel_fpu_begin();

> -

> -	hchacha_block_ssse3(state, subctx.key, ctx->nrounds);

> +	chacha_init_generic(state, ctx->key, req->iv);

> +

> +	if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {

> +		kernel_fpu_begin();

> +		hchacha_block_ssse3(state, subctx.key, ctx->nrounds);

> +		kernel_fpu_end();

> +	} else {

> +		hchacha_block_generic(state, subctx.key, ctx->nrounds);

> +	}

>  	subctx.nrounds = ctx->nrounds;

>  

>  	memcpy(&real_iv[0], req->iv + 24, 8);

>  	memcpy(&real_iv[8], req->iv + 16, 8);

>  	err = chacha_simd_stream_xor(&walk, &subctx, real_iv);

>  

> -	kernel_fpu_end();

> -

>  	return err;


Can use 'return chacha_simd_stream_xor(...') here.

- Eric
Martin Willi Oct. 15, 2019, 10 a.m. UTC | #2
Hi Ard,

> Since turning the FPU on and off is cheap these days, simplify the

> SIMD routine by dropping the per-page yield, which makes for a

> cleaner switch to the library API as well.


In my measurements that lazy FPU restore works as intended, and I could
not identify any slowdown by this change. 

> +++ b/arch/x86/crypto/chacha_glue.c

> @@ -127,32 +127,32 @@ static int chacha_simd_stream_xor [...]

>  

> +	do_simd = (walk->total > CHACHA_BLOCK_SIZE) && crypto_simd_usable();


Given that most users (including chacha20poly1305) likely involve
multiple operations under the same (real) FPU save/restore cycle, those
length checks both in chacha and in poly1305 hardly make sense anymore.

Obviously under tcrypt we get better results when engaging SIMD for any
length, but also for real users this seems beneficial. But of course we
may defer that to a later optimization patch.

Thanks,
Martin
Ard Biesheuvel Oct. 15, 2019, 10:12 a.m. UTC | #3
On Tue, 15 Oct 2019 at 12:00, Martin Willi <martin@strongswan.org> wrote:
>

> Hi Ard,

>

> > Since turning the FPU on and off is cheap these days, simplify the

> > SIMD routine by dropping the per-page yield, which makes for a

> > cleaner switch to the library API as well.

>

> In my measurements that lazy FPU restore works as intended, and I could

> not identify any slowdown by this change.

>


Thanks for confirming.

> > +++ b/arch/x86/crypto/chacha_glue.c

> > @@ -127,32 +127,32 @@ static int chacha_simd_stream_xor [...]

> >

> > +     do_simd = (walk->total > CHACHA_BLOCK_SIZE) && crypto_simd_usable();

>

> Given that most users (including chacha20poly1305) likely involve

> multiple operations under the same (real) FPU save/restore cycle, those

> length checks both in chacha and in poly1305 hardly make sense anymore.

>

> Obviously under tcrypt we get better results when engaging SIMD for any

> length, but also for real users this seems beneficial. But of course we

> may defer that to a later optimization patch.

>


Given that the description already reasons about FPU save/restore
being cheap these days, I think it would be appropriate to just get
rid of it right away. Especially in the chacha20poly1305 case, where
the separate chacha invocation for the poly nonce is guaranteed to
fail this check, we basically end up going back and forth between the
scalar and the SIMD code, which seems rather suboptimal to me.
diff mbox series

Patch

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index bc62daa8dafd..3a1a11a4326d 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -127,32 +127,32 @@  static int chacha_simd_stream_xor(struct skcipher_walk *walk,
 				  const struct chacha_ctx *ctx, const u8 *iv)
 {
 	u32 *state, state_buf[16 + 2] __aligned(8);
-	int next_yield = 4096; /* bytes until next FPU yield */
+	bool do_simd;
 	int err = 0;
 
 	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
 	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
 
-	crypto_chacha_init(state, ctx, iv);
+	chacha_init_generic(state, ctx->key, iv);
 
+	do_simd = (walk->total > CHACHA_BLOCK_SIZE) && crypto_simd_usable();
 	while (walk->nbytes > 0) {
 		unsigned int nbytes = walk->nbytes;
 
-		if (nbytes < walk->total) {
+		if (nbytes < walk->total)
 			nbytes = round_down(nbytes, walk->stride);
-			next_yield -= nbytes;
-		}
-
-		chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
-			      nbytes, ctx->nrounds);
 
-		if (next_yield <= 0) {
-			/* temporarily allow preemption */
-			kernel_fpu_end();
+		if (!do_simd) {
+			chacha_crypt_generic(state, walk->dst.virt.addr,
+					     walk->src.virt.addr, nbytes,
+					     ctx->nrounds);
+		} else {
 			kernel_fpu_begin();
-			next_yield = 4096;
+			chacha_dosimd(state, walk->dst.virt.addr,
+				      walk->src.virt.addr, nbytes,
+				      ctx->nrounds);
+			kernel_fpu_end();
 		}
-
 		err = skcipher_walk_done(walk, walk->nbytes - nbytes);
 	}
 
@@ -164,19 +164,9 @@  static int chacha_simd(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct skcipher_walk walk;
-	int err;
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
-		return crypto_chacha_crypt(req);
-
-	err = skcipher_walk_virt(&walk, req, true);
-	if (err)
-		return err;
-
-	kernel_fpu_begin();
-	err = chacha_simd_stream_xor(&walk, ctx, req->iv);
-	kernel_fpu_end();
-	return err;
+	return skcipher_walk_virt(&walk, req, true) ?:
+	       chacha_simd_stream_xor(&walk, ctx, req->iv);
 }
 
 static int xchacha_simd(struct skcipher_request *req)
@@ -189,31 +179,42 @@  static int xchacha_simd(struct skcipher_request *req)
 	u8 real_iv[16];
 	int err;
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
-		return crypto_xchacha_crypt(req);
-
 	err = skcipher_walk_virt(&walk, req, true);
 	if (err)
 		return err;
 
 	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
 	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
-	crypto_chacha_init(state, ctx, req->iv);
-
-	kernel_fpu_begin();
-
-	hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+	chacha_init_generic(state, ctx->key, req->iv);
+
+	if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
+		kernel_fpu_begin();
+		hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+		kernel_fpu_end();
+	} else {
+		hchacha_block_generic(state, subctx.key, ctx->nrounds);
+	}
 	subctx.nrounds = ctx->nrounds;
 
 	memcpy(&real_iv[0], req->iv + 24, 8);
 	memcpy(&real_iv[8], req->iv + 16, 8);
 	err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
 
-	kernel_fpu_end();
-
 	return err;
 }
 
+static int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+		    unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 20);
+}
+
+static int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+		    unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 12);
+}
+
 static struct skcipher_alg algs[] = {
 	{
 		.base.cra_name		= "chacha20",
@@ -227,7 +228,7 @@  static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
 		.encrypt		= chacha_simd,
 		.decrypt		= chacha_simd,
 	}, {
@@ -242,7 +243,7 @@  static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
 		.encrypt		= xchacha_simd,
 		.decrypt		= xchacha_simd,
 	}, {
@@ -257,7 +258,7 @@  static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha12_setkey,
+		.setkey			= chacha12_setkey,
 		.encrypt		= xchacha_simd,
 		.decrypt		= xchacha_simd,
 	},
diff --git a/crypto/Kconfig b/crypto/Kconfig
index b39ca79ef65f..86732709b171 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1439,7 +1439,7 @@  config CRYPTO_CHACHA20_X86_64
 	tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
 	depends on X86 && 64BIT
 	select CRYPTO_BLKCIPHER
-	select CRYPTO_CHACHA20
+	select CRYPTO_LIB_CHACHA_GENERIC
 	help
 	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
 	  XChaCha20, and XChaCha12 stream ciphers.