[v4,04/35] crypto: x86/chacha - expose SIMD ChaCha routine as library function

Message ID 20191017190932.1947-5-ard.biesheuvel@linaro.org
State New
Headers show
Series
  • crypto: crypto API library interfaces for WireGuard
Related show

Commit Message

Ard Biesheuvel Oct. 17, 2019, 7:09 p.m.
Wire the existing x86 SIMD ChaCha code into the new ChaCha library
interface, so that users of the library interface will get the
accelerated version when available.

Given that calls into the library API will always go through the
routines in this module if it is enabled, switch to static keys
to select the optimal implementation available (which may be none
at all, in which case we defer to the generic implementation for
all invocations).

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
 arch/x86/crypto/chacha_glue.c | 91 ++++++++++++++------
 crypto/Kconfig                |  1 +
 include/crypto/chacha.h       |  6 ++
 3 files changed, 73 insertions(+), 25 deletions(-)

-- 
2.20.1

Comments

Eric Biggers Oct. 23, 2019, 3:10 a.m. | #1
On Thu, Oct 17, 2019 at 09:09:01PM +0200, Ard Biesheuvel wrote:
> +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)

> +{

> +	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);

> +

> +	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {

> +		hchacha_block_generic(state, stream, nrounds);

> +	} else {

> +		kernel_fpu_begin();

> +		hchacha_block_ssse3(state, stream, nrounds);

> +		kernel_fpu_end();

> +	}

> +}

> +EXPORT_SYMBOL(hchacha_block_arch);


I generally find negative logic like the above harder to read than the other
way:

	if (static_branch_likely(&chacha_use_simd) && crypto_simd_usable()) {
		kernel_fpu_begin();
		hchacha_block_ssse3(state, stream, nrounds);
		kernel_fpu_end();
	} else {
		hchacha_block_generic(state, stream, nrounds);
	}

(This applies to lots of places in this patch series.)  Not a big deal though.

>  static int __init chacha_simd_mod_init(void)

>  {

>  	if (!boot_cpu_has(X86_FEATURE_SSSE3))

> -		return -ENODEV;

> -

> -#ifdef CONFIG_AS_AVX2

> -	chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&

> -			  boot_cpu_has(X86_FEATURE_AVX2) &&

> -			  cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);

> -#ifdef CONFIG_AS_AVX512

> -	chacha_use_avx512vl = chacha_use_avx2 &&

> -			      boot_cpu_has(X86_FEATURE_AVX512VL) &&

> -			      boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */

> -#endif

> -#endif

> +		return 0;

> +

> +	static_branch_enable(&chacha_use_simd);

> +

> +	if (IS_ENABLED(CONFIG_AS_AVX2) &&

> +	    boot_cpu_has(X86_FEATURE_AVX) &&

> +	    boot_cpu_has(X86_FEATURE_AVX2) &&

> +	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {

> +		static_branch_enable(&chacha_use_avx2);

> +

> +		if (IS_ENABLED(CONFIG_AS_AVX512) &&

> +		    boot_cpu_has(X86_FEATURE_AVX512VL) &&

> +		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */

> +			static_branch_enable(&chacha_use_avx512vl);

> +	}

>  	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));

>  }

>  


This patch is missing the corresponding change to chacha_simd_mod_fini(), to
skip unregistering the skcipher algorithms if they weren't registered.

- Eric
Eric Biggers Oct. 23, 2019, 4:40 a.m. | #2
On Thu, Oct 17, 2019 at 09:09:01PM +0200, Ard Biesheuvel wrote:
> +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,

> +		       int nrounds)

> +{

> +	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);

> +

> +	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||

> +	    bytes <= CHACHA_BLOCK_SIZE)

> +		return chacha_crypt_generic(state, dst, src, bytes, nrounds);

> +

> +	kernel_fpu_begin();

> +	chacha_dosimd(state, dst, src, bytes, nrounds);

> +	kernel_fpu_end();

> +}

> +EXPORT_SYMBOL(chacha_crypt_arch);


This can process an arbitrary amount of data with preemption disabled.
Shouldn't the library functions limit the amount of data processed per
fpu_begin/fpu_end region?  I see that some of them do...

- Eric

Patch

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index a264dcc64679..b788260a1243 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -21,24 +21,24 @@  asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					unsigned int len, int nrounds);
 asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
-#ifdef CONFIG_AS_AVX2
+
 asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 				       unsigned int len, int nrounds);
 asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 				       unsigned int len, int nrounds);
 asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 				       unsigned int len, int nrounds);
-static bool chacha_use_avx2;
-#ifdef CONFIG_AS_AVX512
+
 asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					   unsigned int len, int nrounds);
 asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					   unsigned int len, int nrounds);
 asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					   unsigned int len, int nrounds);
-static bool chacha_use_avx512vl;
-#endif
-#endif
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
 
 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
 {
@@ -49,9 +49,8 @@  static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
 static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
 			  unsigned int bytes, int nrounds)
 {
-#ifdef CONFIG_AS_AVX2
-#ifdef CONFIG_AS_AVX512
-	if (chacha_use_avx512vl) {
+	if (IS_ENABLED(CONFIG_AS_AVX512) &&
+	    static_branch_likely(&chacha_use_avx512vl)) {
 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
 						   nrounds);
@@ -79,8 +78,9 @@  static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
 			return;
 		}
 	}
-#endif
-	if (chacha_use_avx2) {
+
+	if (IS_ENABLED(CONFIG_AS_AVX2) &&
+	    static_branch_likely(&chacha_use_avx2)) {
 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
 			bytes -= CHACHA_BLOCK_SIZE * 8;
@@ -104,7 +104,7 @@  static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
 			return;
 		}
 	}
-#endif
+
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE * 4;
@@ -123,6 +123,43 @@  static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+
+	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
+		hchacha_block_generic(state, stream, nrounds);
+	} else {
+		kernel_fpu_begin();
+		hchacha_block_ssse3(state, stream, nrounds);
+		kernel_fpu_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+
+	chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+		       int nrounds)
+{
+	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+
+	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
+	    bytes <= CHACHA_BLOCK_SIZE)
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	kernel_fpu_begin();
+	chacha_dosimd(state, dst, src, bytes, nrounds);
+	kernel_fpu_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
 static int chacha_simd_stream_xor(struct skcipher_request *req,
 				  const struct chacha_ctx *ctx, const u8 *iv)
 {
@@ -143,7 +180,8 @@  static int chacha_simd_stream_xor(struct skcipher_request *req,
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 
-		if (!crypto_simd_usable()) {
+		if (!static_branch_likely(&chacha_use_simd) ||
+		    !crypto_simd_usable()) {
 			chacha_crypt_generic(state, walk.dst.virt.addr,
 					     walk.src.virt.addr, nbytes,
 					     ctx->nrounds);
@@ -258,18 +296,21 @@  static struct skcipher_alg algs[] = {
 static int __init chacha_simd_mod_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
-		return -ENODEV;
-
-#ifdef CONFIG_AS_AVX2
-	chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
-			  boot_cpu_has(X86_FEATURE_AVX2) &&
-			  cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifdef CONFIG_AS_AVX512
-	chacha_use_avx512vl = chacha_use_avx2 &&
-			      boot_cpu_has(X86_FEATURE_AVX512VL) &&
-			      boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
-#endif
-#endif
+		return 0;
+
+	static_branch_enable(&chacha_use_simd);
+
+	if (IS_ENABLED(CONFIG_AS_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		static_branch_enable(&chacha_use_avx2);
+
+		if (IS_ENABLED(CONFIG_AS_AVX512) &&
+		    boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+			static_branch_enable(&chacha_use_avx512vl);
+	}
 	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 9da4b67ac8e2..4f7212fa0170 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1418,6 +1418,7 @@  config CRYPTO_CHACHA20_X86_64
 	depends on X86 && 64BIT
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
 	help
 	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
 	  XChaCha20, and XChaCha12 stream ciphers.
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index 78cb9d549b1b..33034b69b087 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -25,6 +25,12 @@ 
 #define CHACHA_BLOCK_SIZE	64
 #define CHACHAPOLY_IV_SIZE	12
 
+#ifdef CONFIG_X86_64
+#define CHACHA_STATE_WORDS	((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
+#else
+#define CHACHA_STATE_WORDS	(CHACHA_BLOCK_SIZE / sizeof(u32))
+#endif
+
 /* 192-bit nonce, then 64-bit stream position */
 #define XCHACHA_IV_SIZE		32