diff mbox series

[6/6] x86/crc32: implement crc32_be using new template

Message ID 20241125041129.192999-7-ebiggers@kernel.org
State New
Headers show
Series x86: new optimized CRC functions, with VPCLMULQDQ support | expand

Commit Message

Eric Biggers Nov. 25, 2024, 4:11 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

crc32_be was previously unoptimized on x86.  Optimize it using the new
template.  This improves performance by over 25x in some cases.

Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit:

	Length     Before        After
	------     ------        -----
	     1     389 MB/s      325 MB/s
	    16    2845 MB/s     2911 MB/s
	    64    3012 MB/s     6513 MB/s
	   127    2567 MB/s     9057 MB/s
	   128    3048 MB/s    11589 MB/s
	   200    3070 MB/s    14042 MB/s
	   256    3067 MB/s    20454 MB/s
	   511    2938 MB/s    26245 MB/s
	   512    3081 MB/s    36926 MB/s
	  1024    3090 MB/s    61914 MB/s
	  3173    3065 MB/s    76201 MB/s
	  4096    3084 MB/s    82547 MB/s
	 16384    3084 MB/s    89333 MB/s

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/lib/crc-pclmul-consts.h | 49 +++++++++++++++++++++++++++++++-
 arch/x86/lib/crc32-glue.c        |  4 +++
 arch/x86/lib/crc32-pclmul.S      |  1 +
 3 files changed, 53 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
index c3ca689eae3b8..f8af6e9278c83 100644
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ b/arch/x86/lib/crc-pclmul-consts.h
@@ -1,10 +1,10 @@ 
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * CRC constants generated by:
  *
- *	./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320
+ *	./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_msb_0x04c11db7
  *
  * Do not edit manually.
  */
 
 /*
@@ -97,5 +97,52 @@  static const struct {
 		0xb4e5b025f7011641,	/* floor(x^95 / G(x)) */
 		0x1db710641,	/* G(x) */
 	},
 	.extract_crc_mask = {0, 0xffffffff},
 };
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ *        x^5 + x^4 + x^2 + x + 1
+ */
+static const struct {
+	u8 bswap_mask[16];
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc32_msb_0x04c11db7_consts __cacheline_aligned __maybe_unused = {
+	.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+	.fold_across_2048_bits_consts = {
+		0x88fe2237,	/* x^(2048+0) mod G(x) */
+		0xcbcf3bcb,	/* x^(2048+64) mod G(x) */
+	},
+	.fold_across_1024_bits_consts = {
+		0x567fddeb,	/* x^(1024+0) mod G(x) */
+		0x10bd4d7c,	/* x^(1024+64) mod G(x) */
+	},
+	.fold_across_512_bits_consts = {
+		0xe6228b11,	/* x^(512+0) mod G(x) */
+		0x8833794c,	/* x^(512+64) mod G(x) */
+	},
+	.fold_across_256_bits_consts = {
+		0x75be46b7,	/* x^(256+0) mod G(x) */
+		0x569700e5,	/* x^(256+64) mod G(x) */
+	},
+	.fold_across_128_bits_consts = {
+		0xe8a45605,	/* x^(128+0) mod G(x) */
+		0xc5b9cd4c,	/* x^(128+64) mod G(x) */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0x04d101df481b4e5a,	/* floor(x^96 / G(x)) - x^64 */
+		0x104c11db7,	/* G(x) */
+	},
+};
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
index afcdeee429664..326261e503b42 100644
--- a/arch/x86/lib/crc32-glue.c
+++ b/arch/x86/lib/crc32-glue.c
@@ -18,10 +18,11 @@ 
 
 static DEFINE_STATIC_KEY_FALSE(have_crc32);
 static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 
 DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+DECLARE_CRC_PCLMUL_FUNCS(crc32_msb, u32);
 
 u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
 	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
 		   have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8));
@@ -69,10 +70,12 @@  u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)
 }
 EXPORT_SYMBOL(crc32c_le_arch);
 
 u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
 {
+	CRC_PCLMUL(crc, p, len, crc32_msb, crc32_msb_0x04c11db7_consts,
+		   have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8));
 	return crc32_be_base(crc, p, len);
 }
 EXPORT_SYMBOL(crc32_be_arch);
 
 static int __init crc32_x86_init(void)
@@ -80,10 +83,11 @@  static int __init crc32_x86_init(void)
 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
 		static_branch_enable(&have_crc32);
 	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		static_branch_enable(&have_pclmulqdq);
 		INIT_CRC_PCLMUL(crc32_lsb);
+		INIT_CRC_PCLMUL(crc32_msb);
 	}
 	return 0;
 }
 arch_initcall(crc32_x86_init);
 
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
index cf07d571ae864..d562944211d4d 100644
--- a/arch/x86/lib/crc32-pclmul.S
+++ b/arch/x86/lib/crc32-pclmul.S
@@ -2,5 +2,6 @@ 
 // Copyright 2024 Google LLC
 
 #include "crc-pclmul-template.S"
 
 DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
+DEFINE_CRC_PCLMUL_FUNCS(crc32_msb, /* bits= */ 32, /* lsb= */ 0)