[v2,6/6] crypto: arm/crc32 - accelerated support based on x86 SSE implementation

Message ID	1480852447-25082-7-git-send-email-ard.biesheuvel@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-crypto-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: Ard Biesheuvel <ard.biesheuvel@linaro.org> To: linux-crypto@vger.kernel.org, herbert@gondor.apana.org.au Cc: linux-arm-kernel@lists.infradead.org, Ard Biesheuvel <ard.biesheuvel@linaro.org> Subject: [PATCH v2 6/6] crypto: arm/crc32 - accelerated support based on x86 SSE implementation Date: Sun, 4 Dec 2016 11:54:07 +0000 Message-Id: <1480852447-25082-7-git-send-email-ard.biesheuvel@linaro.org> In-Reply-To: <1480852447-25082-1-git-send-email-ard.biesheuvel@linaro.org> References: <1480852447-25082-1-git-send-email-ard.biesheuvel@linaro.org> Sender: linux-crypto-owner@vger.kernel.org Precedence: bulk

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index fce801fa52a1..de7bb20815bf 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -125,4 +125,9 @@ config CRYPTO_CRCT10DIF_ARM_CE depends on KERNEL_MODE_NEON && CRC_T10DIF select CRYPTO_HASH +config CRYPTO_CRC32_ARM_CE + tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions" + depends on KERNEL_MODE_NEON && CRC32 + select CRYPTO_HASH + endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index fc77265014b7..b578a1820ab1 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -14,6 +14,7 @@ ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o +ce-obj-$(CONFIG_CRYPTO_CRC32_ARM_CE) += crc32-arm-ce.o ifneq ($(ce-obj-y)$(ce-obj-m),) ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y) @@ -38,6 +39,7 @@ sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o +crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o quiet_cmd_perl = PERL $@ cmd_perl = $(PERL) $(<) > $(@) diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S new file mode 100644 index 000000000000..70e0c8042880 --- /dev/null +++ b/arch/arm/crypto/crc32-ce-core.S @@ -0,0 +1,306 @@ +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .align 6 + .arch armv8-a + .arch_extension crc + .fpu crypto-neon-fp-armv8 + +.Lcrc32_constants: + /* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ + .quad 0x0000000154442bd4 + .quad 0x00000001c6e41596 + + /* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ + .quad 0x00000001751997d0 + .quad 0x00000000ccaa009e + + /* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ + .quad 0x0000000163cd6124 + .quad 0x00000000FFFFFFFF + + /* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` + * = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ + .quad 0x00000001DB710641 + .quad 0x00000001F7011641 + +.Lcrc32c_constants: + .quad 0x00000000740eef02 + .quad 0x000000009e4addf8 + .quad 0x00000000f20c0dfe + .quad 0x000000014cd00bd6 + .quad 0x00000000dd45aab8 + .quad 0x00000000FFFFFFFF + .quad 0x0000000105ec76f0 + .quad 0x00000000dea713f1 + + dCONSTANTl .req d0 + dCONSTANTh .req d1 + qCONSTANT .req q0 + + BUF .req r0 + LEN .req r1 + CRC .req r2 + + qzr .req q9 + + /** + * Calculate crc32 + * BUF - buffer + * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pmull_le(unsigned char const *buffer, + * size_t len, uint crc32) + */ +ENTRY(crc32_pmull_le) + adr r3, .Lcrc32_constants + b 0f + +ENTRY(crc32c_pmull_le) + adr r3, .Lcrc32c_constants + +0: bic LEN, LEN, #15 + vld1.8 {q1-q2}, [BUF]! + vld1.8 {q3-q4}, [BUF]! + vmov.i8 qzr, #0 + vmov.i8 qCONSTANT, #0 + vmov dCONSTANTl[0], CRC + veor.8 d2, d2, dCONSTANTl + sub LEN, LEN, #0x40 + cmp LEN, #0x40 + blt less_64 + + vld1.64 {qCONSTANT}, [r3] + +loop_64: /* 64 bytes Full cache line folding */ + sub LEN, LEN, #0x40 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q6, d5, dCONSTANTh + vmull.p64 q7, d7, dCONSTANTh + vmull.p64 q8, d9, dCONSTANTh + + vmull.p64 q1, d2, dCONSTANTl + vmull.p64 q2, d4, dCONSTANTl + vmull.p64 q3, d6, dCONSTANTl + vmull.p64 q4, d8, dCONSTANTl + + veor.8 q1, q1, q5 + vld1.8 {q5}, [BUF]! + veor.8 q2, q2, q6 + vld1.8 {q6}, [BUF]! + veor.8 q3, q3, q7 + vld1.8 {q7}, [BUF]! + veor.8 q4, q4, q8 + vld1.8 {q8}, [BUF]! + + veor.8 q1, q1, q5 + veor.8 q2, q2, q6 + veor.8 q3, q3, q7 + veor.8 q4, q4, q8 + + cmp LEN, #0x40 + bge loop_64 + +less_64: /* Folding cache line into 128bit */ + vldr dCONSTANTl, [r3, #16] + vldr dCONSTANTh, [r3, #24] + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q3 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q4 + + teq LEN, #0 + beq fold_64 + +loop_16: /* Folding rest buffer into 128bit */ + subs LEN, LEN, #0x10 + + vld1.8 {q2}, [BUF]! + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + bne loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + vmull.p64 q2, d2, dCONSTANTh + vext.8 q1, q1, qzr, #8 + veor.8 q1, q1, q2 + + /* final 32-bit fold */ + vldr dCONSTANTl, [r3, #32] + vldr d6, [r3, #40] + vmov.i8 d7, #0 + + vext.8 q2, q1, qzr, #4 + vand.8 d2, d2, d6 + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q2 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ + vldr dCONSTANTl, [r3, #48] + vldr dCONSTANTh, [r3, #56] + + vand.8 q2, q1, q3 + vext.8 q2, qzr, q2, #8 + vmull.p64 q2, d5, dCONSTANTh + vand.8 q2, q2, q3 + vmull.p64 q2, d4, dCONSTANTl + veor.8 q1, q1, q2 + vmov r0, s5 + + bx lr +ENDPROC(crc32_pmull_le) +ENDPROC(crc32c_pmull_le) + + .macro __crc32, c + subs ip, r2, #8 + bmi .Ltail + + tst r1, #3 + bne .Lunaligned + + teq ip, #0 +.Laligned8\c: + ldrd r2, r3, [r1], #8 +ARM_BE8(rev r2, r2 ) +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r2 + crc32\c\()w r0, r0, r3 + bxeq lr + subs ip, ip, #8 + bpl .Laligned8\c + +.Ltail\c: + tst ip, #4 + beq 2f + ldr r3, [r1], #4 +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r3 + +2: tst ip, #2 + beq 1f + ldrh r3, [r1], #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +1: tst ip, #1 + bxeq lr + ldrb r3, [r1] + crc32\c\()b r0, r0, r3 + bx lr + +.Lunaligned\c: + tst r1, #1 + beq 2f + ldrb r3, [r1], #1 + subs r2, r2, #1 + crc32\c\()b r0, r0, r3 + + tst r1, #2 + beq 0f +2: ldrh r3, [r1], #2 + subs r2, r2, #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +0: subs ip, r2, #8 + bpl .Laligned8\c + b .Ltail\c + .endm + + .align 5 +ENTRY(crc32_armv8_le) + __crc32 +ENDPROC(crc32_armv8_le) + + .align 5 +ENTRY(crc32c_armv8_le) + __crc32 c +ENDPROC(crc32c_armv8_le) diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c new file mode 100644 index 000000000000..f78cf1689669 --- /dev/null +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -0,0 +1,195 @@ +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/crc32.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <crypto/internal/hash.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> + +#define PMULL_MIN_LEN 64L /* minimum size of buffer + * for crc32_pmull_le_16 */ +#define SCALE_F 16L /* size of NEON register */ + +asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +static int crc32_pmull_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + return 0; +} + +static int crc32c_pmull_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + return 0; +} + +static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_pmull_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crc = shash_desc_ctx(desc); + + *crc = *mctx; + return 0; +} + +static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u32 *crc = shash_desc_ctx(desc); + + if (length >= PMULL_MIN_LEN && may_use_simd() && + (elf_hwcap2 & HWCAP2_PMULL)) { + unsigned int l = round_down(length, SCALE_F); + + kernel_neon_begin(); + *crc = crc32_pmull_le(data, l, *crc); + kernel_neon_end(); + + data += l; + length -= l; + } + + if (length > 0) { + if (elf_hwcap2 & HWCAP2_CRC32) + *crc = crc32_armv8_le(*crc, data, length); + else + *crc = crc32_le(*crc, data, length); + } + + return 0; +} + +static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u32 *crc = shash_desc_ctx(desc); + + if (length >= PMULL_MIN_LEN && may_use_simd() && + (elf_hwcap2 & HWCAP2_PMULL)) { + unsigned int l = round_down(length, SCALE_F); + + kernel_neon_begin(); + *crc = crc32c_pmull_le(data, l, *crc); + kernel_neon_end(); + + data += l; + length -= l; + } + + if (length > 0) { + if (elf_hwcap2 & HWCAP2_CRC32) + *crc = crc32c_armv8_le(*crc, data, length); + else + *crc = __crc32c_le(*crc, data, length); + } + + return 0; +} + +static int crc32_pmull_final(struct shash_desc *desc, u8 *out) +{ + u32 *crc = shash_desc_ctx(desc); + + put_unaligned_le32(*crc, out); + return 0; +} + +static int crc32c_pmull_final(struct shash_desc *desc, u8 *out) +{ + u32 *crc = shash_desc_ctx(desc); + + put_unaligned_le32(~*crc, out); + return 0; +} + +static struct shash_alg crc32_pmull_algs[] = { { + .setkey = crc32_pmull_setkey, + .init = crc32_pmull_init, + .update = crc32_pmull_update, + .final = crc32_pmull_final, + .descsize = sizeof(u32), + .digestsize = sizeof(u32), + + .base.cra_ctxsize = sizeof(u32), + .base.cra_init = crc32_pmull_cra_init, + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-arm-ce", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_module = THIS_MODULE, +}, { + .setkey = crc32_pmull_setkey, + .init = crc32_pmull_init, + .update = crc32c_pmull_update, + .final = crc32c_pmull_final, + .descsize = sizeof(u32), + .digestsize = sizeof(u32), + + .base.cra_ctxsize = sizeof(u32), + .base.cra_init = crc32c_pmull_cra_init, + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-arm-ce", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_module = THIS_MODULE, +} }; + +static int __init crc32_pmull_mod_init(void) +{ + if (!(elf_hwcap2 & (HWCAP2_PMULL|HWCAP2_CRC32))) + return -ENODEV; + + return crypto_register_shashes(crc32_pmull_algs, + ARRAY_SIZE(crc32_pmull_algs)); +} + +static void __exit crc32_pmull_mod_exit(void) +{ + crypto_unregister_shashes(crc32_pmull_algs, + ARRAY_SIZE(crc32_pmull_algs)); +} + +module_init(crc32_pmull_mod_init); +module_exit(crc32_pmull_mod_exit); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("crc32"); +MODULE_ALIAS_CRYPTO("crc32c");

[v2,6/6] crypto: arm/crc32 - accelerated support based on x86 SSE implementation

Commit Message

Patch