@@ -1,8 +1,11 @@
//
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+// Eric Biggers <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
@@ -122,6 +125,10 @@
sli perm2.2d, perm1.2d, #56
sli perm3.2d, perm1.2d, #48
sli perm4.2d, perm1.2d, #40
+
+ mov_q x5, 0x909010108080000
+ mov bd1.d[0], x5
+ zip1 bd1.16b, bd1.16b, bd1.16b
.endm
.macro __pmull_pre_p8, bd
@@ -196,6 +203,45 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
ret
SYM_FUNC_END(__pmull_p8_core)
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+ ext t6.16b, t5.16b, t5.16b, #8
+
+ pmull t3.8h, t7.8b, t5.8b
+ pmull t4.8h, t7.8b, t6.8b
+ pmull2 t5.8h, t7.16b, t5.16b
+ pmull2 t6.8h, t7.16b, t6.16b
+
+ ext t8.16b, t3.16b, t3.16b, #8
+ eor t4.16b, t4.16b, t6.16b
+ ext t7.16b, t5.16b, t5.16b, #8
+ ext t6.16b, t4.16b, t4.16b, #8
+ eor t8.8b, t8.8b, t3.8b
+ eor t5.8b, t5.8b, t7.8b
+ eor t4.8b, t4.8b, t6.8b
+ ext t5.16b, t5.16b, t5.16b, #14
+ ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
+ .macro pmull16x64_p64, a16, b64, c64
+ pmull2 \c64\().1q, \a16\().2d, \b64\().2d
+ pmull \b64\().1q, \a16\().1d, \b64\().1d
+ .endm
+
+ /*
+ * NOTE: the 16x64 bit polynomial multiply below is not equivalent to
+ * the one above, but XOR'ing the outputs together will produce the
+ * expected result, and this is sufficient in the context of this
+ * algorithm.
+ */
+ .macro pmull16x64_p8, a16, b64, c64
+ ext t7.16b, \b64\().16b, \b64\().16b, #1
+ tbl t5.16b, {\a16\().16b}, bd1.16b
+ uzp1 t7.16b, \b64\().16b, t7.16b
+ bl __pmull_p8_16x64
+ ext \b64\().16b, t4.16b, t4.16b, #15
+ eor \c64\().16b, t8.16b, t5.16b
+ .endm
+
.macro __pmull_p8, rq, ad, bd, i
.ifnc \bd, fold_consts
.err
@@ -218,14 +264,12 @@ SYM_FUNC_END(__pmull_p8_core)
.macro fold_32_bytes, p, reg1, reg2
ldp q11, q12, [buf], #0x20
- __pmull_\p v8, \reg1, fold_consts, 2
- __pmull_\p \reg1, \reg1, fold_consts
+ pmull16x64_\p fold_consts, \reg1, v8
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
- __pmull_\p v9, \reg2, fold_consts, 2
- __pmull_\p \reg2, \reg2, fold_consts
+ pmull16x64_\p fold_consts, \reg2, v9
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@@ -238,11 +282,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
// Fold src_reg into dst_reg, optionally loading the next fold constants
.macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
- __pmull_\p v8, \src_reg, fold_consts
- __pmull_\p \src_reg, \src_reg, fold_consts, 2
+ pmull16x64_\p fold_consts, \src_reg, v8
.ifnb \load_next_consts
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
.endif
eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
@@ -296,7 +338,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Load the constants for folding across 128 bytes.
ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_pre_\p fold_consts
// Subtract 128 for the 128 data bytes just consumed. Subtract another
// 128 to simplify the termination condition of the following loop.
@@ -318,7 +359,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Fold across 64 bytes.
add fold_consts_ptr, fold_consts_ptr, #16
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
fold_16_bytes \p, v0, v4
fold_16_bytes \p, v1, v5
fold_16_bytes \p, v2, v6
@@ -339,8 +379,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// into them, storing the result back into v7.
b.lt .Lfold_16_bytes_loop_done_\@
.Lfold_16_bytes_loop_\@:
- __pmull_\p v8, v7, fold_consts
- __pmull_\p v7, v7, fold_consts, 2
+ pmull16x64_\p fold_consts, v7, v8
eor v7.16b, v7.16b, v8.16b
ldr q0, [buf], #16
CPU_LE( rev64 v0.16b, v0.16b )
@@ -387,9 +426,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
bsl v2.16b, v1.16b, v0.16b
// Fold the first chunk into the second chunk, storing the result in v7.
- __pmull_\p v0, v3, fold_consts
- __pmull_\p v7, v3, fold_consts, 2
- eor v7.16b, v7.16b, v0.16b
+ pmull16x64_\p fold_consts, v3, v0
+ eor v7.16b, v3.16b, v0.16b
eor v7.16b, v7.16b, v2.16b
.Lreduce_final_16_bytes_\@:
@@ -450,7 +488,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Load the fold-across-16-bytes constants.
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
cmp len, #16
b.eq .Lreduce_final_16_bytes_\@ // len == 16