diff mbox series

crypto: x86/aria-avx - fix using avx2 instructions

Message ID 20230210181541.2895144-1-ap420073@gmail.com
State Accepted
Commit 8b84475318641c2b89320859332544cf187e1cbd
Headers show
Series crypto: x86/aria-avx - fix using avx2 instructions | expand

Commit Message

Taehee Yoo Feb. 10, 2023, 6:15 p.m. UTC
vpbroadcastb and vpbroadcastd are not AVX instructions.
But the aria-avx assembly code contains these instructions.
So, kernel panic will occur if the aria-avx works on AVX2 unsupported
CPU.

vbroadcastss, and vpshufb are used to avoid using vpbroadcastb in it.
Unfortunately, this change reduces performance by about 5%.
Also, vpbroadcastd is simply replaced by vmovdqa in it.

Fixes: ba3579e6e45c ("crypto: aria-avx - add AES-NI/AVX/x86_64/GFNI assembler implementation of aria cipher")
Reported-by: Herbert Xu <herbert@gondor.apana.org.au>
Reported-by: Erhard F. <erhard_f@mailbox.org>
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---

My CPU supports AVX2.
So, I disabled AVX2 with QEMU.
In the VM, lscpu doesn't show AVX2, but kernel panic didn't occur.
Therefore, I couldn't reproduce kernel panic.
I will really appreciate it if someone test this patch.

 arch/x86/crypto/aria-aesni-avx-asm_64.S | 134 +++++++++++++++++-------
 1 file changed, 94 insertions(+), 40 deletions(-)

Comments

Elliott, Robert (Servers) Feb. 10, 2023, 7:08 p.m. UTC | #1
> Unfortunately, this change reduces performance by about 5%.

The driver could continue to use functions with AVX2 instructions
if AVX2 is supported, and fallback to functions using only
AVX instructions if not (assuming AVX is supported).
Samuel Neves Feb. 10, 2023, 9:18 p.m. UTC | #2
On Fri, Feb 10, 2023 at 6:18 PM Taehee Yoo <ap420073@gmail.com> wrote:
>
> Also, vpbroadcastd is simply replaced by vmovdqa in it.
>
>  #ifdef CONFIG_AS_GFNI
>  #define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
>                             x4, x5, x6, x7,             \
>                             t0, t1, t2, t3,             \
>                             t4, t5, t6, t7)             \
> -       vpbroadcastq .Ltf_s2_bitmatrix, t0;             \
> -       vpbroadcastq .Ltf_inv_bitmatrix, t1;            \
> -       vpbroadcastq .Ltf_id_bitmatrix, t2;             \
> -       vpbroadcastq .Ltf_aff_bitmatrix, t3;            \
> -       vpbroadcastq .Ltf_x2_bitmatrix, t4;             \
> +       vmovdqa .Ltf_s2_bitmatrix, t0;                  \
> +       vmovdqa .Ltf_inv_bitmatrix, t1;                 \
> +       vmovdqa .Ltf_id_bitmatrix, t2;                  \
> +       vmovdqa .Ltf_aff_bitmatrix, t3;                 \
> +       vmovdqa .Ltf_x2_bitmatrix, t4;                  \

You can use vmovddup to replicate the behavior of vpbroadcastq for xmm
registers. It's as fast as a movdqa and does not require increasing
the data fields to be 16 bytes.
Erhard F. Feb. 10, 2023, 11:48 p.m. UTC | #3
On Fri, 10 Feb 2023 18:15:41 +0000
Taehee Yoo <ap420073@gmail.com> wrote:

> vpbroadcastb and vpbroadcastd are not AVX instructions.
> But the aria-avx assembly code contains these instructions.
> So, kernel panic will occur if the aria-avx works on AVX2 unsupported
> CPU.
> 
> vbroadcastss, and vpshufb are used to avoid using vpbroadcastb in it.
> Unfortunately, this change reduces performance by about 5%.
> Also, vpbroadcastd is simply replaced by vmovdqa in it.

Thanks Taehee, your patch fixes the issue on my AMD FX-8370!

# cryptsetup benchmark -c aria-ctr-plain64
# Tests are approximate using memory only (no storage IO).
# Algorithm |       Key |      Encryption |      Decryption
   aria-ctr        256b       301.3 MiB/s       303.6 MiB/s

The patch did not apply cleanly on v6.2-rc7 however. I needed to do small trivial modifications on hunk #1 and #21. Perhaps this is useful for other users to test so i'll attach it.

Regards,
Erhard

diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
index fe0d84a7ced1..9243f6289d34 100644
--- a/arch/x86/crypto/aria-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -271,34 +271,43 @@
 
 #define aria_ark_8way(x0, x1, x2, x3,			\
 		      x4, x5, x6, x7,			\
-		      t0, rk, idx, round)		\
+		      t0, t1, t2, rk,			\
+		      idx, round)			\
 	/* AddRoundKey */                               \
-	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
-	vpxor t0, x0, x0;				\
-	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
-	vpxor t0, x1, x1;				\
-	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
-	vpxor t0, x2, x2;				\
-	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
-	vpxor t0, x3, x3;				\
-	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
-	vpxor t0, x4, x4;				\
-	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
-	vpxor t0, x5, x5;				\
-	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
-	vpxor t0, x6, x6;				\
-	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
-	vpxor t0, x7, x7;
+	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
+	vpsrld $24, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x0, x0;				\
+	vpsrld $16, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x1, x1;				\
+	vpsrld $8, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x2, x2;				\
+	vpshufb t1, t0, t2;				\
+	vpxor t2, x3, x3;				\
+	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
+	vpsrld $24, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x4, x4;				\
+	vpsrld $16, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x5, x5;				\
+	vpsrld $8, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x6, x6;				\
+	vpshufb t1, t0, t2;				\
+	vpxor t2, x7, x7;
 
 #define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
 			    x4, x5, x6, x7,		\
 			    t0, t1, t2, t3,		\
 			    t4, t5, t6, t7)		\
-	vpbroadcastq .Ltf_s2_bitmatrix, t0;		\
-	vpbroadcastq .Ltf_inv_bitmatrix, t1;		\
-	vpbroadcastq .Ltf_id_bitmatrix, t2;		\
-	vpbroadcastq .Ltf_aff_bitmatrix, t3;		\
-	vpbroadcastq .Ltf_x2_bitmatrix, t4;		\
+	vmovdqa .Ltf_s2_bitmatrix, t0;			\
+	vmovdqa .Ltf_inv_bitmatrix, t1;			\
+	vmovdqa .Ltf_id_bitmatrix, t2;			\
+	vmovdqa .Ltf_aff_bitmatrix, t3;			\
+	vmovdqa .Ltf_x2_bitmatrix, t4;			\
 	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
 	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
 	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
@@ -315,10 +324,9 @@
 		       x4, x5, x6, x7,			\
 		       t0, t1, t2, t3,			\
 		       t4, t5, t6, t7)			\
-	vpxor t7, t7, t7;				\
 	vmovdqa .Linv_shift_row, t0;			\
 	vmovdqa .Lshift_row, t1;			\
-	vpbroadcastd .L0f0f0f0f, t6;			\
+	vbroadcastss .L0f0f0f0f, t6;			\
 	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
 	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
 	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
@@ -413,8 +421,9 @@
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round)			\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -429,7 +438,7 @@
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -467,8 +476,9 @@
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round)			\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -483,7 +493,7 @@
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -521,14 +531,15 @@
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round, last_round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, last_round);		\
+		      y0, y7, y2, rk, 8, last_round);	\
 							\
 	aria_store_state_8way(x0, x1, x2, x3,		\
 			      x4, x5, x6, x7,		\
@@ -538,13 +549,13 @@
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, last_round);		\
+		      y0, y7, y2, rk, 0, last_round);	\
 							\
 	aria_load_state_8way(y0, y1, y2, y3,		\
 			     y4, y5, y6, y7,		\
@@ -556,8 +567,9 @@
 		     y0, y1, y2, y3,			\
 		     y4, y5, y6, y7,			\
 		     mem_tmp, rk, round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -574,7 +586,7 @@
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -614,8 +626,9 @@
 		     y0, y1, y2, y3,			\
 		     y4, y5, y6, y7,			\
 		     mem_tmp, rk, round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 			    x4, x5, x6, x7,		\
@@ -632,7 +645,7 @@
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 			    x4, x5, x6, x7,		\
@@ -672,8 +685,9 @@
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round, last_round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -681,7 +695,7 @@
 			    y4, y5, y6, y7);		\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, last_round);		\
+		      y0, y7, y2, rk, 8, last_round);	\
 							\
 	aria_store_state_8way(x0, x1, x2, x3,		\
 			      x4, x5, x6, x7,		\
@@ -691,7 +705,7 @@
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -699,7 +713,7 @@
 			    y4, y5, y6, y7);		\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, last_round);		\
+		      y0, y7, y2, rk, 0, last_round);	\
 							\
 	aria_load_state_8way(y0, y1, y2, y3,		\
 			     y4, y5, y6, y7,		\
@@ -772,6 +786,14 @@
 		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
 		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
 		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
+	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
+		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
 
 /* AES inverse affine: */
 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
@@ -784,6 +806,14 @@
 		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
 		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
 		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
+	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
 
 /* S2: */
 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
@@ -796,6 +826,14 @@
 		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
 		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
 		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
+	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
+		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
 
 /* X2: */
 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
@@ -808,6 +846,14 @@
 		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
 		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
 		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
+	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
+		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
+		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
 
 /* Identity matrix: */
 .Ltf_id_bitmatrix:
@@ -862,6 +862,14 @@
 		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
+	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 
 /* 4-bit mask */
 .section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
Taehee Yoo Feb. 11, 2023, 6:30 p.m. UTC | #4
On 2/11/23 04:08, Elliott, Robert (Servers) wrote:

Hi Elliott,
Thank you so much for your review!

 >> Unfortunately, this change reduces performance by about 5%.
 >
 > The driver could continue to use functions with AVX2 instructions
 > if AVX2 is supported, and fallback to functions using only
 > AVX instructions if not (assuming AVX is supported).
 >

If CPU supports AVX2, ARIA-AVX2 driver will be worked and it is faster.
But, currently AVX driver requires 16 blocks and AVX2 driver requires 32 
blocks.
So, input block size is less than 32, AVX driver is worked even if cpu 
supports AVX2.
I think the best solution is to make AVX, AVX2, and AVX512 drivers 
support various blocks.
If so, we can use the best performance of ARIA algorithm regardless of 
input block size.
As far as I know, SM4 driver already supports it.
So, I think it would be better for ARIA follows this strategy instead of 
supporting AVX2 instruction in the ARIA-AVX.

Thank you so much!
Taehee Yoo
Taehee Yoo Feb. 11, 2023, 6:41 p.m. UTC | #5
On 2/11/23 04:19, Dave Hansen wrote:

Hi Dave,
Thank you so much for the review!

 > On 2/10/23 10:15, Taehee Yoo wrote:
 >> vpbroadcastb and vpbroadcastd are not AVX instructions.
 >> But the aria-avx assembly code contains these instructions.
 >> So, kernel panic will occur if the aria-avx works on AVX2 unsupported
 >> CPU.
 > ...
 >> My CPU supports AVX2.
 >> So, I disabled AVX2 with QEMU.
 >> In the VM, lscpu doesn't show AVX2, but kernel panic didn't occur.
 >> Therefore, I couldn't reproduce kernel panic.
 >> I will really appreciate it if someone test this patch.
 >
 > So, someone reported this issue and you _think_ you know what went
 > wrong.  But, you can't reproduce the issue so it sounds like you're not
 > confident if this is the right fix or if you are fixing the right
 > problem in the first place.
 >
 > We can certainly apply obvious fixes, but it would be *really* nice if
 > you could try a bit harder to reproduce this.

Yes, you're right.
I'm so sorry for this careless testing.
I think I didn't use enough time for making the reproducer environment.
I will try to make a proper environment for this work.

Thank you so much!
Taehee Yoo
Taehee Yoo Feb. 11, 2023, 7:02 p.m. UTC | #6
On 2/11/23 06:18, Samuel Neves wrote:

Hi Samuel,
Thank you so much for the review!

 > On Fri, Feb 10, 2023 at 6:18 PM Taehee Yoo <ap420073@gmail.com> wrote:
 >>
 >> Also, vpbroadcastd is simply replaced by vmovdqa in it.
 >>
 >>   #ifdef CONFIG_AS_GFNI
 >>   #define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
 >>                              x4, x5, x6, x7,             \
 >>                              t0, t1, t2, t3,             \
 >>                              t4, t5, t6, t7)             \
 >> -       vpbroadcastq .Ltf_s2_bitmatrix, t0;             \
 >> -       vpbroadcastq .Ltf_inv_bitmatrix, t1;            \
 >> -       vpbroadcastq .Ltf_id_bitmatrix, t2;             \
 >> -       vpbroadcastq .Ltf_aff_bitmatrix, t3;            \
 >> -       vpbroadcastq .Ltf_x2_bitmatrix, t4;             \
 >> +       vmovdqa .Ltf_s2_bitmatrix, t0;                  \
 >> +       vmovdqa .Ltf_inv_bitmatrix, t1;                 \
 >> +       vmovdqa .Ltf_id_bitmatrix, t2;                  \
 >> +       vmovdqa .Ltf_aff_bitmatrix, t3;                 \
 >> +       vmovdqa .Ltf_x2_bitmatrix, t4;                  \
 >
 > You can use vmovddup to replicate the behavior of vpbroadcastq for xmm
 > registers. It's as fast as a movdqa and does not require increasing
 > the data fields to be 16 bytes.

Thanks for this suggestion!
I tested this driver using vmovddup instead of using vpbroadcastq, it 
works well.
As you mentioned, vmovddup doesn't require 16byte data.
So, I will use vmovddup instruction instead of vpbroadcastq instruction.

I will send the v2 patch for it.

Thank you so much,
Taehee Yoo
Taehee Yoo Feb. 11, 2023, 7:09 p.m. UTC | #7
On 2/11/23 08:48, Erhard F. wrote:

Hi Erhard,
Thank you so much for the testing!

 > On Fri, 10 Feb 2023 18:15:41 +0000
 > Taehee Yoo <ap420073@gmail.com> wrote:
 >
 >> vpbroadcastb and vpbroadcastd are not AVX instructions.
 >> But the aria-avx assembly code contains these instructions.
 >> So, kernel panic will occur if the aria-avx works on AVX2 unsupported
 >> CPU.
 >>
 >> vbroadcastss, and vpshufb are used to avoid using vpbroadcastb in it.
 >> Unfortunately, this change reduces performance by about 5%.
 >> Also, vpbroadcastd is simply replaced by vmovdqa in it.
 >
 > Thanks Taehee, your patch fixes the issue on my AMD FX-8370!

Nice :)

 >
 > # cryptsetup benchmark -c aria-ctr-plain64
 > # Tests are approximate using memory only (no storage IO).
 > # Algorithm |       Key |      Encryption |      Decryption
 >     aria-ctr        256b       301.3 MiB/s       303.6 MiB/s

Thanks for this benchmark!

 >
 > The patch did not apply cleanly on v6.2-rc7 however. I needed to do 
small trivial modifications on hunk #1 and #21. Perhaps this is useful 
for other users to test so i'll attach it.

Thanks for this :)

 >
 > Regards,
 > Erhard
 >
 > diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S 
b/arch/x86/crypto/aria-aesni-avx-asm_64.S
 > index fe0d84a7ced1..9243f6289d34 100644
 > --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S
 > +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
 > @@ -271,34 +271,43 @@
 >
 >   #define aria_ark_8way(x0, x1, x2, x3,			\
 >   		      x4, x5, x6, x7,			\
 > -		      t0, rk, idx, round)		\
 > +		      t0, t1, t2, rk,			\
 > +		      idx, round)			\
 >   	/* AddRoundKey */                               \
 > -	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
 > -	vpxor t0, x0, x0;				\
 > -	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
 > -	vpxor t0, x1, x1;				\
 > -	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
 > -	vpxor t0, x2, x2;				\
 > -	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
 > -	vpxor t0, x3, x3;				\
 > -	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
 > -	vpxor t0, x4, x4;				\
 > -	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
 > -	vpxor t0, x5, x5;				\
 > -	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
 > -	vpxor t0, x6, x6;				\
 > -	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
 > -	vpxor t0, x7, x7;
 > +	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
 > +	vpsrld $24, t0, t2;				\
 > +	vpshufb t1, t2, t2;				\
 > +	vpxor t2, x0, x0;				\
 > +	vpsrld $16, t0, t2;				\
 > +	vpshufb t1, t2, t2;				\
 > +	vpxor t2, x1, x1;				\
 > +	vpsrld $8, t0, t2;				\
 > +	vpshufb t1, t2, t2;				\
 > +	vpxor t2, x2, x2;				\
 > +	vpshufb t1, t0, t2;				\
 > +	vpxor t2, x3, x3;				\
 > +	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
 > +	vpsrld $24, t0, t2;				\
 > +	vpshufb t1, t2, t2;				\
 > +	vpxor t2, x4, x4;				\
 > +	vpsrld $16, t0, t2;				\
 > +	vpshufb t1, t2, t2;				\
 > +	vpxor t2, x5, x5;				\
 > +	vpsrld $8, t0, t2;				\
 > +	vpshufb t1, t2, t2;				\
 > +	vpxor t2, x6, x6;				\
 > +	vpshufb t1, t0, t2;				\
 > +	vpxor t2, x7, x7;
 >
 >   #define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
 >   			    x4, x5, x6, x7,		\
 >   			    t0, t1, t2, t3,		\
 >   			    t4, t5, t6, t7)		\
 > -	vpbroadcastq .Ltf_s2_bitmatrix, t0;		\
 > -	vpbroadcastq .Ltf_inv_bitmatrix, t1;		\
 > -	vpbroadcastq .Ltf_id_bitmatrix, t2;		\
 > -	vpbroadcastq .Ltf_aff_bitmatrix, t3;		\
 > -	vpbroadcastq .Ltf_x2_bitmatrix, t4;		\
 > +	vmovdqa .Ltf_s2_bitmatrix, t0;			\
 > +	vmovdqa .Ltf_inv_bitmatrix, t1;			\
 > +	vmovdqa .Ltf_id_bitmatrix, t2;			\
 > +	vmovdqa .Ltf_aff_bitmatrix, t3;			\
 > +	vmovdqa .Ltf_x2_bitmatrix, t4;			\
 >   	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
 >   	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
 >   	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
 > @@ -315,10 +324,9 @@
 >   		       x4, x5, x6, x7,			\
 >   		       t0, t1, t2, t3,			\
 >   		       t4, t5, t6, t7)			\
 > -	vpxor t7, t7, t7;				\
 >   	vmovdqa .Linv_shift_row, t0;			\
 >   	vmovdqa .Lshift_row, t1;			\
 > -	vpbroadcastd .L0f0f0f0f, t6;			\
 > +	vbroadcastss .L0f0f0f0f, t6;			\
 >   	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
 >   	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
 >   	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
 > @@ -413,8 +421,9 @@
 >   		y0, y1, y2, y3,				\
 >   		y4, y5, y6, y7,				\
 >   		mem_tmp, rk, round)			\
 > +	vpxor y7, y7, y7;				\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, round);		\
 > +		      y0, y7, y2, rk, 8, round);	\
 >   							\
 >   	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 >   		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 > @@ -429,7 +438,7 @@
 >   			     x4, x5, x6, x7,		\
 >   			     mem_tmp, 0);		\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, round);		\
 > +		      y0, y7, y2, rk, 0, round);	\
 >   							\
 >   	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 >   		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 > @@ -467,8 +476,9 @@
 >   		y0, y1, y2, y3,				\
 >   		y4, y5, y6, y7,				\
 >   		mem_tmp, rk, round)			\
 > +	vpxor y7, y7, y7;				\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, round);		\
 > +		      y0, y7, y2, rk, 8, round);	\
 >   							\
 >   	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 >   		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 > @@ -483,7 +493,7 @@
 >   			     x4, x5, x6, x7,		\
 >   			     mem_tmp, 0);		\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, round);		\
 > +		      y0, y7, y2, rk, 0, round);	\
 >   							\
 >   	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 >   		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 > @@ -521,14 +531,15 @@
 >   		y0, y1, y2, y3,				\
 >   		y4, y5, y6, y7,				\
 >   		mem_tmp, rk, round, last_round)		\
 > +	vpxor y7, y7, y7;				\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, round);		\
 > +		      y0, y7, y2, rk, 8, round);	\
 >   							\
 >   	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 >   		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 >   							\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, last_round);		\
 > +		      y0, y7, y2, rk, 8, last_round);	\
 >   							\
 >   	aria_store_state_8way(x0, x1, x2, x3,		\
 >   			      x4, x5, x6, x7,		\
 > @@ -538,13 +549,13 @@
 >   			     x4, x5, x6, x7,		\
 >   			     mem_tmp, 0);		\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, round);		\
 > +		      y0, y7, y2, rk, 0, round);	\
 >   							\
 >   	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 >   		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 >   							\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, last_round);		\
 > +		      y0, y7, y2, rk, 0, last_round);	\
 >   							\
 >   	aria_load_state_8way(y0, y1, y2, y3,		\
 >   			     y4, y5, y6, y7,		\
 > @@ -556,8 +567,9 @@
 >   		     y0, y1, y2, y3,			\
 >   		     y4, y5, y6, y7,			\
 >   		     mem_tmp, rk, round)		\
 > +	vpxor y7, y7, y7;				\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, round);		\
 > +		      y0, y7, y2, rk, 8, round);	\
 >   							\
 >   	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 >   			    x6, x7, x4, x5,		\
 > @@ -574,7 +586,7 @@
 >   			     x4, x5, x6, x7,		\
 >   			     mem_tmp, 0);		\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, round);		\
 > +		      y0, y7, y2, rk, 0, round);	\
 >   							\
 >   	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 >   			    x6, x7, x4, x5,		\
 > @@ -614,8 +626,9 @@
 >   		     y0, y1, y2, y3,			\
 >   		     y4, y5, y6, y7,			\
 >   		     mem_tmp, rk, round)		\
 > +	vpxor y7, y7, y7;				\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, round);		\
 > +		      y0, y7, y2, rk, 8, round);	\
 >   							\
 >   	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 >   			    x4, x5, x6, x7,		\
 > @@ -632,7 +645,7 @@
 >   			     x4, x5, x6, x7,		\
 >   			     mem_tmp, 0);		\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, round);		\
 > +		      y0, y7, y2, rk, 0, round);	\
 >   							\
 >   	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 >   			    x4, x5, x6, x7,		\
 > @@ -672,8 +685,9 @@
 >   		y0, y1, y2, y3,				\
 >   		y4, y5, y6, y7,				\
 >   		mem_tmp, rk, round, last_round)		\
 > +	vpxor y7, y7, y7;				\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, round);		\
 > +		      y0, y7, y2, rk, 8, round);	\
 >   							\
 >   	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 >   			    x6, x7, x4, x5,		\
 > @@ -681,7 +695,7 @@
 >   			    y4, y5, y6, y7);		\
 >   							\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 8, last_round);		\
 > +		      y0, y7, y2, rk, 8, last_round);	\
 >   							\
 >   	aria_store_state_8way(x0, x1, x2, x3,		\
 >   			      x4, x5, x6, x7,		\
 > @@ -691,7 +705,7 @@
 >   			     x4, x5, x6, x7,		\
 >   			     mem_tmp, 0);		\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, round);		\
 > +		      y0, y7, y2, rk, 0, round);	\
 >   							\
 >   	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 >   			    x6, x7, x4, x5,		\
 > @@ -699,7 +713,7 @@
 >   			    y4, y5, y6, y7);		\
 >   							\
 >   	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > -		      y0, rk, 0, last_round);		\
 > +		      y0, y7, y2, rk, 0, last_round);	\
 >   							\
 >   	aria_load_state_8way(y0, y1, y2, y3,		\
 >   			     y4, y5, y6, y7,		\
 > @@ -772,6 +786,14 @@
 >   		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
 >   		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
 >   		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
 > +	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
 > +		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
 > +		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 > +		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
 > +		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
 > +		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
 > +		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
 > +		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
 >
 >   /* AES inverse affine: */
 >   #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
 > @@ -784,6 +806,14 @@
 >   		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
 >   		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
 >   		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
 > +	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
 > +		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
 > +		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
 > +		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
 > +		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
 > +		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
 > +		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
 > +		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
 >
 >   /* S2: */
 >   #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
 > @@ -796,6 +826,14 @@
 >   		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
 >   		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
 >   		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
 > +	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
 > +		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
 > +		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
 > +		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
 > +		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
 > +		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
 > +		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
 > +		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
 >
 >   /* X2: */
 >   #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
 > @@ -808,6 +846,14 @@
 >   		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
 >   		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
 >   		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
 > +	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
 > +		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
 > +		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
 > +		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 > +		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
 > +		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
 > +		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
 > +		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
 >
 >   /* Identity matrix: */
 >   .Ltf_id_bitmatrix:
 > @@ -862,6 +862,14 @@
 >   		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
 >   		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
 >   		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 > +	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
 > +		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
 > +		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
 > +		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
 > +		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
 > +		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
 > +		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
 > +		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 >
 >   /* 4-bit mask */
 >   .section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4

Thank you so much,
Taehee Yoo
Herbert Xu Feb. 14, 2023, 8:49 a.m. UTC | #8
On Fri, Feb 10, 2023 at 06:15:41PM +0000, Taehee Yoo wrote:
> vpbroadcastb and vpbroadcastd are not AVX instructions.
> But the aria-avx assembly code contains these instructions.
> So, kernel panic will occur if the aria-avx works on AVX2 unsupported
> CPU.
> 
> vbroadcastss, and vpshufb are used to avoid using vpbroadcastb in it.
> Unfortunately, this change reduces performance by about 5%.
> Also, vpbroadcastd is simply replaced by vmovdqa in it.
> 
> Fixes: ba3579e6e45c ("crypto: aria-avx - add AES-NI/AVX/x86_64/GFNI assembler implementation of aria cipher")
> Reported-by: Herbert Xu <herbert@gondor.apana.org.au>
> Reported-by: Erhard F. <erhard_f@mailbox.org>
> Signed-off-by: Taehee Yoo <ap420073@gmail.com>
> ---
> 
> My CPU supports AVX2.
> So, I disabled AVX2 with QEMU.
> In the VM, lscpu doesn't show AVX2, but kernel panic didn't occur.
> Therefore, I couldn't reproduce kernel panic.
> I will really appreciate it if someone test this patch.
> 
>  arch/x86/crypto/aria-aesni-avx-asm_64.S | 134 +++++++++++++++++-------
>  1 file changed, 94 insertions(+), 40 deletions(-)

Patch applied.  Thanks.
diff mbox series

Patch

diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
index fe0d84a7ced1..9243f6289d34 100644
--- a/arch/x86/crypto/aria-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -267,35 +267,44 @@ 
 
 #define aria_ark_8way(x0, x1, x2, x3,			\
 		      x4, x5, x6, x7,			\
-		      t0, rk, idx, round)		\
+		      t0, t1, t2, rk,			\
+		      idx, round)			\
 	/* AddRoundKey */                               \
-	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
-	vpxor t0, x0, x0;				\
-	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
-	vpxor t0, x1, x1;				\
-	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
-	vpxor t0, x2, x2;				\
-	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
-	vpxor t0, x3, x3;				\
-	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
-	vpxor t0, x4, x4;				\
-	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
-	vpxor t0, x5, x5;				\
-	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
-	vpxor t0, x6, x6;				\
-	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
-	vpxor t0, x7, x7;
+	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
+	vpsrld $24, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x0, x0;				\
+	vpsrld $16, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x1, x1;				\
+	vpsrld $8, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x2, x2;				\
+	vpshufb t1, t0, t2;				\
+	vpxor t2, x3, x3;				\
+	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
+	vpsrld $24, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x4, x4;				\
+	vpsrld $16, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x5, x5;				\
+	vpsrld $8, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x6, x6;				\
+	vpshufb t1, t0, t2;				\
+	vpxor t2, x7, x7;
 
 #ifdef CONFIG_AS_GFNI
 #define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
 			    x4, x5, x6, x7,		\
 			    t0, t1, t2, t3,		\
 			    t4, t5, t6, t7)		\
-	vpbroadcastq .Ltf_s2_bitmatrix, t0;		\
-	vpbroadcastq .Ltf_inv_bitmatrix, t1;		\
-	vpbroadcastq .Ltf_id_bitmatrix, t2;		\
-	vpbroadcastq .Ltf_aff_bitmatrix, t3;		\
-	vpbroadcastq .Ltf_x2_bitmatrix, t4;		\
+	vmovdqa .Ltf_s2_bitmatrix, t0;			\
+	vmovdqa .Ltf_inv_bitmatrix, t1;			\
+	vmovdqa .Ltf_id_bitmatrix, t2;			\
+	vmovdqa .Ltf_aff_bitmatrix, t3;			\
+	vmovdqa .Ltf_x2_bitmatrix, t4;			\
 	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
 	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
 	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
@@ -315,10 +324,9 @@ 
 		       x4, x5, x6, x7,			\
 		       t0, t1, t2, t3,			\
 		       t4, t5, t6, t7)			\
-	vpxor t7, t7, t7;				\
 	vmovdqa .Linv_shift_row, t0;			\
 	vmovdqa .Lshift_row, t1;			\
-	vpbroadcastd .L0f0f0f0f, t6;			\
+	vbroadcastss .L0f0f0f0f, t6;			\
 	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
 	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
 	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
@@ -413,8 +421,9 @@ 
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round)			\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -429,7 +438,7 @@ 
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -467,8 +476,9 @@ 
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round)			\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -483,7 +493,7 @@ 
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
@@ -521,14 +531,15 @@ 
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round, last_round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, last_round);		\
+		      y0, y7, y2, rk, 8, last_round);	\
 							\
 	aria_store_state_8way(x0, x1, x2, x3,		\
 			      x4, x5, x6, x7,		\
@@ -538,13 +549,13 @@ 
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, last_round);		\
+		      y0, y7, y2, rk, 0, last_round);	\
 							\
 	aria_load_state_8way(y0, y1, y2, y3,		\
 			     y4, y5, y6, y7,		\
@@ -556,8 +567,9 @@ 
 		     y0, y1, y2, y3,			\
 		     y4, y5, y6, y7,			\
 		     mem_tmp, rk, round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -574,7 +586,7 @@ 
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -614,8 +626,9 @@ 
 		     y0, y1, y2, y3,			\
 		     y4, y5, y6, y7,			\
 		     mem_tmp, rk, round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 			    x4, x5, x6, x7,		\
@@ -632,7 +645,7 @@ 
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 			    x4, x5, x6, x7,		\
@@ -672,8 +685,9 @@ 
 		y0, y1, y2, y3,				\
 		y4, y5, y6, y7,				\
 		mem_tmp, rk, round, last_round)		\
+	vpxor y7, y7, y7;				\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, round);		\
+		      y0, y7, y2, rk, 8, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -681,7 +695,7 @@ 
 			    y4, y5, y6, y7);		\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 8, last_round);		\
+		      y0, y7, y2, rk, 8, last_round);	\
 							\
 	aria_store_state_8way(x0, x1, x2, x3,		\
 			      x4, x5, x6, x7,		\
@@ -691,7 +705,7 @@ 
 			     x4, x5, x6, x7,		\
 			     mem_tmp, 0);		\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, round);		\
+		      y0, y7, y2, rk, 0, round);	\
 							\
 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 			    x6, x7, x4, x5,		\
@@ -699,7 +713,7 @@ 
 			    y4, y5, y6, y7);		\
 							\
 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
-		      y0, rk, 0, last_round);		\
+		      y0, y7, y2, rk, 0, last_round);	\
 							\
 	aria_load_state_8way(y0, y1, y2, y3,		\
 			     y4, y5, y6, y7,		\
@@ -772,6 +786,14 @@ 
 		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
 		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
 		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
+	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
+		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
 
 /* AES inverse affine: */
 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
@@ -784,6 +806,14 @@ 
 		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
 		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
 		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
+	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
 
 /* S2: */
 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
@@ -796,6 +826,14 @@ 
 		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
 		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
 		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
+	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
+		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
 
 /* X2: */
 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
@@ -808,6 +846,14 @@ 
 		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
 		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
 		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
+	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
+		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
+		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
 
 /* Identity matrix: */
 .Ltf_id_bitmatrix:
@@ -819,6 +865,14 @@ 
 		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
+	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 #endif /* CONFIG_AS_GFNI */
 
 /* 4-bit mask */