From patchwork Sat Jan 28 23:25:38 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ard Biesheuvel <ard.biesheuvel@linaro.org>
X-Patchwork-Id: 92775
Delivered-To: patch@linaro.org
Received: by 10.140.20.99 with SMTP id 90csp847289qgi;
 Sat, 28 Jan 2017 15:26:31 -0800 (PST)
X-Received: by 10.98.144.22 with SMTP id a22mr16211066pfe.160.1485645991765; 
 Sat, 28 Jan 2017 15:26:31 -0800 (PST)
Return-Path: <linux-crypto-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67])
 by mx.google.com with ESMTP id d65si8383902pfl.73.2017.01.28.15.26.31;
 Sat, 28 Jan 2017 15:26:31 -0800 (PST)
Received-SPF: pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender) client-ip=209.132.180.67; 
Authentication-Results: mx.google.com;
 dkim=neutral (body hash did not verify) header.i=@linaro.org;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S1752609AbdA1X0Y (ORCPT <rfc822;victor.chong@linaro.org>
 + 1 other); Sat, 28 Jan 2017 18:26:24 -0500
Received: from mail-wm0-f47.google.com ([74.125.82.47]:37298 "EHLO
 mail-wm0-f47.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
 with ESMTP id S1752494AbdA1X0X (ORCPT
 <rfc822;linux-crypto@vger.kernel.org>);
 Sat, 28 Jan 2017 18:26:23 -0500
Received: by mail-wm0-f47.google.com with SMTP id d140so1641133wmd.0
 for <linux-crypto@vger.kernel.org>;
 Sat, 28 Jan 2017 15:26:13 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google; 
 h=from:to:cc:subject:date:message-id:in-reply-to:references;
 bh=9CAIQkYU/XyC31J6gMAyBSvd8YatUmzR0gUDMVzx5W0=;
 b=dEFT9z3bLKunOQsw0MvWYNdnmuNLTq5/XDH3AoGjU6Eqm33GL8hpG1EN7V3doZwwOJ
 KH2B7noWSE0+m31SGnx81W1lSRrVRB46AjcLt+HkheGyIhobpFVUDQ/55WvJGEOC+R+f
 4/Q/1SWCOBfFZvwlX7ZgC4RMyZhUCmxckO8m8=
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references;
 bh=9CAIQkYU/XyC31J6gMAyBSvd8YatUmzR0gUDMVzx5W0=;
 b=cFzdiADdQvR98zdlYLWd7T9UtjRKbLyMMyxEkwgLlqfWAgHW5LDA0DhYj7s7VXSSh+
 ZUsgo2lQXSsgQOqCsu+5rDMf+nwj3UueuZvcJhbyr3fJe8K9RXw1+aVosccPdeEyzGus
 Ir0X3uaawehHyEtelJ2zMK30qdCk3QwhauvBGrCPhBLxgUjzzQ/P2Y4K9W1fM9WBkNxa
 IZr0FKMV2d/0Y+g9ThxbK+ZoLpBwGY82CeLbTwGW9zr89tdTMz9wjXhyBxxiD1qtwfhr
 nAkQshGBUTL9KlPGYFvXA7dP3zUfMe1/WIii2VuLzn231NMfUag7jba2vZnMbUM+cYZN
 XfbA==
X-Gm-Message-State: AIkVDXLruN2KNiKZavdSoZ16JvKRUfJFRzuY8Oe0hxvupf+eLdgM0m/87MGCQTYd6gLLldOR
X-Received: by 10.28.48.78 with SMTP id w75mr9198347wmw.55.1485645971947;
 Sat, 28 Jan 2017 15:26:11 -0800 (PST)
Received: from localhost.localdomain ([160.163.215.165])
 by smtp.gmail.com with ESMTPSA id
 33sm14992064wrd.34.2017.01.28.15.26.08
 (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128);
 Sat, 28 Jan 2017 15:26:11 -0800 (PST)
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: linux-crypto@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org, herbert@gondor.apana.org.au,
 Ard Biesheuvel <ard.biesheuvel@linaro.org>
Subject: [PATCH v3 09/10] crypto: arm64/aes-neon-blk - tweak performance for
 low end cores
Date: Sat, 28 Jan 2017 23:25:38 +0000
Message-Id: <1485645939-17126-10-git-send-email-ard.biesheuvel@linaro.org>
X-Mailer: git-send-email 2.7.4
In-Reply-To: <1485645939-17126-1-git-send-email-ard.biesheuvel@linaro.org>
References: <1485645939-17126-1-git-send-email-ard.biesheuvel@linaro.org>
Sender: linux-crypto-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-crypto.vger.kernel.org>
X-Mailing-List: linux-crypto@vger.kernel.org

The non-bitsliced AES implementation using the NEON is highly sensitive
to micro-architectural details, and, as it turns out, the Cortex-A53 on
the Raspberry Pi 3 is a core that can benefit from this code, given that
its scalar AES performance is abysmal (32.9 cycles per byte).

The new bitsliced AES code manages 19.8 cycles per byte on this core,
but can only operate on 8 blocks at a time, which is not supported by
all chaining modes. With a bit of tweaking, we can get the plain NEON
code to run at 22.0 cycles per byte, making it useful for sequential
modes like CBC encryption. (Like bitsliced NEON, the plain NEON
implementation does not use any lookup tables, which makes it easy on
the D-cache, and invulnerable to cache timing attacks)

So tweak the plain NEON AES code to use tbl instructions rather than
shl/sri pairs, and to avoid the need to reload permutation vectors or
other constants from memory in every round. Also, improve the decryption
performance by switching to 16x8 pmul instructions for the performing
the multiplications in GF(2^8).

To allow the ECB and CBC encrypt routines to be reused by the bitsliced
NEON code in a subsequent patch, export them from the module.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/aes-glue.c |   2 +
 arch/arm64/crypto/aes-neon.S | 235 +++++++++-----------
 2 files changed, 102 insertions(+), 135 deletions(-)

-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 8ee1fb7aaa4f..055bc3f61138 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -409,5 +409,7 @@ static int __init aes_init(void)
 module_cpu_feature_match(AES, aes_init);
 #else
 module_init(aes_init);
+EXPORT_SYMBOL(neon_aes_ecb_encrypt);
+EXPORT_SYMBOL(neon_aes_cbc_encrypt);
 #endif
 module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 85f07ead7c5c..f1e3aa2732f9 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -1,7 +1,7 @@
 /*
  * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
  *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -17,17 +17,25 @@
 	/* multiply by polynomial 'x' in GF(2^8) */
 	.macro		mul_by_x, out, in, temp, const
 	sshr		\temp, \in, #7
-	add		\out, \in, \in
+	shl		\out, \in, #1
 	and		\temp, \temp, \const
 	eor		\out, \out, \temp
 	.endm
 
+	/* multiply by polynomial 'x^2' in GF(2^8) */
+	.macro		mul_by_x2, out, in, temp, const
+	ushr		\temp, \in, #6
+	shl		\out, \in, #2
+	pmul		\temp, \temp, \const
+	eor		\out, \out, \temp
+	.endm
+
 	/* preload the entire Sbox */
 	.macro		prepare, sbox, shiftrows, temp
 	adr		\temp, \sbox
-	movi		v12.16b, #0x40
+	movi		v12.16b, #0x1b
 	ldr		q13, \shiftrows
-	movi		v14.16b, #0x1b
+	ldr		q14, .Lror32by8
 	ld1		{v16.16b-v19.16b}, [\temp], #64
 	ld1		{v20.16b-v23.16b}, [\temp], #64
 	ld1		{v24.16b-v27.16b}, [\temp], #64
@@ -50,37 +58,31 @@
 
 	/* apply SubBytes transformation using the the preloaded Sbox */
 	.macro		sub_bytes, in
-	sub		v9.16b, \in\().16b, v12.16b
+	sub		v9.16b, \in\().16b, v15.16b
 	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
-	sub		v10.16b, v9.16b, v12.16b
+	sub		v10.16b, v9.16b, v15.16b
 	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v11.16b, v10.16b, v12.16b
+	sub		v11.16b, v10.16b, v15.16b
 	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
 	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm
 
 	/* apply MixColumns transformation */
-	.macro		mix_columns, in
-	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
-	rev32		v8.8h, \in\().8h
-	eor		\in\().16b, v10.16b, \in\().16b
-	shl		v9.4s, v8.4s, #24
-	shl		v11.4s, \in\().4s, #24
-	sri		v9.4s, v8.4s, #8
-	sri		v11.4s, \in\().4s, #8
-	eor		v9.16b, v9.16b, v8.16b
-	eor		v10.16b, v10.16b, v9.16b
-	eor		\in\().16b, v10.16b, v11.16b
-	.endm
-
+	.macro		mix_columns, in, enc
+	.if		\enc == 0
 	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
-	.macro		inv_mix_columns, in
-	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
-	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
-	eor		\in\().16b, \in\().16b, v11.16b
-	rev32		v11.8h, v11.8h
-	eor		\in\().16b, \in\().16b, v11.16b
-	mix_columns	\in
+	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	rev32		v8.8h, v8.8h
+	eor		\in\().16b, \in\().16b, v8.16b
+	.endif
+
+	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
+	rev32		v8.8h, \in\().8h
+	eor		v8.16b, v8.16b, v9.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	tbl		\in\().16b, {\in\().16b}, v14.16b
+	eor		\in\().16b, \in\().16b, v8.16b
 	.endm
 
 	.macro		do_block, enc, in, rounds, rk, rkp, i
@@ -88,16 +90,13 @@
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	movi		v15.16b, #0x40
 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes	\in
-	ld1		{v15.4s}, [\rkp], #16
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns	\in
-	.else
-	inv_mix_columns	\in
-	.endif
+	mix_columns	\in, \enc
 	b		1111b
 2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	.endm
@@ -116,139 +115,114 @@
 	 */
 
 	.macro		sub_bytes_2x, in0, in1
-	sub		v8.16b, \in0\().16b, v12.16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, v8.16b, v12.16b
-	sub		v11.16b, v9.16b, v12.16b
+	sub		v10.16b, v8.16b, v15.16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
+	sub		v11.16b, v9.16b, v15.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v10.16b, v12.16b
-	sub		v9.16b, v11.16b, v12.16b
+	sub		v8.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
+	sub		v9.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	.endm
 
 	.macro		sub_bytes_4x, in0, in1, in2, in3
-	sub		v8.16b, \in0\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, \in2\().16b, v12.16b
+	sub		v10.16b, \in2\().16b, v15.16b
 	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
-	sub		v11.16b, \in3\().16b, v12.16b
+	sub		v11.16b, \in3\().16b, v15.16b
 	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
 	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm
 
 	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
-	sshr		\tmp0\().16b, \in0\().16b,  #7
-	add		\out0\().16b, \in0\().16b,  \in0\().16b
-	sshr		\tmp1\().16b, \in1\().16b,  #7
+	sshr		\tmp0\().16b, \in0\().16b, #7
+	shl		\out0\().16b, \in0\().16b, #1
+	sshr		\tmp1\().16b, \in1\().16b, #7
 	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
-	add		\out1\().16b, \in1\().16b,  \in1\().16b
+	shl		\out1\().16b, \in1\().16b, #1
 	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
 	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
 	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
 	.endm
 
-	.macro		mix_columns_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	rev32		v10.8h, \in0\().8h
-	rev32		v11.8h, \in1\().8h
-	eor		\in0\().16b, v8.16b, \in0\().16b
-	eor		\in1\().16b, v9.16b, \in1\().16b
-	shl		v12.4s, v10.4s, #24
-	shl		v13.4s, v11.4s, #24
-	eor		v8.16b, v8.16b, v10.16b
-	sri		v12.4s, v10.4s, #8
-	shl		v10.4s, \in0\().4s, #24
-	eor		v9.16b, v9.16b, v11.16b
-	sri		v13.4s, v11.4s, #8
-	shl		v11.4s, \in1\().4s, #24
-	sri		v10.4s, \in0\().4s, #8
-	eor		\in0\().16b, v8.16b, v12.16b
-	sri		v11.4s, \in1\().4s, #8
-	eor		\in1\().16b, v9.16b, v13.16b
-	eor		\in0\().16b, v10.16b, \in0\().16b
-	eor		\in1\().16b, v11.16b, \in1\().16b
+	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
+	ushr		\tmp0\().16b, \in0\().16b, #6
+	shl		\out0\().16b, \in0\().16b, #2
+	ushr		\tmp1\().16b, \in1\().16b, #6
+	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
+	shl		\out1\().16b, \in1\().16b, #2
+	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
+	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
+	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
 	.endm
 
-	.macro		inv_mix_cols_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
+	.macro		mix_columns_2x, in0, in1, enc
+	.if		\enc == 0
+	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
 	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
 	rev32		v8.8h, v8.8h
-	rev32		v9.8h, v9.8h
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	mix_columns_2x	\in0, \in1
-	.endm
-
-	.macro		inv_mix_cols_4x, in0, in1, in2, in3
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
-	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
-	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
-	eor		\in0\().16b, \in0\().16b, v8.16b
 	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	rev32		v8.8h, v8.8h
 	rev32		v9.8h, v9.8h
-	rev32		v10.8h, v10.8h
-	rev32		v11.8h, v11.8h
 	eor		\in0\().16b, \in0\().16b, v8.16b
 	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
+	.endif
+
+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
+	rev32		v10.8h, \in0\().8h
+	rev32		v11.8h, \in1\().8h
+	eor		v10.16b, v10.16b, v8.16b
+	eor		v11.16b, v11.16b, v9.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
+	tbl		\in0\().16b, {\in0\().16b}, v14.16b
+	tbl		\in1\().16b, {\in1\().16b}, v14.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
 	.endm
 
-	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
-	sub_bytes_2x	\in0, \in1
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_2x	\in0, \in1
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_2x	\in0, \in1
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -262,23 +236,17 @@
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
-	sub_bytes_4x	\in0, \in1, \in2, \in3
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_4x	\in0, \in1, \in2, \in3
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_4x	\in0, \in1, \in2, \in3
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
+	mix_columns_2x	\in2, \in3, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -305,19 +273,7 @@
 #include "aes-modes.S"
 
 	.text
-	.align		4
-.LForward_ShiftRows:
-CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
-CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
-CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
-CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)
-
-.LReverse_ShiftRows:
-CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
-CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
-CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
-CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
-
+	.align		6
 .LForward_Sbox:
 	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
 	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
@@ -385,3 +341,12 @@ CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
 	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
 	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
 	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+.LForward_ShiftRows:
+	.octa		0x0b06010c07020d08030e09040f0a0500
+
+.LReverse_ShiftRows:
+	.octa		0x0306090c0f0205080b0e0104070a0d00
+
+.Lror32by8:
+	.octa		0x0c0f0e0d080b0a090407060500030201