From patchwork Mon Jun 24 17:38:30 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ard Biesheuvel <ard.biesheuvel@linaro.org>
X-Patchwork-Id: 167632
Delivered-To: patch@linaro.org
Received: by 2002:a92:4782:0:0:0:0:0 with SMTP id e2csp4500773ilk;
 Mon, 24 Jun 2019 10:39:23 -0700 (PDT)
X-Google-Smtp-Source: APXvYqxEmioF0Cs+0ElSmg8eoHdbsyQmdqoYwQk7IbJLInTyZpJm1rtzVSsWVuf4FTAPI09NjpJg
X-Received: by 2002:a17:902:2be8:: with SMTP id
 l95mr139524024plb.231.1561397963222; 
 Mon, 24 Jun 2019 10:39:23 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1561397963; cv=none;
 d=google.com; s=arc-20160816;
 b=XlWUim7DUF9fNMffHsnLPDwxY7q64pjfp13AH0NRZZwQ1AiEzM7/J9F6quynDAPR9n
 t2GUhuqZXaCNhWpIzyu/3Gb2EeIiIhmG5TpZqHhmowtxoFMtM23MkesH3fseD1HRsOX/
 ruGIRcq0U6NP81mnFFIsc9NkzraJLRD/tpFfqQgXutqhUWUI7n02g0/Q46+SA1HYwTMJ
 6pCW/DhEiVHJaogXAB1++W7mG/BIzmz2YqJPsLcL0a5LQrcsY9InJROFFxSK4U4xhmws
 3VsDQ8kx6x66AC5KL/9ycyUaTO+WAluyXvsl87uam6bU05+xhaedoxvfiHDS0bp1TihG
 pyHQ==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:sender:content-transfer-encoding:mime-version
 :references:in-reply-to:message-id:date:subject:cc:to:from
 :dkim-signature;
 bh=ynFIMn+6TN2cPawLGFkoRWgCAGYQqRbJZUBKk5aR5VM=;
 b=KvwikJ9Xs+N4m1O+7e9MpyqweaQTW9N/W1TWkEso56I6SJd4+IQccVCqecYoC3MHeP
 QMu++if58D+8cHjU9RZcFG1mleoo7/0fL9nqikIHWXsbeqH3e4lwqm6+uib0yVlNYoEC
 Kk0RBh+jHpuDxqV+IN+UUiIkmKwUOCUIcohZg5qTb53wcBXINxkaFRpBTrdZk5v+giCo
 EGTypNLNoUQek5lk5iwsRQswHJmZYodu38rbixJaSaw7Sr5G5xBD4qg/3uSnNied3hRp
 g+MFsqOboymPTHQ38rqw8pNvMmppCTfnioxHMfU5qLP//CjMQ3orMg9SIYIGdDshlic2
 MSLA==
ARC-Authentication-Results: i=1; mx.google.com;
 dkim=pass header.i=@linaro.org header.s=google header.b=mWZMXD2M;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Return-Path: <linux-crypto-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67])
 by mx.google.com with ESMTP id
 188si11961707pfv.146.2019.06.24.10.39.23; 
 Mon, 24 Jun 2019 10:39:23 -0700 (PDT)
Received-SPF: pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender) client-ip=209.132.180.67; 
Authentication-Results: mx.google.com;
 dkim=pass header.i=@linaro.org header.s=google header.b=mWZMXD2M;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S1728762AbfFXRjW (ORCPT <rfc822;sumit.garg@linaro.org>
 + 3 others); Mon, 24 Jun 2019 13:39:22 -0400
Received: from mail-wm1-f52.google.com ([209.85.128.52]:51233 "EHLO
 mail-wm1-f52.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
 with ESMTP id S1727945AbfFXRjW (ORCPT
 <rfc822;linux-crypto@vger.kernel.org>);
 Mon, 24 Jun 2019 13:39:22 -0400
Received: by mail-wm1-f52.google.com with SMTP id 207so199624wma.1
 for <linux-crypto@vger.kernel.org>;
 Mon, 24 Jun 2019 10:39:20 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google; 
 h=from:to:cc:subject:date:message-id:in-reply-to:references
 :mime-version:content-transfer-encoding;
 bh=ynFIMn+6TN2cPawLGFkoRWgCAGYQqRbJZUBKk5aR5VM=;
 b=mWZMXD2Mz4jzRorcmsKQnFyKELh8iJoJ3F8tCEPfq3IcyHFyHt4tcCdGbBpMQ0wm+W
 21+sKKG72OT7ohBCLAXWWOwHWqRDxM0tiUXn1lUizu8V1HIbb78QmOt6VEj0Phm3pyWX
 oYT1CSWuOu07rArFvHbAiQxHVXkPvHWF3Iz0WFG1p4aIP+1PDaAn0ewDX9goAOJDNMG+
 wvxJE3/7cyRTNEjX7yTdYTMKvKP3fgPG9cyXODDY/Cw91VWYJovF5iWcsOjbLcMC0VMx
 vIM2MLxcJryVvPuaGzeoi9IjDCuEQ5BZazjxc7Jpc2rBx+wdLysOSPYASqfZaap6hqfB
 rxQw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=ynFIMn+6TN2cPawLGFkoRWgCAGYQqRbJZUBKk5aR5VM=;
 b=G85C+vQ1jVyKcGzfJD24yyO5nmuMXgXjrVht86cv44f5Jt7nlMf8/4NylTeO4pfChY
 Zbun88zBzcVHvWNUErttnxkjS8DHlv3Ob1+o1eVhQ7PwXvC8kfcE99qUc/SkCCsoiTvY
 u5NVBTTaagOA7biWWswSlp8vs7OQUZXAN0YmkHPxy2Ux8KiXia+szIMi20ja8B/ZncBE
 0lg0kbnYlru9hnZRTmTXfc3fbX/t1Nkrhi9c6TNYGbut3MvuKE7VPf2F55fAxpHWkNMi
 54Ov2NBFA0gD249ozMO423MUTdzN9nvijADrzITSHzdUrLOM6SMfyvwAVrHS5ZOYLciM
 jvcA==
X-Gm-Message-State: APjAAAVvqknYirL/Vk1U2RyuQepFsQK8hhjLb8fm2Bgl9ln8E+s1o0BA
 xbU7ZUjqNPzVxgelL7GL37892kV5Il8=
X-Received: by 2002:a05:600c:20d:: with SMTP id
 13mr16668238wmi.141.1561397958935; 
 Mon, 24 Jun 2019 10:39:18 -0700 (PDT)
Received: from localhost.localdomain
 (aaubervilliers-681-1-10-211.w90-88.abo.wanadoo.fr.
 [90.88.131.211]) by smtp.gmail.com with ESMTPSA id
 s10sm260787wmf.8.2019.06.24.10.39.17
 (version=TLS1_3 cipher=AEAD-AES256-GCM-SHA384 bits=256/256);
 Mon, 24 Jun 2019 10:39:18 -0700 (PDT)
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: linux-crypto@vger.kernel.org
Cc: herbert@gondor.apana.org.au, ebiggers@kernel.org,
 linux-arm-kernel@lists.infradead.org, steve.capper@arm.com,
 Ard Biesheuvel <ard.biesheuvel@linaro.org>
Subject: [PATCH 1/2] crypto: arm64/aes-ce - add 5 way interleave routines
Date: Mon, 24 Jun 2019 19:38:30 +0200
Message-Id: <20190624173831.8375-2-ard.biesheuvel@linaro.org>
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20190624173831.8375-1-ard.biesheuvel@linaro.org>
References: <20190624173831.8375-1-ard.biesheuvel@linaro.org>
MIME-Version: 1.0
Sender: linux-crypto-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-crypto.vger.kernel.org>
X-Mailing-List: linux-crypto@vger.kernel.org

In preparation of tweaking the accelerated AES chaining mode routines
to be able to use a 5-way stride, implement the core routines to
support processing 5 blocks of input at a time. While at it, drop
the 2 way versions, which have been unused for a while now.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/aes-ce.S    | 58 ++++++++++++--------
 arch/arm64/crypto/aes-modes.S | 16 ++++++
 arch/arm64/crypto/aes-neon.S  | 46 +---------------
 3 files changed, 52 insertions(+), 68 deletions(-)

-- 
2.20.1

diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 143070510809..0fca5f463406 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -52,7 +52,7 @@
 	load_round_keys	\rounds, \temp
 	.endm
 
-	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3
+	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
 	aes\de		\i0\().16b, \k\().16b
 	aes\mc		\i0\().16b, \i0\().16b
 	.ifnb		\i1
@@ -63,27 +63,34 @@
 	aes\mc		\i2\().16b, \i2\().16b
 	aes\de		\i3\().16b, \k\().16b
 	aes\mc		\i3\().16b, \i3\().16b
+	.ifnb		\i4
+	aes\de		\i4\().16b, \k\().16b
+	aes\mc		\i4\().16b, \i4\().16b
+	.endif
 	.endif
 	.endif
 	.endm
 
-	/* up to 4 interleaved encryption rounds with the same round key */
-	.macro		round_Nx, enc, k, i0, i1, i2, i3
+	/* up to 5 interleaved encryption rounds with the same round key */
+	.macro		round_Nx, enc, k, i0, i1, i2, i3, i4
 	.ifc		\enc, e
-	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3
+	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3, \i4
 	.else
-	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3
+	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3, \i4
 	.endif
 	.endm
 
-	/* up to 4 interleaved final rounds */
-	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3
+	/* up to 5 interleaved final rounds */
+	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4
 	aes\de		\i0\().16b, \k\().16b
 	.ifnb		\i1
 	aes\de		\i1\().16b, \k\().16b
 	.ifnb		\i3
 	aes\de		\i2\().16b, \k\().16b
 	aes\de		\i3\().16b, \k\().16b
+	.ifnb		\i4
+	aes\de		\i4\().16b, \k\().16b
+	.endif
 	.endif
 	.endif
 	eor		\i0\().16b, \i0\().16b, \k2\().16b
@@ -92,47 +99,52 @@
 	.ifnb		\i3
 	eor		\i2\().16b, \i2\().16b, \k2\().16b
 	eor		\i3\().16b, \i3\().16b, \k2\().16b
+	.ifnb		\i4
+	eor		\i4\().16b, \i4\().16b, \k2\().16b
+	.endif
 	.endif
 	.endif
 	.endm
 
-	/* up to 4 interleaved blocks */
-	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3
+	/* up to 5 interleaved blocks */
+	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
 	cmp		\rounds, #12
 	blo		2222f		/* 128 bits */
 	beq		1111f		/* 192 bits */
-	round_Nx	\enc, v17, \i0, \i1, \i2, \i3
-	round_Nx	\enc, v18, \i0, \i1, \i2, \i3
-1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3
-	round_Nx	\enc, v20, \i0, \i1, \i2, \i3
+	round_Nx	\enc, v17, \i0, \i1, \i2, \i3, \i4
+	round_Nx	\enc, v18, \i0, \i1, \i2, \i3, \i4
+1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3, \i4
+	round_Nx	\enc, v20, \i0, \i1, \i2, \i3, \i4
 2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
-	round_Nx	\enc, \key, \i0, \i1, \i2, \i3
+	round_Nx	\enc, \key, \i0, \i1, \i2, \i3, \i4
 	.endr
-	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3
+	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3, \i4
 	.endm
 
 	.macro		encrypt_block, in, rounds, t0, t1, t2
 	do_block_Nx	e, \rounds, \in
 	.endm
 
-	.macro		encrypt_block2x, i0, i1, rounds, t0, t1, t2
-	do_block_Nx	e, \rounds, \i0, \i1
-	.endm
-
 	.macro		encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
 	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3
 	.endm
 
-	.macro		decrypt_block, in, rounds, t0, t1, t2
-	do_block_Nx	d, \rounds, \in
+	.macro		encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
+	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3, \i4
 	.endm
 
-	.macro		decrypt_block2x, i0, i1, rounds, t0, t1, t2
-	do_block_Nx	d, \rounds, \i0, \i1
+	.macro		decrypt_block, in, rounds, t0, t1, t2
+	do_block_Nx	d, \rounds, \in
 	.endm
 
 	.macro		decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
 	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3
 	.endm
 
+	.macro		decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
+	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3, \i4
+	.endm
+
+#define MAX_STRIDE	5
+
 #include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 4ebc61375aa6..0dbeaf2ce9b1 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -13,6 +13,10 @@
 	.text
 	.align		4
 
+#ifndef MAX_STRIDE
+#define MAX_STRIDE	4
+#endif
+
 aes_encrypt_block4x:
 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
 	ret
@@ -23,6 +27,18 @@ aes_decrypt_block4x:
 	ret
 ENDPROC(aes_decrypt_block4x)
 
+#if MAX_STRIDE == 5
+aes_encrypt_block5x:
+	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
+	ret
+ENDPROC(aes_encrypt_block5x)
+
+aes_decrypt_block5x:
+	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
+	ret
+ENDPROC(aes_decrypt_block5x)
+#endif
+
 	/*
 	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 	 *		   int blocks)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 29100f692e8a..33bb6af309a3 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -117,26 +117,9 @@
 
 	/*
 	 * Interleaved versions: functionally equivalent to the
-	 * ones above, but applied to 2 or 4 AES states in parallel.
+	 * ones above, but applied to AES states in parallel.
 	 */
 
-	.macro		sub_bytes_2x, in0, in1
-	sub		v8.16b, \in0\().16b, v15.16b
-	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
-	sub		v9.16b, \in1\().16b, v15.16b
-	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, v8.16b, v15.16b
-	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
-	sub		v11.16b, v9.16b, v15.16b
-	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v10.16b, v15.16b
-	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
-	sub		v9.16b, v11.16b, v15.16b
-	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
-	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
-	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
-	.endm
-
 	.macro		sub_bytes_4x, in0, in1, in2, in3
 	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
@@ -215,25 +198,6 @@
 	eor		\in1\().16b, \in1\().16b, v11.16b
 	.endm
 
-	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
-	ld1		{v15.4s}, [\rk]
-	add		\rkp, \rk, #16
-	mov		\i, \rounds
-1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
-	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
-	movi		v15.16b, #0x40
-	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
-	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
-	sub_bytes_2x	\in0, \in1
-	subs		\i, \i, #1
-	ld1		{v15.4s}, [\rkp], #16
-	beq		2222f
-	mix_columns_2x	\in0, \in1, \enc
-	b		1111b
-2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
-	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
-	.endm
-
 	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
@@ -260,14 +224,6 @@
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
 	.endm
 
-	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
-	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
-	.endm
-
-	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
-	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
-	.endm
-
 	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
 	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
 	.endm

From patchwork Mon Jun 24 17:38:31 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ard Biesheuvel <ard.biesheuvel@linaro.org>
X-Patchwork-Id: 167633
Delivered-To: patch@linaro.org
Received: by 2002:a92:4782:0:0:0:0:0 with SMTP id e2csp4500799ilk;
 Mon, 24 Jun 2019 10:39:24 -0700 (PDT)
X-Google-Smtp-Source: APXvYqzxpOearwXKRwQV+R2DMIfLSKGGgvxMaTih5B0urd7nlqJBPCDwy2GPK1B5kNECnRn9O3iy
X-Received: by 2002:a17:90a:25af:: with SMTP id
 k44mr25638861pje.122.1561397964672; 
 Mon, 24 Jun 2019 10:39:24 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1561397964; cv=none;
 d=google.com; s=arc-20160816;
 b=d6wnnmkHetECd48FhQgONh2AQQKFKiUNGZpc7UkU1Cz71eTfg4SPOsCsZ3+1GoPW4f
 RDit1vSrJlSHFUhTm1TX95uHpBTIYH3M9+RoZpv5g+IWXaFE09sWNKzuDeesRQAXzN+u
 wUM+hS9Ymam5JoFacwSftTnkoOv20/PsHsE1Wc9S2A73OHKXKiuMDPnjxdolYOcG1tzG
 1R97hNGsT9oiuXG4vqbuALaqCeX3xV/gMVJZj5YPFj0Zy3AG34o3tvVC8V+JDoQ8y5Uo
 lCo+TnWqqlSTgmGiDmeBhDsnsNRBeshwxKE6h3ZPEkkKwtgcoN3aMO6U1/oF4aMe1xrl
 flrg==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:sender:content-transfer-encoding:mime-version
 :references:in-reply-to:message-id:date:subject:cc:to:from
 :dkim-signature;
 bh=PNlQrDhTvmKworOs7YcRIZ4pDf2weoiPlXkzqH25+mU=;
 b=QERkQSWGn6ZZQ1Xe3OcngS44E9HI6iKkxtlH/HIsDfHkdShnmqZdn9WxkTcqDjoIA4
 v2IDNQaPfjSGbr4UM4kCYQQNJwFVdDwISAdczS9WZutmSIuMR2FHuVVLmyr11jPDa09o
 GwVb/hD1pOHKxMr1yd2lOsmm2RXpyUhXu5VrP/Z/5RFWV2rMxlX+p3/ebnOOhF75h6JV
 awjwuABtoLrrWvusgdTN/ODfnOsUa01eFL5vApkW5RbxePaFzZYBf5vTyVsF5/s8NjGJ
 s8sf5bpo77P6ynf0HYqNa2w/i9E/+0USmAKZ0FNXFmx6tNnIdAdqMzbMbUhEbIQFfZP5
 iXAg==
ARC-Authentication-Results: i=1; mx.google.com;
 dkim=pass header.i=@linaro.org header.s=google header.b=bN+JTKg+;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Return-Path: <linux-crypto-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67])
 by mx.google.com with ESMTP id
 188si11961707pfv.146.2019.06.24.10.39.24; 
 Mon, 24 Jun 2019 10:39:24 -0700 (PDT)
Received-SPF: pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender) client-ip=209.132.180.67; 
Authentication-Results: mx.google.com;
 dkim=pass header.i=@linaro.org header.s=google header.b=bN+JTKg+;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S1732484AbfFXRjX (ORCPT <rfc822;sumit.garg@linaro.org>
 + 3 others); Mon, 24 Jun 2019 13:39:23 -0400
Received: from mail-wm1-f48.google.com ([209.85.128.48]:51231 "EHLO
 mail-wm1-f48.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
 with ESMTP id S1727806AbfFXRjX (ORCPT
 <rfc822;linux-crypto@vger.kernel.org>);
 Mon, 24 Jun 2019 13:39:23 -0400
Received: by mail-wm1-f48.google.com with SMTP id 207so199659wma.1
 for <linux-crypto@vger.kernel.org>;
 Mon, 24 Jun 2019 10:39:21 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google; 
 h=from:to:cc:subject:date:message-id:in-reply-to:references
 :mime-version:content-transfer-encoding;
 bh=PNlQrDhTvmKworOs7YcRIZ4pDf2weoiPlXkzqH25+mU=;
 b=bN+JTKg+zSPN8uDKOndRtVWL54JBT3X8WQGqzvv/memssEIXhtymc+2ZWDvY3I1WiS
 uaXlD4dufvPc57Xs7b68RreyUj8HYSkbbI7mJn9FR6xFaK2UmsnirnGrWFrt1WcA0m8Z
 a+KKkwmRa3qkv7AdeUWOfUC1+R/VXQaLVMSXF6biGetfvonCjPRbZvlbI4B4Xsey1hYt
 ABuQZj9WVPEP+Sd6SpHCXpMjLYhNlyN4ZSdT9AYiekTzM69BtnPH0BGiSO//hZ+rn9BR
 nf3XRMg6E9rtLh6Ok6IiXOn/YMq+8aaxqf/RGfgD1DotM6cmAwM9wP4dTgkosDH0dxpi
 u5lA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=PNlQrDhTvmKworOs7YcRIZ4pDf2weoiPlXkzqH25+mU=;
 b=E+GyYEmSZq8Iospo5e71vF1ngxnTllKlBiIGUc8PBD6KYx/fciWxkY7DFROFf1A0C1
 NEtP1goyLiy1kBZVu8orqVplzc/dabRjsf+0k/2r9mz3y3lObdTUQ5EpyiH46suGWj/k
 ifcXhZuoJ4YawnAT1wIDJpTe2dz/vjOtBjjjb7JeBgyAFznEFnEP/TDOulrxlRnTBamE
 CPVwiprSU7F06h5Q6LGLbAlsqTYCqtXO3Yr/8wCd6j8THyW97UNo0A7DePBH/eLCMo4K
 62EHH/646+XLadzdHTgw7l8qRM1tBDTokRafJCjzhdgrNHE7yTRcnpWCYTQisrhicGvZ
 ASYQ==
X-Gm-Message-State: APjAAAVxEQoNYGZHHtGwDX/TPqnqZYEG5+28RszRzw26T2yw1UJcKjAU
 BYIoF2HJ9sr2GKqMYNUhmJ6sRLh04/I=
X-Received: by 2002:a05:600c:2056:: with SMTP id
 p22mr16500820wmg.155.1561397959982; 
 Mon, 24 Jun 2019 10:39:19 -0700 (PDT)
Received: from localhost.localdomain
 (aaubervilliers-681-1-10-211.w90-88.abo.wanadoo.fr.
 [90.88.131.211]) by smtp.gmail.com with ESMTPSA id
 s10sm260787wmf.8.2019.06.24.10.39.18
 (version=TLS1_3 cipher=AEAD-AES256-GCM-SHA384 bits=256/256);
 Mon, 24 Jun 2019 10:39:19 -0700 (PDT)
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: linux-crypto@vger.kernel.org
Cc: herbert@gondor.apana.org.au, ebiggers@kernel.org,
 linux-arm-kernel@lists.infradead.org, steve.capper@arm.com,
 Ard Biesheuvel <ard.biesheuvel@linaro.org>
Subject: [PATCH 2/2] crypto: arm64/aes-ce - implement 5 way interleave for
 ECB, CBC and CTR
Date: Mon, 24 Jun 2019 19:38:31 +0200
Message-Id: <20190624173831.8375-3-ard.biesheuvel@linaro.org>
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20190624173831.8375-1-ard.biesheuvel@linaro.org>
References: <20190624173831.8375-1-ard.biesheuvel@linaro.org>
MIME-Version: 1.0
Sender: linux-crypto-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-crypto.vger.kernel.org>
X-Mailing-List: linux-crypto@vger.kernel.org

This implements 5-way interleaving for ECB, CBC decryption and CTR,
resulting in a speedup of ~11% on Marvell ThunderX2, which has a
very deep pipeline and therefore a high issue latency for NEON
instructions operating on the same registers.

Note that XTS is left alone: implementing 5-way interleave there
would either involve spilling of the calculated tweaks to the
stack, or recalculating them after the encryption operation, and
doing either of those would most likely penalize low end cores.

For ECB, this is not a concern at all, given that we have plenty
of spare registers. For CTR and CBC decryption, we take advantage
of the fact that v16 is not used by the CE version of the code
(which is the only one targeted by the optimization), and so we
can reshuffle the code a bit and avoid having to spill to memory
(with the exception of one extra reload in the CBC routine)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/aes-ce.S    |   2 +
 arch/arm64/crypto/aes-modes.S | 102 ++++++++++++++------
 arch/arm64/crypto/aes-neon.S  |   2 +
 3 files changed, 75 insertions(+), 31 deletions(-)

-- 
2.20.1

diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 0fca5f463406..dbfc04ea20c7 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -18,6 +18,8 @@
 	.arch		armv8-a+crypto
 
 	xtsmask		.req	v16
+	cbciv		.req	v16
+	vctr		.req	v16
 
 	.macro		xts_reload_mask, tmp
 	.endm
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 0dbeaf2ce9b1..42edc34e8f01 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -17,6 +17,14 @@
 #define MAX_STRIDE	4
 #endif
 
+#if MAX_STRIDE == 4
+#define ST4(x...) x
+#define ST5(x...)
+#else
+#define ST4(x...)
+#define ST5(x...) x
+#endif
+
 aes_encrypt_block4x:
 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
 	ret
@@ -53,14 +61,17 @@ AES_ENTRY(aes_ecb_encrypt)
 	enc_prepare	w3, x2, x5
 
 .LecbencloopNx:
-	subs		w4, w4, #4
+	subs		w4, w4, #MAX_STRIDE
 	bmi		.Lecbenc1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
-	bl		aes_encrypt_block4x
+ST4(	bl		aes_encrypt_block4x		)
+ST5(	ld1		{v4.16b}, [x1], #16		)
+ST5(	bl		aes_encrypt_block5x		)
 	st1		{v0.16b-v3.16b}, [x0], #64
+ST5(	st1		{v4.16b}, [x0], #16		)
 	b		.LecbencloopNx
 .Lecbenc1x:
-	adds		w4, w4, #4
+	adds		w4, w4, #MAX_STRIDE
 	beq		.Lecbencout
 .Lecbencloop:
 	ld1		{v0.16b}, [x1], #16		/* get next pt block */
@@ -81,14 +92,17 @@ AES_ENTRY(aes_ecb_decrypt)
 	dec_prepare	w3, x2, x5
 
 .LecbdecloopNx:
-	subs		w4, w4, #4
+	subs		w4, w4, #MAX_STRIDE
 	bmi		.Lecbdec1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
-	bl		aes_decrypt_block4x
+ST4(	bl		aes_decrypt_block4x		)
+ST5(	ld1		{v4.16b}, [x1], #16		)
+ST5(	bl		aes_decrypt_block5x		)
 	st1		{v0.16b-v3.16b}, [x0], #64
+ST5(	st1		{v4.16b}, [x0], #16		)
 	b		.LecbdecloopNx
 .Lecbdec1x:
-	adds		w4, w4, #4
+	adds		w4, w4, #MAX_STRIDE
 	beq		.Lecbdecout
 .Lecbdecloop:
 	ld1		{v0.16b}, [x1], #16		/* get next ct block */
@@ -148,39 +162,56 @@ AES_ENTRY(aes_cbc_decrypt)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp
 
-	ld1		{v7.16b}, [x5]			/* get iv */
+	ld1		{cbciv.16b}, [x5]		/* get iv */
 	dec_prepare	w3, x2, x6
 
 .LcbcdecloopNx:
-	subs		w4, w4, #4
+	subs		w4, w4, #MAX_STRIDE
 	bmi		.Lcbcdec1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+#if MAX_STRIDE == 5
+	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
+	mov		v5.16b, v0.16b
+	mov		v6.16b, v1.16b
+	mov		v7.16b, v2.16b
+	bl		aes_decrypt_block5x
+	sub		x1, x1, #32
+	eor		v0.16b, v0.16b, cbciv.16b
+	eor		v1.16b, v1.16b, v5.16b
+	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
+	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
+	eor		v2.16b, v2.16b, v6.16b
+	eor		v3.16b, v3.16b, v7.16b
+	eor		v4.16b, v4.16b, v5.16b
+#else
 	mov		v4.16b, v0.16b
 	mov		v5.16b, v1.16b
 	mov		v6.16b, v2.16b
 	bl		aes_decrypt_block4x
 	sub		x1, x1, #16
-	eor		v0.16b, v0.16b, v7.16b
+	eor		v0.16b, v0.16b, cbciv.16b
 	eor		v1.16b, v1.16b, v4.16b
-	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
+	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
 	eor		v2.16b, v2.16b, v5.16b
 	eor		v3.16b, v3.16b, v6.16b
+#endif
 	st1		{v0.16b-v3.16b}, [x0], #64
+ST5(	st1		{v4.16b}, [x0], #16		)
 	b		.LcbcdecloopNx
 .Lcbcdec1x:
-	adds		w4, w4, #4
+	adds		w4, w4, #MAX_STRIDE
 	beq		.Lcbcdecout
 .Lcbcdecloop:
 	ld1		{v1.16b}, [x1], #16		/* get next ct block */
 	mov		v0.16b, v1.16b			/* ...and copy to v0 */
 	decrypt_block	v0, w3, x2, x6, w7
-	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
-	mov		v7.16b, v1.16b			/* ct is next iv */
+	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
+	mov		cbciv.16b, v1.16b		/* ct is next iv */
 	st1		{v0.16b}, [x0], #16
 	subs		w4, w4, #1
 	bne		.Lcbcdecloop
 .Lcbcdecout:
-	st1		{v7.16b}, [x5]			/* return iv */
+	st1		{cbciv.16b}, [x5]		/* return iv */
 	ldp		x29, x30, [sp], #16
 	ret
 AES_ENDPROC(aes_cbc_decrypt)
@@ -373,51 +404,60 @@ AES_ENTRY(aes_ctr_encrypt)
 	mov		x29, sp
 
 	enc_prepare	w3, x2, x6
-	ld1		{v4.16b}, [x5]
+	ld1		{vctr.16b}, [x5]
 
-	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
+	umov		x6, vctr.d[1]		/* keep swabbed ctr in reg */
 	rev		x6, x6
 	cmn		w6, w4			/* 32 bit overflow? */
 	bcs		.Lctrloop
 .LctrloopNx:
-	subs		w4, w4, #4
+	subs		w4, w4, #MAX_STRIDE
 	bmi		.Lctr1x
 	add		w7, w6, #1
-	mov		v0.16b, v4.16b
+	mov		v0.16b, vctr.16b
 	add		w8, w6, #2
-	mov		v1.16b, v4.16b
+	mov		v1.16b, vctr.16b
+	add		w9, w6, #3
+	mov		v2.16b, vctr.16b
 	add		w9, w6, #3
-	mov		v2.16b, v4.16b
 	rev		w7, w7
-	mov		v3.16b, v4.16b
+	mov		v3.16b, vctr.16b
 	rev		w8, w8
+ST5(	mov		v4.16b, vctr.16b		)
 	mov		v1.s[3], w7
 	rev		w9, w9
+ST5(	add		w10, w6, #4			)
 	mov		v2.s[3], w8
+ST5(	rev		w10, w10			)
 	mov		v3.s[3], w9
+ST5(	mov		v4.s[3], w10			)
 	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
-	bl		aes_encrypt_block4x
+ST4(	bl		aes_encrypt_block4x		)
+ST5(	bl		aes_encrypt_block5x		)
 	eor		v0.16b, v5.16b, v0.16b
-	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
+ST4(	ld1		{v5.16b}, [x1], #16		)
 	eor		v1.16b, v6.16b, v1.16b
+ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
 	eor		v2.16b, v7.16b, v2.16b
 	eor		v3.16b, v5.16b, v3.16b
+ST5(	eor		v4.16b, v6.16b, v4.16b		)
 	st1		{v0.16b-v3.16b}, [x0], #64
-	add		x6, x6, #4
+ST5(	st1		{v4.16b}, [x0], #16		)
+	add		x6, x6, #MAX_STRIDE
 	rev		x7, x6
-	ins		v4.d[1], x7
+	ins		vctr.d[1], x7
 	cbz		w4, .Lctrout
 	b		.LctrloopNx
 .Lctr1x:
-	adds		w4, w4, #4
+	adds		w4, w4, #MAX_STRIDE
 	beq		.Lctrout
 .Lctrloop:
-	mov		v0.16b, v4.16b
+	mov		v0.16b, vctr.16b
 	encrypt_block	v0, w3, x2, x8, w7
 
 	adds		x6, x6, #1		/* increment BE ctr */
 	rev		x7, x6
-	ins		v4.d[1], x7
+	ins		vctr.d[1], x7
 	bcs		.Lctrcarry		/* overflow? */
 
 .Lctrcarrydone:
@@ -429,7 +469,7 @@ AES_ENTRY(aes_ctr_encrypt)
 	bne		.Lctrloop
 
 .Lctrout:
-	st1		{v4.16b}, [x5]		/* return next CTR value */
+	st1		{vctr.16b}, [x5]	/* return next CTR value */
 	ldp		x29, x30, [sp], #16
 	ret
 
@@ -438,11 +478,11 @@ AES_ENTRY(aes_ctr_encrypt)
 	b		.Lctrout
 
 .Lctrcarry:
-	umov		x7, v4.d[0]		/* load upper word of ctr  */
+	umov		x7, vctr.d[0]		/* load upper word of ctr  */
 	rev		x7, x7			/* ... to handle the carry */
 	add		x7, x7, #1
 	rev		x7, x7
-	ins		v4.d[0], x7
+	ins		vctr.d[0], x7
 	b		.Lctrcarrydone
 AES_ENDPROC(aes_ctr_encrypt)
 
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 33bb6af309a3..8bd66a6c4749 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -15,6 +15,8 @@
 #define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
 
 	xtsmask		.req	v7
+	cbciv		.req	v7
+	vctr		.req	v4
 
 	.macro		xts_reload_mask, tmp
 	xts_load_mask	\tmp