From patchwork Tue Jul 18 12:06:43 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ard Biesheuvel <ard.biesheuvel@linaro.org>
X-Patchwork-Id: 108127
Delivered-To: patch@linaro.org
Received: by 10.182.45.195 with SMTP id p3csp5810903obm;
 Tue, 18 Jul 2017 05:07:19 -0700 (PDT)
X-Received: by 10.84.138.3 with SMTP id 3mr1395601plo.101.1500379639682;
 Tue, 18 Jul 2017 05:07:19 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1500379639; cv=none;
 d=google.com; s=arc-20160816;
 b=wiBB955KjRFRNMHadQEKZpvAu/zJUVF2fIG54qpNLZUfMjxsGtAERyTnTtPxSPvZ3x
 pOMd4nmZSd+JNFrDaiuKkT+HAn5RkMUCLm0+29cOMSO/6DYdX2z/4kZnJqTry27xohAk
 A+sDRRFlVPI9U342xPhtunO1vtIgRAG3xMdbhQyG321n1BPBjQ8Bex5V3tLELBhyEI/U
 00zIIAAiwtDi0gSfbObWbjUxaV1eBM1luUYshvQMM7KVCqmoB8oS4ur+OGA4X7g0Nyi6
 PGmOA8dg/84OpMj7rM2wUYMhr5Bsx/onoYvSW45BQWqpuCCynoG3pivQBgyCO1R3GOfV
 RTHw==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:sender:references:in-reply-to:message-id:date
 :subject:cc:to:from:dkim-signature:arc-authentication-results;
 bh=pqiIF7QQ8EMRHnXJDT/CHwL4s17X365k/Y6M5wjhE2Q=;
 b=L6Fq26vWRgA0zYKRnIOvlR2Nq8nKdFVzZVz2MnLNqA3vVgH46KQqo0+t9RK8d5QP9P
 vLdOdaC7j48DDBbf3XheFtNLk6iRTOTDIqgeFX5JSdXsf+uR3MQVjRou7fBoE2GilhQm
 0GeLp8ANGO0VNgUjckc0AEToATRRcWPwzAqNHnJU9/p706Xy5k/otR/cRgLRckGcP5sE
 dMQ9qdpAqyKYJt0OiKHPI0sE/h/LMgUR3NZvhg2OjSmOowEz9v2KON4EdQR/R4A1Scqi
 v/SyDRI2Zhn1d1OC7xNDjAUxSBmItXC6cT2ArrSWuc8SN2gXKhgDmq+3p3hrnAyx9PU9
 YG3Q==
ARC-Authentication-Results: i=1; mx.google.com;
 dkim=pass header.i=@linaro.org header.b=cdsZFXqt;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Return-Path: <linux-crypto-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67])
 by mx.google.com with ESMTP id
 o131si1563241pfg.413.2017.07.18.05.07.19; 
 Tue, 18 Jul 2017 05:07:19 -0700 (PDT)
Received-SPF: pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender) client-ip=209.132.180.67; 
Authentication-Results: mx.google.com;
 dkim=pass header.i=@linaro.org header.b=cdsZFXqt;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S1751378AbdGRMHR (ORCPT <rfc822;victor.chong@linaro.org>
 + 1 other); Tue, 18 Jul 2017 08:07:17 -0400
Received: from mail-wr0-f171.google.com ([209.85.128.171]:34481 "EHLO
 mail-wr0-f171.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
 with ESMTP id S1751371AbdGRMHL (ORCPT
 <rfc822;linux-crypto@vger.kernel.org>);
 Tue, 18 Jul 2017 08:07:11 -0400
Received: by mail-wr0-f171.google.com with SMTP id 12so26326657wrb.1
 for <linux-crypto@vger.kernel.org>;
 Tue, 18 Jul 2017 05:07:11 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google; 
 h=from:to:cc:subject:date:message-id:in-reply-to:references;
 bh=pqiIF7QQ8EMRHnXJDT/CHwL4s17X365k/Y6M5wjhE2Q=;
 b=cdsZFXqtTYw+76y18qO6ZE23iGGGZ3YMQAJ6BuG8QttDSIu9iEv7jEUergEFu+Wz9S
 /AeKYSGTS0nd9iJ/+sBzyfblzhAr+c3+V/A8UoYHhTik0om1QRb9BZcAhPw+tdt3VsML
 4lBOOkY0NZorkXuQ184OoX53PdhNk7cu7LfRM=
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references;
 bh=pqiIF7QQ8EMRHnXJDT/CHwL4s17X365k/Y6M5wjhE2Q=;
 b=mZ/8F3ErJFWyywNo/CdtJjmNHH3CTiR9Cmyrcg6PkeGfU/C0g+mgMidFM5QNnAJXoa
 c2flw0R0Dac1fVAaK8pZDORLhYvwUVSLeyAY4lYIkrG7r9+Vsptx7boMdb96TF5AZMAS
 ByZ8FH0NL0cQajXtt98mViO5IGSXPhGfIz/rb41Jvnt3sC+AZ6RPWHzfw9qOMWUIrOBR
 Z2v8FS0zppDvh3V6p7NxLqYKif5U1z0dFRfLqkG3DtpAa4CD9HlKm3w3RzTCisUVjXH+
 9ErdsyB3vKH40qckdfsYkgxeFhfUJWKd0SY2J6UHXniMxe4fFgCWaBLjagEJpsZN5UsS
 nS3w==
X-Gm-Message-State: AIVw113yDEs7KZ7CM/HEDC9VOOgXUxkaXuzOkLKJCHl1+nzyRimJp8HY
 tH6A+lSKXguOBzYyNAwCPQ==
X-Received: by 10.223.148.36 with SMTP id 33mr1046062wrq.24.1500379629902;
 Tue, 18 Jul 2017 05:07:09 -0700 (PDT)
Received: from localhost.localdomain ([154.145.198.181])
 by smtp.gmail.com with ESMTPSA id
 l46sm2174532wrl.15.2017.07.18.05.07.07
 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
 Tue, 18 Jul 2017 05:07:08 -0700 (PDT)
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: linux-crypto@vger.kernel.org, herbert@gondor.apana.org.au,
 nico@linaro.org, ebiggers@google.com
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Subject: [PATCH v4 6/8] crypto: arm64/aes - avoid expanded lookup tables in
 the final round
Date: Tue, 18 Jul 2017 13:06:43 +0100
Message-Id: <20170718120645.15880-7-ard.biesheuvel@linaro.org>
X-Mailer: git-send-email 2.9.3
In-Reply-To: <20170718120645.15880-1-ard.biesheuvel@linaro.org>
References: <20170718120645.15880-1-ard.biesheuvel@linaro.org>
Sender: linux-crypto-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-crypto.vger.kernel.org>
X-Mailing-List: linux-crypto@vger.kernel.org

For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox exported by the generic AES driver, which is 4x smaller than
the inverse table exported by the generic driver.

This significantly reduces the Dcache footprint of our code, and does
not introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines. It
also frees up register x18, which is not available as a scratch register
on all platforms, which and so avoiding it improves shareability of this
code.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/aes-cipher-core.S | 155 ++++++++++++++------
 1 file changed, 108 insertions(+), 47 deletions(-)

-- 
2.9.3

diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S
index bbe5dd96135c..fe807f164d83 100644
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -18,99 +18,160 @@
 	out		.req	x1
 	in		.req	x2
 	rounds		.req	x3
-	tt		.req	x4
-	lt		.req	x2
+	tt		.req	x2
 
-	.macro		__pair, enc, reg0, reg1, in0, in1e, in1d, shift
+	.macro		__ubf1, reg0, reg1, in0, in1e, in1d, sz, shift
 	ubfx		\reg0, \in0, #\shift, #8
-	.if		\enc
 	ubfx		\reg1, \in1e, #\shift, #8
-	.else
+	.endm
+
+	.macro		__ubf0, reg0, reg1, in0, in1e, in1d, sz, shift
+	ubfx		\reg0, \in0, #\shift, #8
 	ubfx		\reg1, \in1d, #\shift, #8
+	.endm
+
+	.macro		__ubf1b, reg0, reg1, in0, in1e, in1d, sz, shift
+	.if		\shift == 0 && \sz > 0
+	ubfiz		\reg0, \in0, #\sz, #8
+	ubfiz		\reg1, \in1e, #\sz, #8
+	.else
+	__ubf1		\reg0, \reg1, \in0, \in1e, \in1d, \sz, \shift
+	.endif
+	.endm
+
+	.macro		__ubf0b, reg0, reg1, in0, in1e, in1d, sz, shift
+	.if		\shift == 0 && \sz > 0
+	ubfiz		\reg0, \in0, #\sz, #8
+	ubfiz		\reg1, \in1d, #\sz, #8
+	.else
+	__ubf0		\reg0, \reg1, \in0, \in1e, \in1d, \sz, \shift
 	.endif
+	.endm
+
+	/*
+	 * AArch64 cannot do byte size indexed loads from a table containing
+	 * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
+	 * valid instruction.
+	 *
+	 * For shift == 0, we can simply fold the size shift of the index
+	 * into the ubfx instruction, by switcing to ubfiz and using \sz as
+	 * the destination offset.
+	 * For shift > 0, we perform a 32-byte wide load instead, which does
+	 * allow an index shift of 2, and discard the high bytes later using
+	 * uxtb or lsl #24.
+	 */
+	.macro		__pair, enc, sz, op, reg0, reg1, in0, in1e, in1d, shift
+	__ubf\enc\op	\reg0, \reg1, \in0, \in1e, \in1d, \sz, \shift
+	.ifnc		\op\sz, b2
+	ldr\op		\reg0, [tt, \reg0, uxtw #\sz]
+	ldr\op		\reg1, [tt, \reg1, uxtw #\sz]
+	.elseif		\shift == 0
+	ldrb		\reg0, [tt, \reg0, uxtw]
+	ldrb		\reg1, [tt, \reg1, uxtw]
+	.else
 	ldr		\reg0, [tt, \reg0, uxtw #2]
 	ldr		\reg1, [tt, \reg1, uxtw #2]
+	.endif
 	.endm
 
-	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op
 	ldp		\out0, \out1, [rk], #8
 
-	__pair		\enc, w13, w14, \in0, \in1, \in3, 0
-	__pair		\enc, w15, w16, \in1, \in2, \in0, 8
-	__pair		\enc, w17, w18, \in2, \in3, \in1, 16
-	__pair		\enc, \t0, \t1, \in3, \in0, \in2, 24
-
-	eor		\out0, \out0, w13
-	eor		\out1, \out1, w14
-	eor		\out0, \out0, w15, ror #24
-	eor		\out1, \out1, w16, ror #24
-	eor		\out0, \out0, w17, ror #16
-	eor		\out1, \out1, w18, ror #16
-	eor		\out0, \out0, \t0, ror #8
-	eor		\out1, \out1, \t1, ror #8
+	__pair		\enc, \sz, \op, w12, w13, \in0, \in1, \in3, 0
+	__pair		\enc, \sz, \op, w14, w15, \in3, \in0, \in2, 24
+	__pair		\enc, \sz, \op, w16, w17, \in2, \in3, \in1, 16
+	__pair		\enc, \sz, \op, \t0, \t1, \in1, \in2, \in0, 8
+
+	eor		\out0, \out0, w12
+	eor		\out1, \out1, w13
+
+	.ifnc		\op\sz, b2
+	eor		\out0, \out0, w14, ror #8
+	eor		\out1, \out1, w15, ror #8
+	.else
+CPU_BE(	lsr		w14, w14, #24		)
+CPU_BE(	lsr		w15, w15, #24		)
+
+	eor		\out0, \out0, w14, lsl #24
+	eor		\out1, \out1, w15, lsl #24
+
+CPU_LE(	uxtb		w16, w16		)
+CPU_LE(	uxtb		w17, w17		)
+CPU_LE(	uxtb		\t0, \t0		)
+CPU_LE(	uxtb		\t1, \t1		)
+
+CPU_BE(	lsr		w16, w16, #24		)
+CPU_BE(	lsr		w17, w17, #24		)
+CPU_BE(	lsr		\t0, \t0, #24		)
+CPU_BE(	lsr		\t1, \t1, #24		)
+	.endif
+
+	eor		\out0, \out0, w16, ror #16
+	eor		\out1, \out1, w17, ror #16
+	eor		\out0, \out0, \t0, ror #24
+	eor		\out1, \out1, \t1, ror #24
 	.endm
 
-	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3
-	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
-	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
 	.endm
 
-	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3
-	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
-	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
 	.endm
 
-	.macro		do_crypt, round, ttab, ltab
-	ldp		w5, w6, [in]
-	ldp		w7, w8, [in, #8]
-	ldp		w9, w10, [rk], #16
-	ldp		w11, w12, [rk, #-8]
+	.macro		do_crypt, round, ttab, ltab, bsz
+	ldp		w4, w5, [in]
+	ldp		w6, w7, [in, #8]
+	ldp		w8, w9, [rk], #16
+	ldp		w10, w11, [rk, #-8]
 
+CPU_BE(	rev		w4, w4		)
 CPU_BE(	rev		w5, w5		)
 CPU_BE(	rev		w6, w6		)
 CPU_BE(	rev		w7, w7		)
-CPU_BE(	rev		w8, w8		)
 
+	eor		w4, w4, w8
 	eor		w5, w5, w9
 	eor		w6, w6, w10
 	eor		w7, w7, w11
-	eor		w8, w8, w12
 
 	adr_l		tt, \ttab
-	adr_l		lt, \ltab
 
 	tbnz		rounds, #1, 1f
 
-0:	\round		w9, w10, w11, w12, w5, w6, w7, w8
-	\round		w5, w6, w7, w8, w9, w10, w11, w12
+0:	\round		w8, w9, w10, w11, w4, w5, w6, w7
+	\round		w4, w5, w6, w7, w8, w9, w10, w11
 
 1:	subs		rounds, rounds, #4
-	\round		w9, w10, w11, w12, w5, w6, w7, w8
-	csel		tt, tt, lt, hi
-	\round		w5, w6, w7, w8, w9, w10, w11, w12
-	b.hi		0b
-
+	\round		w8, w9, w10, w11, w4, w5, w6, w7
+	b.ls		3f
+2:	\round		w4, w5, w6, w7, w8, w9, w10, w11
+	b		0b
+3:	adr_l		tt, \ltab
+	\round		w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
+
+CPU_BE(	rev		w4, w4		)
 CPU_BE(	rev		w5, w5		)
 CPU_BE(	rev		w6, w6		)
 CPU_BE(	rev		w7, w7		)
-CPU_BE(	rev		w8, w8		)
 
-	stp		w5, w6, [out]
-	stp		w7, w8, [out, #8]
+	stp		w4, w5, [out]
+	stp		w6, w7, [out, #8]
 	ret
 	.endm
 
 	.align			7
 	aes_table_reduced	crypto_ft_tab
-	aes_table_reduced	crypto_fl_tab
 	aes_table_reduced	crypto_it_tab
-	aes_table_reduced	crypto_il_tab
 
 ENTRY(__aes_arm64_encrypt)
-	do_crypt	fround, crypto_ft_tab, crypto_fl_tab
+	do_crypt	fround, crypto_ft_tab, crypto_ft_tab + 1, 2
 ENDPROC(__aes_arm64_encrypt)
 
 	.align		5
 ENTRY(__aes_arm64_decrypt)
-	do_crypt	iround, crypto_it_tab, crypto_il_tab
+	do_crypt	iround, crypto_it_tab, crypto_aes_inv_sbox, 0
 ENDPROC(__aes_arm64_decrypt)