From patchwork Sun Jul 29 14:52:30 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ard Biesheuvel <ard.biesheuvel@linaro.org>
X-Patchwork-Id: 143113
Delivered-To: patch@linaro.org
Received: by 2002:a2e:9754:0:0:0:0:0 with SMTP id f20-v6csp3064089ljj;
 Sun, 29 Jul 2018 07:52:51 -0700 (PDT)
X-Google-Smtp-Source: AAOMgpegYku0rY90826CIE1w4vFdObU2Fekcr2kdcHEocvVFbD5KYBZ/icPkNeURd+MqaMjgnxPl
X-Received: by 2002:a17:902:42a3:: with SMTP id
 h32-v6mr12937703pld.72.1532875971379; 
 Sun, 29 Jul 2018 07:52:51 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1532875971; cv=none;
 d=google.com; s=arc-20160816;
 b=irM/x1j+WkZEoUHKV0VzfbKEjqX2vsR4KHMRi+U+ijJHLXx0hySqMMbMZljFY4zVyM
 SPbMH813OtEg+0odGla0E1bKxI5kFwdZcZeMJHvQ3qCT0n4jVPZ7uOEkuSi599auapBt
 AvZiFG7+0OBxbgO6qv+s2+HjKH4ocVKTeGHXycjbHyRjL6dFMp1mD0jtmepm7967mS5b
 W4ZXspDOYta9iR83/XpBuqfGPfMOIFCStemvzsv2wUgPWhmgy5hE8y8PAKuW8Zda8E8J
 8xjB9p4JVz+2XaWync6m4Q7I8VJFOLDsrglWEqbZkDWNWWJOKfbe+re/4Tn49OjrIvxA
 vQlw==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:sender:message-id:date:subject:cc:to:from
 :dkim-signature:arc-authentication-results;
 bh=kySTp/72RUB3sr0yus8yKDyvyy4reximN11z0rocZMs=;
 b=vgFA683FS/HLrMVOTZJd4AM9r8de4TUDc49DiZAgsy3aPa7PisM14Lh6tcIGDBz0Iw
 DtdC2+XuU3svJb47jfO4e6nX/hrcbX4qX1QmqEKT6Fb/c59pJhqtEfneqd2Ko9jdfid9
 HMBFB5ysTbgn9Y5oe5NojEuNjaxHcuwwQZ5yW1jbu6w49CMdSMuFmMVx7pTb5X3xnk/J
 exeaEzIb54uZEWxGNTwVX7ar8mbfxrNWtMjJV8g5xFhq1t+vfz16RIh/vT1Xt2fcboMv
 k5WBa8JioVdIpH+sUOYcPeqrYiNGVkdm1NNSjQEZbXLjYOxJvA6DUkvCwak5ioud+79q
 rVDg==
ARC-Authentication-Results: i=1; mx.google.com;
 dkim=pass header.i=@linaro.org header.s=google header.b=cHJOH0nE;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Return-Path: <linux-crypto-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67])
 by mx.google.com with ESMTP id
 b6-v6si3093305pgj.211.2018.07.29.07.52.50; 
 Sun, 29 Jul 2018 07:52:51 -0700 (PDT)
Received-SPF: pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender) client-ip=209.132.180.67; 
Authentication-Results: mx.google.com;
 dkim=pass header.i=@linaro.org header.s=google header.b=cHJOH0nE;
 spf=pass (google.com: best guess record for domain of
 linux-crypto-owner@vger.kernel.org designates 209.132.180.67
 as permitted sender)
 smtp.mailfrom=linux-crypto-owner@vger.kernel.org; 
 dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=linaro.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S1726585AbeG2QXU (ORCPT <rfc822;victor.chong@linaro.org>
 + 1 other); Sun, 29 Jul 2018 12:23:20 -0400
Received: from mail-wr1-f65.google.com ([209.85.221.65]:43200 "EHLO
 mail-wr1-f65.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
 with ESMTP id S1726347AbeG2QXU (ORCPT
 <rfc822;linux-crypto@vger.kernel.org>);
 Sun, 29 Jul 2018 12:23:20 -0400
Received: by mail-wr1-f65.google.com with SMTP id b15-v6so9959339wrv.10
 for <linux-crypto@vger.kernel.org>;
 Sun, 29 Jul 2018 07:52:36 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google; 
 h=from:to:cc:subject:date:message-id;
 bh=kySTp/72RUB3sr0yus8yKDyvyy4reximN11z0rocZMs=;
 b=cHJOH0nEFIqgGixSFSw7/Hlab1YbhZZtINTl3UZ7XBh6UkQ6n7ZY+7TDf6nF69jtlX
 GzWKFOH5McO060IsuQaaNazd6NRSTjs2B2aytD88qKJjgX/+NpM2DxgZxBmCm3hcSYc0
 G1hZWLPNYYMZBPUEYwYt8OJhtoyfuE0eOvk1E=
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id;
 bh=kySTp/72RUB3sr0yus8yKDyvyy4reximN11z0rocZMs=;
 b=LfaKtCYO+A7r0Xva9nwbenvo9kTX45BjCTUvSF+7efB5onp8EHXm4mAAmY4pFu0Kjk
 BvhIroNrwDB+ZIb1XMepjHrXN0y2CVvH2mK5dva+1d2EXtTts60z5iHwvSI7MDeqEG8E
 zAaC83zXeNoNhAxivg06rDofDuDAO87LBWdV+lhMFM2JJH2X0tBOnc3HI7XomzC8xwx3
 ctojMRuThOPvs/YS8HCuOlpVyW9M0fwYnRbEKs2EyMTXTtUZmhC3QhAa+gNJSg0l03qT
 59hEGd+goXTOTCyfG0xzVNMxvjWlOqL7lg7awrcrG7h9xgITK37aE31pU/NUNNCOq8by
 mX9A==
X-Gm-Message-State: AOUpUlHXcDaoF4H+C0nIfBHj8BUM/zlTkcoQinQcD/y2ZrG2Tkd0oe7d
 Mpg9wCZzkPyNc04dHwG8KqcHOvTF+cw=
X-Received: by 2002:a5d:4701:: with SMTP id
 y1-v6mr13707518wrq.107.1532875955163; 
 Sun, 29 Jul 2018 07:52:35 -0700 (PDT)
Received: from rev02.home (b80182.upc-b.chello.nl. [212.83.80.182])
 by smtp.gmail.com with ESMTPSA id
 s2-v6sm17403209wrn.83.2018.07.29.07.52.32
 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
 Sun, 29 Jul 2018 07:52:33 -0700 (PDT)
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: linux-crypto@vger.kernel.org
Cc: herbert@gondor.apana.org.au, will.deacon@arm.com,
 dave.martin@arm.com, vakul.garg@nxp.com, bigeasy@linutronix.de,
 Ard Biesheuvel <ard.biesheuvel@linaro.org>
Subject: [PATCH] crypto: arm64 - revert NEON yield for fast AEAD
 implementations
Date: Sun, 29 Jul 2018 16:52:30 +0200
Message-Id: <20180729145230.6738-1-ard.biesheuvel@linaro.org>
X-Mailer: git-send-email 2.18.0
Sender: linux-crypto-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-crypto.vger.kernel.org>
X-Mailing-List: linux-crypto@vger.kernel.org

As it turns out, checking the TIF_NEED_RESCHED flag after each
iteration results in a significant performance regression (~10%)
when running fast algorithms (i.e., ones that use special instructions
and operate in the < 4 cycles per byte range) on in-order cores with
comparatively slow memory accesses such as the Cortex-A53.

Given the speed of these ciphers, and the fact that the page based
nature of the AEAD scatterwalk API guarantees that the core NEON
transform is never invoked with more than a single page's worth of
input, we can estimate the worst case duration of any resulting
scheduling blackout: on a 1 GHz Cortex-A53 running with 64k pages,
processing a page's worth of input at 4 cycles per byte results in
a delay of ~250 us, which is a reasonable upper bound.

So let's remove the yield checks from the fused AES-CCM and AES-GCM
routines entirely.

This reverts commit 7b67ae4d5ce8e2f912377f5fbccb95811a92097f and
partially reverts commit 7c50136a8aba8784f07fb66a950cc61a7f3d2ee3.

Fixes: 7c50136a8aba ("crypto: arm64/aes-ghash - yield NEON after every ...")
Fixes: 7b67ae4d5ce8 ("crypto: arm64/aes-ccm - yield NEON after every ...")
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
This supersedes my series 'crypto/arm64: reduce impact of NEON yield checks'
sent out on the 24th of July.

Given the significant performance regression, we may want to treat this as
a fix (the patches in question went into v4.18)

This patch applies onto my patch 'crypto/arm64: aes-ce-gcm - add missing
kernel_neon_begin/end pair' which I sent out on the 27th of July, which
fixes a data corruption bug, whic should also be applied as a fix.

 arch/arm64/crypto/aes-ce-ccm-core.S | 150 +++++++-------------
 arch/arm64/crypto/ghash-ce-core.S   |  76 ++++------
 2 files changed, 80 insertions(+), 146 deletions(-)

-- 
2.18.0
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 88f5aef7934c..e3a375c4cb83 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -19,33 +19,24 @@
 	 *			     u32 *macp, u8 const rk[], u32 rounds);
 	 */
 ENTRY(ce_aes_ccm_auth_data)
-	frame_push	7
-
-	mov	x19, x0
-	mov	x20, x1
-	mov	x21, x2
-	mov	x22, x3
-	mov	x23, x4
-	mov	x24, x5
-
-	ldr	w25, [x22]			/* leftover from prev round? */
+	ldr	w8, [x3]			/* leftover from prev round? */
 	ld1	{v0.16b}, [x0]			/* load mac */
-	cbz	w25, 1f
-	sub	w25, w25, #16
+	cbz	w8, 1f
+	sub	w8, w8, #16
 	eor	v1.16b, v1.16b, v1.16b
-0:	ldrb	w7, [x20], #1			/* get 1 byte of input */
-	subs	w21, w21, #1
-	add	w25, w25, #1
+0:	ldrb	w7, [x1], #1			/* get 1 byte of input */
+	subs	w2, w2, #1
+	add	w8, w8, #1
 	ins	v1.b[0], w7
 	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */
 	beq	8f				/* out of input? */
-	cbnz	w25, 0b
+	cbnz	w8, 0b
 	eor	v0.16b, v0.16b, v1.16b
-1:	ld1	{v3.4s}, [x23]			/* load first round key */
-	prfm	pldl1strm, [x20]
-	cmp	w24, #12			/* which key size? */
-	add	x6, x23, #16
-	sub	w7, w24, #2			/* modified # of rounds */
+1:	ld1	{v3.4s}, [x4]			/* load first round key */
+	prfm	pldl1strm, [x1]
+	cmp	w5, #12				/* which key size? */
+	add	x6, x4, #16
+	sub	w7, w5, #2			/* modified # of rounds */
 	bmi	2f
 	bne	5f
 	mov	v5.16b, v3.16b
@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data)
 	ld1	{v5.4s}, [x6], #16		/* load next round key */
 	bpl	3b
 	aese	v0.16b, v4.16b
-	subs	w21, w21, #16			/* last data? */
+	subs	w2, w2, #16			/* last data? */
 	eor	v0.16b, v0.16b, v5.16b		/* final round */
 	bmi	6f
-	ld1	{v1.16b}, [x20], #16		/* load next input block */
+	ld1	{v1.16b}, [x1], #16		/* load next input block */
 	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
-	beq	6f
-
-	if_will_cond_yield_neon
-	st1	{v0.16b}, [x19]			/* store mac */
-	do_cond_yield_neon
-	ld1	{v0.16b}, [x19]			/* reload mac */
-	endif_yield_neon
-
-	b	1b
-6:	st1	{v0.16b}, [x19]			/* store mac */
+	bne	1b
+6:	st1	{v0.16b}, [x0]			/* store mac */
 	beq	10f
-	adds	w21, w21, #16
+	adds	w2, w2, #16
 	beq	10f
-	mov	w25, w21
-7:	ldrb	w7, [x20], #1
+	mov	w8, w2
+7:	ldrb	w7, [x1], #1
 	umov	w6, v0.b[0]
 	eor	w6, w6, w7
-	strb	w6, [x19], #1
-	subs	w21, w21, #1
+	strb	w6, [x0], #1
+	subs	w2, w2, #1
 	beq	10f
 	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
 	b	7b
-8:	mov	w7, w25
-	add	w25, w25, #16
+8:	mov	w7, w8
+	add	w8, w8, #16
 9:	ext	v1.16b, v1.16b, v1.16b, #1
 	adds	w7, w7, #1
 	bne	9b
 	eor	v0.16b, v0.16b, v1.16b
-	st1	{v0.16b}, [x19]
-10:	str	w25, [x22]
-
-	frame_pop
+	st1	{v0.16b}, [x0]
+10:	str	w8, [x3]
 	ret
 ENDPROC(ce_aes_ccm_auth_data)
 
@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final)
 ENDPROC(ce_aes_ccm_final)
 
 	.macro	aes_ccm_do_crypt,enc
-	frame_push	8
-
-	mov	x19, x0
-	mov	x20, x1
-	mov	x21, x2
-	mov	x22, x3
-	mov	x23, x4
-	mov	x24, x5
-	mov	x25, x6
-
-	ldr	x26, [x25, #8]			/* load lower ctr */
-	ld1	{v0.16b}, [x24]			/* load mac */
-CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
+	ldr	x8, [x6, #8]			/* load lower ctr */
+	ld1	{v0.16b}, [x5]			/* load mac */
+CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 0:	/* outer loop */
-	ld1	{v1.8b}, [x25]			/* load upper ctr */
-	prfm	pldl1strm, [x20]
-	add	x26, x26, #1
-	rev	x9, x26
-	cmp	w23, #12			/* which key size? */
-	sub	w7, w23, #2			/* get modified # of rounds */
+	ld1	{v1.8b}, [x6]			/* load upper ctr */
+	prfm	pldl1strm, [x1]
+	add	x8, x8, #1
+	rev	x9, x8
+	cmp	w4, #12				/* which key size? */
+	sub	w7, w4, #2			/* get modified # of rounds */
 	ins	v1.d[1], x9			/* no carry in lower ctr */
-	ld1	{v3.4s}, [x22]			/* load first round key */
-	add	x10, x22, #16
+	ld1	{v3.4s}, [x3]			/* load first round key */
+	add	x10, x3, #16
 	bmi	1f
 	bne	4f
 	mov	v5.16b, v3.16b
@@ -194,9 +165,9 @@ CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 	bpl	2b
 	aese	v0.16b, v4.16b
 	aese	v1.16b, v4.16b
-	subs	w21, w21, #16
-	bmi	7f				/* partial block? */
-	ld1	{v2.16b}, [x20], #16		/* load next input block */
+	subs	w2, w2, #16
+	bmi	6f				/* partial block? */
+	ld1	{v2.16b}, [x1], #16		/* load next input block */
 	.if	\enc == 1
 	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
 	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
@@ -205,29 +176,18 @@ CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 	eor	v1.16b, v2.16b, v5.16b		/* final round enc */
 	.endif
 	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
-	st1	{v1.16b}, [x19], #16		/* write output block */
-	beq	5f
-
-	if_will_cond_yield_neon
-	st1	{v0.16b}, [x24]			/* store mac */
-	do_cond_yield_neon
-	ld1	{v0.16b}, [x24]			/* reload mac */
-	endif_yield_neon
-
-	b	0b
-5:
-CPU_LE(	rev	x26, x26			)
-	st1	{v0.16b}, [x24]			/* store mac */
-	str	x26, [x25, #8]			/* store lsb end of ctr (BE) */
-
-6:	frame_pop
-	ret
-
-7:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
+	st1	{v1.16b}, [x0], #16		/* write output block */
+	bne	0b
+CPU_LE(	rev	x8, x8			)
+	st1	{v0.16b}, [x5]			/* store mac */
+	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
+5:	ret
+
+6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
 	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
-	st1	{v0.16b}, [x24]			/* store mac */
-	add	w21, w21, #16			/* process partial tail block */
-8:	ldrb	w9, [x20], #1			/* get 1 byte of input */
+	st1	{v0.16b}, [x5]			/* store mac */
+	add	w2, w2, #16			/* process partial tail block */
+7:	ldrb	w9, [x1], #1			/* get 1 byte of input */
 	umov	w6, v1.b[0]			/* get top crypted ctr byte */
 	umov	w7, v0.b[0]			/* get top mac byte */
 	.if	\enc == 1
@@ -237,13 +197,13 @@ CPU_LE(	rev	x26, x26			)
 	eor	w9, w9, w6
 	eor	w7, w7, w9
 	.endif
-	strb	w9, [x19], #1			/* store out byte */
-	strb	w7, [x24], #1			/* store mac byte */
-	subs	w21, w21, #1
-	beq	6b
+	strb	w9, [x0], #1			/* store out byte */
+	strb	w7, [x5], #1			/* store mac byte */
+	subs	w2, w2, #1
+	beq	5b
 	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
 	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */
-	b	8b
+	b	7b
 	.endm
 
 	/*
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index dcffb9e77589..c723647b37db 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
 	.endm
 
 	.macro		pmull_gcm_do_crypt, enc
-	frame_push	10
+	ld1		{SHASH.2d}, [x4]
+	ld1		{XL.2d}, [x1]
+	ldr		x8, [x5, #8]			// load lower counter
 
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
-	mov		x25, x6
-	mov		x26, x7
-	.if		\enc == 1
-	ldr		x27, [sp, #96]			// first stacked arg
-	.endif
-
-	ldr		x28, [x24, #8]			// load lower counter
-CPU_LE(	rev		x28, x28	)
-
-0:	mov		x0, x25
-	load_round_keys	w26, x0
-	ld1		{SHASH.2d}, [x23]
-	ld1		{XL.2d}, [x20]
+	load_round_keys	w7, x6
 
 	movi		MASK.16b, #0xe1
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE(	rev		x8, x8		)
 	shl		MASK.2d, MASK.2d, #57
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	.if		\enc == 1
-	ld1		{KS.16b}, [x27]
+	ldr		x10, [sp]
+	ld1		{KS.16b}, [x10]
 	.endif
 
-1:	ld1		{CTR.8b}, [x24]			// load upper counter
-	ld1		{INP.16b}, [x22], #16
-	rev		x9, x28
-	add		x28, x28, #1
-	sub		w19, w19, #1
+0:	ld1		{CTR.8b}, [x5]			// load upper counter
+	ld1		{INP.16b}, [x3], #16
+	rev		x9, x8
+	add		x8, x8, #1
+	sub		w0, w0, #1
 	ins		CTR.d[1], x9			// set lower counter
 
 	.if		\enc == 1
 	eor		INP.16b, INP.16b, KS.16b	// encrypt input
-	st1		{INP.16b}, [x21], #16
+	st1		{INP.16b}, [x2], #16
 	.endif
 
 	rev64		T1.16b, INP.16b
 
-	cmp		w26, #12
-	b.ge		4f				// AES-192/256?
+	cmp		w7, #12
+	b.ge		2f				// AES-192/256?
 
-2:	enc_round	CTR, v21
+1:	enc_round	CTR, v21
 
 	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
@@ -425,39 +411,27 @@ CPU_LE(	rev		x28, x28	)
 
 	.if		\enc == 0
 	eor		INP.16b, INP.16b, KS.16b
-	st1		{INP.16b}, [x21], #16
+	st1		{INP.16b}, [x2], #16
 	.endif
 
-	cbz		w19, 3f
+	cbnz		w0, 0b
 
-	if_will_cond_yield_neon
-	st1		{XL.2d}, [x20]
-	.if		\enc == 1
-	st1		{KS.16b}, [x27]
-	.endif
-	do_cond_yield_neon
-	b		0b
-	endif_yield_neon
+CPU_LE(	rev		x8, x8		)
+	st1		{XL.2d}, [x1]
+	str		x8, [x5, #8]			// store lower counter
 
-	b		1b
-
-3:	st1		{XL.2d}, [x20]
 	.if		\enc == 1
-	st1		{KS.16b}, [x27]
+	st1		{KS.16b}, [x10]
 	.endif
 
-CPU_LE(	rev		x28, x28	)
-	str		x28, [x24, #8]			// store lower counter
-
-	frame_pop
 	ret
 
-4:	b.eq		5f				// AES-192?
+2:	b.eq		3f				// AES-192?
 	enc_round	CTR, v17
 	enc_round	CTR, v18
-5:	enc_round	CTR, v19
+3:	enc_round	CTR, v19
 	enc_round	CTR, v20
-	b		2b
+	b		1b
 	.endm
 
 	/*