From patchwork Tue Dec 10 23:58:28 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 848939
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by smtp.subspace.kernel.org (Postfix) with ESMTPS id B12CE22C36B
 for <linux-crypto@vger.kernel.org>; Tue, 10 Dec 2024 23:59:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
 t=1733875149; cv=none;
 b=FY3RSRupgY6Mo70XkqiLGZ2VD6xVPM32PelY0KPIijFB677TkbCWpbDhZ5GNGrtZnSq9g21Dj2CqS33yXgo40nUK7+MnoTxnZcG7KmcbAsv/9xc4REaNe6eKwRhpRwOHRpe921K8YsZ8pe72kaMFrZVw3vVXjs/EvoGEsg5/bgo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
 s=arc-20240116; t=1733875149; c=relaxed/simple;
 bh=LBJA3XyLgyAElFN5tiUy0YDumdSdf3apuk7wD3unCkM=;
 h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
 MIME-Version;
 b=Vv5JBCwT/HMx4wV9fB5FYIQUQwdcQs3GAhz2bx9XlYR/fz310VGHA/nJccbiQxh4lX32SVxh8NuRVey5FXyB7df92D7GxA5YBznEA/Lyxkza3lMaZs9D1DeEf/gYx3IgM6HzLaTaOUc2r7lD6I6hBv712/UXlF6hgi3NfYTttmI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=Q6BfAYQ/; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Q6BfAYQ/"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2A31CC4CEE0;
 Tue, 10 Dec 2024 23:59:09 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
 s=k20201202; t=1733875149;
 bh=LBJA3XyLgyAElFN5tiUy0YDumdSdf3apuk7wD3unCkM=;
 h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
 b=Q6BfAYQ/A4sRnVcGGbICFFFFjF8zD17E1XVNZQT/rOl1LPwtevI8ciY4tB+1vfvOS
 bZ+rYBdE+E+rtUF77996ztXeg1/N34j28EFGEiPjd0lmIOuN6UkzLkPZ8LvKw3oCg7
 USgQ97oFUXt7KpLaN7lFHPCsVyPXByLWF8y2cPFbvNDFSuFBksxjRRkDTJ26RjwTbV
 4Yu0sP6If0iSGWpwjNyrZsiyV5NEU2/XTKK2wN4kMaVx39A2bLjjvWJYopEIQb/gJS
 tlZoDhxL9sVxcR8rQash2xPHLX1H04ZOJSYkyyjND+StKvsfp4C9wQvB91IIadPX9g
 +sszOwXfaiv5g==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org
Subject: [PATCH 1/7] crypto: x86/aes-gcm - code size optimization
Date: Tue, 10 Dec 2024 15:58:28 -0800
Message-ID: <20241210235834.40862-2-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.1
In-Reply-To: <20241210235834.40862-1-ebiggers@kernel.org>
References: <20241210235834.40862-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Prefer immediates of -128 to 128, since the former fits in a signed
byte, saving 3 bytes per instruction.  Also replace a vpand and vpxor
with a vpternlogd.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-gcm-avx10-x86_64.S | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
index 97e0ee515fc5f..8989bf9b8384d 100644
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@@ -382,12 +382,12 @@
 	// wide shift instruction, so instead double each of the two 64-bit
 	// halves and incorporate the internal carry bit into the value XOR'd.
 	vpshufd		$0xd3, H_CUR_XMM, %xmm0
 	vpsrad		$31, %xmm0, %xmm0
 	vpaddq		H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
-	vpand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0
-	vpxor		%xmm0, H_CUR_XMM, H_CUR_XMM
+	// H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
+	vpternlogd	$0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM
 
 	// Load the gfpoly constant.
 	vbroadcasti32x4	.Lgfpoly(%rip), GFPOLY
 
 	// Square H^1 to get H^2.
@@ -711,11 +711,11 @@
 	// that processes 4*VL bytes of data at a time.  Otherwise skip it.
 	//
 	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
 	// loop and also ensures that at least one write always occurs to
 	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
-	sub		$4*VL, DATALEN
+	add		$-4*VL, DATALEN  // shorter than 'sub 4*VL' when VL=32
 	jl		.Lcrypt_loop_4x_done\@
 
 	// Load powers of the hash key.
 	vmovdqu8	OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
 	vmovdqu8	OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
@@ -758,13 +758,13 @@
 	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
 	vmovdqu8	GHASHDATA0, 0*VL(DST)
 	vmovdqu8	GHASHDATA1, 1*VL(DST)
 	vmovdqu8	GHASHDATA2, 2*VL(DST)
 	vmovdqu8	GHASHDATA3, 3*VL(DST)
-	add		$4*VL, SRC
-	add		$4*VL, DST
-	sub		$4*VL, DATALEN
+	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+	sub		$-4*VL, DST
+	add		$-4*VL, DATALEN
 	jl		.Lghash_last_ciphertext_4x\@
 .endif
 
 	// Cache as many additional AES round keys as possible.
 .irp i, 9,8,7,6,5
@@ -838,13 +838,13 @@
 	vmovdqu8	GHASHDATA0, 0*VL(DST)
 	vmovdqu8	GHASHDATA1, 1*VL(DST)
 	vmovdqu8	GHASHDATA2, 2*VL(DST)
 	vmovdqu8	GHASHDATA3, 3*VL(DST)
 
-	add		$4*VL, SRC
-	add		$4*VL, DST
-	sub		$4*VL, DATALEN
+	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+	sub		$-4*VL, DST
+	add		$-4*VL, DATALEN
 	jge		.Lcrypt_loop_4x\@
 
 .if \enc
 .Lghash_last_ciphertext_4x\@:
 	// Update GHASH with the last set of ciphertext blocks.
@@ -854,11 +854,11 @@
 .endif
 
 .Lcrypt_loop_4x_done\@:
 
 	// Undo the extra subtraction by 4*VL and check whether data remains.
-	add		$4*VL, DATALEN
+	sub		$-4*VL, DATALEN  // shorter than 'add 4*VL' when VL=32
 	jz		.Ldone\@
 
 	// The data length isn't a multiple of 4*VL.  Process the remaining data
 	// of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
 	// Going one vector at a time may seem inefficient compared to having

From patchwork Tue Dec 10 23:58:29 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 849420
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by smtp.subspace.kernel.org (Postfix) with ESMTPS id D97EA22C36C
 for <linux-crypto@vger.kernel.org>; Tue, 10 Dec 2024 23:59:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
 t=1733875149; cv=none;
 b=QcjnFgCUD/GULMZsWdG5FCJ3uG786qLbrz/chn72xnyi0S1PgaUTImryZsILaXP6pMM2Vy5Foc6vYMHENmZoqPe93N2ASPoYcXALj/uvHHn1qTLwL/HYmFPGh2M5flN9jDM7rNBlbtg/rEldwnmneJood4W0hu68k1n/qDFbOyE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
 s=arc-20240116; t=1733875149; c=relaxed/simple;
 bh=S/biDWYIFiHYqfH4/5v3xSMcwvn7HD829lF2LjnG+wc=;
 h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
 MIME-Version;
 b=OEUoSk6n6S04JUCwhYWbCYRchyP6fMyviHaNmfZIl+PNVOtOiyiunIQfQaZ3lRBAJDzDMAnqY96rE4YbRGsWFm6F9rAQw+j+fLF4G817Y/Pxl82g61FQ5+VMYJht9F6yt011kVEcc/cUZkt8JPHFSIb6ZVljcCOD7yFeFBEPwKA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=I6322i7K; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="I6322i7K"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 601E3C4CEE2;
 Tue, 10 Dec 2024 23:59:09 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
 s=k20201202; t=1733875149;
 bh=S/biDWYIFiHYqfH4/5v3xSMcwvn7HD829lF2LjnG+wc=;
 h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
 b=I6322i7K7pxrU9sRtfeqX2w2WWc/fHg9dFdkB+InFiwkUtIR3nKE7XwvP/5gLtzFQ
 xU0Pr3JQd9w1022mTdh4R1SVv4tGItLbesfJI2X6+hqiXxvSQv5HJQA0wzeUm5Z2LJ
 cly4bqMFAr3lQXoUwBcMuyG8SfWNc7EVPgjt1JR1NBhxtkXfzmqLUREl+F7q35qjxi
 Tn8Mffyz2/BiZJsTJXdKh+PK5JEKimx/ZkrpQ31d85rtX2FrWPdVwB/txWOncao33y
 jSiJRC+vRCy7nHYAN5KZ4hdXypDX66+GHLAuuDnXWAI/lIM1OVLRaT3hrA7R57LThQ
 TQPIvxwcE0b6A==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org
Subject: [PATCH 2/7] crypto: x86/aes-gcm - tune better for AMD CPUs
Date: Tue, 10 Dec 2024 15:58:29 -0800
Message-ID: <20241210235834.40862-3-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.1
In-Reply-To: <20241210235834.40862-1-ebiggers@kernel.org>
References: <20241210235834.40862-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Reorganize the main loop to free up the RNDKEYLAST[0-3] registers and
use them for more cached round keys.  This improves performance by about
2% on AMD Zen 4 and Zen 5.  Intel performance remains about the same.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-gcm-avx10-x86_64.S | 99 ++++++++++----------------
 1 file changed, 38 insertions(+), 61 deletions(-)

diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
index 8989bf9b8384d..02ee11083d4f8 100644
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@@ -86,11 +86,11 @@
 .section .rodata
 .p2align 6
 
 	// A shuffle mask that reflects the bytes of 16-byte blocks
 .Lbswap_mask:
-	.octa   0x000102030405060708090a0b0c0d0e0f
+	.octa	0x000102030405060708090a0b0c0d0e0f
 
 	// This is the GHASH reducing polynomial without its constant term, i.e.
 	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
 	// between bits and polynomial coefficients.
 	//
@@ -560,10 +560,36 @@
 	vpxord		RNDKEY0, V1, V1
 	vpxord		RNDKEY0, V2, V2
 	vpxord		RNDKEY0, V3, V3
 .endm
 
+// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
+// data with the resulting keystream, and write the result to DST and
+// GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
+.macro	_aesenclast_and_xor_4x
+	// XOR the source data with the last round key, saving the result in
+	// GHASHDATA[0-3].  This reduces latency by taking advantage of the
+	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+	vpxord		0*VL(SRC), RNDKEYLAST, GHASHDATA0
+	vpxord		1*VL(SRC), RNDKEYLAST, GHASHDATA1
+	vpxord		2*VL(SRC), RNDKEYLAST, GHASHDATA2
+	vpxord		3*VL(SRC), RNDKEYLAST, GHASHDATA3
+
+	// Do the last AES round.  This handles the XOR with the source data
+	// too, as per the optimization described above.
+	vaesenclast	GHASHDATA0, V0, GHASHDATA0
+	vaesenclast	GHASHDATA1, V1, GHASHDATA1
+	vaesenclast	GHASHDATA2, V2, GHASHDATA2
+	vaesenclast	GHASHDATA3, V3, GHASHDATA3
+
+	// Store the en/decrypted data to DST.
+	vmovdqu8	GHASHDATA0, 0*VL(DST)
+	vmovdqu8	GHASHDATA1, 1*VL(DST)
+	vmovdqu8	GHASHDATA2, 2*VL(DST)
+	vmovdqu8	GHASHDATA3, 3*VL(DST)
+.endm
+
 // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
 //					  const u32 le_ctr[4], u8 ghash_acc[16],
 //					  const u8 *src, u8 *dst, int datalen);
 //
 // This macro generates a GCM encryption or decryption update function with the
@@ -638,29 +664,24 @@
 	.set	LE_CTR_INC,	V11
 
 	// LE_CTR contains the next set of little-endian counter blocks.
 	.set	LE_CTR,		V12
 
-	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
+	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
 	.set	RNDKEY0,	V13
 	.set	RNDKEYLAST,	V14
 	.set	RNDKEY_M9,	V15
 	.set	RNDKEY_M8,	V16
 	.set	RNDKEY_M7,	V17
 	.set	RNDKEY_M6,	V18
 	.set	RNDKEY_M5,	V19
-
-	// RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
-	// the corresponding block of source data.  This is useful because
-	// vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
-	// be computed in parallel with the AES rounds.
-	.set	RNDKEYLAST0,	V20
-	.set	RNDKEYLAST1,	V21
-	.set	RNDKEYLAST2,	V22
-	.set	RNDKEYLAST3,	V23
+	.set	RNDKEY_M4,	V20
+	.set	RNDKEY_M3,	V21
+	.set	RNDKEY_M2,	V22
+	.set	RNDKEY_M1,	V23
 
 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
 	// cannot coincide with anything used for AES encryption, since for
 	// performance reasons GHASH and AES encryption are interleaved.
 	.set	GHASHTMP0,	V24
@@ -746,30 +767,19 @@
 	vbroadcasti32x4	(%rax), RNDKEY
 	_vaesenc_4x	RNDKEY
 	add		$16, %rax
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
-	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
-	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
-	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
-	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
-	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
-	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
-	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
-	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
-	vmovdqu8	GHASHDATA0, 0*VL(DST)
-	vmovdqu8	GHASHDATA1, 1*VL(DST)
-	vmovdqu8	GHASHDATA2, 2*VL(DST)
-	vmovdqu8	GHASHDATA3, 3*VL(DST)
+	_aesenclast_and_xor_4x
 	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
 	sub		$-4*VL, DST
 	add		$-4*VL, DATALEN
 	jl		.Lghash_last_ciphertext_4x\@
 .endif
 
 	// Cache as many additional AES round keys as possible.
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
 	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
 .endr
 
 .Lcrypt_loop_4x\@:
 
@@ -797,51 +807,18 @@
 	_vaesenc_4x	RNDKEY
 	vbroadcasti32x4	-10*16(RNDKEYLAST_PTR), RNDKEY
 	_vaesenc_4x	RNDKEY
 128:
 
-	// XOR the source data with the last round key, saving the result in
-	// RNDKEYLAST[0-3].  This reduces latency by taking advantage of the
-	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
-.if \enc
-	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
-	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
-	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
-	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
-.else
-	vpxord		GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
-	vpxord		GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
-	vpxord		GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
-	vpxord		GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
-.endif
-
 	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
 	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
+	_ghash_step_4x  (9 - \i)
 	_vaesenc_4x	RNDKEY_M\i
-	_ghash_step_4x	(9 - \i)
-.endr
-.irp i, 4,3,2,1
-	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY
-	_vaesenc_4x	RNDKEY
-	_ghash_step_4x	(9 - \i)
 .endr
 	_ghash_step_4x	9
-
-	// Do the last AES round.  This handles the XOR with the source data
-	// too, as per the optimization described above.
-	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
-	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
-	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
-	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
-
-	// Store the en/decrypted data to DST.
-	vmovdqu8	GHASHDATA0, 0*VL(DST)
-	vmovdqu8	GHASHDATA1, 1*VL(DST)
-	vmovdqu8	GHASHDATA2, 2*VL(DST)
-	vmovdqu8	GHASHDATA3, 3*VL(DST)
-
+	_aesenclast_and_xor_4x
 	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
 	sub		$-4*VL, DST
 	add		$-4*VL, DATALEN
 	jge		.Lcrypt_loop_4x\@
 
@@ -938,11 +915,11 @@
 	// be whole block(s) that get processed by the GHASH multiplication and
 	// reduction instructions but should not actually be included in the
 	// GHASH.  However, any such blocks are all-zeroes, and the values that
 	// they're multiplied with are also all-zeroes.  Therefore they just add
 	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
-	vmovdqu8        (POWERS_PTR), H_POW1
+	vmovdqu8	(POWERS_PTR), H_POW1
 .if \enc
 	vmovdqu8	V0, V1{%k1}{z}
 .endif
 	vpshufb		BSWAP_MASK, V1, V0
 	vpxord		GHASH_ACC, V0, V0

From patchwork Tue Dec 10 23:58:30 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 848938
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by smtp.subspace.kernel.org (Postfix) with ESMTPS id 00CC722C36E
 for <linux-crypto@vger.kernel.org>; Tue, 10 Dec 2024 23:59:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
 t=1733875150; cv=none;
 b=lKXvOgHHrVklDm7JU+vZhuODv/dopvRN59yM9FEnUhM25Yt7o/FQJHqinCQh4meFuxz8zJSk1G1/MUEQXEoAx5o4VnBZBLrZaZadOvLTT3VzPG2kZvRqpA9kZgyXQpBbzInT86QyHdoIPzXFdHijtpIstUoKLe1RpZOX5Mbk6RU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
 s=arc-20240116; t=1733875150; c=relaxed/simple;
 bh=roscPuA9UjIXPEx3TkJZPsedVDwX9liM0Ge5TCmHKb0=;
 h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
 MIME-Version;
 b=RaHA9tQT4YJE9WnYlDaemiXO/+l6VhNnjmlYgY6Fe7bb6nPMFhTUyYeNeUxsIrA0vwt5RKD6HvoFn+f5zv6SaIv1VSE6in/lGOcJD1m/Sst8NeAqjUuo7kvAnaUj+1D0AxckZe/tlegDCpCrY7MXVKzAv+scRVHdlEx4lBm7ge4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=ugmrz+CR; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="ugmrz+CR"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 95075C4CEDF;
 Tue, 10 Dec 2024 23:59:09 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
 s=k20201202; t=1733875149;
 bh=roscPuA9UjIXPEx3TkJZPsedVDwX9liM0Ge5TCmHKb0=;
 h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
 b=ugmrz+CRopVVNuDEBZY4cmYimjNDTYP0AhlobLrKeuZxm1T3P8y13iAFKVTDio/vW
 LWOYr16fCS7zkgOMT7DdQEN+9e2RXZ591IoNBtcoflOnAvr/O+ix6yPjjOkX9RcnLe
 iuCK2ZxxWcfmyUvaaMc76HaSaheeFMh/0NXaZWNzLfiTvPtD79YeT49onR4alYqEOX
 agDGAst+8X/X+Dq2UlC4fyQIVzZKxOZ+/emh28G+Y8c5Dn4Fdyl95xR6sTcxy33Eef
 MZaSDX9TEq3iJYv+izXMC05LJiIfTpcxCwd7xuJfDUAESGZAcuiZVfTbtt3SPFazjP
 sKmAWs8ze5zQA==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org
Subject: [PATCH 3/7] crypto: x86/aes-xts - use .irp when useful
Date: Tue, 10 Dec 2024 15:58:30 -0800
Message-ID: <20241210235834.40862-4-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.1
In-Reply-To: <20241210235834.40862-1-ebiggers@kernel.org>
References: <20241210235834.40862-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Use .irp instead of repeating code.

No change in the generated code.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S | 50 +++++-----------------------
 1 file changed, 9 insertions(+), 41 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 48f97b79f7a9c..63e5d3b3e77f5 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -110,43 +110,17 @@
 
 .macro _define_aliases
 	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
 	// are available, that map to the xmm, ymm, or zmm registers according
 	// to the selected Vector Length (VL).
-	_define_Vi	0
-	_define_Vi	1
-	_define_Vi	2
-	_define_Vi	3
-	_define_Vi	4
-	_define_Vi	5
-	_define_Vi	6
-	_define_Vi	7
-	_define_Vi	8
-	_define_Vi	9
-	_define_Vi	10
-	_define_Vi	11
-	_define_Vi	12
-	_define_Vi	13
-	_define_Vi	14
-	_define_Vi	15
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	_define_Vi	\i
+.endr
 .if USE_AVX10
-	_define_Vi	16
-	_define_Vi	17
-	_define_Vi	18
-	_define_Vi	19
-	_define_Vi	20
-	_define_Vi	21
-	_define_Vi	22
-	_define_Vi	23
-	_define_Vi	24
-	_define_Vi	25
-	_define_Vi	26
-	_define_Vi	27
-	_define_Vi	28
-	_define_Vi	29
-	_define_Vi	30
-	_define_Vi	31
+.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+	_define_Vi	\i
+.endr
 .endif
 
 	// V0-V3 hold the data blocks during the main loop, or temporary values
 	// otherwise.  V4-V5 hold temporary values.
 
@@ -616,19 +590,13 @@
 	_vaes_4x	\enc, 0, 2
 .Laes192\@:
 	_vaes_4x	\enc, 0, 3
 	_vaes_4x	\enc, 0, 4
 .Laes128\@:
-	_vaes_4x	\enc, 0, 5
-	_vaes_4x	\enc, 0, 6
-	_vaes_4x	\enc, 0, 7
-	_vaes_4x	\enc, 0, 8
-	_vaes_4x	\enc, 0, 9
-	_vaes_4x	\enc, 0, 10
-	_vaes_4x	\enc, 0, 11
-	_vaes_4x	\enc, 0, 12
-	_vaes_4x	\enc, 0, 13
+.irp i, 5,6,7,8,9,10,11,12,13
+	_vaes_4x	\enc, 0, \i
+.endr
 	_vaes_4x	\enc, 1, 14
 
 	// XOR in the tweaks again.
 	_vpxor		TWEAK0, V0, V0
 	_vpxor		TWEAK1, V1, V1

From patchwork Tue Dec 10 23:58:31 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 849419
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1994222C373
 for <linux-crypto@vger.kernel.org>; Tue, 10 Dec 2024 23:59:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
 t=1733875150; cv=none;
 b=DT70+497ovuuU/ly0+ps3DGQnwXRDysuDyuTpHacCqfgWAhZPsN46Jk5MubZyHEUZVrhrHsnIIhb5lPgrv2VmH8GYhfFYncanUf/lrjhhP4OSy9NFrm9UmjnbnsDzGD11RJGQMdbR8r/Q0tEWKeDohHX4ZCYh8nbAo18NzhGLD4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
 s=arc-20240116; t=1733875150; c=relaxed/simple;
 bh=3+JQ8eJTH3/H9N0HSJTtmwt577Uamy2X18dOOOn248E=;
 h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
 MIME-Version;
 b=azMAmLbvrnUHG6vLoCZk5OvJyiRwxwoJs/Cccvt4NOt+37yC88kfo35+sY+kNFolc7BEGClT/JZgq/boYPUqphbywyThJfUG0CbmN3XFhq/lWUOhVKsF1Qmofo065DstMFPx75GnYF6zg+bpvhm6N7jGSK/SM3S8h4k4rgbgKyo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=c9kbndy/; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="c9kbndy/"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id CA5E1C4CEE5;
 Tue, 10 Dec 2024 23:59:09 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
 s=k20201202; t=1733875149;
 bh=3+JQ8eJTH3/H9N0HSJTtmwt577Uamy2X18dOOOn248E=;
 h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
 b=c9kbndy/2jJHiGQw0GkRLg1PLL5geah5goDYAnxMUBxxoSpLOHvtHL9t/LL3Wrzs5
 Ykp3omMUd4OfeZ8Zore73MkGjGa8AXO5phOqUGnrNIQBDbqDSO+X+cfncO1PfTB9NB
 VVoAZSz76K7OEOMduTS1RFCmgl/XbWmVadXAqsGrQKd7/8iKKSs6xV48BcgeSKpiIp
 aFJDD27COjBCCsRaEexf074H8Z+p2bG6S9UwKHsWVhTyttdGS6/R5q1NwLgvIa6Mtc
 jLagzPLYz+GC6kgPizPqsKVb/EcrFNn22pOsTnW0dCxtcUC6012OEaQqLzU3812Rw8
 NCXfCu38Bh1MQ==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org
Subject: [PATCH 4/7] crypto: x86/aes-xts - make the register aliases
 per-function
Date: Tue, 10 Dec 2024 15:58:31 -0800
Message-ID: <20241210235834.40862-5-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.1
In-Reply-To: <20241210235834.40862-1-ebiggers@kernel.org>
References: <20241210235834.40862-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Since aes-xts-avx-x86_64.S contains multiple functions, move the
register aliases for the parameters and local variables of the XTS
update function into the macro that generates that function.  Then add
register aliases to aes_xts_encrypt_iv() to improve readability there.
This makes aes-xts-avx-x86_64.S consistent with the GCM assembly files.

No change in the generated code.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S | 77 +++++++++++++++-------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 63e5d3b3e77f5..77b3c265be30b 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -78,26 +78,10 @@
 	.byte	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
 .text
 
-// Function parameters
-.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
-				// advanced to point to 7th-from-last round key
-.set	SRC,		%rsi	// Pointer to next source data
-.set	DST,		%rdx	// Pointer to next destination data
-.set	LEN,		%ecx	// Remaining length in bytes
-.set	LEN8,		%cl
-.set	LEN64,		%rcx
-.set	TWEAK,		%r8	// Pointer to next tweak
-
-// %rax holds the AES key length in bytes.
-.set	KEYLEN,		%eax
-.set	KEYLEN64,	%rax
-
-// %r9-r11 are available as temporaries.
-
 .macro	_define_Vi	i
 .if VL == 16
 	.set	V\i,		%xmm\i
 .elseif VL == 32
 	.set	V\i,		%ymm\i
@@ -119,10 +103,26 @@
 .irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
 	_define_Vi	\i
 .endr
 .endif
 
+	// Function parameters
+	.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
+					// advanced to point to 7th-from-last round key
+	.set	SRC,		%rsi	// Pointer to next source data
+	.set	DST,		%rdx	// Pointer to next destination data
+	.set	LEN,		%ecx	// Remaining length in bytes
+	.set	LEN8,		%cl
+	.set	LEN64,		%rcx
+	.set	TWEAK,		%r8	// Pointer to next tweak
+
+	// %rax holds the AES key length in bytes.
+	.set	KEYLEN,		%eax
+	.set	KEYLEN64,	%rax
+
+	// %r9-r11 are available as temporaries.
+
 	// V0-V3 hold the data blocks during the main loop, or temporary values
 	// otherwise.  V4-V5 hold temporary values.
 
 	// V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
 	.set	TWEAK0_XMM,	%xmm6
@@ -732,34 +732,39 @@
 .endm
 
 // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 //			   u8 iv[AES_BLOCK_SIZE]);
 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
-	vmovdqu		(%rsi), %xmm0
-	vpxor		(%rdi), %xmm0, %xmm0
-	movl		480(%rdi), %eax		// AES key length
-	lea		-16(%rdi, %rax, 4), %rdi
-	cmp		$24, %eax
+	.set	TWEAK_KEY,	%rdi
+	.set	IV,		%rsi
+	.set	KEYLEN,		%eax
+	.set	KEYLEN64,	%rax
+
+	vmovdqu		(IV), %xmm0
+	vpxor		(TWEAK_KEY), %xmm0, %xmm0
+	movl		480(TWEAK_KEY), KEYLEN
+	lea		-16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
+	cmp		$24, KEYLEN
 	jl		.Lencrypt_iv_aes128
 	je		.Lencrypt_iv_aes192
-	vaesenc		-6*16(%rdi), %xmm0, %xmm0
-	vaesenc		-5*16(%rdi), %xmm0, %xmm0
+	vaesenc		-6*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		-5*16(TWEAK_KEY), %xmm0, %xmm0
 .Lencrypt_iv_aes192:
-	vaesenc		-4*16(%rdi), %xmm0, %xmm0
-	vaesenc		-3*16(%rdi), %xmm0, %xmm0
+	vaesenc		-4*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		-3*16(TWEAK_KEY), %xmm0, %xmm0
 .Lencrypt_iv_aes128:
-	vaesenc		-2*16(%rdi), %xmm0, %xmm0
-	vaesenc		-1*16(%rdi), %xmm0, %xmm0
-	vaesenc		0*16(%rdi), %xmm0, %xmm0
-	vaesenc		1*16(%rdi), %xmm0, %xmm0
-	vaesenc		2*16(%rdi), %xmm0, %xmm0
-	vaesenc		3*16(%rdi), %xmm0, %xmm0
-	vaesenc		4*16(%rdi), %xmm0, %xmm0
-	vaesenc		5*16(%rdi), %xmm0, %xmm0
-	vaesenc		6*16(%rdi), %xmm0, %xmm0
-	vaesenclast	7*16(%rdi), %xmm0, %xmm0
-	vmovdqu		%xmm0, (%rsi)
+	vaesenc		-2*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		-1*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		0*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		1*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		2*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		3*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		4*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		5*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		6*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenclast	7*16(TWEAK_KEY), %xmm0, %xmm0
+	vmovdqu		%xmm0, (IV)
 	RET
 SYM_FUNC_END(aes_xts_encrypt_iv)
 
 // Below are the actual AES-XTS encryption and decryption functions,
 // instantiated from the above macro.  They all have the following prototype:

From patchwork Tue Dec 10 23:58:32 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 848937
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by smtp.subspace.kernel.org (Postfix) with ESMTPS id 38E1022C378
 for <linux-crypto@vger.kernel.org>; Tue, 10 Dec 2024 23:59:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
 t=1733875150; cv=none;
 b=fKwsMycCRRJeTMbJ69eLNKPwJaJ8/2hUKP3jL9I0CWOEZl4n7mocCN/xBqdDjSHAUHTTifJGEB2lZXgkihJDSWzQN8xwVjkA1DnV+8r84HwblUrqpWrgrTIL336tnDNUsmrfWQ62DsHblVqI14YNi7NwjvC2N/mn1glgG+F0TP8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
 s=arc-20240116; t=1733875150; c=relaxed/simple;
 bh=A72zRASK2MIJnbhzpyRnOy5EtDb49NEbzfCzkwETwKE=;
 h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
 MIME-Version;
 b=EjNysuSDwUh+YOW5IEEiPT1hJPFXDUNf6UMMmkRzrExwb6WCOvybFk57AIdq8thI5XCi83m2D/PUjPiC1ywaff/BU+3z0d3hbd7QBRgBAUAxhOylTpn9y28VJSy522L8INaQiytV0sP7RUnQ7wJJKgTrSgePMErEBMTUkNg4bZ8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=kbjO7xVJ; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="kbjO7xVJ"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 0B1E4C4CEE2;
 Tue, 10 Dec 2024 23:59:10 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
 s=k20201202; t=1733875150;
 bh=A72zRASK2MIJnbhzpyRnOy5EtDb49NEbzfCzkwETwKE=;
 h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
 b=kbjO7xVJMRBVEpjfiKewFgCIZ/mBbTA43owVSOynDC4yj0VuWyBJoYJekZcsCOLQd
 CYn7VJYoHhWtwhnZX40wn0XFJsFLb0PuSBFe/otNlgj+HBYVkHVMPbSwFGXcRYGLzv
 hbSfhmeWmWF7qT9FBScD3xT39DmS4cQG4gp5nsFMQH9SRYBmCPrlxXr8MkTp9d7Qa3
 WhlfdpQkOnDgXp3o0dwZNoHATZEiEP89kuWKEGWNmJn/05xoEULNEA8JMXPfPuxcaD
 U3ybGjdCqaCERHXb27yP+8ADXGnKksllT0NrV/vc6pjpN0e6340i3Yes28ijYlzrcC
 7HaxAigAaYMpw==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org
Subject: [PATCH 5/7] crypto: x86/aes-xts - improve some comments
Date: Tue, 10 Dec 2024 15:58:32 -0800
Message-ID: <20241210235834.40862-6-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.1
In-Reply-To: <20241210235834.40862-1-ebiggers@kernel.org>
References: <20241210235834.40862-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Improve some of the comments in aes-xts-avx-x86_64.S.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 77b3c265be30b..a94f02b7c5b47 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -341,13 +341,18 @@
 .endif
 .endm
 
 // Do one step in computing the next set of tweaks using the VPCLMULQDQ method
 // (the same method _next_tweakvec uses for VL > 16).  This means multiplying
-// each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
-// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
-// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+// each tweak by x^(4*VL/16) independently.
+//
+// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed
+// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
+// to do 128-bit wide shifts.  The 128-bit left shift (vpslldq) saves
+// instructions directly.  The 128-bit right shift (vpsrldq) performs better
+// than a 64-bit right shift on Intel CPUs in the context where it is used here,
+// because it runs on a different execution port from the AES instructions.
 .macro	_tweak_step_pclmul	i
 .if \i == 0
 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
 .elseif \i == 2
 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
@@ -415,13 +420,14 @@
 	// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
 	// beginning.  Skipping rounds at the end doesn't work as well because
 	// the last round needs different instructions.
 	//
 	// An alternative approach would be to roll up all the round loops.  We
-	// don't do that because it isn't compatible with caching the round keys
-	// in registers which we do when possible (see below), and also because
-	// it seems unwise to rely *too* heavily on the CPU's branch predictor.
+	// don't do that because (a) it isn't compatible with caching the round
+	// keys in registers which we do when possible (see below), (b) we
+	// interleave the AES rounds with the XTS tweak computation, and (c) it
+	// seems unwise to rely *too* heavily on the CPU's branch predictor.
 	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
 
 	// If all 32 SIMD registers are available, cache all the round keys.
 .if USE_AVX10
 	cmp		$24, KEYLEN
@@ -731,10 +737,13 @@
 	jmp		.Ldone\@
 .endm
 
 // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 //			   u8 iv[AES_BLOCK_SIZE]);
+//
+// Encrypt |iv| using the AES key |tweak_key| to get the first tweak.  Assumes
+// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
 	.set	TWEAK_KEY,	%rdi
 	.set	IV,		%rsi
 	.set	KEYLEN,		%eax
 	.set	KEYLEN64,	%rax
@@ -767,13 +776,13 @@ SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
 SYM_FUNC_END(aes_xts_encrypt_iv)
 
 // Below are the actual AES-XTS encryption and decryption functions,
 // instantiated from the above macro.  They all have the following prototype:
 //
-// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
-//			const u8 *src, u8 *dst, unsigned int len,
-//			u8 tweak[AES_BLOCK_SIZE]);
+// void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+//			  const u8 *src, u8 *dst, unsigned int len,
+//			  u8 tweak[AES_BLOCK_SIZE]);
 //
 // |key| is the data key.  |tweak| contains the next tweak; the encryption of
 // the original IV with the tweak key was already done.  This function supports
 // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
 // |len| must be a multiple of 16 except on the last call.  If |len| is a

From patchwork Tue Dec 10 23:58:33 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 849418
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by smtp.subspace.kernel.org (Postfix) with ESMTPS id B1A8F22C373
 for <linux-crypto@vger.kernel.org>; Tue, 10 Dec 2024 23:59:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
 t=1733875150; cv=none;
 b=e1r6TyqiqsyUoqVJbhjQSC1NHBmwOz3SovjsKH12hP5GjewQWOAaZg/0uA00yy90fJVovoQ22azeZqK9ZIeg6DnMPIOzDTvH548UzF6Z91lxybBpx8sORP9th7YHOumZ/zXlUmkpPLeltXwzp4m6AElF9F1r+nwxsc7t48W6C8E=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
 s=arc-20240116; t=1733875150; c=relaxed/simple;
 bh=qeaRCPBXqIDZP/J2KL40Si6lLr3Xq+los47PzgXgjVA=;
 h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
 MIME-Version;
 b=RtZ5CYyvJHiMNt13sn8Pgqexd2Uh1eMBMEnVDA70BzyVMjtC+hvLufXTMiSXU0h/gwMfQ8Vjh9jdKKbowowiQ0RkkL1aZdcs07eVpPtkCEOQP73q3wbW2752IuQ9YSCj+AaO0SdakVlUeK+TkS7f3H9D8ubUAqRHuZzYS3IkPiw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=RBHtDYrH; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="RBHtDYrH"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4007AC4CEE0;
 Tue, 10 Dec 2024 23:59:10 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
 s=k20201202; t=1733875150;
 bh=qeaRCPBXqIDZP/J2KL40Si6lLr3Xq+los47PzgXgjVA=;
 h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
 b=RBHtDYrH5MPG3sXdB8ZsGKNRkXwd5FC8xN+7Vpc7KrULsu+hNGOhIjHuB3i+DLlCj
 /luXmM9E+kAL3GP0TKDlqhlInNA+JWW1u3IRUnLPtGQkBHBRzuVuJF6A6Azo4ewvAU
 Uskt/ypWm+I5W40+UEdPSBlzdrOFz2S3g7sy+cQkAgM603VtxwiaKyITmxJ1ZzysFW
 epQRGtrS7HrC+neMLxcgB7Bn3yyma5Cm7YtcREBBI+cs2nf6DGvE9HykjUtEOA9Yvg
 w4+XWGHZuYZ3TuUhwN+kYvxRAQC0r3I4gXZy/VI/VvuT0rjLkR1kj2n+dsUphgyE/Z
 WFX4V4TSbNevg==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org
Subject: [PATCH 6/7] crypto: x86/aes-xts - change len parameter to int
Date: Tue, 10 Dec 2024 15:58:33 -0800
Message-ID: <20241210235834.40862-7-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.1
In-Reply-To: <20241210235834.40862-1-ebiggers@kernel.org>
References: <20241210235834.40862-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

The AES-XTS assembly code currently treats the length as signed, since
this saves a few instructions in the loop compared to treating it as
unsigned.  Therefore update the type to make this clear.  (It is not
actually passed any values larger than PAGE_SIZE.)

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S |  2 +-
 arch/x86/crypto/aesni-intel_glue.c   | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index a94f02b7c5b47..bb3d0dfeca575 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -777,11 +777,11 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
 
 // Below are the actual AES-XTS encryption and decryption functions,
 // instantiated from the above macro.  They all have the following prototype:
 //
 // void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
-//			  const u8 *src, u8 *dst, unsigned int len,
+//			  const u8 *src, u8 *dst, int len,
 //			  u8 tweak[AES_BLOCK_SIZE]);
 //
 // |key| is the data key.  |tweak| contains the next tweak; the encryption of
 // the original IV with the tweak key was already done.  This function supports
 // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index fbf43482e1f5e..11e95fc62636e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -503,11 +503,11 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
 }
 
 typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
 				    u8 iv[AES_BLOCK_SIZE]);
 typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
-			       const u8 *src, u8 *dst, unsigned int len,
+			       const u8 *src, u8 *dst, int len,
 			       u8 tweak[AES_BLOCK_SIZE]);
 
 /* This handles cases where the source and/or destination span pages. */
 static noinline int
 xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
@@ -622,18 +622,18 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 {
 	aesni_enc(tweak_key, iv, iv);
 }
 
 static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
-			      const u8 *src, u8 *dst, unsigned int len,
+			      const u8 *src, u8 *dst, int len,
 			      u8 tweak[AES_BLOCK_SIZE])
 {
 	aesni_xts_enc(key, dst, src, len, tweak);
 }
 
 static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
-			      const u8 *src, u8 *dst, unsigned int len,
+			      const u8 *src, u8 *dst, int len,
 			      u8 tweak[AES_BLOCK_SIZE])
 {
 	aesni_xts_dec(key, dst, src, len, tweak);
 }
 
@@ -788,14 +788,14 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 
 #define DEFINE_XTS_ALG(suffix, driver_name, priority)			       \
 									       \
 asmlinkage void								       \
 aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
-			 u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+			 u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]);	       \
 asmlinkage void								       \
 aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
-			 u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+			 u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]);	       \
 									       \
 static int xts_encrypt_##suffix(struct skcipher_request *req)		       \
 {									       \
 	return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix);   \
 }									       \

From patchwork Tue Dec 10 23:58:34 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 848936
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by smtp.subspace.kernel.org (Postfix) with ESMTPS id DA66122FAF8
 for <linux-crypto@vger.kernel.org>; Tue, 10 Dec 2024 23:59:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
 t=1733875150; cv=none;
 b=IFnrQg8ssjLvIQQnrva5RWF0CLqWOYVPsSyn1gHen69XqixS38Wc4iS9w5w+gpAlnKIiI3Th5lrzD4CxzJGJJd8BY7VJEdyYKJUU+nnaNeBpAkny8YO67tO07ME3nbfjYY0INpilGJAmC7KmDbT7zzyN3rQKcQ4phHWK1+tNhbY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
 s=arc-20240116; t=1733875150; c=relaxed/simple;
 bh=1qTaI+eJVgg9YFidroahefGrOIsW8D3PuvSLnBlsGp8=;
 h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
 MIME-Version;
 b=OmZiOHUEx5nDWSW9Of/UJ3GQAnH7FBUbQnaF2ojO9Q/Ws4Ptgggy3OykI8odVhYAGZ1haYMo/0ENt1k0D0tzb43AIK1dPJN0nR8U7BsKQaWY8OPkF7tzVfiJXn6W9VuoJKHUmRWO/4K3QqCcoXrcqKm0DWYXj8fx6QoSfcYPjls=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=sS8m/nQW; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="sS8m/nQW"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 75CDEC4CEE3;
 Tue, 10 Dec 2024 23:59:10 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
 s=k20201202; t=1733875150;
 bh=1qTaI+eJVgg9YFidroahefGrOIsW8D3PuvSLnBlsGp8=;
 h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
 b=sS8m/nQWJh5eqn3QvKVFEjBr5Np2PP+5t5hLCpntQTC+5eY3NyIMsksw8s4CgmTEs
 woFDjy4Vi3qdQ+VmS0VMVESWO8IioTJT5Ac21naq7oiqk9dthPb8N5cNV6K64niJ8q
 B0Z7gEhlBMzVElY5JVeCjv933VbbCcfFVi09awggR6XGoY5n9Z+4BuIr2KVw+dxTc2
 0MPkBWrELESxACcECpU3foply9eF2qQ5NahHtL+73uZh8TfF9gXDqqY4FnOR1ZidTD
 fgYWcKZXu/SE0UvrhNNeTgqELtDGT4cSQ3yz7iAKsa9lSakJ3ZKeWC56E5+N/3xHfB
 9CweUEP9F3+hg==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org
Subject: [PATCH 7/7] crypto: x86/aes-xts - more code size optimizations
Date: Tue, 10 Dec 2024 15:58:34 -0800
Message-ID: <20241210235834.40862-8-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.1
In-Reply-To: <20241210235834.40862-1-ebiggers@kernel.org>
References: <20241210235834.40862-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Prefer immediates of -128 to 128, since the former fits in a signed
byte, saving 3 bytes per instruction.  Also prefer VEX-coded
instructions to EVEX where this is easy to do.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index bb3d0dfeca575..a49637f0a729d 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -186,10 +186,11 @@
 .endif
 	// V30-V31 are currently unused.
 .endm
 
 // Move a vector between memory and a register.
+// The register operand must be in the first 16 vector registers.
 .macro	_vmovdqu	src, dst
 .if VL < 64
 	vmovdqu		\src, \dst
 .else
 	vmovdqu8	\src, \dst
@@ -206,15 +207,16 @@
 	vbroadcasti32x4	\src, \dst
 .endif
 .endm
 
 // XOR two vectors together.
+// Any register operands must be in the first 16 vector registers.
 .macro	_vpxor	src1, src2, dst
-.if USE_AVX10
-	vpxord		\src1, \src2, \dst
-.else
+.if VL < 64
 	vpxor		\src1, \src2, \dst
+.else
+	vpxord		\src1, \src2, \dst
 .endif
 .endm
 
 // XOR three vectors together.
 .macro	_xor3	src1, src2, src3_and_dst
@@ -559,22 +561,22 @@
 	_setup_round_keys	\enc
 
 	// Compute the first set of tweaks TWEAK[0-3].
 	_compute_first_set_of_tweaks
 
-	sub		$4*VL, LEN
+	add		$-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
 	jl		.Lhandle_remainder\@
 
 .Lmain_loop\@:
 	// This is the main loop, en/decrypting 4*VL bytes per iteration.
 
 	// XOR each source block with its tweak and the zero-th round key.
 .if USE_AVX10
-	vmovdqu8	0*VL(SRC), V0
-	vmovdqu8	1*VL(SRC), V1
-	vmovdqu8	2*VL(SRC), V2
-	vmovdqu8	3*VL(SRC), V3
+	_vmovdqu	0*VL(SRC), V0
+	_vmovdqu	1*VL(SRC), V1
+	_vmovdqu	2*VL(SRC), V2
+	_vmovdqu	3*VL(SRC), V3
 	vpternlogd	$0x96, TWEAK0, KEY0, V0
 	vpternlogd	$0x96, TWEAK1, KEY0, V1
 	vpternlogd	$0x96, TWEAK2, KEY0, V2
 	vpternlogd	$0x96, TWEAK3, KEY0, V3
 .else
@@ -616,13 +618,13 @@
 	_vmovdqu	V3, 3*VL(DST)
 
 	// Finish computing the next set of tweaks.
 	_tweak_step	1000
 
-	add		$4*VL, SRC
-	add		$4*VL, DST
-	sub		$4*VL, LEN
+	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+	sub		$-4*VL, DST
+	add		$-4*VL, LEN
 	jge		.Lmain_loop\@
 
 	// Check for the uncommon case where the data length isn't a multiple of
 	// 4*VL.  Handle it out-of-line in order to optimize for the common
 	// case.  In the common case, just fall through to the ret.