diff mbox series

[2/3] crypto: x86/aes-xts - eliminate a few more instructions

Message ID 20240413031728.159495-3-ebiggers@kernel.org
State New
Headers show
Series crypto: x86/aes-xts - additional tuning | expand

Commit Message

Eric Biggers April 13, 2024, 3:17 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

- For conditionally subtracting 16 from LEN when decrypting a message
  whose length isn't a multiple of 16, use the cmovnz instruction.

- Fold the addition of 4*VL to LEN into the sub of VL or 16 from LEN.

- Remove an unnecessary test instruction.

This results in slightly shorter code, both source and binary.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S | 39 ++++++++++------------------
 1 file changed, 13 insertions(+), 26 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index f5e7ab739105..802d3b90d337 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -557,24 +557,24 @@ 
 .endm
 
 .macro	_aes_xts_crypt	enc
 	_define_aliases
 
-	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
-	movl		480(KEY), KEYLEN
-
 .if !\enc
 	// When decrypting a message whose length isn't a multiple of the AES
 	// block length, exclude the last full block from the main loop by
 	// subtracting 16 from LEN.  This is needed because ciphertext stealing
 	// decryption uses the last two tweaks in reverse order.  We'll handle
 	// the last full block and the partial block specially at the end.
+	lea		-16(LEN), %rax
 	test		$15, LEN
-	jnz		.Lneed_cts_dec\@
-.Lxts_init\@:
+	cmovnz		%rax, LEN
 .endif
 
+	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+	movl		480(KEY), KEYLEN
+
 	// Setup the pointer to the round keys and cache as many as possible.
 	_setup_round_keys	\enc
 
 	// Compute the first set of tweaks TWEAK[0-3].
 	_compute_first_set_of_tweaks
@@ -659,15 +659,14 @@ 
 	vzeroupper
 .endif
 	RET
 
 .Lhandle_remainder\@:
-	add		$4*VL, LEN	// Undo the extra sub from earlier.
 
 	// En/decrypt any remaining full blocks, one vector at a time.
 .if VL > 16
-	sub		$VL, LEN
+	add		$3*VL, LEN	// Undo extra sub of 4*VL, then sub VL.
 	jl		.Lvec_at_a_time_done\@
 .Lvec_at_a_time\@:
 	_vmovdqu	(SRC), V0
 	_aes_crypt	\enc, , TWEAK0, V0
 	_vmovdqu	V0, (DST)
@@ -675,13 +674,13 @@ 
 	add		$VL, SRC
 	add		$VL, DST
 	sub		$VL, LEN
 	jge		.Lvec_at_a_time\@
 .Lvec_at_a_time_done\@:
-	add		$VL-16, LEN	// Undo the extra sub from earlier.
+	add		$VL-16, LEN	// Undo extra sub of VL, then sub 16.
 .else
-	sub		$16, LEN
+	add		$4*VL-16, LEN	// Undo extra sub of 4*VL, then sub 16.
 .endif
 
 	// En/decrypt any remaining full blocks, one at a time.
 	jl		.Lblock_at_a_time_done\@
 .Lblock_at_a_time\@:
@@ -692,28 +691,16 @@ 
 	add		$16, SRC
 	add		$16, DST
 	sub		$16, LEN
 	jge		.Lblock_at_a_time\@
 .Lblock_at_a_time_done\@:
-	add		$16, LEN	// Undo the extra sub from earlier.
-
-.Lfull_blocks_done\@:
-	// Now 0 <= LEN <= 15.  If LEN is nonzero, do ciphertext stealing to
-	// process the last 16 + LEN bytes.  If LEN is zero, we're done.
-	test		LEN, LEN
-	jnz		.Lcts\@
-	jmp		.Ldone\@
-
-.if !\enc
-.Lneed_cts_dec\@:
-	sub		$16, LEN
-	jmp		.Lxts_init\@
-.endif
+	add		$16, LEN	// Undo the extra sub of 16.
+	// Now 0 <= LEN <= 15.  If LEN is zero, we're done.
+	jz		.Ldone\@
 
-.Lcts\@:
-	// Do ciphertext stealing (CTS) to en/decrypt the last full block and
-	// the partial block.  TWEAK0_XMM contains the next tweak.
+	// Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
+	// Do ciphertext stealing to process the last 16 + LEN bytes.
 
 .if \enc
 	// If encrypting, the main loop already encrypted the last full block to
 	// create the CTS intermediate ciphertext.  Prepare for the rest of CTS
 	// by rewinding the pointers and loading the intermediate ciphertext.