diff mbox series

[2/6] crypto: x86/aes-xts - add AES-XTS assembly macro for modern CPUs

Message ID 20240326080305.402382-3-ebiggers@kernel.org
State New
Headers show
Series Faster AES-XTS on modern x86_64 CPUs | expand

Commit Message

Eric Biggers March 26, 2024, 8:03 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

Add an assembly file aes-xts-avx-x86_64.S which contains a macro that
expands into AES-XTS implementations for x86_64 CPUs that support at
least AES-NI and AVX, optionally also taking advantage of VAES,
VPCLMULQDQ, and AVX512 or AVX10.

This patch doesn't expand the macro at all.  Later patches will do so,
adding each implementation individually so that the motivation and use
case for each individual implementation can be fully presented.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/Makefile             |   3 +-
 arch/x86/crypto/aes-xts-avx-x86_64.S | 758 +++++++++++++++++++++++++++
 2 files changed, 760 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/crypto/aes-xts-avx-x86_64.S
diff mbox series

Patch

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9aa46093c91b..9c5ce5613738 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -46,11 +46,12 @@  obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
 chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o
 chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
 
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \
+	aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o
 
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
 sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o
 
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
new file mode 100644
index 000000000000..92f1580e1eb0
--- /dev/null
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -0,0 +1,758 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES-XTS for modern x86_64 CPUs
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+/*
+ * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
+ * complexities of coding for x86 SIMD, e.g. where every vector length needs
+ * different code, it uses a macro to generate several implementations that
+ * share similar source code but are targeted at different CPUs, listed below:
+ *
+ * AES-NI + AVX
+ *    - 128-bit vectors (1 AES block per vector)
+ *    - VEX-coded instructions
+ *    - xmm0-xmm15
+ *    - This is for older CPUs that lack VAES but do have AVX.
+ *
+ * VAES + VPCLMULQDQ + AVX2
+ *    - 256-bit vectors (2 AES blocks per vector)
+ *    - VEX-coded instructions
+ *    - ymm0-ymm15
+ *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
+ *      e.g. Intel's Alder Lake and AMD's Zen 3.
+ *
+ * VAES + VPCLMULQDQ + AVX10/256 + BMI2
+ *    - 256-bit vectors (2 AES blocks per vector)
+ *    - EVEX-coded instructions
+ *    - ymm0-ymm31
+ *    - This is for CPUs that have AVX512 but where using zmm registers causes
+ *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
+ *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
+ *      To avoid confusion with 512-bit, we just write AVX10/256.
+ *
+ * VAES + VPCLMULQDQ + AVX10/512 + BMI2
+ *    - Same as the previous one, but upgrades to 512-bit vectors
+ *      (4 AES blocks per vector) in zmm0-zmm31.
+ *    - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *
+ * This file doesn't have an implementation for AES-NI alone (without AVX), as
+ * the lack of VEX would make all the assembly code different.
+ *
+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
+ * the XTS tweaks.  This avoids a bottleneck.  Currently there don't seem to be
+ * any CPUs that support VAES but not VPCLMULQDQ.  If that changes, we might
+ * need to start also providing an implementation using VAES alone.
+ *
+ * The AES-XTS implementations in this file support everything required by the
+ * crypto API, including support for arbitrary input lengths and multi-part
+ * processing.  However, they are most heavily optimized for the common case of
+ * power-of-2 length inputs that are processed in a single part (disk sectors).
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+.Lgf_poly:
+	// The low 64 bits of this value represent the polynomial x^7 + x^2 + x
+	// + 1.  It is the value that must be XOR'd into the low 64 bits of the
+	// tweak each time a 1 is carried out of the high 64 bits.
+	//
+	// The high 64 bits of this value is just the internal carry bit that
+	// exists when there's a carry out of the low 64 bits of the tweak.
+	.quad	0x87, 1
+
+	// This table contains constants for vpshufb and vpblendvb, used to
+	// handle variable byte shifts and blending during ciphertext stealing
+	// on CPUs that don't support AVX10-style masking.
+.Lcts_permute_table:
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+	.byte	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+.text
+
+// Function parameters
+.set	KEY,		%rdi	// Initially points to aesni_xts_ctx, then is
+				// advanced to point directly to the round keys
+.set	SRC,		%rsi	// Pointer to next source data
+.set	DST,		%rdx	// Pointer to next destination data
+.set	LEN,		%rcx	// Remaining length in bytes
+.set	IV,		%r8	// Pointer to IV
+.set	FLAGS,		%r9d	// XTS_* flags
+
+// Flags for the 'int flags' parameter.  Keep in sync with C file.
+#define XTS_FIRST	0x1
+#define XTS_UPDATE_IV	0x2
+
+// r10d holds the AES key length in bytes.
+.set	KEYLEN,		%r10d
+
+// %rax and %r11 are available as temporaries.
+
+// Move a vector between memory and a register.
+.macro	_vmovdqu	src, dst
+.if VL < 64
+	vmovdqu		\src, \dst
+.else
+	vmovdqu8	\src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value into a vector.
+.macro	_vbroadcast128	src, dst
+.if VL == 16 && !USE_AVX10
+	vmovdqu		\src, \dst
+.elseif VL == 32 && !USE_AVX10
+	vbroadcasti128	\src, \dst
+.else
+	vbroadcasti32x4	\src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro	_vpxor	src1, src2, dst
+.if USE_AVX10
+	vpxord		\src1, \src2, \dst
+.else
+	vpxor		\src1, \src2, \dst
+.endif
+.endm
+
+// XOR three vectors together.
+.macro	_xor3	src1, src2, src3_and_dst
+.if USE_AVX10
+	// vpternlogd with immediate 0x96 is a three-argument XOR.
+	vpternlogd	$0x96, \src1, \src2, \src3_and_dst
+.else
+	vpxor		\src1, \src3_and_dst, \src3_and_dst
+	vpxor		\src2, \src3_and_dst, \src3_and_dst
+.endif
+.endm
+
+.macro	_define_Vi	i
+.if VL == 16
+	.set	V\i,		%xmm\i
+.elseif VL == 32
+	.set	V\i,		%ymm\i
+.elseif VL == 64
+	.set	V\i,		%zmm\i
+.else
+	.error "Unsupported Vector Length (VL)"
+.endif
+.endm
+
+.macro _define_aliases
+	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+	// are available, that map to the xmm, ymm, or zmm registers according
+	// to the selected Vector Length (VL).
+	_define_Vi	0
+	_define_Vi	1
+	_define_Vi	2
+	_define_Vi	3
+	_define_Vi	4
+	_define_Vi	5
+	_define_Vi	6
+	_define_Vi	7
+	_define_Vi	8
+	_define_Vi	9
+	_define_Vi	10
+	_define_Vi	11
+	_define_Vi	12
+	_define_Vi	13
+	_define_Vi	14
+	_define_Vi	15
+.if USE_AVX10
+	_define_Vi	16
+	_define_Vi	17
+	_define_Vi	18
+	_define_Vi	19
+	_define_Vi	20
+	_define_Vi	21
+	_define_Vi	22
+	_define_Vi	23
+	_define_Vi	24
+	_define_Vi	25
+	_define_Vi	26
+	_define_Vi	27
+	_define_Vi	28
+	_define_Vi	29
+	_define_Vi	30
+	_define_Vi	31
+.endif
+
+	// V0-V7 hold temporary values.
+
+	// V8-V11 hold XTS tweaks.  Each 128-bit lane holds one tweak.
+	.set	TWEAK0_XMM,	%xmm8
+	.set	TWEAK0,		V8
+	.set	TWEAK1_XMM,	%xmm9
+	.set	TWEAK1,		V9
+	.set	TWEAK2,		V10
+	.set	TWEAK3,		V11
+
+	// V12-V14 hold the first 3 AES round keys, copied to all 128-bit lanes.
+	.set	KEY0_XMM,	%xmm12
+	.set	KEY0,		V12
+	.set	KEY1_XMM,	%xmm13
+	.set	KEY1,		V13
+	.set	KEY2_XMM,	%xmm14
+	.set	KEY2,		V14
+
+	// V15 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
+	.set	GF_POLY_XMM,	%xmm15
+	.set	GF_POLY,	V15
+
+	// If 32 SIMD registers are available, then V16-V27 hold the remaining
+	// AES round keys, copied to all 128-bit lanes.
+.if USE_AVX10
+	.set	KEY3_XMM,	%xmm16
+	.set	KEY3,		V16
+	.set	KEY4_XMM,	%xmm17
+	.set	KEY4,		V17
+	.set	KEY5_XMM,	%xmm18
+	.set	KEY5,		V18
+	.set	KEY6_XMM,	%xmm19
+	.set	KEY6,		V19
+	.set	KEY7_XMM,	%xmm20
+	.set	KEY7,		V20
+	.set	KEY8_XMM,	%xmm21
+	.set	KEY8,		V21
+	.set	KEY9_XMM,	%xmm22
+	.set	KEY9,		V22
+	.set	KEY10_XMM,	%xmm23
+	.set	KEY10,		V23
+	.set	KEY11_XMM,	%xmm24
+	.set	KEY11,		V24
+	.set	KEY12_XMM,	%xmm25
+	.set	KEY12,		V25
+	.set	KEY13_XMM,	%xmm26
+	.set	KEY13,		V26
+	.set	KEY14_XMM,	%xmm27
+	.set	KEY14,		V27
+.endif
+	// V28-V31 are currently unused.
+.endm
+
+// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
+// on the block(s) in \data using the round key(s) in \key.  The register length
+// determines the number of AES blocks en/decrypted.
+.macro	_vaes	enc, last, key, data
+.if \enc
+.if \last
+	vaesenclast	\key, \data, \data
+.else
+	vaesenc		\key, \data, \data
+.endif
+.else
+.if \last
+	vaesdeclast	\key, \data, \data
+.else
+	vaesdec		\key, \data, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the block(s) in \data, using the
+// same key for all block(s).  The round key is loaded from the appropriate
+// register or memory location for round \i.  May clobber V4.
+.macro _vaes_1x		enc, last, i, xmm_suffix, data
+.if \i < NR_CACHED_ROUND_KEYS
+	_vaes		\enc, \last, KEY\i\xmm_suffix, \data
+.else
+.ifnb \xmm_suffix
+	_vaes		\enc, \last, \i*16(KEY), \data
+.else
+	_vbroadcast128	\i*16(KEY), V4
+	_vaes		\enc, \last, V4, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the blocks in registers V0-V3,
+// using the same key for all blocks.  The round key is loaded from the
+// appropriate register or memory location for round \i.  May clobber V4.
+.macro	_vaes_4x	enc, last, i
+.if \i < NR_CACHED_ROUND_KEYS
+	_vaes		\enc, \last, KEY\i, V0
+	_vaes		\enc, \last, KEY\i, V1
+	_vaes		\enc, \last, KEY\i, V2
+	_vaes		\enc, \last, KEY\i, V3
+.else
+	_vbroadcast128	\i*16(KEY), V4
+	_vaes		\enc, \last, V4, V0
+	_vaes		\enc, \last, V4, V1
+	_vaes		\enc, \last, V4, V2
+	_vaes		\enc, \last, V4, V3
+.endif
+.endm
+
+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+// then XOR with \tweak again) of the block(s) in \data.  To process a single
+// block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
+// length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
+.macro	_aes_crypt	enc, xmm_suffix, tweak, data
+	_xor3		KEY0\xmm_suffix, \tweak, \data
+	_vaes_1x	\enc, 0, 1, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 2, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 3, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 4, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 5, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 6, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 7, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 8, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 9, \xmm_suffix, \data
+	cmp		$24, KEYLEN
+	jle		.Laes_128_or_192\@
+	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 12, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 13, \xmm_suffix, \data
+	_vaes_1x	\enc, 1, 14, \xmm_suffix, \data
+	jmp		.Laes_done\@
+.Laes_128_or_192\@:
+	je		.Laes_192\@
+	_vaes_1x	\enc, 1, 10, \xmm_suffix, \data
+	jmp		.Laes_done\@
+.Laes_192\@:
+	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
+	_vaes_1x	\enc, 1, 12, \xmm_suffix, \data
+.Laes_done\@:
+	_vpxor		\tweak, \data, \data
+.endm
+
+// Load the round keys: just the first 3 if !USE_AVX10, otherwise all of them.
+.macro	_load_round_keys
+	_vbroadcast128	0*16(KEY), KEY0
+	_vbroadcast128	1*16(KEY), KEY1
+	_vbroadcast128	2*16(KEY), KEY2
+	.set	NR_CACHED_ROUND_KEYS,	3
+.if USE_AVX10
+	_vbroadcast128	3*16(KEY), KEY3
+	_vbroadcast128	4*16(KEY), KEY4
+	_vbroadcast128	5*16(KEY), KEY5
+	_vbroadcast128	6*16(KEY), KEY6
+	_vbroadcast128	7*16(KEY), KEY7
+	_vbroadcast128	8*16(KEY), KEY8
+	_vbroadcast128	9*16(KEY), KEY9
+	_vbroadcast128	10*16(KEY), KEY10
+	// Note: if it's AES-128 or AES-192, the last several round keys won't
+	// be used.  We do the loads anyway to save a conditional jump.
+	_vbroadcast128	11*16(KEY), KEY11
+	_vbroadcast128	12*16(KEY), KEY12
+	_vbroadcast128	13*16(KEY), KEY13
+	_vbroadcast128	14*16(KEY), KEY14
+	.set	NR_CACHED_ROUND_KEYS,	15
+.endif
+.endm
+
+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
+// (by multiplying by the polynomial 'x') and write it to \dst.
+.macro	_next_tweak	src, tmp, dst
+	vpshufd		$0x13, \src, \tmp
+	vpaddq		\src, \src, \dst
+	vpsrad		$31, \tmp, \tmp
+	vpand		GF_POLY_XMM, \tmp, \tmp
+	vpxor		\tmp, \dst, \dst
+.endm
+
+// Given the XTS tweak(s) in the vector \src, compute the next vector of
+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
+//
+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
+// all tweaks in the vector in parallel.  If VL=16, we just do the regular
+// computation without vpclmulqdq, as it's the faster method for a single tweak.
+.macro	_next_tweakvec	src, tmp1, tmp2, dst
+.if VL == 16
+	_next_tweak	\src, \tmp1, \dst
+.else
+	vpsrlq		$64 - VL/16, \src, \tmp1
+	vpclmulqdq	$0x01, GF_POLY, \tmp1, \tmp2
+	vpslldq		$8, \tmp1, \tmp1
+	vpsllq		$VL/16, \src, \dst
+	_xor3		\tmp1, \tmp2, \dst
+.endif
+.endm
+
+// Given an XTS tweak in TWEAK0_XMM, compute the following tweaks and store them
+// in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V7.
+.macro	_compute_first_set_of_tweaks
+.if VL == 16
+	// With VL=16, multiplying by x serially is fastest.
+	_next_tweak	TWEAK0, %xmm0, TWEAK1
+	_next_tweak	TWEAK1, %xmm0, TWEAK2
+	_next_tweak	TWEAK2, %xmm0, TWEAK3
+.else
+.if VL == 32
+	// Compute the second block of TWEAK0.
+	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
+	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
+.elseif VL == 64
+	// Compute the remaining blocks of TWEAK0.
+	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
+	_next_tweak	%xmm1, %xmm0, %xmm2
+	_next_tweak	%xmm2, %xmm0, %xmm3
+	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
+	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
+	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
+.endif
+	// Compute TWEAK[1-3] from TWEAK0.
+	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
+	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
+	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpclmulqdq	$0x01, GF_POLY, V2, V3
+	vpclmulqdq	$0x01, GF_POLY, V4, V5
+	vpslldq		$8, V0, V0
+	vpslldq		$8, V2, V2
+	vpslldq		$8, V4, V4
+	vpsllq		$1*VL/16, TWEAK0, TWEAK1
+	vpsllq		$2*VL/16, TWEAK0, TWEAK2
+	vpsllq		$3*VL/16, TWEAK0, TWEAK3
+.if USE_AVX10
+	vpternlogd	$0x96, V0, V1, TWEAK1
+	vpternlogd	$0x96, V2, V3, TWEAK2
+	vpternlogd	$0x96, V4, V5, TWEAK3
+.else
+	vpxor		V0, TWEAK1, TWEAK1
+	vpxor		V2, TWEAK2, TWEAK2
+	vpxor		V4, TWEAK3, TWEAK3
+	vpxor		V1, TWEAK1, TWEAK1
+	vpxor		V3, TWEAK2, TWEAK2
+	vpxor		V5, TWEAK3, TWEAK3
+.endif
+.endif
+.endm
+
+// Advance the set of XTS tweaks in TWEAK0-TWEAK3 to the next set.
+.macro	_compute_next_set_of_tweaks
+.if VL == 16
+	// With VL=16, multiplying by x serially is fastest.
+	_next_tweak	TWEAK3, %xmm0, TWEAK0
+	_next_tweak	TWEAK0, %xmm0, TWEAK1
+	_next_tweak	TWEAK1, %xmm0, TWEAK2
+	_next_tweak	TWEAK2, %xmm0, TWEAK3
+.else
+	// Multiply each tweak by x^(4*VL/16) in parallel.
+	vpsrlq		$64 - 4*VL/16, TWEAK0, V0
+	vpsrlq		$64 - 4*VL/16, TWEAK1, V1
+	vpsrlq		$64 - 4*VL/16, TWEAK2, V2
+	vpsrlq		$64 - 4*VL/16, TWEAK3, V3
+	vpclmulqdq	$0x01, GF_POLY, V0, V4
+	vpclmulqdq	$0x01, GF_POLY, V1, V5
+	vpclmulqdq	$0x01, GF_POLY, V2, V6
+	vpclmulqdq	$0x01, GF_POLY, V3, V7
+	vpslldq		$8, V0, V0
+	vpslldq		$8, V1, V1
+	vpslldq		$8, V2, V2
+	vpslldq		$8, V3, V3
+	vpsllq		$4*VL/16, TWEAK0, TWEAK0
+	vpsllq		$4*VL/16, TWEAK1, TWEAK1
+	vpsllq		$4*VL/16, TWEAK2, TWEAK2
+	vpsllq		$4*VL/16, TWEAK3, TWEAK3
+.if USE_AVX10
+	vpternlogd	$0x96, V0, V4, TWEAK0
+	vpternlogd	$0x96, V1, V5, TWEAK1
+	vpternlogd	$0x96, V2, V6, TWEAK2
+	vpternlogd	$0x96, V3, V7, TWEAK3
+.else
+	vpxor		V0, TWEAK0, TWEAK0
+	vpxor		V1, TWEAK1, TWEAK1
+	vpxor		V2, TWEAK2, TWEAK2
+	vpxor		V3, TWEAK3, TWEAK3
+	vpxor		V4, TWEAK0, TWEAK0
+	vpxor		V5, TWEAK1, TWEAK1
+	vpxor		V6, TWEAK2, TWEAK2
+	vpxor		V7, TWEAK3, TWEAK3
+.endif
+.endif
+.endm
+
+.macro	aes_xts_crypt	enc
+	_define_aliases
+
+	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+	mov		480(KEY), KEYLEN
+
+	// Check whether the data length is a multiple of the AES block length.
+	test		$15, LEN
+	jnz		.Lneed_cts\@
+
+.Lxts_init\@:
+	// Load the IV into TWEAK0_XMM, and if (flags & XTS_FIRST) encrypt it
+	// with the tweak key to get the first tweak.  If !(flags & XTS_FIRST),
+	// then this is a continuation call and the IV was already encrypted.
+	vmovdqu		(IV), TWEAK0_XMM
+	test		$XTS_FIRST, FLAGS
+	jz		.Lencrypt_iv_done\@
+	vpxor		0*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		1*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		2*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		3*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		4*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		5*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		6*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		7*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		8*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		9*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	cmp		$24, KEYLEN
+	jle		.Lencrypt_iv_aes_128_or_192\@
+	vaesenc		10*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		11*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		12*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		13*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenclast	14*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+.Lencrypt_iv_done\@:
+
+	// Advance KEY from tweak_ctx to crypt_ctx::key_enc (for encryption)
+	// or crypt_ctx::key_dec (for decryption).
+.if \enc
+	add		$496, KEY
+.else
+	add		$496 + 240, KEY
+.endif
+
+	// Load the gf_poly constant.
+	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
+
+	// Compute the first set of tweaks TWEAK[0-3].
+	_compute_first_set_of_tweaks
+
+	// Cache as many round keys as possible.
+	_load_round_keys
+
+	sub		$4*VL, LEN
+	jl		.Lhandle_remainder_have_tweak0\@
+
+.Lmain_loop\@:
+	// This is the main loop, en/decrypting 4*VL bytes per iteration.
+
+	// Load the next set of source blocks.
+	_vmovdqu	0*VL(SRC), V0
+	_vmovdqu	1*VL(SRC), V1
+	_vmovdqu	2*VL(SRC), V2
+	_vmovdqu	3*VL(SRC), V3
+
+	// XOR each block with its tweak and the first round key.
+.if USE_AVX10
+	vpternlogd	$0x96, TWEAK0, KEY0, V0
+	vpternlogd	$0x96, TWEAK1, KEY0, V1
+	vpternlogd	$0x96, TWEAK2, KEY0, V2
+	vpternlogd	$0x96, TWEAK3, KEY0, V3
+.else
+	vpxor		TWEAK0, V0, V0
+	vpxor		TWEAK1, V1, V1
+	vpxor		TWEAK2, V2, V2
+	vpxor		TWEAK3, V3, V3
+	vpxor		KEY0, V0, V0
+	vpxor		KEY0, V1, V1
+	vpxor		KEY0, V2, V2
+	vpxor		KEY0, V3, V3
+.endif
+
+	// Do the AES rounds.
+	_vaes_4x	\enc, 0, 1
+	_vaes_4x	\enc, 0, 2
+	_vaes_4x	\enc, 0, 3
+	_vaes_4x	\enc, 0, 4
+	_vaes_4x	\enc, 0, 5
+	_vaes_4x	\enc, 0, 6
+	_vaes_4x	\enc, 0, 7
+	_vaes_4x	\enc, 0, 8
+	_vaes_4x	\enc, 0, 9
+	// Try to optimize for AES-256 by keeping the code for AES-128 and
+	// AES-192 out-of-line.
+	cmp		$24, KEYLEN
+	jle		.Lencrypt_4x_aes_128_or_192\@
+	_vaes_4x	\enc, 0, 10
+	_vaes_4x	\enc, 0, 11
+	_vaes_4x	\enc, 0, 12
+	_vaes_4x	\enc, 0, 13
+	_vaes_4x	\enc, 1, 14
+.Lencrypt_4x_done\@:
+
+	// XOR in the tweaks again.
+	_vpxor		TWEAK0, V0, V0
+	_vpxor		TWEAK1, V1, V1
+	_vpxor		TWEAK2, V2, V2
+	_vpxor		TWEAK3, V3, V3
+
+	// Store the destination blocks.
+	_vmovdqu	V0, 0*VL(DST)
+	_vmovdqu	V1, 1*VL(DST)
+	_vmovdqu	V2, 2*VL(DST)
+	_vmovdqu	V3, 3*VL(DST)
+
+	add		$4*VL, SRC
+	add		$4*VL, DST
+	sub		$4*VL, LEN
+	jl		.Lmain_loop_done\@
+
+	// Another iteration of the main loop is needed, so advance the tweaks.
+	_compute_next_set_of_tweaks
+
+	jmp		.Lmain_loop\@
+
+.Lmain_loop_done\@:
+	// Check for less common cases: the data length isn't a multiple of 4*VL
+	// and/or the caller needs the next tweak to be returned.  Optimize for
+	// the common case by falling through to the ret in that case.
+	test		$4*VL-1, LEN
+	jnz		.Lhandle_remainder\@
+	test		$XTS_UPDATE_IV, FLAGS
+	jnz		.Lhandle_remainder\@
+.Ldone\@:
+.if VL > 16
+	vzeroupper
+.endif
+	RET
+
+.Lhandle_remainder\@:
+	// Compute the next vector of tweaks and store it in TWEAK0.
+	_next_tweakvec	TWEAK3, V0, V1, TWEAK0
+.Lhandle_remainder_have_tweak0\@:
+	add		$4*VL, LEN	// Undo the extra sub from earlier.
+
+	// En/decrypt any remaining full blocks, one vector at a time.
+.if VL > 16
+	sub		$VL, LEN
+	jl		.Lvec_at_a_time_done\@
+.Lvec_at_a_time\@:
+	_vmovdqu	(SRC), V0
+	_aes_crypt	\enc, , TWEAK0, V0
+	_vmovdqu	V0, (DST)
+	_next_tweakvec	TWEAK0, V0, V1, TWEAK0
+	add		$VL, SRC
+	add		$VL, DST
+	sub		$VL, LEN
+	jge		.Lvec_at_a_time\@
+.Lvec_at_a_time_done\@:
+	add		$VL-16, LEN
+.else
+	sub		$16, LEN
+.endif
+
+	// En/decrypt any remaining full blocks, one at a time.
+	jl		.Lblock_at_a_time_done\@
+.Lblock_at_a_time\@:
+	vmovdqu		(SRC), %xmm0
+	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
+	vmovdqu		%xmm0, (DST)
+	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK0_XMM
+	add		$16, SRC
+	add		$16, DST
+	sub		$16, LEN
+	jge		.Lblock_at_a_time\@
+.Lblock_at_a_time_done\@:
+	add		$16, LEN
+
+.Lfull_blocks_done\@:
+	// Now 0 <= LEN <= 15.  If LEN is nonzero, do ciphertext stealing to
+	// process the last 16 + LEN bytes.  If LEN is zero, we're done.
+	test		LEN, LEN
+	jnz		.Lcts\@
+
+	// Store the next tweak back to *IV to support continuation calls.
+	vmovdqu		TWEAK0_XMM, (IV)
+	jmp		.Ldone\@
+
+	// Out-of-line handling of AES-128 and AES-192
+.Lencrypt_iv_aes_128_or_192\@:
+	jz		.Lencrypt_iv_aes_192\@
+	vaesenclast	10*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	jmp		.Lencrypt_iv_done\@
+.Lencrypt_iv_aes_192\@:
+	vaesenc		10*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenc		11*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	vaesenclast	12*16(KEY), TWEAK0_XMM, TWEAK0_XMM
+	jmp		.Lencrypt_iv_done\@
+
+	// Out-of-line handling of AES-128 and AES-192
+.Lencrypt_4x_aes_128_or_192\@:
+	jz		.Lencrypt_4x_aes_192\@
+	_vaes_4x	\enc, 1, 10
+	jmp		.Lencrypt_4x_done\@
+.Lencrypt_4x_aes_192\@:
+	_vaes_4x	\enc, 0, 10
+	_vaes_4x	\enc, 0, 11
+	_vaes_4x	\enc, 1, 12
+	jmp		.Lencrypt_4x_done\@
+
+.Lneed_cts\@:
+	// The data length isn't a multiple of the AES block length, so
+	// ciphertext stealing (CTS) will be needed.  Subtract one block from
+	// LEN so that the main loop doesn't process the last full block.  The
+	// CTS step will process it specially along with the partial block.
+	sub		$16, LEN
+	jmp		.Lxts_init\@
+
+.Lcts\@:
+	// Do ciphertext stealing (CTS) to en/decrypt the last full block and
+	// the partial block.  CTS needs two tweaks.  TWEAK0_XMM contains the
+	// next tweak; compute the one after that.  Decryption uses these two
+	// tweaks in reverse order, so also define aliases to handle that.
+	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK1_XMM
+.if \enc
+	.set		CTS_TWEAK0,	TWEAK0_XMM
+	.set		CTS_TWEAK1,	TWEAK1_XMM
+.else
+	.set		CTS_TWEAK0,	TWEAK1_XMM
+	.set		CTS_TWEAK1,	TWEAK0_XMM
+.endif
+
+	// En/decrypt the last full block.
+	vmovdqu		(SRC), %xmm0
+	_aes_crypt	\enc, _XMM, CTS_TWEAK0, %xmm0
+
+.if USE_AVX10
+	// Create a mask that has the first LEN bits set.
+	mov		$-1, %rax
+	bzhi		LEN, %rax, %rax
+	kmovq		%rax, %k1
+
+	// Swap the first LEN bytes of the above result with the partial block.
+	// Note that to support in-place en/decryption, the load from the src
+	// partial block must happen before the store to the dst partial block.
+	vmovdqa		%xmm0, %xmm1
+	vmovdqu8	16(SRC), %xmm0{%k1}
+	vmovdqu8	%xmm1, 16(DST){%k1}
+.else
+	lea		.Lcts_permute_table(%rip), %rax
+
+	// Load the src partial block, left-aligned.  Note that to support
+	// in-place en/decryption, this must happen before the store to the dst
+	// partial block.
+	vmovdqu		(SRC, LEN, 1), %xmm1
+
+	// Shift the first LEN bytes of the en/decryption of the last full block
+	// to the end of a register, then store it to DST+LEN.  This stores the
+	// dst partial block.  It also writes to the second part of the dst last
+	// full block, but that part is overwritten later.
+	vpshufb		(%rax, LEN, 1), %xmm0, %xmm2
+	vmovdqu		%xmm2, (DST, LEN, 1)
+
+	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
+	sub		LEN, %rax
+	vmovdqu		32(%rax), %xmm3
+
+	// Shift the src partial block to the beginning of its register.
+	vpshufb		%xmm3, %xmm1, %xmm1
+
+	// Do a blend to generate the src partial block followed by the second
+	// part of the en/decryption of the last full block.
+	vpblendvb	%xmm3, %xmm0, %xmm1, %xmm0
+.endif
+	// En/decrypt again and store the last full block.
+	_aes_crypt	\enc, _XMM, CTS_TWEAK1, %xmm0
+	vmovdqu		%xmm0, (DST)
+	jmp		.Ldone\@
+.endm