diff mbox series

[v6,06/30] KVM: selftests: Add helper functions to create TDX VMs

Message ID 20250414214801.2693294-7-sagis@google.com
State New
Headers show
Series TDX KVM selftests | expand

Commit Message

Sagi Shahar April 14, 2025, 9:47 p.m. UTC
From: Erdem Aktas <erdemaktas@google.com>

TDX requires additional IOCTLs to initialize VM and vCPUs to add
private memory and to finalize the VM memory. Also additional utility
functions are provided to manipulate a TD, similar to those that
manipulate a VM in the current selftest framework.

A TD's initial register state cannot be manipulated directly by
setting the VM's memory, hence boot code is provided at the TD's reset
vector. This boot code takes boot parameters loaded in the TD's memory
and sets up the TD for the selftest.

Userspace needs to ensure consistency between KVM's CPUID and the
TDX Module's view. Obtain the CPUID supported by KVM and make
adjustments to reflect features of interest and the limited
KVM PV features supported for TD guest. This involves masking the
feature bits from CPUID entries and filtering CPUID entries of
features not supported by TDX before initializing the TD.

Suggested-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Erdem Aktas <erdemaktas@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm      |   2 +
 .../testing/selftests/kvm/include/kvm_util.h  |   6 +
 .../selftests/kvm/include/x86/kvm_util_arch.h |   1 +
 .../selftests/kvm/include/x86/tdx/td_boot.h   |  83 +++
 .../kvm/include/x86/tdx/td_boot_asm.h         |  16 +
 .../selftests/kvm/include/x86/tdx/tdx_util.h  |  19 +
 tools/testing/selftests/kvm/lib/kvm_util.c    |   6 +-
 .../testing/selftests/kvm/lib/x86/processor.c |  19 +-
 .../selftests/kvm/lib/x86/tdx/td_boot.S       | 100 ++++
 .../selftests/kvm/lib/x86/tdx/tdx_util.c      | 566 ++++++++++++++++++
 10 files changed, 807 insertions(+), 11 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/tdx/td_boot_asm.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
 create mode 100644 tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S
 create mode 100644 tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index f62b0a5aba35..8e7a12d74745 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -28,6 +28,8 @@  LIBKVM_x86 += lib/x86/sev.c
 LIBKVM_x86 += lib/x86/svm.c
 LIBKVM_x86 += lib/x86/ucall.c
 LIBKVM_x86 += lib/x86/vmx.c
+LIBKVM_x86 += lib/x86/tdx/tdx_util.c
+LIBKVM_x86 += lib/x86/tdx/td_boot.S
 
 LIBKVM_arm64 += lib/arm64/gic.c
 LIBKVM_arm64 += lib/arm64/gic_v3.c
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 74ecfd8d7ae0..813ba634dc49 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -79,6 +79,7 @@  enum kvm_mem_region_type {
 	MEM_REGION_DATA,
 	MEM_REGION_PT,
 	MEM_REGION_TEST_DATA,
+	MEM_REGION_TDX_BOOT_PARAMS,
 	NR_MEM_REGIONS,
 };
 
@@ -985,6 +986,9 @@  unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
 unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
 unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
 unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages);
+uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
+			      uint32_t nr_runnable_vcpus,
+			      uint64_t extra_mem_pages);
 static inline unsigned int
 vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
 {
@@ -1150,6 +1154,8 @@  static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm)
  */
 void kvm_selftest_arch_init(void);
 
+void vm_init_descriptor_tables(struct kvm_vm *vm);
+
 void kvm_arch_vm_post_create(struct kvm_vm *vm);
 
 bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr);
diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
index 972bb1c4ab4c..80db1e4c38ba 100644
--- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -19,6 +19,7 @@  struct kvm_vm_arch {
 	uint64_t s_bit;
 	int sev_fd;
 	bool is_pt_protected;
+	bool has_protected_regs;
 };
 
 static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch)
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h b/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
new file mode 100644
index 000000000000..94a50295f953
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
@@ -0,0 +1,83 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_TDX_TD_BOOT_H
+#define SELFTEST_TDX_TD_BOOT_H
+
+#include <stdint.h>
+
+#include "tdx/td_boot_asm.h"
+
+/*
+ * Layout for boot section (not to scale)
+ *
+ *                                   GPA
+ * _________________________________ 0x1_0000_0000 (4GB)
+ * |   Boot code trampoline    |
+ * |___________________________|____ 0x0_ffff_fff0: Reset vector (16B below 4GB)
+ * |   Boot code               |
+ * |___________________________|____ td_boot will be copied here, so that the
+ * |                           |     jmp to td_boot is exactly at the reset vector
+ * |   Empty space             |
+ * |                           |
+ * |───────────────────────────|
+ * |                           |
+ * |                           |
+ * |   Boot parameters         |
+ * |                           |
+ * |                           |
+ * |___________________________|____ 0x0_ffff_0000: TD_BOOT_PARAMETERS_GPA
+ */
+#define FOUR_GIGABYTES_GPA (4ULL << 30)
+
+/*
+ * The exact memory layout for LGDT or LIDT instructions.
+ */
+struct __packed td_boot_parameters_dtr {
+	uint16_t limit;
+	uint32_t base;
+};
+
+/*
+ * The exact layout in memory required for a ljmp, including the selector for
+ * changing code segment.
+ */
+struct __packed td_boot_parameters_ljmp_target {
+	uint32_t eip_gva;
+	uint16_t code64_sel;
+};
+
+/*
+ * Allows each vCPU to be initialized with different eip and esp.
+ */
+struct __packed td_per_vcpu_parameters {
+	uint32_t esp_gva;
+	struct td_boot_parameters_ljmp_target ljmp_target;
+};
+
+/*
+ * Boot parameters for the TD.
+ *
+ * Unlike a regular VM, KVM cannot set registers such as esp, eip, etc
+ * before boot, so to run selftests, these registers' values have to be
+ * initialized by the TD.
+ *
+ * This struct is loaded in TD private memory at TD_BOOT_PARAMETERS_GPA.
+ *
+ * The TD boot code will read off parameters from this struct and set up the
+ * vCPU for executing selftests.
+ */
+struct __packed td_boot_parameters {
+	uint32_t cr0;
+	uint32_t cr3;
+	uint32_t cr4;
+	struct td_boot_parameters_dtr gdtr;
+	struct td_boot_parameters_dtr idtr;
+	struct td_per_vcpu_parameters per_vcpu[];
+};
+
+void td_boot(void);
+void reset_vector(void);
+void td_boot_code_end(void);
+
+#define TD_BOOT_CODE_SIZE (td_boot_code_end - td_boot)
+
+#endif /* SELFTEST_TDX_TD_BOOT_H */
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/td_boot_asm.h b/tools/testing/selftests/kvm/include/x86/tdx/td_boot_asm.h
new file mode 100644
index 000000000000..10b4b527595c
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/tdx/td_boot_asm.h
@@ -0,0 +1,16 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_TDX_TD_BOOT_ASM_H
+#define SELFTEST_TDX_TD_BOOT_ASM_H
+
+/*
+ * GPA where TD boot parameters will be loaded.
+ *
+ * TD_BOOT_PARAMETERS_GPA is arbitrarily chosen to
+ *
+ * + be within the 4GB address space
+ * + provide enough contiguous memory for the struct td_boot_parameters such
+ *   that there is one struct td_per_vcpu_parameters for KVM_MAX_VCPUS
+ */
+#define TD_BOOT_PARAMETERS_GPA 0xffff0000
+
+#endif  // SELFTEST_TDX_TD_BOOT_ASM_H
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
new file mode 100644
index 000000000000..57a2f5893ffe
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
@@ -0,0 +1,19 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_TDX_KVM_UTIL_H
+#define SELFTESTS_TDX_KVM_UTIL_H
+
+#include <stdint.h>
+
+#include "kvm_util.h"
+
+void tdx_filter_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid_data);
+void __tdx_mask_cpuid_features(struct kvm_cpuid_entry2 *entry);
+
+struct kvm_vcpu *td_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code);
+
+struct kvm_vm *td_create(void);
+void td_initialize(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
+		   uint64_t attributes);
+void td_finalize(struct kvm_vm *vm);
+
+#endif // SELFTESTS_TDX_KVM_UTIL_H
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 40dd63f2bd05..f8cf49794eed 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -373,9 +373,9 @@  struct kvm_vm *____vm_create(struct vm_shape shape)
 	return vm;
 }
 
-static uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
-				     uint32_t nr_runnable_vcpus,
-				     uint64_t extra_mem_pages)
+uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
+			      uint32_t nr_runnable_vcpus,
+			      uint64_t extra_mem_pages)
 {
 	uint64_t page_size = vm_guest_mode_params[mode].page_size;
 	uint64_t nr_pages;
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 1c42af328f19..9b2c236e723a 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -590,7 +590,7 @@  void sync_exception_handlers_to_guest(struct kvm_vm *vm)
 	*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
 }
 
-static void vm_init_descriptor_tables(struct kvm_vm *vm)
+void vm_init_descriptor_tables(struct kvm_vm *vm)
 {
 	extern void *idt_handlers;
 	struct kvm_segment seg;
@@ -697,16 +697,19 @@  struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 
 	vcpu = __vm_vcpu_add(vm, vcpu_id);
 	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
-	vcpu_init_sregs(vm, vcpu);
-	vcpu_init_xcrs(vm, vcpu);
 
 	vcpu->initial_stack_addr = stack_vaddr;
 
-	/* Setup guest general purpose registers */
-	vcpu_regs_get(vcpu, &regs);
-	regs.rflags = regs.rflags | 0x2;
-	regs.rsp = stack_vaddr;
-	vcpu_regs_set(vcpu, &regs);
+	if (!vm->arch.has_protected_regs) {
+		vcpu_init_sregs(vm, vcpu);
+		vcpu_init_xcrs(vm, vcpu);
+
+		/* Setup guest general purpose registers */
+		vcpu_regs_get(vcpu, &regs);
+		regs.rflags = regs.rflags | 0x2;
+		regs.rsp = stack_vaddr;
+		vcpu_regs_set(vcpu, &regs);
+	}
 
 	/* Setup the MP state */
 	mp_state.mp_state = 0;
diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S b/tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S
new file mode 100644
index 000000000000..c8cbe214bba9
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S
@@ -0,0 +1,100 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include "tdx/td_boot_asm.h"
+
+/* Offsets for reading struct td_boot_parameters. */
+#define TD_BOOT_PARAMETERS_CR0         0
+#define TD_BOOT_PARAMETERS_CR3         4
+#define TD_BOOT_PARAMETERS_CR4         8
+#define TD_BOOT_PARAMETERS_GDT         12
+#define TD_BOOT_PARAMETERS_IDT         18
+#define TD_BOOT_PARAMETERS_PER_VCPU    24
+
+/* Offsets for reading struct td_per_vcpu_parameters. */
+#define TD_PER_VCPU_PARAMETERS_ESP_GVA     0
+#define TD_PER_VCPU_PARAMETERS_LJMP_TARGET 4
+
+#define SIZEOF_TD_PER_VCPU_PARAMETERS      10
+
+.code32
+
+.globl td_boot
+td_boot:
+	/* In this procedure, edi is used as a temporary register. */
+	cli
+
+	/* Paging is off. */
+
+	movl $TD_BOOT_PARAMETERS_GPA, %ebx
+
+	/*
+	 * Find the address of struct td_per_vcpu_parameters for this
+	 * vCPU based on esi (TDX spec: initialized with vCPU id). Put
+	 * struct address into register for indirect addressing.
+	 */
+	movl $SIZEOF_TD_PER_VCPU_PARAMETERS, %eax
+	mul %esi
+	leal TD_BOOT_PARAMETERS_PER_VCPU(%ebx), %edi
+	addl %edi, %eax
+
+	/* Setup stack. */
+	movl TD_PER_VCPU_PARAMETERS_ESP_GVA(%eax), %esp
+
+	/* Setup GDT. */
+	leal TD_BOOT_PARAMETERS_GDT(%ebx), %edi
+	lgdt (%edi)
+
+	/* Setup IDT. */
+	leal TD_BOOT_PARAMETERS_IDT(%ebx), %edi
+	lidt (%edi)
+
+	/*
+	 * Set up control registers (There are no instructions to mov from
+	 * memory to control registers, hence use ebx as a scratch register).
+	 */
+	movl TD_BOOT_PARAMETERS_CR4(%ebx), %edi
+	movl %edi, %cr4
+	movl TD_BOOT_PARAMETERS_CR3(%ebx), %edi
+	movl %edi, %cr3
+	movl TD_BOOT_PARAMETERS_CR0(%ebx), %edi
+	movl %edi, %cr0
+
+	/* Paging is on after setting the most significant bit on cr0. */
+
+	/*
+	 * Jump to selftest guest code. Far jumps read <segment
+	 * selector:new eip> from <addr+4:addr>. This location has
+	 * already been set up in boot parameters, and boot parameters can
+	 * be read because boot code and boot parameters are loaded so
+	 * that GVA and GPA are mapped 1:1.
+	 */
+	ljmp *TD_PER_VCPU_PARAMETERS_LJMP_TARGET(%eax)
+
+.globl reset_vector
+reset_vector:
+	jmp td_boot
+	/*
+	 * Pad reset_vector to its full size of 16 bytes so that this
+	 * can be loaded with the end of reset_vector aligned to GPA=4G.
+	 */
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+	int3
+
+/* Leave marker so size of td_boot code can be computed. */
+.globl td_boot_code_end
+td_boot_code_end:
+
+/* Disable executable stack. */
+.section .note.GNU-stack,"",%progbits
diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
new file mode 100644
index 000000000000..392d6272d17e
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
@@ -0,0 +1,566 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <asm/kvm.h>
+#include <errno.h>
+#include <linux/kvm.h>
+#include <stdint.h>
+#include <sys/ioctl.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "tdx/td_boot.h"
+#include "test_util.h"
+
+uint64_t tdx_s_bit;
+
+/*
+ * TDX ioctls
+ */
+
+static char *tdx_cmd_str[] = {
+	"KVM_TDX_CAPABILITIES",
+	"KVM_TDX_INIT_VM",
+	"KVM_TDX_INIT_VCPU",
+	"KVM_TDX_INIT_MEM_REGION",
+	"KVM_TDX_FINALIZE_VM",
+	"KVM_TDX_GET_CPUID"
+};
+
+#define TDX_MAX_CMD_STR (ARRAY_SIZE(tdx_cmd_str))
+
+static int _tdx_ioctl(int fd, int ioctl_no, uint32_t flags, void *data)
+{
+	struct kvm_tdx_cmd tdx_cmd;
+
+	TEST_ASSERT(ioctl_no < TDX_MAX_CMD_STR, "Unknown TDX CMD : %d\n",
+		    ioctl_no);
+
+	memset(&tdx_cmd, 0x0, sizeof(tdx_cmd));
+	tdx_cmd.id = ioctl_no;
+	tdx_cmd.flags = flags;
+	tdx_cmd.data = (uint64_t)data;
+
+	return ioctl(fd, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd);
+}
+
+static void tdx_ioctl(int fd, int ioctl_no, uint32_t flags, void *data)
+{
+	int r;
+
+	r = _tdx_ioctl(fd, ioctl_no, flags, data);
+	TEST_ASSERT(r == 0, "%s failed: %d  %d", tdx_cmd_str[ioctl_no], r,
+		    errno);
+}
+
+static struct kvm_tdx_capabilities *tdx_read_capabilities(struct kvm_vm *vm)
+{
+	struct kvm_tdx_capabilities *tdx_cap = NULL;
+	int nr_cpuid_configs = 4;
+	int rc = -1;
+	int i;
+
+	do {
+		nr_cpuid_configs *= 2;
+
+		tdx_cap = realloc(tdx_cap, sizeof(*tdx_cap) +
+					   sizeof(tdx_cap->cpuid) +
+					   (sizeof(struct kvm_cpuid_entry2) * nr_cpuid_configs));
+		TEST_ASSERT(tdx_cap,
+			    "Could not allocate memory for tdx capability nr_cpuid_configs %d\n",
+			    nr_cpuid_configs);
+
+		tdx_cap->cpuid.nent = nr_cpuid_configs;
+		rc = _tdx_ioctl(vm->fd, KVM_TDX_CAPABILITIES, 0, tdx_cap);
+	} while (rc < 0 && errno == E2BIG);
+
+	TEST_ASSERT(rc == 0, "KVM_TDX_CAPABILITIES failed: %d %d",
+		    rc, errno);
+
+	pr_debug("tdx_cap: supported_attrs: 0x%016llx\n"
+		 "tdx_cap: supported_xfam 0x%016llx\n",
+		 tdx_cap->supported_attrs, tdx_cap->supported_xfam);
+
+	for (i = 0; i < tdx_cap->cpuid.nent; i++) {
+		const struct kvm_cpuid_entry2 *config = &tdx_cap->cpuid.entries[i];
+
+		pr_debug("cpuid config[%d]: leaf 0x%x sub_leaf 0x%x eax 0x%08x ebx 0x%08x ecx 0x%08x edx 0x%08x\n",
+			 i, config->function, config->index,
+			 config->eax, config->ebx, config->ecx, config->edx);
+	}
+
+	return tdx_cap;
+}
+
+static struct kvm_cpuid_entry2 *tdx_find_cpuid_config(struct kvm_tdx_capabilities *cap,
+						      uint32_t leaf, uint32_t sub_leaf)
+{
+	struct kvm_cpuid_entry2 *config;
+	uint32_t i;
+
+	for (i = 0; i < cap->cpuid.nent; i++) {
+		config = &cap->cpuid.entries[i];
+
+		if (config->function == leaf && config->index == sub_leaf)
+			return config;
+	}
+
+	return NULL;
+}
+
+#define XFEATURE_MASK_CET (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)
+
+static void tdx_apply_cpuid_restrictions(struct kvm_cpuid2 *cpuid_data)
+{
+	for (int i = 0; i < cpuid_data->nent; i++) {
+		struct kvm_cpuid_entry2 *e = &cpuid_data->entries[i];
+
+		if (e->function == 0xd && e->index == 0) {
+			/*
+			 * TDX module requires both XTILE_{CFG, DATA} to be set.
+			 * Both bits are required for AMX to be functional.
+			 */
+			if ((e->eax & XFEATURE_MASK_XTILE) !=
+			    XFEATURE_MASK_XTILE) {
+				e->eax &= ~XFEATURE_MASK_XTILE;
+			}
+		}
+		if (e->function == 0xd && e->index == 1) {
+			/*
+			 * TDX doesn't support LBR yet.
+			 * Disable bits from the XCR0 register.
+			 */
+			e->ecx &= ~XFEATURE_MASK_LBR;
+			/*
+			 * TDX modules requires both CET_{U, S} to be set even
+			 * if only one is supported.
+			 */
+			if (e->ecx & XFEATURE_MASK_CET)
+				e->ecx |= XFEATURE_MASK_CET;
+		}
+	}
+}
+
+#define KVM_MAX_CPUID_ENTRIES 256
+
+#define CPUID_EXT_VMX			BIT(5)
+#define CPUID_EXT_SMX			BIT(6)
+#define CPUID_PSE36			BIT(17)
+#define CPUID_7_0_EBX_TSC_ADJUST	BIT(1)
+#define CPUID_7_0_EBX_SGX		BIT(2)
+#define CPUID_7_0_EBX_INTEL_PT		BIT(25)
+#define CPUID_7_0_ECX_SGX_LC		BIT(30)
+#define CPUID_APM_INVTSC		BIT(8)
+#define CPUID_8000_0008_EBX_WBNOINVD	BIT(9)
+#define CPUID_EXT_PDCM			BIT(15)
+
+#define TDX_SUPPORTED_KVM_FEATURES  ((1U << KVM_FEATURE_NOP_IO_DELAY) | \
+				     (1U << KVM_FEATURE_PV_UNHALT) |	\
+				     (1U << KVM_FEATURE_PV_TLB_FLUSH) | \
+				     (1U << KVM_FEATURE_PV_SEND_IPI) |	\
+				     (1U << KVM_FEATURE_POLL_CONTROL) | \
+				     (1U << KVM_FEATURE_PV_SCHED_YIELD) | \
+				     (1U << KVM_FEATURE_MSI_EXT_DEST_ID))
+
+void __tdx_mask_cpuid_features(struct kvm_cpuid_entry2 *entry)
+{
+	/*
+	 * Only entries with sub-leaf zero need to be masked, but some of these
+	 * leaves have other sub-leaves defined. Bail on any non-zero sub-leaf,
+	 * so they don't get unintentionally modified.
+	 */
+	if (entry->index)
+		return;
+
+	switch (entry->function) {
+	case 0x1:
+		entry->ecx &= ~(CPUID_EXT_VMX | CPUID_EXT_SMX);
+		entry->edx &= ~CPUID_PSE36;
+		break;
+	case 0x7:
+		entry->ebx &= ~(CPUID_7_0_EBX_TSC_ADJUST | CPUID_7_0_EBX_SGX);
+		entry->ebx &= ~CPUID_7_0_EBX_INTEL_PT;
+		entry->ecx &= ~CPUID_7_0_ECX_SGX_LC;
+		break;
+	case 0x40000001:
+		entry->eax &= TDX_SUPPORTED_KVM_FEATURES;
+		break;
+	case 0x80000007:
+		entry->edx |= CPUID_APM_INVTSC;
+		break;
+	case 0x80000008:
+		entry->ebx &= CPUID_8000_0008_EBX_WBNOINVD;
+		break;
+	default:
+		break;
+	}
+}
+
+static void tdx_mask_cpuid_features(struct kvm_cpuid2 *cpuid_data)
+{
+	for (int i = 0; i < cpuid_data->nent; i++)
+		__tdx_mask_cpuid_features(&cpuid_data->entries[i]);
+}
+
+void tdx_filter_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid_data)
+{
+	struct kvm_tdx_capabilities *tdx_cap;
+	struct kvm_cpuid_entry2 *config;
+	struct kvm_cpuid_entry2 *e;
+	int i;
+
+	tdx_cap = tdx_read_capabilities(vm);
+
+	i = 0;
+	while (i < cpuid_data->nent) {
+		e = cpuid_data->entries + i;
+		config = tdx_find_cpuid_config(tdx_cap, e->function, e->index);
+
+		if (!config) {
+			int left = cpuid_data->nent - i - 1;
+
+			if (left > 0)
+				memmove(cpuid_data->entries + i,
+					cpuid_data->entries + i + 1,
+					sizeof(*cpuid_data->entries) * left);
+			cpuid_data->nent--;
+			continue;
+		}
+
+		e->eax &= config->eax;
+		e->ebx &= config->ebx;
+		e->ecx &= config->ecx;
+		e->edx &= config->edx;
+
+		i++;
+	}
+
+	free(tdx_cap);
+}
+
+static void tdx_td_init(struct kvm_vm *vm, uint64_t attributes)
+{
+	struct kvm_tdx_init_vm *init_vm;
+	const struct kvm_cpuid2 *tmp;
+	struct kvm_cpuid2 *cpuid;
+
+	tmp = kvm_get_supported_cpuid();
+
+	cpuid = allocate_kvm_cpuid2(KVM_MAX_CPUID_ENTRIES);
+	memcpy(cpuid, tmp, kvm_cpuid2_size(tmp->nent));
+	tdx_mask_cpuid_features(cpuid);
+
+	init_vm = calloc(1, sizeof(*init_vm) +
+			 sizeof(init_vm->cpuid.entries[0]) * cpuid->nent);
+	TEST_ASSERT(init_vm, "vm allocation failed");
+
+	memcpy(&init_vm->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
+	free(cpuid);
+
+	init_vm->attributes = attributes;
+
+	tdx_apply_cpuid_restrictions(&init_vm->cpuid);
+	tdx_filter_cpuid(vm, &init_vm->cpuid);
+
+	tdx_ioctl(vm->fd, KVM_TDX_INIT_VM, 0, init_vm);
+	free(init_vm);
+}
+
+static void tdx_td_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid2 *cpuid;
+
+	cpuid = allocate_kvm_cpuid2(KVM_MAX_CPUID_ENTRIES);
+	tdx_ioctl(vcpu->fd, KVM_TDX_GET_CPUID, 0, cpuid);
+	vcpu_init_cpuid(vcpu, cpuid);
+	free(cpuid);
+	tdx_ioctl(vcpu->fd, KVM_TDX_INIT_VCPU, 0, NULL);
+	/*
+	 * Refresh CPUID to get KVM's "runtime" updates which are done by
+	 * KVM_TDX_INIT_VCPU.
+	 */
+	vcpu_get_cpuid(vcpu);
+}
+
+static void tdx_init_mem_region(struct kvm_vm *vm, void *source_pages,
+				uint64_t gpa, uint64_t size)
+{
+	uint32_t metadata = KVM_TDX_MEASURE_MEMORY_REGION;
+	struct kvm_tdx_init_mem_region mem_region = {
+		.source_addr = (uint64_t)source_pages,
+		.gpa = gpa,
+		.nr_pages = size / PAGE_SIZE,
+	};
+	struct kvm_vcpu *vcpu;
+
+	vcpu = list_first_entry_or_null(&vm->vcpus, struct kvm_vcpu, list);
+
+	TEST_ASSERT((mem_region.nr_pages > 0) &&
+		    ((mem_region.nr_pages * PAGE_SIZE) == size),
+		    "Cannot add partial pages to the guest memory.\n");
+	TEST_ASSERT(((uint64_t)source_pages & (PAGE_SIZE - 1)) == 0,
+		    "Source memory buffer is not page aligned\n");
+	tdx_ioctl(vcpu->fd, KVM_TDX_INIT_MEM_REGION, metadata, &mem_region);
+}
+
+static void tdx_td_finalize_mr(struct kvm_vm *vm)
+{
+	tdx_ioctl(vm->fd, KVM_TDX_FINALIZE_VM, 0, NULL);
+}
+
+/*
+ * TD creation/setup/finalization
+ */
+
+static void tdx_enable_capabilities(struct kvm_vm *vm)
+{
+	int rc;
+
+	rc = kvm_check_cap(KVM_CAP_X2APIC_API);
+	TEST_ASSERT(rc, "TDX: KVM_CAP_X2APIC_API is not supported!");
+	rc = kvm_check_cap(KVM_CAP_SPLIT_IRQCHIP);
+	TEST_ASSERT(rc, "TDX: KVM_CAP_SPLIT_IRQCHIP is not supported!");
+
+	vm_enable_cap(vm, KVM_CAP_X2APIC_API,
+		      KVM_X2APIC_API_USE_32BIT_IDS |
+			      KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK);
+	vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24);
+}
+
+static void tdx_apply_cr4_restrictions(struct kvm_sregs *sregs)
+{
+	/* TDX spec 11.6.2: CR4 bit MCE is fixed to 1 */
+	sregs->cr4 |= X86_CR4_MCE;
+
+	/* Set this because UEFI also sets this up, to handle XMM exceptions */
+	sregs->cr4 |= X86_CR4_OSXMMEXCPT;
+
+	/* TDX spec 11.6.2: CR4 bit VMXE and SMXE are fixed to 0 */
+	sregs->cr4 &= ~(X86_CR4_VMXE | X86_CR4_SMXE);
+}
+
+static void load_td_boot_code(struct kvm_vm *vm)
+{
+	void *boot_code_hva = addr_gpa2hva(vm, FOUR_GIGABYTES_GPA - TD_BOOT_CODE_SIZE);
+
+	TEST_ASSERT(td_boot_code_end - reset_vector == 16,
+		    "The reset vector must be 16 bytes in size.");
+	memcpy(boot_code_hva, td_boot, TD_BOOT_CODE_SIZE);
+}
+
+static void load_td_per_vcpu_parameters(struct td_boot_parameters *params,
+					struct kvm_sregs *sregs,
+					struct kvm_vcpu *vcpu,
+					void *guest_code)
+{
+	struct td_per_vcpu_parameters *vcpu_params = &params->per_vcpu[vcpu->id];
+
+	TEST_ASSERT(vcpu->initial_stack_addr != 0,
+		    "initial stack address should not be 0");
+	TEST_ASSERT(vcpu->initial_stack_addr <= 0xffffffff,
+		    "initial stack address must fit in 32 bits");
+	TEST_ASSERT((uint64_t)guest_code <= 0xffffffff,
+		    "guest_code must fit in 32 bits");
+	TEST_ASSERT(sregs->cs.selector != 0, "cs.selector should not be 0");
+
+	vcpu_params->esp_gva = (uint32_t)(uint64_t)vcpu->initial_stack_addr;
+	vcpu_params->ljmp_target.eip_gva = (uint32_t)(uint64_t)guest_code;
+	vcpu_params->ljmp_target.code64_sel = sregs->cs.selector;
+}
+
+static void load_td_common_parameters(struct td_boot_parameters *params,
+				      struct kvm_sregs *sregs)
+{
+	/* Set parameters! */
+	params->cr0 = sregs->cr0;
+	params->cr3 = sregs->cr3;
+	params->cr4 = sregs->cr4;
+	params->gdtr.limit = sregs->gdt.limit;
+	params->gdtr.base = sregs->gdt.base;
+	params->idtr.limit = sregs->idt.limit;
+	params->idtr.base = sregs->idt.base;
+
+	TEST_ASSERT(params->cr0 != 0, "cr0 should not be 0");
+	TEST_ASSERT(params->cr3 != 0, "cr3 should not be 0");
+	TEST_ASSERT(params->cr4 != 0, "cr4 should not be 0");
+	TEST_ASSERT(params->gdtr.base != 0, "gdt base address should not be 0");
+	TEST_ASSERT(params->idtr.base != 0, "idt base address should not be 0");
+}
+
+static void load_td_boot_parameters(struct td_boot_parameters *params,
+				    struct kvm_vcpu *vcpu, void *guest_code)
+{
+	struct kvm_sregs sregs;
+
+	/* Assemble parameters in sregs */
+	memset(&sregs, 0, sizeof(struct kvm_sregs));
+	vcpu_setup_mode_sregs(vcpu->vm, &sregs);
+	tdx_apply_cr4_restrictions(&sregs);
+
+	if (!params->cr0)
+		load_td_common_parameters(params, &sregs);
+
+	load_td_per_vcpu_parameters(params, &sregs, vcpu, guest_code);
+}
+
+/*
+ * Adds a vCPU to a TD (Trusted Domain) with minimum defaults. It will not set
+ * up any general purpose registers as they will be initialized by the TDX. In
+ * TDX, vCPUs RIP is set to 0xFFFFFFF0. See Intel TDX EAS Section "Initial State
+ * of Guest GPRs" for more information on vCPUs initial register values when
+ * entering the TD first time.
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - The id of the vCPU to add to the VM.
+ */
+struct kvm_vcpu *td_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code)
+{
+	struct kvm_vcpu *vcpu;
+
+	vm->arch.has_protected_regs = true;
+	vcpu = vm_arch_vcpu_add(vm, vcpu_id);
+
+	tdx_td_vcpu_init(vcpu);
+
+	load_td_boot_parameters(addr_gpa2hva(vm, TD_BOOT_PARAMETERS_GPA),
+				vcpu, guest_code);
+
+	return vcpu;
+}
+
+static void load_td_memory_region(struct kvm_vm *vm,
+				  struct userspace_mem_region *region)
+{
+	const struct sparsebit *pages = region->protected_phy_pages;
+	const vm_paddr_t gpa_base = region->region.guest_phys_addr;
+	const uint64_t hva_base = region->region.userspace_addr;
+	const sparsebit_idx_t lowest_page_in_region = gpa_base >> vm->page_shift;
+
+	sparsebit_idx_t i;
+	sparsebit_idx_t j;
+
+	if (!sparsebit_any_set(pages))
+		return;
+
+	sparsebit_for_each_set_range(pages, i, j) {
+		const uint64_t size_to_load = (j - i + 1) * vm->page_size;
+		const uint64_t offset =
+			(i - lowest_page_in_region) * vm->page_size;
+		const uint64_t hva = hva_base + offset;
+		const uint64_t gpa = gpa_base + offset;
+		void *source_addr;
+
+		/*
+		 * KVM_TDX_INIT_MEM_REGION ioctl cannot encrypt memory in place.
+		 * Make a copy if there's only one backing memory source.
+		 */
+		source_addr = mmap(NULL, size_to_load, PROT_READ | PROT_WRITE,
+				   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+		TEST_ASSERT(source_addr,
+			    "Could not allocate memory for loading memory region");
+
+		memcpy(source_addr, (void *)hva, size_to_load);
+
+		tdx_init_mem_region(vm, source_addr, gpa, size_to_load);
+
+		munmap(source_addr, size_to_load);
+	}
+}
+
+static void load_td_private_memory(struct kvm_vm *vm)
+{
+	struct userspace_mem_region *region;
+	int ctr;
+
+	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
+		load_td_memory_region(vm, region);
+	}
+}
+
+struct kvm_vm *td_create(void)
+{
+	const struct vm_shape shape = {
+		.mode = VM_MODE_DEFAULT,
+		.type = KVM_X86_TDX_VM,
+	};
+
+	return ____vm_create(shape);
+}
+
+static void td_setup_boot_code(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type)
+{
+	size_t boot_code_allocation = round_up(TD_BOOT_CODE_SIZE, PAGE_SIZE);
+	vm_paddr_t boot_code_base_gpa = FOUR_GIGABYTES_GPA - boot_code_allocation;
+	size_t npages = DIV_ROUND_UP(boot_code_allocation, PAGE_SIZE);
+	vm_vaddr_t addr;
+
+	vm_userspace_mem_region_add(vm, src_type, boot_code_base_gpa, 1, npages,
+				    KVM_MEM_GUEST_MEMFD);
+	vm->memslots[MEM_REGION_CODE] = 1;
+	addr = vm_vaddr_identity_alloc(vm, boot_code_allocation,
+				       boot_code_base_gpa, MEM_REGION_CODE);
+	TEST_ASSERT_EQ(addr, boot_code_base_gpa);
+
+	load_td_boot_code(vm);
+}
+
+static size_t td_boot_parameters_size(void)
+{
+	int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+	size_t total_per_vcpu_parameters_size =
+		max_vcpus * sizeof(struct td_per_vcpu_parameters);
+
+	return sizeof(struct td_boot_parameters) + total_per_vcpu_parameters_size;
+}
+
+static void td_setup_boot_parameters(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type)
+{
+	size_t boot_params_size = td_boot_parameters_size();
+	int npages = DIV_ROUND_UP(boot_params_size, PAGE_SIZE);
+	size_t total_size = npages * PAGE_SIZE;
+	vm_vaddr_t addr;
+
+	vm_userspace_mem_region_add(vm, src_type, TD_BOOT_PARAMETERS_GPA, 2,
+				    npages, KVM_MEM_GUEST_MEMFD);
+	vm->memslots[MEM_REGION_TDX_BOOT_PARAMS] = 2;
+	addr = vm_vaddr_identity_alloc(vm, total_size, TD_BOOT_PARAMETERS_GPA,
+				       MEM_REGION_TDX_BOOT_PARAMS);
+	TEST_ASSERT_EQ(addr, TD_BOOT_PARAMETERS_GPA);
+}
+
+void td_initialize(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
+		   uint64_t attributes)
+{
+	uint64_t nr_pages_required;
+
+	tdx_enable_capabilities(vm);
+
+	tdx_td_init(vm, attributes);
+
+	nr_pages_required = vm_nr_pages_required(VM_MODE_DEFAULT, 1, 0);
+
+	/*
+	 * Add memory (add 0th memslot) for TD. This will be used to setup the
+	 * CPU (provide stack space for the CPU) and to load the elf file.
+	 */
+	vm_userspace_mem_region_add(vm, src_type, 0, 0, nr_pages_required,
+				    KVM_MEM_GUEST_MEMFD);
+
+	kvm_vm_elf_load(vm, program_invocation_name);
+	tdx_s_bit = vm->arch.s_bit;
+	sync_global_to_guest(vm, tdx_s_bit);
+
+	vm_init_descriptor_tables(vm);
+
+	td_setup_boot_code(vm, src_type);
+	td_setup_boot_parameters(vm, src_type);
+}
+
+void td_finalize(struct kvm_vm *vm)
+{
+	sync_exception_handlers_to_guest(vm);
+
+	load_td_private_memory(vm);
+
+	tdx_td_finalize_mr(vm);
+}