[v1] KVM: x86/tdx: Have TDX handle VMXON during bringup

[RFC PATCH 3/4] KVM: x86/tdx: Do VMXON and TDX-Module initialization during tdx_init()

Posted by Sean Christopherson 4 months ago

TODO: Split this up, write changelogs.

Cc: Chao Gao <chao.gao@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Xin Li (Intel) <xin@zytor.com>
Cc: Kai Huang <kai.huang@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/arch/x86/tdx.rst              |  26 --
 arch/x86/include/asm/tdx.h                  |   4 -
 arch/x86/kvm/vmx/tdx.c                      | 187 +++----------
 arch/x86/virt/vmx/tdx/tdx.c                 | 292 +++++++++++---------
 arch/x86/virt/vmx/tdx/tdx.h                 |   8 -
 arch/x86/virt/vmx/tdx/tdx_global_metadata.c |  10 +-
 6 files changed, 201 insertions(+), 326 deletions(-)

diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst
index 719043cd8b46..3c96f92051af 100644
--- a/Documentation/arch/x86/tdx.rst
+++ b/Documentation/arch/x86/tdx.rst
@@ -60,32 +60,6 @@ Besides initializing the TDX module, a per-cpu initialization SEAMCALL
 must be done on one cpu before any other SEAMCALLs can be made on that
 cpu.
 
-The kernel provides two functions, tdx_enable() and tdx_cpu_enable() to
-allow the user of TDX to enable the TDX module and enable TDX on local
-cpu respectively.
-
-Making SEAMCALL requires VMXON has been done on that CPU.  Currently only
-KVM implements VMXON.  For now both tdx_enable() and tdx_cpu_enable()
-don't do VMXON internally (not trivial), but depends on the caller to
-guarantee that.
-
-To enable TDX, the caller of TDX should: 1) temporarily disable CPU
-hotplug; 2) do VMXON and tdx_enable_cpu() on all online cpus; 3) call
-tdx_enable().  For example::
-
-        cpus_read_lock();
-        on_each_cpu(vmxon_and_tdx_cpu_enable());
-        ret = tdx_enable();
-        cpus_read_unlock();
-        if (ret)
-                goto no_tdx;
-        // TDX is ready to use
-
-And the caller of TDX must guarantee the tdx_cpu_enable() has been
-successfully done on any cpu before it wants to run any other SEAMCALL.
-A typical usage is do both VMXON and tdx_cpu_enable() in CPU hotplug
-online callback, and refuse to online if tdx_cpu_enable() fails.
-
 User can consult dmesg to see whether the TDX module has been initialized.
 
 If the TDX module is initialized successfully, dmesg shows something
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 7ddef3a69866..650917b49862 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -122,8 +122,6 @@ static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
 #define seamcall(_fn, _args)		sc_retry(__seamcall, (_fn), (_args))
 #define seamcall_ret(_fn, _args)	sc_retry(__seamcall_ret, (_fn), (_args))
 #define seamcall_saved_ret(_fn, _args)	sc_retry(__seamcall_saved_ret, (_fn), (_args))
-int tdx_cpu_enable(void);
-int tdx_enable(void);
 const char *tdx_dump_mce_info(struct mce *m);
 const struct tdx_sys_info *tdx_get_sysinfo(void);
 
@@ -196,8 +194,6 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
 #else
 static inline void tdx_init(void) { }
-static inline int tdx_cpu_enable(void) { return -ENODEV; }
-static inline int tdx_enable(void)  { return -ENODEV; }
 static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
 static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
 static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index bbf5ee7ec6ba..d89382971076 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -46,7 +46,7 @@ module_param_named(tdx, enable_tdx, bool, 0444);
 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
 
-static enum cpuhp_state tdx_cpuhp_state;
+static enum cpuhp_state tdx_cpuhp_state __ro_after_init;
 
 static const struct tdx_sys_info *tdx_sysinfo;
 
@@ -3345,17 +3345,7 @@ int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
 
 static int tdx_online_cpu(unsigned int cpu)
 {
-	unsigned long flags;
-	int r;
-
-	/* Sanity check CPU is already in post-VMXON */
-	WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
-
-	local_irq_save(flags);
-	r = tdx_cpu_enable();
-	local_irq_restore(flags);
-
-	return r;
+	return 0;
 }
 
 static int tdx_offline_cpu(unsigned int cpu)
@@ -3394,51 +3384,6 @@ static int tdx_offline_cpu(unsigned int cpu)
 	return -EBUSY;
 }
 
-static void __do_tdx_cleanup(void)
-{
-	/*
-	 * Once TDX module is initialized, it cannot be disabled and
-	 * re-initialized again w/o runtime update (which isn't
-	 * supported by kernel).  Only need to remove the cpuhp here.
-	 * The TDX host core code tracks TDX status and can handle
-	 * 'multiple enabling' scenario.
-	 */
-	WARN_ON_ONCE(!tdx_cpuhp_state);
-	cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
-	tdx_cpuhp_state = 0;
-}
-
-static void __tdx_cleanup(void)
-{
-	cpus_read_lock();
-	__do_tdx_cleanup();
-	cpus_read_unlock();
-}
-
-static int __init __do_tdx_bringup(void)
-{
-	int r;
-
-	/*
-	 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
-	 * online CPUs before calling tdx_enable(), and on any new
-	 * going-online CPU to make sure it is ready for TDX guest.
-	 */
-	r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
-					 "kvm/cpu/tdx:online",
-					 tdx_online_cpu, tdx_offline_cpu);
-	if (r < 0)
-		return r;
-
-	tdx_cpuhp_state = r;
-
-	r = tdx_enable();
-	if (r)
-		__do_tdx_cleanup();
-
-	return r;
-}
-
 static int __init __tdx_bringup(void)
 {
 	const struct tdx_sys_info_td_conf *td_conf;
@@ -3462,34 +3407,18 @@ static int __init __tdx_bringup(void)
 		}
 	}
 
-	/*
-	 * Enabling TDX requires enabling hardware virtualization first,
-	 * as making SEAMCALLs requires CPU being in post-VMXON state.
-	 */
-	r = kvm_enable_virtualization();
-	if (r)
-		return r;
-
-	cpus_read_lock();
-	r = __do_tdx_bringup();
-	cpus_read_unlock();
-
-	if (r)
-		goto tdx_bringup_err;
-
-	r = -EINVAL;
 	/* Get TDX global information for later use */
 	tdx_sysinfo = tdx_get_sysinfo();
-	if (WARN_ON_ONCE(!tdx_sysinfo))
-		goto get_sysinfo_err;
+	if (!tdx_sysinfo)
+		return -EINVAL;
 
 	/* Check TDX module and KVM capabilities */
 	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
 	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
-		goto get_sysinfo_err;
+		return -EINVAL;
 
 	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
-		goto get_sysinfo_err;
+		return -EINVAL;
 
 	/*
 	 * TDX has its own limit of maximum vCPUs it can support for all
@@ -3524,34 +3453,31 @@ static int __init __tdx_bringup(void)
 	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
 		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
 				td_conf->max_vcpus_per_td, num_present_cpus());
-		goto get_sysinfo_err;
+		return -EINVAL;
 	}
 
 	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
-		goto get_sysinfo_err;
+		return -EINVAL;
 
 	/*
-	 * Leave hardware virtualization enabled after TDX is enabled
-	 * successfully.  TDX CPU hotplug depends on this.
+	 * TDX-specific cpuhp callback to disallow offlining the last CPU in a
+	 * packing while KVM is running one or more TDs.  Reclaiming HKIDs
+	 * requires doing PAGE.WBINVD on every package, i.e. offlining all CPUs
+	 * of a package would prevent reclaiming the HKID.
 	 */
+	r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvm/cpu/tdx:online",
+			      tdx_online_cpu, tdx_offline_cpu);
+	if (r < 0)
+		goto err_cpuhup;
+
+	tdx_cpuhp_state = r;
 	return 0;
 
-get_sysinfo_err:
-	__tdx_cleanup();
-tdx_bringup_err:
-	kvm_disable_virtualization();
+err_cpuhup:
+	misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
 	return r;
 }
 
-void tdx_cleanup(void)
-{
-	if (enable_tdx) {
-		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
-		__tdx_cleanup();
-		kvm_disable_virtualization();
-	}
-}
-
 int __init tdx_bringup(void)
 {
 	int r, i;
@@ -3563,6 +3489,16 @@ int __init tdx_bringup(void)
 	if (!enable_tdx)
 		return 0;
 
+	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
+		pr_err("TDX not supported by the host platform\n");
+		goto success_disable_tdx;
+	}
+
+	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+		pr_err("OSXSAVE is required for TDX\n");
+		return -EINVAL;
+	}
+
 	if (!enable_ept) {
 		pr_err("EPT is required for TDX\n");
 		goto success_disable_tdx;
@@ -3578,61 +3514,9 @@ int __init tdx_bringup(void)
 		goto success_disable_tdx;
 	}
 
-	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
-		pr_err("tdx: OSXSAVE is required for TDX\n");
-		goto success_disable_tdx;
-	}
-
-	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
-		pr_err("tdx: MOVDIR64B is required for TDX\n");
-		goto success_disable_tdx;
-	}
-
-	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
-		pr_err("Self-snoop is required for TDX\n");
-		goto success_disable_tdx;
-	}
-
-	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
-		pr_err("tdx: no TDX private KeyIDs available\n");
-		goto success_disable_tdx;
-	}
-
-	if (!enable_virt_at_load) {
-		pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
-		goto success_disable_tdx;
-	}
-
-	/*
-	 * Ideally KVM should probe whether TDX module has been loaded
-	 * first and then try to bring it up.  But TDX needs to use SEAMCALL
-	 * to probe whether the module is loaded (there is no CPUID or MSR
-	 * for that), and making SEAMCALL requires enabling virtualization
-	 * first, just like the rest steps of bringing up TDX module.
-	 *
-	 * So, for simplicity do everything in __tdx_bringup(); the first
-	 * SEAMCALL will return -ENODEV when the module is not loaded.  The
-	 * only complication is having to make sure that initialization
-	 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
-	 * cases.
-	 */
 	r = __tdx_bringup();
-	if (r) {
-		/*
-		 * Disable TDX only but don't fail to load module if the TDX
-		 * module could not be loaded.  No need to print message saying
-		 * "module is not loaded" because it was printed when the first
-		 * SEAMCALL failed.  Don't bother unwinding the S-EPT hooks or
-		 * vm_size, as kvm_x86_ops have already been finalized (and are
-		 * intentionally not exported).  The S-EPT code is unreachable,
-		 * and allocating a few more bytes per VM in a should-be-rare
-		 * failure scenario is a non-issue.
-		 */
-		if (r == -ENODEV)
-			goto success_disable_tdx;
-
+	if (r)
 		enable_tdx = 0;
-	}
 
 	return r;
 
@@ -3641,6 +3525,15 @@ int __init tdx_bringup(void)
 	return 0;
 }
 
+void tdx_cleanup(void)
+{
+	if (!enable_tdx)
+		return;
+
+	misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
+	cpuhp_remove_state(tdx_cpuhp_state);
+}
+
 void __init tdx_hardware_setup(void)
 {
 	KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c7a9a087ccaf..bf1c1cdd9690 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -28,6 +28,7 @@
 #include <linux/log2.h>
 #include <linux/acpi.h>
 #include <linux/suspend.h>
+#include <linux/syscore_ops.h>
 #include <linux/idr.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
@@ -38,6 +39,7 @@
 #include <asm/cpu_device_id.h>
 #include <asm/processor.h>
 #include <asm/mce.h>
+#include <asm/virt.h>
 #include "tdx.h"
 
 static u32 tdx_global_keyid __ro_after_init;
@@ -50,13 +52,11 @@ static DEFINE_PER_CPU(bool, tdx_lp_initialized);
 
 static struct tdmr_info_list tdx_tdmr_list;
 
-static enum tdx_module_status_t tdx_module_status;
-static DEFINE_MUTEX(tdx_module_lock);
-
 /* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
 static LIST_HEAD(tdx_memlist);
 
-static struct tdx_sys_info tdx_sysinfo;
+static struct tdx_sys_info tdx_sysinfo __ro_after_init;
+static bool tdx_module_initialized __ro_after_init;
 
 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
 
@@ -141,26 +141,15 @@ static int try_init_module_global(void)
 }
 
 /**
- * tdx_cpu_enable - Enable TDX on local cpu
- *
- * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
- * global initialization SEAMCALL if not done) on local cpu to make this
- * cpu be ready to run any other SEAMCALLs.
- *
- * Always call this function via IPI function calls.
- *
- * Return 0 on success, otherwise errors.
+ * Enable VMXON and then do one-time TDX module per-cpu initialization SEAMCALL
+ * (and TDX module global initialization SEAMCALL if not done) on local cpu to
+ * make this cpu be ready to run any other SEAMCALLs.
  */
-int tdx_cpu_enable(void)
+static int tdx_cpu_enable(void)
 {
 	struct tdx_module_args args = {};
 	int ret;
 
-	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
-		return -ENODEV;
-
-	lockdep_assert_irqs_disabled();
-
 	if (__this_cpu_read(tdx_lp_initialized))
 		return 0;
 
@@ -181,15 +170,65 @@ int tdx_cpu_enable(void)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tdx_cpu_enable);
+
+static int tdx_online_cpu(unsigned int cpu)
+{
+	int ret;
+
+	guard(irqsave)();
+
+	ret = x86_virt_get_cpu(X86_FEATURE_VMX);
+	if (ret)
+		return ret;
+
+	ret = tdx_cpu_enable();
+	if (ret)
+		x86_virt_put_cpu(X86_FEATURE_VMX);
+
+	return ret;
+}
+
+static int tdx_offline_cpu(unsigned int cpu)
+{
+	x86_virt_put_cpu(X86_FEATURE_VMX);
+	return 0;
+}
+
+static void tdx_shutdown_cpu(void *ign)
+{
+	x86_virt_put_cpu(X86_FEATURE_VMX);
+}
+
+static void tdx_shutdown(void)
+{
+	on_each_cpu(tdx_shutdown_cpu, NULL, 1);
+}
+
+static int tdx_suspend(void)
+{
+	x86_virt_put_cpu(X86_FEATURE_VMX);
+	return 0;
+}
+
+static void tdx_resume(void)
+{
+	WARN_ON_ONCE(x86_virt_get_cpu(X86_FEATURE_VMX));
+}
+
+static struct syscore_ops tdx_syscore_ops = {
+	.suspend = tdx_suspend,
+	.resume = tdx_resume,
+	.shutdown = tdx_shutdown,
+};
 
 /*
  * Add a memory region as a TDX memory block.  The caller must make sure
  * all memory regions are added in address ascending order and don't
  * overlap.
  */
-static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
-			    unsigned long end_pfn, int nid)
+static __init int add_tdx_memblock(struct list_head *tmb_list,
+				   unsigned long start_pfn,
+				   unsigned long end_pfn, int nid)
 {
 	struct tdx_memblock *tmb;
 
@@ -207,7 +246,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
 	return 0;
 }
 
-static void free_tdx_memlist(struct list_head *tmb_list)
+static __init void free_tdx_memlist(struct list_head *tmb_list)
 {
 	/* @tmb_list is protected by mem_hotplug_lock */
 	while (!list_empty(tmb_list)) {
@@ -225,7 +264,7 @@ static void free_tdx_memlist(struct list_head *tmb_list)
  * ranges off in a secondary structure because memblock is modified
  * in memory hotplug while TDX memory regions are fixed.
  */
-static int build_tdx_memlist(struct list_head *tmb_list)
+static __init int build_tdx_memlist(struct list_head *tmb_list)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid, ret;
@@ -257,7 +296,7 @@ static int build_tdx_memlist(struct list_head *tmb_list)
 	return ret;
 }
 
-static int read_sys_metadata_field(u64 field_id, u64 *data)
+static __init int read_sys_metadata_field(u64 field_id, u64 *data)
 {
 	struct tdx_module_args args = {};
 	int ret;
@@ -279,7 +318,7 @@ static int read_sys_metadata_field(u64 field_id, u64 *data)
 
 #include "tdx_global_metadata.c"
 
-static int check_features(struct tdx_sys_info *sysinfo)
+static __init int check_features(struct tdx_sys_info *sysinfo)
 {
 	u64 tdx_features0 = sysinfo->features.tdx_features0;
 
@@ -292,7 +331,7 @@ static int check_features(struct tdx_sys_info *sysinfo)
 }
 
 /* Calculate the actual TDMR size */
-static int tdmr_size_single(u16 max_reserved_per_tdmr)
+static __init int tdmr_size_single(u16 max_reserved_per_tdmr)
 {
 	int tdmr_sz;
 
@@ -306,8 +345,8 @@ static int tdmr_size_single(u16 max_reserved_per_tdmr)
 	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
 }
 
-static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
-			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
+static __init int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
+				  struct tdx_sys_info_tdmr *sysinfo_tdmr)
 {
 	size_t tdmr_sz, tdmr_array_sz;
 	void *tdmr_array;
@@ -338,7 +377,7 @@ static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
 	return 0;
 }
 
-static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
+static __init void free_tdmr_list(struct tdmr_info_list *tdmr_list)
 {
 	free_pages_exact(tdmr_list->tdmrs,
 			tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
@@ -367,8 +406,8 @@ static inline u64 tdmr_end(struct tdmr_info *tdmr)
  * preallocated @tdmr_list, following all the special alignment
  * and size rules for TDMR.
  */
-static int fill_out_tdmrs(struct list_head *tmb_list,
-			  struct tdmr_info_list *tdmr_list)
+static __init int fill_out_tdmrs(struct list_head *tmb_list,
+				 struct tdmr_info_list *tdmr_list)
 {
 	struct tdx_memblock *tmb;
 	int tdmr_idx = 0;
@@ -444,8 +483,8 @@ static int fill_out_tdmrs(struct list_head *tmb_list,
  * Calculate PAMT size given a TDMR and a page size.  The returned
  * PAMT size is always aligned up to 4K page boundary.
  */
-static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
-				      u16 pamt_entry_size)
+static __init unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
+					     u16 pamt_entry_size)
 {
 	unsigned long pamt_sz, nr_pamt_entries;
 
@@ -476,7 +515,7 @@ static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
  * PAMT.  This node will have some memory covered by the TDMR.  The
  * relative amount of memory covered is not considered.
  */
-static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
+static __init int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
 {
 	struct tdx_memblock *tmb;
 
@@ -505,9 +544,9 @@ static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
  * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
  * within @tdmr, and set up PAMTs for @tdmr.
  */
-static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
-			    struct list_head *tmb_list,
-			    u16 pamt_entry_size[])
+static __init int tdmr_set_up_pamt(struct tdmr_info *tdmr,
+				   struct list_head *tmb_list,
+				   u16 pamt_entry_size[])
 {
 	unsigned long pamt_base[TDX_PS_NR];
 	unsigned long pamt_size[TDX_PS_NR];
@@ -577,7 +616,7 @@ static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
 	*pamt_size = pamt_sz;
 }
 
-static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
+static __init void tdmr_do_pamt_func(struct tdmr_info *tdmr,
 		void (*pamt_func)(unsigned long base, unsigned long size))
 {
 	unsigned long pamt_base, pamt_size;
@@ -594,17 +633,17 @@ static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
 	pamt_func(pamt_base, pamt_size);
 }
 
-static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
+static __init  void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
 {
 	free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
 }
 
-static void tdmr_free_pamt(struct tdmr_info *tdmr)
+static __init void tdmr_free_pamt(struct tdmr_info *tdmr)
 {
 	tdmr_do_pamt_func(tdmr, free_pamt);
 }
 
-static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
+static __init void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
 {
 	int i;
 
@@ -613,9 +652,9 @@ static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
 }
 
 /* Allocate and set up PAMTs for all TDMRs */
-static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
-				 struct list_head *tmb_list,
-				 u16 pamt_entry_size[])
+static __init int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
+					struct list_head *tmb_list,
+					u16 pamt_entry_size[])
 {
 	int i, ret = 0;
 
@@ -654,12 +693,12 @@ static void reset_tdx_pages(unsigned long base, unsigned long size)
 	mb();
 }
 
-static void tdmr_reset_pamt(struct tdmr_info *tdmr)
+static __init void tdmr_reset_pamt(struct tdmr_info *tdmr)
 {
 	tdmr_do_pamt_func(tdmr, reset_tdx_pages);
 }
 
-static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
+static __init void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
 {
 	int i;
 
@@ -667,7 +706,7 @@ static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
 		tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
 }
 
-static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
+static __init unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
 {
 	unsigned long pamt_size = 0;
 	int i;
@@ -682,8 +721,8 @@ static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
 	return pamt_size / 1024;
 }
 
-static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
-			      u64 size, u16 max_reserved_per_tdmr)
+static __init int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx,
+				     u64 addr, u64 size, u16 max_reserved_per_tdmr)
 {
 	struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
 	int idx = *p_idx;
@@ -716,10 +755,10 @@ static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
  * those holes fall within @tdmr, set up a TDMR reserved area to cover
  * the hole.
  */
-static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
-				    struct tdmr_info *tdmr,
-				    int *rsvd_idx,
-				    u16 max_reserved_per_tdmr)
+static __init int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
+					   struct tdmr_info *tdmr,
+					   int *rsvd_idx,
+					   u16 max_reserved_per_tdmr)
 {
 	struct tdx_memblock *tmb;
 	u64 prev_end;
@@ -780,10 +819,10 @@ static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
  * overlaps with @tdmr, set up a TDMR reserved area to cover the
  * overlapping part.
  */
-static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
-				    struct tdmr_info *tdmr,
-				    int *rsvd_idx,
-				    u16 max_reserved_per_tdmr)
+static __init int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
+					   struct tdmr_info *tdmr,
+					   int *rsvd_idx,
+					   u16 max_reserved_per_tdmr)
 {
 	int i, ret;
 
@@ -818,7 +857,7 @@ static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
 }
 
 /* Compare function called by sort() for TDMR reserved areas */
-static int rsvd_area_cmp_func(const void *a, const void *b)
+static __init int rsvd_area_cmp_func(const void *a, const void *b)
 {
 	struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
 	struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
@@ -837,10 +876,10 @@ static int rsvd_area_cmp_func(const void *a, const void *b)
  * Populate reserved areas for the given @tdmr, including memory holes
  * (via @tmb_list) and PAMTs (via @tdmr_list).
  */
-static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
-				    struct list_head *tmb_list,
-				    struct tdmr_info_list *tdmr_list,
-				    u16 max_reserved_per_tdmr)
+static __init int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
+					   struct list_head *tmb_list,
+					   struct tdmr_info_list *tdmr_list,
+					   u16 max_reserved_per_tdmr)
 {
 	int ret, rsvd_idx = 0;
 
@@ -865,9 +904,9 @@ static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
  * Populate reserved areas for all TDMRs in @tdmr_list, including memory
  * holes (via @tmb_list) and PAMTs.
  */
-static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
-					 struct list_head *tmb_list,
-					 u16 max_reserved_per_tdmr)
+static __init int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
+						struct list_head *tmb_list,
+						u16 max_reserved_per_tdmr)
 {
 	int i;
 
@@ -888,9 +927,9 @@ static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
  * to cover all TDX memory regions in @tmb_list based on the TDX module
  * TDMR global information in @sysinfo_tdmr.
  */
-static int construct_tdmrs(struct list_head *tmb_list,
-			   struct tdmr_info_list *tdmr_list,
-			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
+static __init int construct_tdmrs(struct list_head *tmb_list,
+				  struct tdmr_info_list *tdmr_list,
+				  struct tdx_sys_info_tdmr *sysinfo_tdmr)
 {
 	u16 pamt_entry_size[TDX_PS_NR] = {
 		sysinfo_tdmr->pamt_4k_entry_size,
@@ -922,7 +961,8 @@ static int construct_tdmrs(struct list_head *tmb_list,
 	return ret;
 }
 
-static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
+static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
+				    u64 global_keyid)
 {
 	struct tdx_module_args args = {};
 	u64 *tdmr_pa_array;
@@ -1015,7 +1055,7 @@ static int config_global_keyid(void)
 	return ret;
 }
 
-static int init_tdmr(struct tdmr_info *tdmr)
+static __init int init_tdmr(struct tdmr_info *tdmr)
 {
 	u64 next;
 
@@ -1046,7 +1086,7 @@ static int init_tdmr(struct tdmr_info *tdmr)
 	return 0;
 }
 
-static int init_tdmrs(struct tdmr_info_list *tdmr_list)
+static __init int init_tdmrs(struct tdmr_info_list *tdmr_list)
 {
 	int i;
 
@@ -1065,7 +1105,7 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list)
 	return 0;
 }
 
-static int init_tdx_module(void)
+static __init int init_tdx_module(void)
 {
 	int ret;
 
@@ -1154,68 +1194,45 @@ static int init_tdx_module(void)
 	goto out_put_tdxmem;
 }
 
-static int __tdx_enable(void)
+static __init int tdx_enable(void)
 {
+	enum cpuhp_state state;
 	int ret;
 
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+		pr_err("XSAVE is required for TDX\n");
+		return -EINVAL;
+	}
+
+	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+		pr_err("MOVDIR64B is required for TDX\n");
+		return -EINVAL;
+	}
+
+	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
+		pr_err("Self-snoop is required for TDX\n");
+		return -ENODEV;
+	}
+
+	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "virt/tdx:online",
+				  tdx_online_cpu, tdx_offline_cpu);
+	if (state < 0)
+		return state;
+
 	ret = init_tdx_module();
 	if (ret) {
-		pr_err("module initialization failed (%d)\n", ret);
-		tdx_module_status = TDX_MODULE_ERROR;
+		pr_err("TDX-Module initialization failed (%d)\n", ret);
+		cpuhp_remove_state(state);
 		return ret;
 	}
 
-	pr_info("module initialized\n");
-	tdx_module_status = TDX_MODULE_INITIALIZED;
+	register_syscore_ops(&tdx_syscore_ops);
 
+	tdx_module_initialized = true;
+	pr_info("TDX-Module initialized\n");
 	return 0;
 }
 
-/**
- * tdx_enable - Enable TDX module to make it ready to run TDX guests
- *
- * This function assumes the caller has: 1) held read lock of CPU hotplug
- * lock to prevent any new cpu from becoming online; 2) done both VMXON
- * and tdx_cpu_enable() on all online cpus.
- *
- * This function requires there's at least one online cpu for each CPU
- * package to succeed.
- *
- * This function can be called in parallel by multiple callers.
- *
- * Return 0 if TDX is enabled successfully, otherwise error.
- */
-int tdx_enable(void)
-{
-	int ret;
-
-	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
-		return -ENODEV;
-
-	lockdep_assert_cpus_held();
-
-	mutex_lock(&tdx_module_lock);
-
-	switch (tdx_module_status) {
-	case TDX_MODULE_UNINITIALIZED:
-		ret = __tdx_enable();
-		break;
-	case TDX_MODULE_INITIALIZED:
-		/* Already initialized, great, tell the caller. */
-		ret = 0;
-		break;
-	default:
-		/* Failed to initialize in the previous attempts */
-		ret = -EINVAL;
-		break;
-	}
-
-	mutex_unlock(&tdx_module_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(tdx_enable);
-
 static bool is_pamt_page(unsigned long phys)
 {
 	struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
@@ -1445,11 +1462,6 @@ void __init tdx_init(void)
 		return;
 	}
 
-#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
-	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
-	acpi_suspend_lowlevel = NULL;
-#endif
-
 	/*
 	 * Just use the first TDX KeyID as the 'global KeyID' and
 	 * leave the rest for TDX guests.
@@ -1458,22 +1470,30 @@ void __init tdx_init(void)
 	tdx_guest_keyid_start = tdx_keyid_start + 1;
 	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
 
+	err = tdx_enable();
+	if (err)
+		goto err_enable;
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
+	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
+	acpi_suspend_lowlevel = NULL;
+#endif
+
 	setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
 
 	check_tdx_erratum();
+	return;
+
+err_enable:
+	unregister_memory_notifier(&tdx_memory_nb);
 }
 
 const struct tdx_sys_info *tdx_get_sysinfo(void)
 {
-	const struct tdx_sys_info *p = NULL;
+	if (!tdx_module_initialized)
+		return NULL;
 
-	/* Make sure all fields in @tdx_sysinfo have been populated */
-	mutex_lock(&tdx_module_lock);
-	if (tdx_module_status == TDX_MODULE_INITIALIZED)
-		p = (const struct tdx_sys_info *)&tdx_sysinfo;
-	mutex_unlock(&tdx_module_lock);
-
-	return p;
+	return (const struct tdx_sys_info *)&tdx_sysinfo;
 }
 EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
 
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index 82bb82be8567..dde219c823b4 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -91,14 +91,6 @@ struct tdmr_info {
  * Do not put any hardware-defined TDX structure representations below
  * this comment!
  */
-
-/* Kernel defined TDX module status during module initialization. */
-enum tdx_module_status_t {
-	TDX_MODULE_UNINITIALIZED,
-	TDX_MODULE_INITIALIZED,
-	TDX_MODULE_ERROR
-};
-
 struct tdx_memblock {
 	struct list_head list;
 	unsigned long start_pfn;
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index 13ad2663488b..360963bc9328 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -7,7 +7,7 @@
  * Include this file to other C file instead.
  */
 
-static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features)
+static __init int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features)
 {
 	int ret = 0;
 	u64 val;
@@ -18,7 +18,7 @@ static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_featu
 	return ret;
 }
 
-static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr)
+static __init int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr)
 {
 	int ret = 0;
 	u64 val;
@@ -37,7 +37,7 @@ static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr)
 	return ret;
 }
 
-static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl)
+static __init int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl)
 {
 	int ret = 0;
 	u64 val;
@@ -52,7 +52,7 @@ static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl
 	return ret;
 }
 
-static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf)
+static __init int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf)
 {
 	int ret = 0;
 	u64 val;
@@ -85,7 +85,7 @@ static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf
 	return ret;
 }
 
-static int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
+static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
 {
 	int ret = 0;
 
-- 
2.51.0.740.g6adb054d12-goog

Re: [RFC PATCH 3/4] KVM: x86/tdx: Do VMXON and TDX-Module initialization during tdx_init()

Posted by Edgecombe, Rick P 3 months, 4 weeks ago

On Fri, 2025-10-10 at 15:04 -0700, Sean Christopherson wrote:
> TODO: Split this up, write changelogs.
> 
> Cc: Chao Gao <chao.gao@intel.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Xin Li (Intel) <xin@zytor.com>
> Cc: Kai Huang <kai.huang@intel.com>
> Cc: Adrian Hunter <adrian.hunter@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> 
[snip]
> -
>  static int __init __tdx_bringup(void)
>  {
>  	const struct tdx_sys_info_td_conf *td_conf;
> @@ -3462,34 +3407,18 @@ static int __init __tdx_bringup(void)
>  		}
>  	}
>  
> -	/*
> -	 * Enabling TDX requires enabling hardware virtualization first,
> -	 * as making SEAMCALLs requires CPU being in post-VMXON state.
> -	 */
> -	r = kvm_enable_virtualization();
> -	if (r)
> -		return r;
> -
> -	cpus_read_lock();
> -	r = __do_tdx_bringup();
> -	cpus_read_unlock();
> -
> -	if (r)
> -		goto tdx_bringup_err;
> -
> -	r = -EINVAL;
>  	/* Get TDX global information for later use */
>  	tdx_sysinfo = tdx_get_sysinfo();
> -	if (WARN_ON_ONCE(!tdx_sysinfo))
> -		goto get_sysinfo_err;
> +	if (!tdx_sysinfo)
> +		return -EINVAL;
>  
>  	/* Check TDX module and KVM capabilities */
>  	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
>  	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
> -		goto get_sysinfo_err;
> +		return -EINVAL;
>  
>  	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
> -		goto get_sysinfo_err;
> +		return -EINVAL;
>  
>  	/*
>  	 * TDX has its own limit of maximum vCPUs it can support for all
> @@ -3524,34 +3453,31 @@ static int __init __tdx_bringup(void)
>  	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
>  		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
>  				td_conf->max_vcpus_per_td, num_present_cpus());
> -		goto get_sysinfo_err;
> +		return -EINVAL;
>  	}
>  
>  	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
> -		goto get_sysinfo_err;
> +		return -EINVAL;
>  
>  	/*
> -	 * Leave hardware virtualization enabled after TDX is enabled
> -	 * successfully.  TDX CPU hotplug depends on this.
> +	 * TDX-specific cpuhp callback to disallow offlining the last CPU in a
> +	 * packing while KVM is running one or more TDs.  Reclaiming HKIDs
> +	 * requires doing PAGE.WBINVD on every package, i.e. offlining all CPUs
> +	 * of a package would prevent reclaiming the HKID.
>  	 */
> +	r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvm/cpu/tdx:online",
> +			      tdx_online_cpu, tdx_offline_cpu);

Could pass NULL instead of tdx_online_cpu() and delete this version of
tdx_online_cpu(). Also could remove the error handling too.

Also, can we name the two tdx_offline_cpu()'s differently? This one is all about
keyid's being in use. tdx_hkid_offline_cpu()?

> +	if (r < 0)
> +		goto err_cpuhup;
> +
> +	tdx_cpuhp_state = r;
>  	return 0;
>  
> -get_sysinfo_err:
> -	__tdx_cleanup();
> -tdx_bringup_err:
> -	kvm_disable_virtualization();
> +err_cpuhup:
> +	misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
>  	return r;
>  }
>  
> -void tdx_cleanup(void)
> -{
> -	if (enable_tdx) {
> -		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
> -		__tdx_cleanup();
> -		kvm_disable_virtualization();
> -	}
> -}
> -
>  int __init tdx_bringup(void)
>  {
>  	int r, i;
> @@ -3563,6 +3489,16 @@ int __init tdx_bringup(void)
>  	if (!enable_tdx)
>  		return 0;
>  
> +	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
> +		pr_err("TDX not supported by the host platform\n");
> +		goto success_disable_tdx;
> +	}
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
> +		pr_err("OSXSAVE is required for TDX\n");
> +		return -EINVAL;

Why change this condition from goto success_disable_tdx?

> +	}
> +
>  	if (!enable_ept) {
>  		pr_err("EPT is required for TDX\n");
>  		goto success_disable_tdx;
> @@ -3578,61 +3514,9 @@ int __init tdx_bringup(void)
>  		goto success_disable_tdx;
>  	}
>  
> -	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
> -		pr_err("tdx: OSXSAVE is required for TDX\n");
> -		goto success_disable_tdx;
> -	}
> -
> -	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
> -		pr_err("tdx: MOVDIR64B is required for TDX\n");
> -		goto success_disable_tdx;
> -	}
> -
> -	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
> -		pr_err("Self-snoop is required for TDX\n");
> -		goto success_disable_tdx;
> -	}
> -
> -	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
> -		pr_err("tdx: no TDX private KeyIDs available\n");
> -		goto success_disable_tdx;
> -	}
> -
> -	if (!enable_virt_at_load) {
> -		pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
> -		goto success_disable_tdx;
> -	}
> -
> -	/*
> -	 * Ideally KVM should probe whether TDX module has been loaded
> -	 * first and then try to bring it up.  But TDX needs to use SEAMCALL
> -	 * to probe whether the module is loaded (there is no CPUID or MSR
> -	 * for that), and making SEAMCALL requires enabling virtualization
> -	 * first, just like the rest steps of bringing up TDX module.
> -	 *
> -	 * So, for simplicity do everything in __tdx_bringup(); the first
> -	 * SEAMCALL will return -ENODEV when the module is not loaded.  The
> -	 * only complication is having to make sure that initialization
> -	 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
> -	 * cases.
> -	 */
>  	r = __tdx_bringup();
> -	if (r) {
> -		/*
> -		 * Disable TDX only but don't fail to load module if the TDX
> -		 * module could not be loaded.  No need to print message saying
> -		 * "module is not loaded" because it was printed when the first
> -		 * SEAMCALL failed.  Don't bother unwinding the S-EPT hooks or
> -		 * vm_size, as kvm_x86_ops have already been finalized (and are
> -		 * intentionally not exported).  The S-EPT code is unreachable,
> -		 * and allocating a few more bytes per VM in a should-be-rare
> -		 * failure scenario is a non-issue.
> -		 */
> -		if (r == -ENODEV)
> -			goto success_disable_tdx;
> -
> +	if (r)
>  		enable_tdx = 0;
> -	}
>  

I think the previous discussion was that there should be a probe and enable
step. We could not fail KVM init if TDX is not supported (probe), but not try to
cleanly handle any other unexpected error (fail enabled).

The existing code mostly has the "probe" type checks in tdx_bringup(), and the
"enable" type checks in __tdx_bringup(). But now the gutted __tdx_bringup() is
very probe-y. Ideally we could separate these into named "install" and "probe"
functions. I don't know if this would be good to do this as part of this series
or later though.

>  	return r;
>  
> 
> 

[snip]

>  
>  /*
>   * Add a memory region as a TDX memory block.  The caller must make sure
>   * all memory regions are added in address ascending order and don't
>   * overlap.
>   */
> -static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
> -			    unsigned long end_pfn, int nid)
> +static __init int add_tdx_memblock(struct list_head *tmb_list,
> +				   unsigned long start_pfn,
> +				   unsigned long end_pfn, int nid)

One easy thing to break this up would be to do this __init adjustments in a
follow on patch.

Re: [RFC PATCH 3/4] KVM: x86/tdx: Do VMXON and TDX-Module initialization during tdx_init()

Posted by Sean Christopherson 3 months, 3 weeks ago

On Mon, Oct 13, 2025, Rick P Edgecombe wrote:
> On Fri, 2025-10-10 at 15:04 -0700, Sean Christopherson wrote:
> > @@ -3524,34 +3453,31 @@ static int __init __tdx_bringup(void)
> >  	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
> >  		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
> >  				td_conf->max_vcpus_per_td, num_present_cpus());
> > -		goto get_sysinfo_err;
> > +		return -EINVAL;
> >  	}
> >  
> >  	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
> > -		goto get_sysinfo_err;
> > +		return -EINVAL;
> >  
> >  	/*
> > -	 * Leave hardware virtualization enabled after TDX is enabled
> > -	 * successfully.  TDX CPU hotplug depends on this.
> > +	 * TDX-specific cpuhp callback to disallow offlining the last CPU in a
> > +	 * packing while KVM is running one or more TDs.  Reclaiming HKIDs
> > +	 * requires doing PAGE.WBINVD on every package, i.e. offlining all CPUs
> > +	 * of a package would prevent reclaiming the HKID.
> >  	 */
> > +	r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvm/cpu/tdx:online",
> > +			      tdx_online_cpu, tdx_offline_cpu);
> 
> Could pass NULL instead of tdx_online_cpu() and delete this version of
> tdx_online_cpu().

Oh, nice, I didn't realize (or forgot) the startup call is optional.
 
> Also could remove the error handling too.

No.  Partly on prinicple, but also because CPUHP_AP_ONLINE_DYN can fail if the
kernel runs out of dynamic entries (currently limited to 40).  The kernel WARNs
if it runs out of entries, but KVM should still do the right thing.

> Also, can we name the two tdx_offline_cpu()'s differently? This one is all about
> keyid's being in use. tdx_hkid_offline_cpu()?

Ya.  And change the description to "kvm/cpu/tdx:hkid_packages"?  Or something
like that.

> > +	if (r < 0)
> > +		goto err_cpuhup;
> > +
> > +	tdx_cpuhp_state = r;
> >  	return 0;
> >  
> > -get_sysinfo_err:
> > -	__tdx_cleanup();
> > -tdx_bringup_err:
> > -	kvm_disable_virtualization();
> > +err_cpuhup:
> > +	misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
> >  	return r;
> >  }
> >  
> > -void tdx_cleanup(void)
> > -{
> > -	if (enable_tdx) {
> > -		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
> > -		__tdx_cleanup();
> > -		kvm_disable_virtualization();
> > -	}
> > -}
> > -
> >  int __init tdx_bringup(void)
> >  {
> >  	int r, i;
> > @@ -3563,6 +3489,16 @@ int __init tdx_bringup(void)
> >  	if (!enable_tdx)
> >  		return 0;
> >  
> > +	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
> > +		pr_err("TDX not supported by the host platform\n");
> > +		goto success_disable_tdx;
> > +	}
> > +
> > +	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
> > +		pr_err("OSXSAVE is required for TDX\n");
> > +		return -EINVAL;
> 
> Why change this condition from goto success_disable_tdx?

Ah, a copy+paste goof.  I originally moved the code to tdx_enable(), then realized
it as checking OSXSAVE, not XSAVE, and so needed to be done later in boot.  When
I copied it back to KVM, I forgot to restore the goto.

> >  	r = __tdx_bringup();
> > -	if (r) {
> > -		/*
> > -		 * Disable TDX only but don't fail to load module if the TDX
> > -		 * module could not be loaded.  No need to print message saying
> > -		 * "module is not loaded" because it was printed when the first
> > -		 * SEAMCALL failed.  Don't bother unwinding the S-EPT hooks or
> > -		 * vm_size, as kvm_x86_ops have already been finalized (and are
> > -		 * intentionally not exported).  The S-EPT code is unreachable,
> > -		 * and allocating a few more bytes per VM in a should-be-rare
> > -		 * failure scenario is a non-issue.
> > -		 */
> > -		if (r == -ENODEV)
> > -			goto success_disable_tdx;
> > -
> > +	if (r)
> >  		enable_tdx = 0;
> > -	}
> >  
> 
> I think the previous discussion was that there should be a probe and enable
> step. We could not fail KVM init if TDX is not supported (probe), but not try to
> cleanly handle any other unexpected error (fail enabled).
> 
> The existing code mostly has the "probe" type checks in tdx_bringup(), and the
> "enable" type checks in __tdx_bringup(). But now the gutted __tdx_bringup() is
> very probe-y. Ideally we could separate these into named "install" and "probe"
> functions. I don't know if this would be good to do this as part of this series
> or later though.
> 
> >  	return r;
> >  
> > 
> > 
> 
> [snip]
> 
> >  
> >  /*
> >   * Add a memory region as a TDX memory block.  The caller must make sure
> >   * all memory regions are added in address ascending order and don't
> >   * overlap.
> >   */
> > -static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
> > -			    unsigned long end_pfn, int nid)
> > +static __init int add_tdx_memblock(struct list_head *tmb_list,
> > +				   unsigned long start_pfn,
> > +				   unsigned long end_pfn, int nid)
> 
> One easy thing to break this up would be to do this __init adjustments in a
> follow on patch.

Ya, for sure.

Re: [RFC PATCH 3/4] KVM: x86/tdx: Do VMXON and TDX-Module initialization during tdx_init()

Posted by Chao Gao 3 months, 3 weeks ago

On Mon, Oct 13, 2025 at 01:59:21PM -0700, Sean Christopherson wrote:
>On Mon, Oct 13, 2025, Rick P Edgecombe wrote:
>> On Fri, 2025-10-10 at 15:04 -0700, Sean Christopherson wrote:
>> > @@ -3524,34 +3453,31 @@ static int __init __tdx_bringup(void)
>> >  	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
>> >  		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
>> >  				td_conf->max_vcpus_per_td, num_present_cpus());
>> > -		goto get_sysinfo_err;
>> > +		return -EINVAL;
>> >  	}
>> >  
>> >  	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
>> > -		goto get_sysinfo_err;
>> > +		return -EINVAL;
>> >  
>> >  	/*
>> > -	 * Leave hardware virtualization enabled after TDX is enabled
>> > -	 * successfully.  TDX CPU hotplug depends on this.
>> > +	 * TDX-specific cpuhp callback to disallow offlining the last CPU in a
>> > +	 * packing while KVM is running one or more TDs.  Reclaiming HKIDs
>> > +	 * requires doing PAGE.WBINVD on every package, i.e. offlining all CPUs
>> > +	 * of a package would prevent reclaiming the HKID.
>> >  	 */
>> > +	r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvm/cpu/tdx:online",
>> > +			      tdx_online_cpu, tdx_offline_cpu);
>> 
>> Could pass NULL instead of tdx_online_cpu() and delete this version of
>> tdx_online_cpu().
>
>Oh, nice, I didn't realize (or forgot) the startup call is optional.
> 
>> Also could remove the error handling too.
>
>No.  Partly on prinicple, but also because CPUHP_AP_ONLINE_DYN can fail if the
>kernel runs out of dynamic entries (currently limited to 40).  The kernel WARNs
>if it runs out of entries, but KVM should still do the right thing.
>
>> Also, can we name the two tdx_offline_cpu()'s differently? This one is all about
>> keyid's being in use. tdx_hkid_offline_cpu()?
>
>Ya.  And change the description to "kvm/cpu/tdx:hkid_packages"?  Or something
>like that.
>

Is it a good idea to consolidate the two tdx_offline_cpu() functions, i.e.,
integrate KVM's version into x86 core?

From 97165f9933f48d588f5390e2d543d9880c03532d Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Tue, 14 Oct 2025 01:00:06 -0700
Subject: [PATCH] x86/virt/tdx: Consolidate TDX CPU hotplug handling

The core kernel registers a CPU hotplug callback to do VMX and TDX init
and deinit while KVM registers a separate CPU offline callback to block
offlining the last online CPU in a socket.

Splitting TDX-related CPU hotplug handling across two components is odd
and adds unnecessary complexity.

Consolidate TDX-related CPU hotplug handling by integrating KVM's
tdx_offline_cpu() to the one in the core kernel.

Also move nr_configured_hkid to the core kernel because tdx_offline_cpu()
references it. Since HKID allocation and free are handled in the core
kernel, it's more natural to track used HKIDs there.

Signed-off-by: Chao Gao <chao.gao@intel.com>
---
 arch/x86/kvm/vmx/tdx.c      | 67 +------------------------------------
 arch/x86/virt/vmx/tdx/tdx.c | 49 +++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 69 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index d89382971076..beac8ab4cbc1 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -46,8 +46,6 @@ module_param_named(tdx, enable_tdx, bool, 0444);
 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
 
-static enum cpuhp_state tdx_cpuhp_state __ro_after_init;
-
 static const struct tdx_sys_info *tdx_sysinfo;
 
 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
@@ -206,8 +204,6 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
  */
 static DEFINE_MUTEX(tdx_lock);
 
-static atomic_t nr_configured_hkid;
-
 static bool tdx_operand_busy(u64 err)
 {
	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
@@ -255,7 +251,6 @@ static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
 {
	tdx_guest_keyid_free(kvm_tdx->hkid);
	kvm_tdx->hkid = -1;
-	atomic_dec(&nr_configured_hkid);
	misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
	put_misc_cg(kvm_tdx->misc_cg);
	kvm_tdx->misc_cg = NULL;
@@ -2487,8 +2482,6 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
 
	ret = -ENOMEM;
 
-	atomic_inc(&nr_configured_hkid);
-
	tdr_page = alloc_page(GFP_KERNEL);
	if (!tdr_page)
		goto free_hkid;
@@ -3343,51 +3336,10 @@ int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
	return PG_LEVEL_4K;
 }
 
-static int tdx_online_cpu(unsigned int cpu)
-{
-	return 0;
-}
-
-static int tdx_offline_cpu(unsigned int cpu)
-{
-	int i;
-
-	/* No TD is running.  Allow any cpu to be offline. */
-	if (!atomic_read(&nr_configured_hkid))
-		return 0;
-
-	/*
-	 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
-	 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
-	 * controller with pconfig.  If we have active TDX HKID, refuse to
-	 * offline the last online cpu.
-	 */
-	for_each_online_cpu(i) {
-		/*
-		 * Found another online cpu on the same package.
-		 * Allow to offline.
-		 */
-		if (i != cpu && topology_physical_package_id(i) ==
-				topology_physical_package_id(cpu))
-			return 0;
-	}
-
-	/*
-	 * This is the last cpu of this package.  Don't offline it.
-	 *
-	 * Because it's hard for human operator to understand the
-	 * reason, warn it.
-	 */
-#define MSG_ALLPKG_ONLINE \
-	"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
-	pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
-	return -EBUSY;
-}
-
 static int __init __tdx_bringup(void)
 {
	const struct tdx_sys_info_td_conf *td_conf;
-	int r, i;
+	int i;
 
	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
		/*
@@ -3459,23 +3411,7 @@ static int __init __tdx_bringup(void)
	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
		return -EINVAL;
 
-	/*
-	 * TDX-specific cpuhp callback to disallow offlining the last CPU in a
-	 * packing while KVM is running one or more TDs.  Reclaiming HKIDs
-	 * requires doing PAGE.WBINVD on every package, i.e. offlining all CPUs
-	 * of a package would prevent reclaiming the HKID.
-	 */
-	r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "kvm/cpu/tdx:online",
-			      tdx_online_cpu, tdx_offline_cpu);
-	if (r < 0)
-		goto err_cpuhup;
-
-	tdx_cpuhp_state = r;
	return 0;
-
-err_cpuhup:
-	misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
-	return r;
 }
 
 int __init tdx_bringup(void)
@@ -3531,7 +3467,6 @@ void tdx_cleanup(void)
		return;
 
	misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
-	cpuhp_remove_state(tdx_cpuhp_state);
 }
 
 void __init tdx_hardware_setup(void)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index bf1c1cdd9690..201ecb4ad20d 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -58,6 +58,8 @@ static LIST_HEAD(tdx_memlist);
 static struct tdx_sys_info tdx_sysinfo __ro_after_init;
 static bool tdx_module_initialized __ro_after_init;
 
+static atomic_t nr_configured_hkid;
+
 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
 
 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
@@ -190,6 +192,40 @@ static int tdx_online_cpu(unsigned int cpu)
 
 static int tdx_offline_cpu(unsigned int cpu)
 {
+	int i;
+
+	/* No TD is running.  Allow any cpu to be offline. */
+	if (!atomic_read(&nr_configured_hkid))
+		goto done;
+
+	/*
+	 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
+	 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
+	 * controller with pconfig.  If we have active TDX HKID, refuse to
+	 * offline the last online cpu.
+	 */
+	for_each_online_cpu(i) {
+		/*
+		 * Found another online cpu on the same package.
+		 * Allow to offline.
+		 */
+		if (i != cpu && topology_physical_package_id(i) ==
+				topology_physical_package_id(cpu))
+			goto done;
+	}
+
+	/*
+	 * This is the last cpu of this package.  Don't offline it.
+	 *
+	 * Because it's hard for human operator to understand the
+	 * reason, warn it.
+	 */
+#define MSG_ALLPKG_ONLINE \
+	"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
+	pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
+	return -EBUSY;
+
+done:
	x86_virt_put_cpu(X86_FEATURE_VMX);
	return 0;
 }
@@ -1505,15 +1541,22 @@ EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
 
 int tdx_guest_keyid_alloc(void)
 {
-	return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
-			       tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
-			       GFP_KERNEL);
+	int ret;
+
+	ret = ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
+			      tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
+			      GFP_KERNEL);
+	if (ret >= 0)
+		atomic_inc(&nr_configured_hkid);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
 
 void tdx_guest_keyid_free(unsigned int keyid)
 {
	ida_free(&tdx_guest_keyid_pool, keyid);
+	atomic_dec(&nr_configured_hkid);
 }
 EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
 
-- 
2.47.3

Re: [RFC PATCH 3/4] KVM: x86/tdx: Do VMXON and TDX-Module initialization during tdx_init()

Posted by Chao Gao 3 months, 4 weeks ago

>-static int __tdx_enable(void)
>+static __init int tdx_enable(void)
> {
>+	enum cpuhp_state state;
> 	int ret;
> 
>+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
>+		pr_err("XSAVE is required for TDX\n");
>+		return -EINVAL;
>+	}
>+
>+	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
>+		pr_err("MOVDIR64B is required for TDX\n");
>+		return -EINVAL;
>+	}
>+
>+	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
>+		pr_err("Self-snoop is required for TDX\n");
>+		return -ENODEV;
>+	}
>+
>+	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "virt/tdx:online",
>+				  tdx_online_cpu, tdx_offline_cpu);
>+	if (state < 0)
>+		return state;
>+
> 	ret = init_tdx_module();

...

>@@ -1445,11 +1462,6 @@ void __init tdx_init(void)
> 		return;
> 	}
> 
>-#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
>-	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
>-	acpi_suspend_lowlevel = NULL;
>-#endif
>-
> 	/*
> 	 * Just use the first TDX KeyID as the 'global KeyID' and
> 	 * leave the rest for TDX guests.
>@@ -1458,22 +1470,30 @@ void __init tdx_init(void)
> 	tdx_guest_keyid_start = tdx_keyid_start + 1;
> 	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
> 
>+	err = tdx_enable();
>+	if (err)
>+		goto err_enable;

IIRC, existing TDX modules require all CPUs to have completed per-CPU
initialization before TDMR/PAMT initialization.

But at this point, APs are not online, so tdx_enable() will fail here.

Re: [RFC PATCH 3/4] KVM: x86/tdx: Do VMXON and TDX-Module initialization during tdx_init()

Posted by Sean Christopherson 3 months, 4 weeks ago

On Mon, Oct 13, 2025, Chao Gao wrote:
> >-static int __tdx_enable(void)
> >+static __init int tdx_enable(void)
> > {
> >+	enum cpuhp_state state;
> > 	int ret;
> > 
> >+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
> >+		pr_err("XSAVE is required for TDX\n");
> >+		return -EINVAL;
> >+	}
> >+
> >+	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
> >+		pr_err("MOVDIR64B is required for TDX\n");
> >+		return -EINVAL;
> >+	}
> >+
> >+	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
> >+		pr_err("Self-snoop is required for TDX\n");
> >+		return -ENODEV;
> >+	}
> >+
> >+	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "virt/tdx:online",
> >+				  tdx_online_cpu, tdx_offline_cpu);
> >+	if (state < 0)
> >+		return state;
> >+
> > 	ret = init_tdx_module();
> 
> ...
> 
> >@@ -1445,11 +1462,6 @@ void __init tdx_init(void)
> > 		return;
> > 	}
> > 
> >-#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
> >-	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
> >-	acpi_suspend_lowlevel = NULL;
> >-#endif
> >-
> > 	/*
> > 	 * Just use the first TDX KeyID as the 'global KeyID' and
> > 	 * leave the rest for TDX guests.
> >@@ -1458,22 +1470,30 @@ void __init tdx_init(void)
> > 	tdx_guest_keyid_start = tdx_keyid_start + 1;
> > 	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
> > 
> >+	err = tdx_enable();
> >+	if (err)
> >+		goto err_enable;
> 
> IIRC, existing TDX modules require all CPUs to have completed per-CPU
> initialization before TDMR/PAMT initialization.
> 
> But at this point, APs are not online, so tdx_enable() will fail here.

Ah.  Maybe invoke tdx_enable() through a subsys_initcall() then?

[RFC PATCH 1/4] KVM: x86: Move kvm_rebooting to x86
[RFC PATCH 2/4] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel
[RFC PATCH 3/4] KVM: x86/tdx: Do VMXON and TDX-Module initialization during tdx_init()
[RFC PATCH 4/4] KVM: Bury kvm_{en,dis}able_virtualization() in kvm_main.c once more