[v2] KVM: x86/tdx: Have TDX handle VMXON during bringup

[PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Sean Christopherson 2 months ago

Move the innermost VMXON and EFER.SVME management logic out of KVM and
into to core x86 so that TDX can force VMXON without having to rely on
KVM being loaded, e.g. to do SEAMCALLs during initialization.

Implement a per-CPU refcounting scheme so that "users", e.g. KVM and the
future TDX code, can co-exist without pulling the rug out from under each
other.

To avoid having to choose between SVM and VMX, simply refuse to enable
either if both are somehow supported.  No known CPU supports both SVM and
VMX, and it's comically unlikely such a CPU will ever exist.

For lack of a better name, call the new file "hw.c", to yield "virt
hardware" when combined with its parent directory.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/events/intel/pt.c      |   1 -
 arch/x86/include/asm/kvm_host.h |   3 +-
 arch/x86/include/asm/reboot.h   |  11 --
 arch/x86/include/asm/virt.h     |  26 +++
 arch/x86/include/asm/vmx.h      |  11 ++
 arch/x86/kernel/cpu/common.c    |   2 +
 arch/x86/kernel/crash.c         |   3 +-
 arch/x86/kernel/reboot.c        |  63 +-----
 arch/x86/kernel/smp.c           |   5 +-
 arch/x86/kvm/svm/svm.c          |  34 +---
 arch/x86/kvm/svm/vmenter.S      |  10 +-
 arch/x86/kvm/vmx/tdx.c          |   3 +-
 arch/x86/kvm/vmx/vmcs.h         |  11 --
 arch/x86/kvm/vmx/vmenter.S      |   2 +-
 arch/x86/kvm/vmx/vmx.c          | 127 +-----------
 arch/x86/kvm/x86.c              |  17 +-
 arch/x86/kvm/x86.h              |   1 -
 arch/x86/virt/Makefile          |   2 +
 arch/x86/virt/hw.c              | 340 ++++++++++++++++++++++++++++++++
 19 files changed, 422 insertions(+), 250 deletions(-)
 create mode 100644 arch/x86/include/asm/virt.h
 create mode 100644 arch/x86/virt/hw.c

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e8cf29d2b10c..9092f0f9de72 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1590,7 +1590,6 @@ void intel_pt_handle_vmx(int on)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
 
 /*
  * PMU callbacks
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5a3bfa293e8b..47b535c1c3bd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,7 +40,8 @@
 #include <asm/irq_remapping.h>
 #include <asm/kvm_page_track.h>
 #include <asm/kvm_vcpu_regs.h>
-#include <asm/reboot.h>
+#include <asm/virt.h>
+
 #include <hyperv/hvhdk.h>
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index ecd58ea9a837..a671a1145906 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,17 +25,6 @@ void __noreturn machine_real_restart(unsigned int type);
 #define MRR_BIOS	0
 #define MRR_APM		1
 
-typedef void (cpu_emergency_virt_cb)(void);
-#if IS_ENABLED(CONFIG_KVM_X86)
-void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
-void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
-void cpu_emergency_disable_virtualization(void);
-#else
-static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {}
-static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {}
-static inline void cpu_emergency_disable_virtualization(void) {}
-#endif /* CONFIG_KVM_X86 */
-
 typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
 void nmi_shootdown_cpus(nmi_shootdown_cb callback);
 void run_crash_ipi_callback(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/virt.h b/arch/x86/include/asm/virt.h
new file mode 100644
index 000000000000..77a366afd9f7
--- /dev/null
+++ b/arch/x86/include/asm/virt.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_VIRT_H
+#define _ASM_X86_VIRT_H
+
+#include <asm/reboot.h>
+
+typedef void (cpu_emergency_virt_cb)(void);
+
+#if IS_ENABLED(CONFIG_KVM_X86)
+extern bool virt_rebooting;
+
+void __init x86_virt_init(void);
+
+int x86_virt_get_cpu(int feat);
+void x86_virt_put_cpu(int feat);
+
+int x86_virt_emergency_disable_virtualization_cpu(void);
+
+void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback);
+void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback);
+#else
+static __always_inline void x86_virt_init(void) {}
+static inline int x86_virt_emergency_disable_virtualization_cpu(void) { return -ENOENT; }
+#endif
+
+#endif /* _ASM_X86_VIRT_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index c85c50019523..d2c7eb1c5f12 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -20,6 +20,17 @@
 #include <asm/trapnr.h>
 #include <asm/vmxfeatures.h>
 
+struct vmcs_hdr {
+	u32 revision_id:31;
+	u32 shadow_vmcs:1;
+};
+
+struct vmcs {
+	struct vmcs_hdr hdr;
+	u32 abort;
+	char data[];
+};
+
 #define VMCS_CONTROL_BIT(x)	BIT(VMX_FEATURE_##x & 0x1f)
 
 /*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 02d97834a1d4..a55cb572d2b4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -70,6 +70,7 @@
 #include <asm/traps.h>
 #include <asm/sev.h>
 #include <asm/tdx.h>
+#include <asm/virt.h>
 #include <asm/posted_intr.h>
 #include <asm/runtime-const.h>
 
@@ -2124,6 +2125,7 @@ static __init void identify_boot_cpu(void)
 	cpu_detect_tlb(&boot_cpu_data);
 	setup_cr_pinning();
 
+	x86_virt_init();
 	tsx_init();
 	tdx_init();
 	lkgs_init();
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 335fd2ee9766..cd796818d94d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -42,6 +42,7 @@
 #include <asm/crash.h>
 #include <asm/cmdline.h>
 #include <asm/sev.h>
+#include <asm/virt.h>
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -111,7 +112,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 
 	crash_smp_send_stop();
 
-	cpu_emergency_disable_virtualization();
+	x86_virt_emergency_disable_virtualization_cpu();
 
 	/*
 	 * Disable Intel PT to stop its logging
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..0f1d14ed955b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -26,6 +26,7 @@
 #include <asm/cpu.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
+#include <asm/virt.h>
 
 #include <linux/ctype.h>
 #include <linux/mc146818rtc.h>
@@ -531,51 +532,6 @@ static inline void kb_wait(void)
 static inline void nmi_shootdown_cpus_on_restart(void);
 
 #if IS_ENABLED(CONFIG_KVM_X86)
-/* RCU-protected callback to disable virtualization prior to reboot. */
-static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
-
-void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
-{
-	if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
-		return;
-
-	rcu_assign_pointer(cpu_emergency_virt_callback, callback);
-}
-EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
-
-void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
-{
-	if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
-		return;
-
-	rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
-
-/*
- * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
- * reboot.  VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
- * GIF=0, i.e. if the crash occurred between CLGI and STGI.
- */
-void cpu_emergency_disable_virtualization(void)
-{
-	cpu_emergency_virt_cb *callback;
-
-	/*
-	 * IRQs must be disabled as KVM enables virtualization in hardware via
-	 * function call IPIs, i.e. IRQs need to be disabled to guarantee
-	 * virtualization stays disabled.
-	 */
-	lockdep_assert_irqs_disabled();
-
-	rcu_read_lock();
-	callback = rcu_dereference(cpu_emergency_virt_callback);
-	if (callback)
-		callback();
-	rcu_read_unlock();
-}
-
 static void emergency_reboot_disable_virtualization(void)
 {
 	local_irq_disable();
@@ -587,16 +543,11 @@ static void emergency_reboot_disable_virtualization(void)
 	 * We can't take any locks and we may be on an inconsistent state, so
 	 * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
 	 *
-	 * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
-	 * other CPUs may have virtualization enabled.
+	 * Safely force _this_ CPU out of VMX/SVM operation, and if necessary,
+	 * blast NMIs to force other CPUs out of VMX/SVM as well.k
 	 */
-	if (rcu_access_pointer(cpu_emergency_virt_callback)) {
-		/* Safely force _this_ CPU out of VMX/SVM operation. */
-		cpu_emergency_disable_virtualization();
-
-		/* Disable VMX/SVM and halt on other CPUs. */
+	if (!x86_virt_emergency_disable_virtualization_cpu())
 		nmi_shootdown_cpus_on_restart();
-	}
 }
 #else
 static void emergency_reboot_disable_virtualization(void) { }
@@ -874,10 +825,10 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 		shootdown_callback(cpu, regs);
 
 	/*
-	 * Prepare the CPU for reboot _after_ invoking the callback so that the
-	 * callback can safely use virtualization instructions, e.g. VMCLEAR.
+	 * Disable virtualization, as both VMX and SVM can block INIT and thus
+	 * prevent AP bringup, e.g. in a kdump kernel or in firmware.
 	 */
-	cpu_emergency_disable_virtualization();
+	x86_virt_emergency_disable_virtualization_cpu();
 
 	atomic_dec(&waiting_for_crash_ipi);
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b014e6d229f9..cbf95fe2b207 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -35,6 +35,7 @@
 #include <asm/trace/irq_vectors.h>
 #include <asm/kexec.h>
 #include <asm/reboot.h>
+#include <asm/virt.h>
 
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
@@ -124,7 +125,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
 		return NMI_HANDLED;
 
-	cpu_emergency_disable_virtualization();
+	x86_virt_emergency_disable_virtualization_cpu();
 	stop_this_cpu(NULL);
 
 	return NMI_HANDLED;
@@ -136,7 +137,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
 {
 	apic_eoi();
-	cpu_emergency_disable_virtualization();
+	x86_virt_emergency_disable_virtualization_cpu();
 	stop_this_cpu(NULL);
 }
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 24d59ccfa40d..c09648cc3bd2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -44,6 +44,7 @@
 #include <asm/traps.h>
 #include <asm/reboot.h>
 #include <asm/fpu/api.h>
+#include <asm/virt.h>
 
 #include <trace/events/ipi.h>
 
@@ -476,27 +477,9 @@ static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm
 	return &sd->save_area->host_sev_es_save;
 }
 
-static inline void kvm_cpu_svm_disable(void)
-{
-	uint64_t efer;
-
-	wrmsrq(MSR_VM_HSAVE_PA, 0);
-	rdmsrq(MSR_EFER, efer);
-	if (efer & EFER_SVME) {
-		/*
-		 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
-		 * NMI aren't blocked.
-		 */
-		stgi();
-		wrmsrq(MSR_EFER, efer & ~EFER_SVME);
-	}
-}
-
 static void svm_emergency_disable_virtualization_cpu(void)
 {
-	kvm_rebooting = true;
-
-	kvm_cpu_svm_disable();
+	wrmsrq(MSR_VM_HSAVE_PA, 0);
 }
 
 static void svm_disable_virtualization_cpu(void)
@@ -505,7 +488,7 @@ static void svm_disable_virtualization_cpu(void)
 	if (tsc_scaling)
 		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 
-	kvm_cpu_svm_disable();
+	x86_virt_put_cpu(X86_FEATURE_SVM);
 
 	amd_pmu_disable_virt();
 }
@@ -514,12 +497,12 @@ static int svm_enable_virtualization_cpu(void)
 {
 
 	struct svm_cpu_data *sd;
-	uint64_t efer;
 	int me = raw_smp_processor_id();
+	int r;
 
-	rdmsrq(MSR_EFER, efer);
-	if (efer & EFER_SVME)
-		return -EBUSY;
+	r = x86_virt_get_cpu(X86_FEATURE_SVM);
+	if (r)
+		return r;
 
 	sd = per_cpu_ptr(&svm_data, me);
 	sd->asid_generation = 1;
@@ -527,8 +510,6 @@ static int svm_enable_virtualization_cpu(void)
 	sd->next_asid = sd->max_asid + 1;
 	sd->min_asid = max_sev_asid + 1;
 
-	wrmsrq(MSR_EFER, efer | EFER_SVME);
-
 	wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);
 
 	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
@@ -539,7 +520,6 @@ static int svm_enable_virtualization_cpu(void)
 		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 	}
 
-
 	/*
 	 * Get OSVW bits.
 	 *
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 3392bcadfb89..d47c5c93c991 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -298,16 +298,16 @@ SYM_FUNC_START(__svm_vcpu_run)
 	RESTORE_GUEST_SPEC_CTRL_BODY
 	RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP)
 
-10:	cmpb $0, _ASM_RIP(kvm_rebooting)
+10:	cmpb $0, _ASM_RIP(virt_rebooting)
 	jne 2b
 	ud2
-30:	cmpb $0, _ASM_RIP(kvm_rebooting)
+30:	cmpb $0, _ASM_RIP(virt_rebooting)
 	jne 4b
 	ud2
-50:	cmpb $0, _ASM_RIP(kvm_rebooting)
+50:	cmpb $0, _ASM_RIP(virt_rebooting)
 	jne 6b
 	ud2
-70:	cmpb $0, _ASM_RIP(kvm_rebooting)
+70:	cmpb $0, _ASM_RIP(virt_rebooting)
 	jne 8b
 	ud2
 
@@ -394,7 +394,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 	RESTORE_GUEST_SPEC_CTRL_BODY
 	RESTORE_HOST_SPEC_CTRL_BODY %sil
 
-3:	cmpb $0, kvm_rebooting(%rip)
+3:	cmpb $0, virt_rebooting(%rip)
 	jne 2b
 	ud2
 
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 2d7a4d52ccfb..21e67a47ad4e 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -6,6 +6,7 @@
 #include <linux/misc_cgroup.h>
 #include <linux/mmu_context.h>
 #include <asm/tdx.h>
+#include <asm/virt.h>
 #include "capabilities.h"
 #include "mmu.h"
 #include "x86_ops.h"
@@ -1994,7 +1995,7 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
 	 * TDX_SEAMCALL_VMFAILINVALID.
 	 */
 	if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
-		KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
+		KVM_BUG_ON(!virt_rebooting, vcpu->kvm);
 		goto unhandled_exit;
 	}
 
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index b25625314658..2ab6ade006c7 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -13,17 +13,6 @@
 
 #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
 
-struct vmcs_hdr {
-	u32 revision_id:31;
-	u32 shadow_vmcs:1;
-};
-
-struct vmcs {
-	struct vmcs_hdr hdr;
-	u32 abort;
-	char data[];
-};
-
 DECLARE_PER_CPU(struct vmcs *, current_vmcs);
 
 /*
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 4426d34811fc..8a481dae9cae 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -310,7 +310,7 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
 	RET
 
 .Lfixup:
-	cmpb $0, _ASM_RIP(kvm_rebooting)
+	cmpb $0, _ASM_RIP(virt_rebooting)
 	jne .Lvmfail
 	ud2
 .Lvmfail:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4cbe8c84b636..dd13bae22a1e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -48,6 +48,7 @@
 #include <asm/msr.h>
 #include <asm/mwait.h>
 #include <asm/spec-ctrl.h>
+#include <asm/virt.h>
 #include <asm/vmx.h>
 
 #include <trace/events/ipi.h>
@@ -577,7 +578,6 @@ noinline void invept_error(unsigned long ext, u64 eptp)
 	vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
 }
 
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 /*
  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
@@ -784,53 +784,17 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 	return ret;
 }
 
-/*
- * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
- *
- * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
- * atomically track post-VMXON state, e.g. this may be called in NMI context.
- * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
- * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
- * magically in RM, VM86, compat mode, or at CPL>0.
- */
-static int kvm_cpu_vmxoff(void)
-{
-	asm goto("1: vmxoff\n\t"
-			  _ASM_EXTABLE(1b, %l[fault])
-			  ::: "cc", "memory" : fault);
-
-	cr4_clear_bits(X86_CR4_VMXE);
-	return 0;
-
-fault:
-	cr4_clear_bits(X86_CR4_VMXE);
-	return -EIO;
-}
-
 void vmx_emergency_disable_virtualization_cpu(void)
 {
 	int cpu = raw_smp_processor_id();
 	struct loaded_vmcs *v;
 
-	kvm_rebooting = true;
-
-	/*
-	 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
-	 * set in task context.  If this races with VMX is disabled by an NMI,
-	 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
-	 * kvm_rebooting set.
-	 */
-	if (!(__read_cr4() & X86_CR4_VMXE))
-		return;
-
 	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
 			    loaded_vmcss_on_cpu_link) {
 		vmcs_clear(v->vmcs);
 		if (v->shadow_vmcs)
 			vmcs_clear(v->shadow_vmcs);
 	}
-
-	kvm_cpu_vmxoff();
 }
 
 static void __loaded_vmcs_clear(void *arg)
@@ -2928,34 +2892,9 @@ int vmx_check_processor_compat(void)
 	return 0;
 }
 
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
-	u64 msr;
-
-	cr4_set_bits(X86_CR4_VMXE);
-
-	asm goto("1: vmxon %[vmxon_pointer]\n\t"
-			  _ASM_EXTABLE(1b, %l[fault])
-			  : : [vmxon_pointer] "m"(vmxon_pointer)
-			  : : fault);
-	return 0;
-
-fault:
-	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
-		  rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
-	cr4_clear_bits(X86_CR4_VMXE);
-
-	return -EFAULT;
-}
-
 int vmx_enable_virtualization_cpu(void)
 {
 	int cpu = raw_smp_processor_id();
-	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	int r;
-
-	if (cr4_read_shadow() & X86_CR4_VMXE)
-		return -EBUSY;
 
 	/*
 	 * This can happen if we hot-added a CPU but failed to allocate
@@ -2964,15 +2903,7 @@ int vmx_enable_virtualization_cpu(void)
 	if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
 		return -EFAULT;
 
-	intel_pt_handle_vmx(1);
-
-	r = kvm_cpu_vmxon(phys_addr);
-	if (r) {
-		intel_pt_handle_vmx(0);
-		return r;
-	}
-
-	return 0;
+	return x86_virt_get_cpu(X86_FEATURE_VMX);
 }
 
 static void vmclear_local_loaded_vmcss(void)
@@ -2989,12 +2920,9 @@ void vmx_disable_virtualization_cpu(void)
 {
 	vmclear_local_loaded_vmcss();
 
-	if (kvm_cpu_vmxoff())
-		kvm_spurious_fault();
+	x86_virt_put_cpu(X86_FEATURE_VMX);
 
 	hv_reset_evmcs();
-
-	intel_pt_handle_vmx(0);
 }
 
 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
@@ -3072,47 +3000,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	return -ENOMEM;
 }
 
-static void free_kvm_area(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		free_vmcs(per_cpu(vmxarea, cpu));
-		per_cpu(vmxarea, cpu) = NULL;
-	}
-}
-
-static __init int alloc_kvm_area(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct vmcs *vmcs;
-
-		vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
-		if (!vmcs) {
-			free_kvm_area();
-			return -ENOMEM;
-		}
-
-		/*
-		 * When eVMCS is enabled, alloc_vmcs_cpu() sets
-		 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
-		 * revision_id reported by MSR_IA32_VMX_BASIC.
-		 *
-		 * However, even though not explicitly documented by
-		 * TLFS, VMXArea passed as VMXON argument should
-		 * still be marked with revision_id reported by
-		 * physical CPU.
-		 */
-		if (kvm_is_using_evmcs())
-			vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
-
-		per_cpu(vmxarea, cpu) = vmcs;
-	}
-	return 0;
-}
-
 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
 		struct kvm_segment *save)
 {
@@ -8380,8 +8267,6 @@ void vmx_hardware_unsetup(void)
 
 	if (nested)
 		nested_vmx_hardware_unsetup();
-
-	free_kvm_area();
 }
 
 void vmx_vm_destroy(struct kvm *kvm)
@@ -8686,10 +8571,6 @@ __init int vmx_hardware_setup(void)
 			return r;
 	}
 
-	r = alloc_kvm_area();
-	if (r && nested)
-		nested_vmx_hardware_unsetup();
-
 	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
 
 	/*
@@ -8715,7 +8596,7 @@ __init int vmx_hardware_setup(void)
 
 	kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
 
-	return r;
+	return 0;
 }
 
 void vmx_exit(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 80cb882f19e2..f650f79d3d5a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -83,6 +83,8 @@
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
 #include <asm/sgx.h>
+#include <asm/virt.h>
+
 #include <clocksource/hyperv_timer.h>
 
 #define CREATE_TRACE_POINTS
@@ -694,9 +696,6 @@ static void drop_user_return_notifiers(void)
 		kvm_on_user_return(&msrs->urn);
 }
 
-__visible bool kvm_rebooting;
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);
-
 /*
  * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
  *
@@ -707,7 +706,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);
 noinstr void kvm_spurious_fault(void)
 {
 	/* Fault while not rebooting.  We want the trace. */
-	BUG_ON(!kvm_rebooting);
+	BUG_ON(!virt_rebooting);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault);
 
@@ -12999,12 +12998,12 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector);
 
 void kvm_arch_enable_virtualization(void)
 {
-	cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+	x86_virt_register_emergency_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
 }
 
 void kvm_arch_disable_virtualization(void)
 {
-	cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+	x86_virt_unregister_emergency_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
 }
 
 int kvm_arch_enable_virtualization_cpu(void)
@@ -13106,11 +13105,11 @@ int kvm_arch_enable_virtualization_cpu(void)
 void kvm_arch_shutdown(void)
 {
 	/*
-	 * Set kvm_rebooting to indicate that KVM has asynchronously disabled
+	 * Set virt_rebooting to indicate that KVM has asynchronously disabled
 	 * hardware virtualization, i.e. that relevant errors and exceptions
 	 * aren't entirely unexpected.
 	 */
-	kvm_rebooting = true;
+	virt_rebooting = true;
 }
 
 void kvm_arch_disable_virtualization_cpu(void)
@@ -13127,7 +13126,7 @@ void kvm_arch_disable_virtualization_cpu(void)
 	 * disable virtualization arrives.  Handle the extreme edge case here
 	 * instead of trying to account for it in the normal flows.
 	 */
-	if (in_task() || WARN_ON_ONCE(!kvm_rebooting))
+	if (in_task() || WARN_ON_ONCE(!virt_rebooting))
 		drop_user_return_notifiers();
 	else
 		__module_get(THIS_MODULE);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 40993348a967..fdab0ad49098 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -54,7 +54,6 @@ struct kvm_host_values {
 	u64 arch_capabilities;
 };
 
-extern bool kvm_rebooting;
 void kvm_spurious_fault(void);
 
 #define SIZE_OF_MEMSLOTS_HASHTABLE \
diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile
index ea343fc392dc..6e485751650c 100644
--- a/arch/x86/virt/Makefile
+++ b/arch/x86/virt/Makefile
@@ -1,2 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y	+= svm/ vmx/
+
+obj-$(subst m,y,$(CONFIG_KVM_X86)) += hw.o
\ No newline at end of file
diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c
new file mode 100644
index 000000000000..986e780cf438
--- /dev/null
+++ b/arch/x86/virt/hw.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/kvm_types.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+
+#include <asm/perf_event.h>
+#include <asm/processor.h>
+#include <asm/virt.h>
+#include <asm/vmx.h>
+
+static int x86_virt_feature __ro_after_init;
+
+__visible bool virt_rebooting;
+EXPORT_SYMBOL_GPL(virt_rebooting);
+
+static DEFINE_PER_CPU(int, virtualization_nr_users);
+
+static cpu_emergency_virt_cb __rcu *kvm_emergency_callback;
+
+void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback)
+{
+	if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback)))
+		return;
+
+	rcu_assign_pointer(kvm_emergency_callback, callback);
+}
+EXPORT_SYMBOL_FOR_MODULES(x86_virt_register_emergency_callback, "kvm");
+
+void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback)
+{
+	if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback) != callback))
+		return;
+
+	rcu_assign_pointer(kvm_emergency_callback, NULL);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_FOR_MODULES(x86_virt_unregister_emergency_callback, "kvm");
+
+static void x86_virt_invoke_kvm_emergency_callback(void)
+{
+	cpu_emergency_virt_cb *kvm_callback;
+
+	kvm_callback = rcu_dereference(kvm_emergency_callback);
+	if (kvm_callback)
+		kvm_callback();
+}
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static DEFINE_PER_CPU(struct vmcs *, root_vmcs);
+
+static int x86_virt_cpu_vmxon(void)
+{
+	u64 vmxon_pointer = __pa(per_cpu(root_vmcs, raw_smp_processor_id()));
+	u64 msr;
+
+	cr4_set_bits(X86_CR4_VMXE);
+
+	asm goto("1: vmxon %[vmxon_pointer]\n\t"
+			  _ASM_EXTABLE(1b, %l[fault])
+			  : : [vmxon_pointer] "m"(vmxon_pointer)
+			  : : fault);
+	return 0;
+
+fault:
+	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+		  rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+	cr4_clear_bits(X86_CR4_VMXE);
+
+	return -EFAULT;
+}
+
+static int x86_vmx_get_cpu(void)
+{
+	int r;
+
+	if (cr4_read_shadow() & X86_CR4_VMXE)
+		return -EBUSY;
+
+	intel_pt_handle_vmx(1);
+
+	r = x86_virt_cpu_vmxon();
+	if (r) {
+		intel_pt_handle_vmx(0);
+		return r;
+	}
+
+	return 0;
+}
+
+/*
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
+ */
+static int x86_vmx_put_cpu(void)
+{
+	int r = -EIO;
+
+	asm goto("1: vmxoff\n\t"
+		 _ASM_EXTABLE(1b, %l[fault])
+		 ::: "cc", "memory" : fault);
+	r = 0;
+
+fault:
+	cr4_clear_bits(X86_CR4_VMXE);
+	intel_pt_handle_vmx(0);
+	return r;
+}
+
+static int x86_vmx_emergency_disable_virtualization_cpu(void)
+{
+	virt_rebooting = true;
+
+	/*
+	 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+	 * set in task context.  If this races with VMX being disabled via NMI,
+	 * VMCLEAR and VMXOFF may #UD, but the kernel will eat those faults due
+	 * to virt_rebooting being set.
+	 */
+	if (!(__read_cr4() & X86_CR4_VMXE))
+		return -ENOENT;
+
+	x86_virt_invoke_kvm_emergency_callback();
+
+	x86_vmx_put_cpu();
+	return 0;
+}
+
+static __init void x86_vmx_exit(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		free_page((unsigned long)per_cpu(root_vmcs, cpu));
+		per_cpu(root_vmcs, cpu) = NULL;
+	}
+}
+
+static __init int x86_vmx_init(void)
+{
+	u64 basic_msr;
+	u32 rev_id;
+	int cpu;
+
+	if (!cpu_feature_enabled(X86_FEATURE_VMX))
+		return -EOPNOTSUPP;
+
+	rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
+
+	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+	if (WARN_ON_ONCE(vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE))
+		return -EIO;
+
+	/*
+	 * Even if eVMCS is enabled (or will be enabled?), and even though not
+	 * explicitly documented by TLFS, the root VMCS  passed to VMXON should
+	 * still be marked with the revision_id reported by the physical CPU.
+	 */
+	rev_id = vmx_basic_vmcs_revision_id(basic_msr);
+
+	for_each_possible_cpu(cpu) {
+		int node = cpu_to_node(cpu);
+		struct page *page;
+		struct vmcs *vmcs;
+
+		page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO , 0);
+		if (!page) {
+			x86_vmx_exit();
+			return -ENOMEM;
+		}
+
+		vmcs = page_address(page);
+		vmcs->hdr.revision_id = rev_id;
+		per_cpu(root_vmcs, cpu) = vmcs;
+	}
+
+	return 0;
+}
+#else
+static int x86_vmx_get_cpu(void) { BUILD_BUG_ON(1); return -EOPNOTSUPP; }
+static int x86_vmx_put_cpu(void) { BUILD_BUG_ON(1); return -EOPNOTSUPP; }
+static int x86_vmx_emergency_disable_virtualization_cpu(void) { BUILD_BUG_ON(1); return -EOPNOTSUPP; }
+static __init int x86_vmx_init(void) { return -EOPNOTSUPP; }
+#endif
+
+#if IS_ENABLED(CONFIG_KVM_AMD)
+static int x86_svm_get_cpu(void)
+{
+	u64 efer;
+
+	rdmsrq(MSR_EFER, efer);
+	if (efer & EFER_SVME)
+		return -EBUSY;
+
+	wrmsrq(MSR_EFER, efer | EFER_SVME);
+	return 0;
+}
+
+static int x86_svm_put_cpu(void)
+{
+	int r = -EIO;
+	u64 efer;
+
+	/*
+	 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+	 * NMI aren't blocked.
+	 */
+	asm goto("1: stgi\n\t"
+		 _ASM_EXTABLE(1b, %l[fault])
+		 ::: "memory" : fault);
+	r = 0;
+
+fault:
+	rdmsrq(MSR_EFER, efer);
+	wrmsrq(MSR_EFER, efer & ~EFER_SVME);
+	return r;
+}
+
+static int x86_svm_emergency_disable_virtualization_cpu(void)
+{
+	u64 efer;
+
+	virt_rebooting = true;
+
+	rdmsrq(MSR_EFER, efer);
+	if (!(efer & EFER_SVME))
+		return -ENOENT;
+
+	x86_virt_invoke_kvm_emergency_callback();
+
+	return x86_svm_put_cpu();
+}
+
+static __init int x86_svm_init(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_SVM))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+#else
+static int x86_svm_get_cpu(void) { BUILD_BUG_ON(1); return -EOPNOTSUPP; }
+static int x86_svm_put_cpu(void) { BUILD_BUG_ON(1); return -EOPNOTSUPP; }
+static int x86_svm_emergency_disable_virtualization_cpu(void) { BUILD_BUG_ON(1); return -EOPNOTSUPP; }
+static __init int x86_svm_init(void) { return -EOPNOTSUPP; }
+#endif
+
+#define x86_virt_call(fn)				\
+({							\
+	int __r;					\
+							\
+	if (IS_ENABLED(CONFIG_KVM_INTEL) &&		\
+	    cpu_feature_enabled(X86_FEATURE_VMX))	\
+		__r = x86_vmx_##fn();			\
+	else if (IS_ENABLED(CONFIG_KVM_AMD) &&		\
+		 cpu_feature_enabled(X86_FEATURE_SVM))	\
+		__r = x86_svm_##fn();			\
+	else						\
+		__r = -EOPNOTSUPP;			\
+							\
+	__r;						\
+})
+
+int x86_virt_get_cpu(int feat)
+{
+	int r;
+
+	if (!x86_virt_feature || x86_virt_feature != feat)
+		return -EOPNOTSUPP;
+
+	if (this_cpu_inc_return(virtualization_nr_users) > 1)
+		return 0;
+
+	r = x86_virt_call(get_cpu);
+	if (r)
+		WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users));
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(x86_virt_get_cpu);
+
+void x86_virt_put_cpu(int feat)
+{
+	if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) ||
+	    this_cpu_dec_return(virtualization_nr_users))
+		return;
+
+	BUG_ON(x86_virt_call(put_cpu) && !virt_rebooting);
+}
+EXPORT_SYMBOL_GPL(x86_virt_put_cpu);
+
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot.  VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+int x86_virt_emergency_disable_virtualization_cpu(void)
+{
+	if (!x86_virt_feature)
+		return -EOPNOTSUPP;
+
+	/*
+	 * IRQs must be disabled as virtualization is enabled in hardware via
+	 * function call IPIs, i.e. IRQs need to be disabled to guarantee
+	 * virtualization stays disabled.
+	 */
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
+	 * other CPUs may have virtualization enabled.
+	 *
+	 * TODO: Track whether or not virtualization might be enabled on other
+	 *	 CPUs?  May not be worth avoiding the NMI shootdown...
+	 */
+	(void)x86_virt_call(emergency_disable_virtualization_cpu);
+	return 0;
+}
+
+void __init x86_virt_init(void)
+{
+	bool is_vmx = !x86_vmx_init();
+	bool is_svm = !x86_svm_init();
+
+	if (!is_vmx && !is_svm)
+		return;
+
+	if (WARN_ON_ONCE(is_vmx && is_svm))
+		return;
+
+	x86_virt_feature = is_vmx ? X86_FEATURE_VMX : X86_FEATURE_SVM;
+}
-- 
2.52.0.223.gf5cc29aaa4-goog

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Dave Hansen 1 month, 3 weeks ago

On 12/5/25 17:10, Sean Christopherson wrote:
> Move the innermost VMXON and EFER.SVME management logic out of KVM and
> into to core x86 so that TDX can force VMXON without having to rely on
> KVM being loaded, e.g. to do SEAMCALLs during initialization.
> 
> Implement a per-CPU refcounting scheme so that "users", e.g. KVM and the
> future TDX code, can co-exist without pulling the rug out from under each
> other.
> 
> To avoid having to choose between SVM and VMX, simply refuse to enable
> either if both are somehow supported.  No known CPU supports both SVM and
> VMX, and it's comically unlikely such a CPU will ever exist.

Yeah, that's totally a "please share your drugs" moment, if it happens.

> +static int x86_vmx_get_cpu(void)
> +{
> +	int r;
> +
> +	if (cr4_read_shadow() & X86_CR4_VMXE)
> +		return -EBUSY;
> +
> +	intel_pt_handle_vmx(1);
> +
> +	r = x86_virt_cpu_vmxon();
> +	if (r) {
> +		intel_pt_handle_vmx(0);
> +		return r;
> +	}
> +
> +	return 0;
> +}
...> +#define x86_virt_call(fn)				\
> +({							\
> +	int __r;					\
> +							\
> +	if (IS_ENABLED(CONFIG_KVM_INTEL) &&		\
> +	    cpu_feature_enabled(X86_FEATURE_VMX))	\
> +		__r = x86_vmx_##fn();			\
> +	else if (IS_ENABLED(CONFIG_KVM_AMD) &&		\
> +		 cpu_feature_enabled(X86_FEATURE_SVM))	\
> +		__r = x86_svm_##fn();			\
> +	else						\
> +		__r = -EOPNOTSUPP;			\
> +							\
> +	__r;						\
> +})

I'm not a super big fan of this. I know you KVM folks love your macros
and wrapping function calls in them because you hate grep. ;)

I don't like a foo_get_cpu() call having such fundamentally different
semantics than good old get_cpu() itself. *Especially* when the calls
look like:

	r = x86_virt_call(get_cpu);

and get_cpu() itself it not invovled one bit. This 100% looks like it's
some kind of virt-specific call for get_cpu().

I think it's probably OK to make this get_hw_ref() or inc_hw_ref() or
something to get it away from getting confused with get_cpu().

IMNHO, the macro magic is overkill. A couple of global function pointers
would probably be fine because none of this code is even remotely
performance sensitive. A couple static_call()s would be fine too because
those at least make it blatantly obvious that the thing being called is
variable. A good ol' ops structure would also make things obvious, but
are probably also overkill-adjecent for this.

P.S. In a perfect world, the renames would also be in their own patches,
but I think I can live with it as-is.

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Sean Christopherson 1 month, 3 weeks ago

On Fri, Dec 19, 2025, Dave Hansen wrote:
> On 12/5/25 17:10, Sean Christopherson wrote:
> > +static int x86_vmx_get_cpu(void)
> > +{
> > +	int r;
> > +
> > +	if (cr4_read_shadow() & X86_CR4_VMXE)
> > +		return -EBUSY;
> > +
> > +	intel_pt_handle_vmx(1);
> > +
> > +	r = x86_virt_cpu_vmxon();
> > +	if (r) {
> > +		intel_pt_handle_vmx(0);
> > +		return r;
> > +	}
> > +
> > +	return 0;
> > +}
> ...> +#define x86_virt_call(fn)				\
> > +({							\
> > +	int __r;					\
> > +							\
> > +	if (IS_ENABLED(CONFIG_KVM_INTEL) &&		\
> > +	    cpu_feature_enabled(X86_FEATURE_VMX))	\
> > +		__r = x86_vmx_##fn();			\
> > +	else if (IS_ENABLED(CONFIG_KVM_AMD) &&		\
> > +		 cpu_feature_enabled(X86_FEATURE_SVM))	\
> > +		__r = x86_svm_##fn();			\
> > +	else						\
> > +		__r = -EOPNOTSUPP;			\
> > +							\
> > +	__r;						\
> > +})
> 
> I'm not a super big fan of this. I know you KVM folks love your macros
> and wrapping function calls in them because you hate grep. ;)

Heh, kvm_x86_call() exists _because_ I like grep and search functionality.  The
number of times I couldn't find something because I was searching for full words
and forgot about the kvm_x86_ prefix...

> I don't like a foo_get_cpu() call having such fundamentally different
> semantics than good old get_cpu() itself. *Especially* when the calls
> look like:
> 
> 	r = x86_virt_call(get_cpu);
> 
> and get_cpu() itself it not invovled one bit. This 100% looks like it's
> some kind of virt-specific call for get_cpu().
> 
> I think it's probably OK to make this get_hw_ref() or inc_hw_ref() or
> something to get it away from getting confused with get_cpu().

Oof, yeah, didn't think about a collision with {get,put}_cpu().  How about 
x86_virt_{get,put}_ref()?  I like how the callers read, e.g. "get a reference to
VMX or SVM":

  x86_virt_get_ref(X86_FEATURE_VMX);
  x86_virt_put_ref(X86_FEATURE_VMX);

  x86_virt_get_ref(X86_FEATURE_SVM);
  x86_virt_put_ref(X86_FEATURE_SVM);

> IMNHO, the macro magic is overkill. A couple of global function pointers
> would probably be fine because none of this code is even remotely
> performance sensitive. A couple static_call()s would be fine too because
> those at least make it blatantly obvious that the thing being called is
> variable. A good ol' ops structure would also make things obvious, but
> are probably also overkill-adjecent for this.

Agreed.  I'm not even entirely sure why I took this approach.  I suspect I carried
over the basic concept from code that wanted to run before wiring up function
pointers, and never revisited the implementation once the dust settled.

I haven't tested yet, but I've got this:

  struct x86_virt_ops {
	int feature;
	int (*enable_virtualization_cpu)(void);
	int (*disable_virtualization_cpu)(void);
	void (*emergency_disable_virtualization_cpu)(void);
  };
  static struct x86_virt_ops virt_ops __ro_after_init;

and then usage like:

  int x86_virt_get_ref(int feat)
  {
	int r;

	if (!virt_ops.feature || virt_ops.feature != feat)
		return -EOPNOTSUPP;

	if (this_cpu_inc_return(virtualization_nr_users) > 1)
		return 0;

	r = virt_ops.enable_virtualization_cpu();
	if (r)
		WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users));

	return r;
  }
  EXPORT_SYMBOL_GPL(x86_virt_get_ref);

  void x86_virt_put_ref(int feat)
  {
	if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) ||
	    this_cpu_dec_return(virtualization_nr_users))
		return;

	BUG_ON(virt_ops.disable_virtualization_cpu() && !virt_rebooting);
  }
  EXPORT_SYMBOL_GPL(x86_virt_put_ref);

> P.S. In a perfect world, the renames would also be in their own patches,
> but I think I can live with it as-is.

Ya, I'll chunk the patch up.

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Dave Hansen 1 month, 3 weeks ago

On 12/19/25 10:35, Sean Christopherson wrote:
> I haven't tested yet, but I've got this:
> 
>   struct x86_virt_ops {
> 	int feature;
> 	int (*enable_virtualization_cpu)(void);
> 	int (*disable_virtualization_cpu)(void);
> 	void (*emergency_disable_virtualization_cpu)(void);
>   };
>   static struct x86_virt_ops virt_ops __ro_after_init;
> 
> and then usage like:
> 
>   int x86_virt_get_ref(int feat)
>   {
> 	int r;
> 
> 	if (!virt_ops.feature || virt_ops.feature != feat)
> 		return -EOPNOTSUPP;
> 
> 	if (this_cpu_inc_return(virtualization_nr_users) > 1)
> 		return 0;
> 
> 	r = virt_ops.enable_virtualization_cpu();
> 	if (r)
> 		WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users));
> 
> 	return r;
>   }
>   EXPORT_SYMBOL_GPL(x86_virt_get_ref);

Yeah, that's much easier to follow, and fixes the naming collision thanks!

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Chao Gao 2 months ago

>--- /dev/null
>+++ b/arch/x86/include/asm/virt.h
>@@ -0,0 +1,26 @@
>+/* SPDX-License-Identifier: GPL-2.0-only */
>+#ifndef _ASM_X86_VIRT_H
>+#define _ASM_X86_VIRT_H
>+
>+#include <asm/reboot.h>

asm/reboot.h isn't used.

>+
>+typedef void (cpu_emergency_virt_cb)(void);
>+
>+#if IS_ENABLED(CONFIG_KVM_X86)
>+extern bool virt_rebooting;
>+
>+void __init x86_virt_init(void);
>+
>+int x86_virt_get_cpu(int feat);
>+void x86_virt_put_cpu(int feat);
>+
>+int x86_virt_emergency_disable_virtualization_cpu(void);
>+
>+void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback);
>+void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback);
>+#else
>+static __always_inline void x86_virt_init(void) {}

Why does this need to be "__always_inline" rather than just "inline"?

> static void emergency_reboot_disable_virtualization(void)
> {
> 	local_irq_disable();
>@@ -587,16 +543,11 @@ static void emergency_reboot_disable_virtualization(void)
> 	 * We can't take any locks and we may be on an inconsistent state, so
> 	 * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
> 	 *
>-	 * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
>-	 * other CPUs may have virtualization enabled.
>+	 * Safely force _this_ CPU out of VMX/SVM operation, and if necessary,
>+	 * blast NMIs to force other CPUs out of VMX/SVM as well.k

								 ^ stray "k".

I don't understand the "if necessary" part. My understanding is this code
issues NMIs if CPUs support VMX or SVM. If so, I think the code snippet below
would be more readable:

	if (cpu_feature_enabled(X86_FEATURE_VMX) ||
	    cpu_feature_enabled(X86_FEATURE_SVM)) {
		x86_virt_emergency_disable_virtualization_cpu();
		nmi_shootdown_cpus_on_restart();
	}

Then x86_virt_emergency_disable_virtualization_cpu() wouldn't need to return
anything. And readers wouldn't need to trace down the function to understand
when NMIs are "necessary" and when they are not.

> 	 */
>-	if (rcu_access_pointer(cpu_emergency_virt_callback)) {
>-		/* Safely force _this_ CPU out of VMX/SVM operation. */
>-		cpu_emergency_disable_virtualization();
>-
>-		/* Disable VMX/SVM and halt on other CPUs. */
>+	if (!x86_virt_emergency_disable_virtualization_cpu())
> 		nmi_shootdown_cpus_on_restart();
>-	}
> }

<snip>

>+#define x86_virt_call(fn)				\
>+({							\
>+	int __r;					\
>+							\
>+	if (IS_ENABLED(CONFIG_KVM_INTEL) &&		\
>+	    cpu_feature_enabled(X86_FEATURE_VMX))	\
>+		__r = x86_vmx_##fn();			\
>+	else if (IS_ENABLED(CONFIG_KVM_AMD) &&		\
>+		 cpu_feature_enabled(X86_FEATURE_SVM))	\
>+		__r = x86_svm_##fn();			\
>+	else						\
>+		__r = -EOPNOTSUPP;			\
>+							\
>+	__r;						\
>+})
>+
>+int x86_virt_get_cpu(int feat)
>+{
>+	int r;
>+
>+	if (!x86_virt_feature || x86_virt_feature != feat)
>+		return -EOPNOTSUPP;
>+
>+	if (this_cpu_inc_return(virtualization_nr_users) > 1)
>+		return 0;

Should we assert that preemption is disabled? Calling this API when preemption
is enabled is wrong.

Maybe use __this_cpu_inc_return(), which already verifies preemption status.

<snip>

>+int x86_virt_emergency_disable_virtualization_cpu(void)
>+{
>+	if (!x86_virt_feature)
>+		return -EOPNOTSUPP;
>+
>+	/*
>+	 * IRQs must be disabled as virtualization is enabled in hardware via
>+	 * function call IPIs, i.e. IRQs need to be disabled to guarantee
>+	 * virtualization stays disabled.
>+	 */
>+	lockdep_assert_irqs_disabled();
>+
>+	/*
>+	 * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
>+	 * other CPUs may have virtualization enabled.
>+	 *
>+	 * TODO: Track whether or not virtualization might be enabled on other
>+	 *	 CPUs?  May not be worth avoiding the NMI shootdown...
>+	 */

This comment is misplaced. NMIs are issued by the caller.

>+	(void)x86_virt_call(emergency_disable_virtualization_cpu);
>+	return 0;
>+}

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Xu Yilun 1 month, 3 weeks ago

> >+#define x86_virt_call(fn)				\
> >+({							\
> >+	int __r;					\
> >+							\
> >+	if (IS_ENABLED(CONFIG_KVM_INTEL) &&		\
> >+	    cpu_feature_enabled(X86_FEATURE_VMX))	\
> >+		__r = x86_vmx_##fn();			\
> >+	else if (IS_ENABLED(CONFIG_KVM_AMD) &&		\
> >+		 cpu_feature_enabled(X86_FEATURE_SVM))	\
> >+		__r = x86_svm_##fn();			\
> >+	else						\
> >+		__r = -EOPNOTSUPP;			\
> >+							\
> >+	__r;						\
> >+})
> >+
> >+int x86_virt_get_cpu(int feat)
> >+{
> >+	int r;
> >+
> >+	if (!x86_virt_feature || x86_virt_feature != feat)
> >+		return -EOPNOTSUPP;
> >+
> >+	if (this_cpu_inc_return(virtualization_nr_users) > 1)
> >+		return 0;
> 
> Should we assert that preemption is disabled? Calling this API when preemption
> is enabled is wrong.
> 
> Maybe use __this_cpu_inc_return(), which already verifies preemption status.
> 

Is it better we explicitly assert the preemption for x86_virt_get_cpu()
rather than embed the check in __this_cpu_inc_return()? We are not just
protecting the racing for the reference counter. We should ensure the
"counter increase + x86_virt_call(get_cpu)" can't be preempted.

Thanks,
Yilun

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Sean Christopherson 1 month, 3 weeks ago

On Wed, Dec 17, 2025, Xu Yilun wrote:
> > >+#define x86_virt_call(fn)				\
> > >+({							\
> > >+	int __r;					\
> > >+							\
> > >+	if (IS_ENABLED(CONFIG_KVM_INTEL) &&		\
> > >+	    cpu_feature_enabled(X86_FEATURE_VMX))	\
> > >+		__r = x86_vmx_##fn();			\
> > >+	else if (IS_ENABLED(CONFIG_KVM_AMD) &&		\
> > >+		 cpu_feature_enabled(X86_FEATURE_SVM))	\
> > >+		__r = x86_svm_##fn();			\
> > >+	else						\
> > >+		__r = -EOPNOTSUPP;			\
> > >+							\
> > >+	__r;						\
> > >+})
> > >+
> > >+int x86_virt_get_cpu(int feat)
> > >+{
> > >+	int r;
> > >+
> > >+	if (!x86_virt_feature || x86_virt_feature != feat)
> > >+		return -EOPNOTSUPP;
> > >+
> > >+	if (this_cpu_inc_return(virtualization_nr_users) > 1)
> > >+		return 0;
> > 
> > Should we assert that preemption is disabled? Calling this API when preemption
> > is enabled is wrong.
> > 
> > Maybe use __this_cpu_inc_return(), which already verifies preemption status.

I always forget that the double-underscores have the checks.  

> Is it better we explicitly assert the preemption for x86_virt_get_cpu()
> rather than embed the check in __this_cpu_inc_return()? We are not just
> protecting the racing for the reference counter. We should ensure the
> "counter increase + x86_virt_call(get_cpu)" can't be preempted.

I don't have a strong preference.  Using __this_cpu_inc_return() without any
nearby preemption_{enable,disable}() calls makes it quite clears that preemption
is expected to be disabled by the caller.  But I'm also ok being explicit.

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Xu Yilun 1 month, 3 weeks ago

On Wed, Dec 17, 2025 at 11:01:59AM -0800, Sean Christopherson wrote:
> On Wed, Dec 17, 2025, Xu Yilun wrote:
> > > >+#define x86_virt_call(fn)				\
> > > >+({							\
> > > >+	int __r;					\
> > > >+							\
> > > >+	if (IS_ENABLED(CONFIG_KVM_INTEL) &&		\
> > > >+	    cpu_feature_enabled(X86_FEATURE_VMX))	\
> > > >+		__r = x86_vmx_##fn();			\
> > > >+	else if (IS_ENABLED(CONFIG_KVM_AMD) &&		\
> > > >+		 cpu_feature_enabled(X86_FEATURE_SVM))	\
> > > >+		__r = x86_svm_##fn();			\
> > > >+	else						\
> > > >+		__r = -EOPNOTSUPP;			\
> > > >+							\
> > > >+	__r;						\
> > > >+})
> > > >+
> > > >+int x86_virt_get_cpu(int feat)
> > > >+{
> > > >+	int r;
> > > >+
> > > >+	if (!x86_virt_feature || x86_virt_feature != feat)
> > > >+		return -EOPNOTSUPP;
> > > >+
> > > >+	if (this_cpu_inc_return(virtualization_nr_users) > 1)
> > > >+		return 0;
> > > 
> > > Should we assert that preemption is disabled? Calling this API when preemption
> > > is enabled is wrong.
> > > 
> > > Maybe use __this_cpu_inc_return(), which already verifies preemption status.
> 
> I always forget that the double-underscores have the checks.  
> 
> > Is it better we explicitly assert the preemption for x86_virt_get_cpu()
> > rather than embed the check in __this_cpu_inc_return()? We are not just
> > protecting the racing for the reference counter. We should ensure the
> > "counter increase + x86_virt_call(get_cpu)" can't be preempted.
> 
> I don't have a strong preference.  Using __this_cpu_inc_return() without any
> nearby preemption_{enable,disable}() calls makes it quite clears that preemption
> is expected to be disabled by the caller.  But I'm also ok being explicit.

Looking into __this_cpu_inc_return(), it finally calls
check_preemption_disabled() which doesn't strictly requires preemption.
It only ensures the context doesn't switch to another CPU. If the caller
is in cpuhp context, preemption is possible.

But in this x86_virt_get_cpu(), we need to ensure preemption disabled,
otherwise caller A increases counter but hasn't do actual VMXON yet and
get preempted. Caller B opts in and get the wrong info that VMX is
already on, and fails on following vmx operations.

On a second thought, maybe we disable preemption inside
x86_virt_get_cpu() to protect the counter-vmxon racing, this is pure
internal thing for this kAPI.

Thanks,
Yilun

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Sean Christopherson 1 month, 3 weeks ago

On Fri, Dec 19, 2025, Xu Yilun wrote:
> On Wed, Dec 17, 2025 at 11:01:59AM -0800, Sean Christopherson wrote:
> > On Wed, Dec 17, 2025, Xu Yilun wrote:
> > > Is it better we explicitly assert the preemption for x86_virt_get_cpu()
> > > rather than embed the check in __this_cpu_inc_return()? We are not just
> > > protecting the racing for the reference counter. We should ensure the
> > > "counter increase + x86_virt_call(get_cpu)" can't be preempted.
> > 
> > I don't have a strong preference.  Using __this_cpu_inc_return() without any
> > nearby preemption_{enable,disable}() calls makes it quite clears that preemption
> > is expected to be disabled by the caller.  But I'm also ok being explicit.
> 
> Looking into __this_cpu_inc_return(), it finally calls
> check_preemption_disabled() which doesn't strictly requires preemption.
> It only ensures the context doesn't switch to another CPU. If the caller
> is in cpuhp context, preemption is possible.

Hmm, right, the cpuhp thread is is_percpu_thread(), and KVM's hooks aren't
considered atomic and so run with IRQs enabled.  In practice, it's "fine", because
TDX also exclusively does x86_virt_get_cpu() from cpuhp, i.e. the two users are
mutually exclusive, but relying on that behavior is gross.

> But in this x86_virt_get_cpu(), we need to ensure preemption disabled,
> otherwise caller A increases counter but hasn't do actual VMXON yet and
> get preempted. Caller B opts in and get the wrong info that VMX is
> already on, and fails on following vmx operations.
> 
> On a second thought, maybe we disable preemption inside
> x86_virt_get_cpu() to protect the counter-vmxon racing, this is pure
> internal thing for this kAPI.

Ya, that'd be my preference.

Kai, question for you (or anyone else that might know):

Is there any **need** for tdx_cpu_enable() and try_init_module_global() to run
with IRQs disabled?  AFAICT, the lockdep_assert_irqs_disabled() checks added by
commit 6162b310bc21 ("x86/virt/tdx: Add skeleton to enable TDX on demand") were
purely because, _when the code was written_, KVM enabled virtualization via IPI
function calls.

But by the time the KVM code landed upstream in commit fcdbdf63431c ("KVM: VMX:
Initialize TDX during KVM module load"), that was no longer true, thanks to
commit 9a798b1337af ("KVM: Register cpuhp and syscore callbacks when enabling
hardware") setting the stage for doing everything from task context.

However, rather than update the kernel side, e.g. to drop the lockdep assertions
and related comments, commit 9a798b1337af instead did this:

	local_irq_save(flags);
	r = tdx_cpu_enable();
	local_irq_restore(flags);

Somewhat frustratingly, I poked at this when the reworked code was first posted
(https://lore.kernel.org/all/ZyJOiPQnBz31qLZ7@google.com), but it just got swept
under the rug :-(

  : > > +	/* tdx_cpu_enable() must be called with IRQ disabled */
  : > 
  : > I don't find this comment helpfu.  If it explained _why_ tdx_cpu_enable() requires
  : > IRQs to be disabled, then I'd feel differently, but as is, IMO it doesn't add value.
  : 
  : I'll remove the comment.
  : 
  : > 
  : > > +	local_irq_save(flags);
  : > > +	r = tdx_cpu_enable();
  : > > +	local_irq_restore(flags);

Unless TDX _needs_ IRQs to be disabled, I would strongly prefer to drop that code
in prep patches so that it doesn't become even harder to disentagle the history
to figure out why tdx_online_cpu() disables IRQs:

  static int tdx_online_cpu(unsigned int cpu)
  {
	int ret;

	guard(irqsave)();  <=============== why is this here!?!?!

	ret = x86_virt_get_cpu(X86_FEATURE_VMX);
	if (ret)
		return ret;

	ret = tdx_cpu_enable();
	if (ret)
		x86_virt_put_cpu(X86_FEATURE_VMX);

	return ret;
  }

Side topic, KVM's change in behavior also means this comment is stale (though I
think it's worth keeping the assertion, but with a comment saying it's hardening
and paranoia, not a strick requirement).

	/*
	 * IRQs must be disabled as virtualization is enabled in hardware via
	 * function call IPIs, i.e. IRQs need to be disabled to guarantee
	 * virtualization stays disabled.
	 */
	lockdep_assert_irqs_disabled();

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Binbin Wu 1 week, 6 days ago


On 12/19/2025 11:40 PM, Sean Christopherson wrote:
> On Fri, Dec 19, 2025, Xu Yilun wrote:
>> On Wed, Dec 17, 2025 at 11:01:59AM -0800, Sean Christopherson wrote:
>>> On Wed, Dec 17, 2025, Xu Yilun wrote:
>>>> Is it better we explicitly assert the preemption for x86_virt_get_cpu()
>>>> rather than embed the check in __this_cpu_inc_return()? We are not just
>>>> protecting the racing for the reference counter. We should ensure the
>>>> "counter increase + x86_virt_call(get_cpu)" can't be preempted.
>>>
>>> I don't have a strong preference.  Using __this_cpu_inc_return() without any
>>> nearby preemption_{enable,disable}() calls makes it quite clears that preemption
>>> is expected to be disabled by the caller.  But I'm also ok being explicit.
>>
>> Looking into __this_cpu_inc_return(), it finally calls
>> check_preemption_disabled() which doesn't strictly requires preemption.
>> It only ensures the context doesn't switch to another CPU. If the caller
>> is in cpuhp context, preemption is possible.
> 
> Hmm, right, the cpuhp thread is is_percpu_thread(), and KVM's hooks aren't
> considered atomic and so run with IRQs enabled.  In practice, it's "fine", because
> TDX also exclusively does x86_virt_get_cpu() from cpuhp, i.e. the two users are
> mutually exclusive, but relying on that behavior is gross.
> 

Side topic:
Isn't the name of check_preemption_disabled() confusing and arguably misleading?

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Huang, Kai 1 month, 3 weeks ago

On Fri, 2025-12-19 at 07:40 -0800, Sean Christopherson wrote:
> Kai, question for you (or anyone else that might know):
> 
> Is there any **need** for tdx_cpu_enable() and try_init_module_global() to run
> with IRQs disabled?  AFAICT, the lockdep_assert_irqs_disabled() checks added by
> commit 6162b310bc21 ("x86/virt/tdx: Add skeleton to enable TDX on demand") were
> purely because, _when the code was written_, KVM enabled virtualization via IPI
> function calls.
> 
> But by the time the KVM code landed upstream in commit fcdbdf63431c ("KVM: VMX:
> Initialize TDX during KVM module load"), that was no longer true, thanks to
> commit 9a798b1337af ("KVM: Register cpuhp and syscore callbacks when enabling
> hardware") setting the stage for doing everything from task context.

The other intention was to make tdx_cpu_enable() as IRQ safe, since it was
supposed to be able to be called by multiple in-kernel users but not just
KVM (i.e., also for TDX connect).

But right currently we don't need to call them with IRQ disabled.

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Dave Hansen 1 month, 3 weeks ago

On 12/19/25 07:40, Sean Christopherson wrote:
> Is there any **need** for tdx_cpu_enable() and try_init_module_global() to run
> with IRQs disabled?  AFAICT, the lockdep_assert_irqs_disabled() checks added by
> commit 6162b310bc21 ("x86/virt/tdx: Add skeleton to enable TDX on demand") were
> purely because, _when the code was written_, KVM enabled virtualization via IPI
> function calls.

Yeah, the intent was to prevent "rmmod kvm_intel" from running while TDX
was being initialized. The way it ended up, though, I think TDX init
would have been called in a KVM path _anyway_, which makes this double
useless.

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by dan.j.williams@intel.com 2 months ago

Sean Christopherson wrote:
> Move the innermost VMXON and EFER.SVME management logic out of KVM and
> into to core x86 so that TDX can force VMXON without having to rely on
> KVM being loaded, e.g. to do SEAMCALLs during initialization.
> 
> Implement a per-CPU refcounting scheme so that "users", e.g. KVM and the
> future TDX code, can co-exist without pulling the rug out from under each
> other.
> 
> To avoid having to choose between SVM and VMX, simply refuse to enable
> either if both are somehow supported.  No known CPU supports both SVM and
> VMX, and it's comically unlikely such a CPU will ever exist.
> 
> For lack of a better name, call the new file "hw.c", to yield "virt
> hardware" when combined with its parent directory.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
[..]

Only patch organization and naming choice questions from me. I only
looked at the VMX side of things since I previously made an attempt at
moving that. I gave a cursory look at the SVM details.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 80cb882f19e2..f650f79d3d5a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -83,6 +83,8 @@
>  #include <asm/intel_pt.h>
>  #include <asm/emulate_prefix.h>
>  #include <asm/sgx.h>
> +#include <asm/virt.h>
> +
>  #include <clocksource/hyperv_timer.h>
>  
>  #define CREATE_TRACE_POINTS
> @@ -694,9 +696,6 @@ static void drop_user_return_notifiers(void)
>  		kvm_on_user_return(&msrs->urn);
>  }
>  
> -__visible bool kvm_rebooting;
> -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);

...a short stay for this symbol in kvm/x86.c? It raises my curiosity why
patch1 is separate. Patch1 looked like the start of a series of
incremental conversions, patch2 is a combo move. I am ok either way,
just questioning consistency. I.e. if combo move then patch1 folds in
here, if incremental, perhaps split out other combo conversions like
emergency_disable_virtualization_cpu()? The aspect of "this got moved
twice in the same patchset" is what poked me.

[..]
> diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c
> new file mode 100644
> index 000000000000..986e780cf438
> --- /dev/null
> +++ b/arch/x86/virt/hw.c
> @@ -0,0 +1,340 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/errno.h>
> +#include <linux/kvm_types.h>
> +#include <linux/list.h>
> +#include <linux/percpu.h>
> +
> +#include <asm/perf_event.h>
> +#include <asm/processor.h>
> +#include <asm/virt.h>
> +#include <asm/vmx.h>
> +
> +static int x86_virt_feature __ro_after_init;
> +
> +__visible bool virt_rebooting;
> +EXPORT_SYMBOL_GPL(virt_rebooting);
> +
> +static DEFINE_PER_CPU(int, virtualization_nr_users);
> +
> +static cpu_emergency_virt_cb __rcu *kvm_emergency_callback;

Hmm, why kvm_ and not virt_?

[..]
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +static DEFINE_PER_CPU(struct vmcs *, root_vmcs);

Perhaps introduce a CONFIG_INTEL_VMX for this? For example, KVM need not
be enabled if all one wants to do is use TDX to setup PCIe Link
Encryption. ...or were you expecting?

#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(...<other VMX users>...)

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Sean Christopherson 2 months ago

On Sat, Dec 06, 2025, dan.j.williams@intel.com wrote:
> Sean Christopherson wrote:
> > @@ -694,9 +696,6 @@ static void drop_user_return_notifiers(void)
> >  		kvm_on_user_return(&msrs->urn);
> >  }
> >  
> > -__visible bool kvm_rebooting;
> > -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);
> 
> ...a short stay for this symbol in kvm/x86.c? It raises my curiosity why
> patch1 is separate.

Because it affects non-x86 architectures.  It should be a complete nop, but I
wanted to isolate what I could.

> Patch1 looked like the start of a series of incremental conversions, patch2
> is a combo move. I am ok either way, just questioning consistency. I.e. if
> combo move then patch1 folds in here, if incremental, perhaps split out other
> combo conversions like emergency_disable_virtualization_cpu()? The aspect of
> "this got moved twice in the same patchset" is what poked me.

Yeah, I got lazy to a large extent.  I'm not super optimistic that we won't end
up with one big "move all this stuff" patch, but I agree it doesn't need to be
_this_ big.

> [..]
> > diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c
> > new file mode 100644
> > index 000000000000..986e780cf438
> > --- /dev/null
> > +++ b/arch/x86/virt/hw.c
> > @@ -0,0 +1,340 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +#include <linux/cpu.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/errno.h>
> > +#include <linux/kvm_types.h>
> > +#include <linux/list.h>
> > +#include <linux/percpu.h>
> > +
> > +#include <asm/perf_event.h>
> > +#include <asm/processor.h>
> > +#include <asm/virt.h>
> > +#include <asm/vmx.h>
> > +
> > +static int x86_virt_feature __ro_after_init;
> > +
> > +__visible bool virt_rebooting;
> > +EXPORT_SYMBOL_GPL(virt_rebooting);
> > +
> > +static DEFINE_PER_CPU(int, virtualization_nr_users);
> > +
> > +static cpu_emergency_virt_cb __rcu *kvm_emergency_callback;
> 
> Hmm, why kvm_ and not virt_?

I was trying to capture that this callback can _only_ be used by KVM, because
KVM is the only in-tree hypervisor.  That's also why the exports are only for
KVM (and will use EXPORT_SYMBOL_FOR_KVM() when I post the next version).

> [..]
> > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > +static DEFINE_PER_CPU(struct vmcs *, root_vmcs);
> 
> Perhaps introduce a CONFIG_INTEL_VMX for this? For example, KVM need not
> be enabled if all one wants to do is use TDX to setup PCIe Link
> Encryption. ...or were you expecting?
> 
> #if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(...<other VMX users>...)

I don't think we need anything at this time.  INTEL_TDX_HOST depends on KVM_INTEL,
and so without a user that needs VMXON without KVM_INTEL, I think we're good as-is.

 config INTEL_TDX_HOST
	bool "Intel Trust Domain Extensions (TDX) host support"
	depends on CPU_SUP_INTEL
	depends on X86_64
	depends on KVM_INTEL

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by dan.j.williams@intel.com 2 months ago

Sean Christopherson wrote:
> On Sat, Dec 06, 2025, dan.j.williams@intel.com wrote:
> > Sean Christopherson wrote:
> > > @@ -694,9 +696,6 @@ static void drop_user_return_notifiers(void)
> > >  		kvm_on_user_return(&msrs->urn);
> > >  }
> > >  
> > > -__visible bool kvm_rebooting;
> > > -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);
> > 
> > ...a short stay for this symbol in kvm/x86.c? It raises my curiosity why
> > patch1 is separate.
> 
> Because it affects non-x86 architectures.  It should be a complete nop, but I
> wanted to isolate what I could.

Ok.

[..]
> > > +static cpu_emergency_virt_cb __rcu *kvm_emergency_callback;
> > 
> > Hmm, why kvm_ and not virt_?
> 
> I was trying to capture that this callback can _only_ be used by KVM, because
> KVM is the only in-tree hypervisor.  That's also why the exports are only for
> KVM (and will use EXPORT_SYMBOL_FOR_KVM() when I post the next version).

Oh, true, that makes sense.

> > [..]
> > > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > > +static DEFINE_PER_CPU(struct vmcs *, root_vmcs);
> > 
> > Perhaps introduce a CONFIG_INTEL_VMX for this? For example, KVM need not
> > be enabled if all one wants to do is use TDX to setup PCIe Link
> > Encryption. ...or were you expecting?
> > 
> > #if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(...<other VMX users>...)
> 
> I don't think we need anything at this time.  INTEL_TDX_HOST depends on KVM_INTEL,
> and so without a user that needs VMXON without KVM_INTEL, I think we're good as-is.
> 
>  config INTEL_TDX_HOST
> 	bool "Intel Trust Domain Extensions (TDX) host support"
> 	depends on CPU_SUP_INTEL
> 	depends on X86_64
> 	depends on KVM_INTEL

...but INTEL_TDX_HOST, it turns out, does not have any functional
dependencies on KVM_INTEL. At least, not since I last checked. Yes, it
would be silly and result in dead code today to do a build with:

CONFIG_INTEL_TDX_HOST=y
CONFIG_KVM_INTEL=n

However, when the TDX Connect support arrives you could have:

CONFIG_INTEL_TDX_HOST=y
CONFIG_KVM_INTEL=n
CONFIG_TDX_HOST_SERVICES=y

Where "TDX Host Services" is a driver for PCIe Link Encryption and TDX
Module update. Whether such configuration freedom has any practical
value is a separate question.

I am ok if the answer is, "wait until someone shows up who really wants
PCIe Link Encryption without KVM".

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Sean Christopherson 2 months ago

On Wed, Dec 10, 2025, dan.j.williams@intel.com wrote:
> Sean Christopherson wrote:
> > On Sat, Dec 06, 2025, dan.j.williams@intel.com wrote:
> > I don't think we need anything at this time.  INTEL_TDX_HOST depends on KVM_INTEL,
> > and so without a user that needs VMXON without KVM_INTEL, I think we're good as-is.
> > 
> >  config INTEL_TDX_HOST
> > 	bool "Intel Trust Domain Extensions (TDX) host support"
> > 	depends on CPU_SUP_INTEL
> > 	depends on X86_64
> > 	depends on KVM_INTEL
> 
> ...but INTEL_TDX_HOST, it turns out, does not have any functional
> dependencies on KVM_INTEL. At least, not since I last checked. Yes, it
> would be silly and result in dead code today to do a build with:
> 
> CONFIG_INTEL_TDX_HOST=y
> CONFIG_KVM_INTEL=n
> 
> However, when the TDX Connect support arrives you could have:
> 
> CONFIG_INTEL_TDX_HOST=y
> CONFIG_KVM_INTEL=n
> CONFIG_TDX_HOST_SERVICES=y
> 
> Where "TDX Host Services" is a driver for PCIe Link Encryption and TDX
> Module update. Whether such configuration freedom has any practical
> value is a separate question.
> 
> I am ok if the answer is, "wait until someone shows up who really wants
> PCIe Link Encryption without KVM".

Ya, that's my answer.  At the very least, wait until TDX_HOST_SERVICES comes
along.

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Xu Yilun 1 month, 2 weeks ago

On Wed, Dec 10, 2025 at 06:20:17AM -0800, Sean Christopherson wrote:
> On Wed, Dec 10, 2025, dan.j.williams@intel.com wrote:
> > Sean Christopherson wrote:
> > > On Sat, Dec 06, 2025, dan.j.williams@intel.com wrote:
> > > I don't think we need anything at this time.  INTEL_TDX_HOST depends on KVM_INTEL,
> > > and so without a user that needs VMXON without KVM_INTEL, I think we're good as-is.
> > > 
> > >  config INTEL_TDX_HOST
> > > 	bool "Intel Trust Domain Extensions (TDX) host support"
> > > 	depends on CPU_SUP_INTEL
> > > 	depends on X86_64
> > > 	depends on KVM_INTEL
> > 
> > ...but INTEL_TDX_HOST, it turns out, does not have any functional
> > dependencies on KVM_INTEL. At least, not since I last checked. Yes, it
> > would be silly and result in dead code today to do a build with:
> > 
> > CONFIG_INTEL_TDX_HOST=y
> > CONFIG_KVM_INTEL=n
> > 
> > However, when the TDX Connect support arrives you could have:
> > 
> > CONFIG_INTEL_TDX_HOST=y
> > CONFIG_KVM_INTEL=n
> > CONFIG_TDX_HOST_SERVICES=y
> > 
> > Where "TDX Host Services" is a driver for PCIe Link Encryption and TDX
> > Module update. Whether such configuration freedom has any practical
> > value is a separate question.
> > 
> > I am ok if the answer is, "wait until someone shows up who really wants
> > PCIe Link Encryption without KVM".
> 
> Ya, that's my answer.  At the very least, wait until TDX_HOST_SERVICES comes
> along.

I've tested the PCIe Link Encryption without KVM, with the kernel
config:

  CONFIG_INTEL_TDX_HOST=y
  CONFIG_KVM_INTEL=n
  CONFIG_TDX_HOST_SERVICES=y

and

--- /dev/null
+++ b/drivers/virt/coco/tdx-host/Kconfig
@@ -0,0 +1,10 @@
+config TDX_HOST_SERVICES
+       tristate "TDX Host Services Driver"
+       depends on INTEL_TDX_HOST
+       default m

Finally I enabled the combination successfully with a patch below, do we
need the change when TDX_HOST_SERVICES comes?

------------8<----------------------

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 80527299f859..e3e90d1fcad3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1898,7 +1898,6 @@ config INTEL_TDX_HOST
        bool "Intel Trust Domain Extensions (TDX) host support"
        depends on CPU_SUP_INTEL
        depends on X86_64
-       depends on KVM_INTEL
        depends on X86_X2APIC
        select ARCH_KEEP_MEMBLOCK
        depends on CONTIG_ALLOC
diff --git a/arch/x86/include/asm/virt.h b/arch/x86/include/asm/virt.h
index 77a366afd9f7..26bbf0f21575 100644
--- a/arch/x86/include/asm/virt.h
+++ b/arch/x86/include/asm/virt.h
@@ -6,7 +6,7 @@

 typedef void (cpu_emergency_virt_cb)(void);

-#if IS_ENABLED(CONFIG_KVM_X86)
+#if IS_ENABLED(CONFIG_KVM_X86) || IS_ENABLED(CONFIG_INTEL_TDX_HOST)
 extern bool virt_rebooting;

 void __init x86_virt_init(void);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 278f08194ec8..28fe309093ed 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -134,7 +134,7 @@ config X86_SGX_KVM
 config KVM_INTEL_TDX
        bool "Intel Trust Domain Extensions (TDX) support"
        default y
-       depends on INTEL_TDX_HOST
+       depends on INTEL_TDX_HOST && KVM_INTEL
        select KVM_GENERIC_MEMORY_ATTRIBUTES
        select HAVE_KVM_ARCH_GMEM_POPULATE
        help
diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile
index 6e485751650c..85ed7a06ed88 100644
--- a/arch/x86/virt/Makefile
+++ b/arch/x86/virt/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y  += svm/ vmx/

-obj-$(subst m,y,$(CONFIG_KVM_X86)) += hw.o
\ No newline at end of file
+obj-$(CONFIG_INTEL_TDX_HOST) += hw.o
+obj-$(subst m,y,$(CONFIG_KVM_X86)) += hw.o
diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c
index 986e780cf438..31ea89069a93 100644
--- a/arch/x86/virt/hw.c
+++ b/arch/x86/virt/hw.c
@@ -48,7 +48,7 @@ static void x86_virt_invoke_kvm_emergency_callback(void)
                kvm_callback();
 }

-#if IS_ENABLED(CONFIG_KVM_INTEL)
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_INTEL_TDX_HOST)
 static DEFINE_PER_CPU(struct vmcs *, root_vmcs);

 static int x86_virt_cpu_vmxon(void)
@@ -257,7 +257,8 @@ static __init int x86_svm_init(void) { return -EOPNOTSUPP; }
 ({                                                     \
        int __r;                                        \
                                                        \
-       if (IS_ENABLED(CONFIG_KVM_INTEL) &&             \
+       if ((IS_ENABLED(CONFIG_KVM_INTEL) ||            \
+            IS_ENABLED(CONFIG_INTEL_TDX_HOST)) &&      \
            cpu_feature_enabled(X86_FEATURE_VMX))       \
                __r = x86_vmx_##fn();                   \
        else if (IS_ENABLED(CONFIG_KVM_AMD) &&          \

>

Re: [PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel

Posted by Sean Christopherson 1 month, 1 week ago

On Wed, Dec 24, 2025, Xu Yilun wrote:
> On Wed, Dec 10, 2025 at 06:20:17AM -0800, Sean Christopherson wrote:
> > On Wed, Dec 10, 2025, dan.j.williams@intel.com wrote:
> > > Sean Christopherson wrote:
> > > > On Sat, Dec 06, 2025, dan.j.williams@intel.com wrote:
> > > > I don't think we need anything at this time.  INTEL_TDX_HOST depends on KVM_INTEL,
> > > > and so without a user that needs VMXON without KVM_INTEL, I think we're good as-is.
> > > > 
> > > >  config INTEL_TDX_HOST
> > > > 	bool "Intel Trust Domain Extensions (TDX) host support"
> > > > 	depends on CPU_SUP_INTEL
> > > > 	depends on X86_64
> > > > 	depends on KVM_INTEL
> > > 
> > > ...but INTEL_TDX_HOST, it turns out, does not have any functional
> > > dependencies on KVM_INTEL. At least, not since I last checked. Yes, it
> > > would be silly and result in dead code today to do a build with:
> > > 
> > > CONFIG_INTEL_TDX_HOST=y
> > > CONFIG_KVM_INTEL=n
> > > 
> > > However, when the TDX Connect support arrives you could have:
> > > 
> > > CONFIG_INTEL_TDX_HOST=y
> > > CONFIG_KVM_INTEL=n
> > > CONFIG_TDX_HOST_SERVICES=y
> > > 
> > > Where "TDX Host Services" is a driver for PCIe Link Encryption and TDX
> > > Module update. Whether such configuration freedom has any practical
> > > value is a separate question.
> > > 
> > > I am ok if the answer is, "wait until someone shows up who really wants
> > > PCIe Link Encryption without KVM".
> > 
> > Ya, that's my answer.  At the very least, wait until TDX_HOST_SERVICES comes
> > along.
> 
> I've tested the PCIe Link Encryption without KVM, with the kernel
> config:
> 
>   CONFIG_INTEL_TDX_HOST=y
>   CONFIG_KVM_INTEL=n
>   CONFIG_TDX_HOST_SERVICES=y
> 
> and
> 
> --- /dev/null
> +++ b/drivers/virt/coco/tdx-host/Kconfig
> @@ -0,0 +1,10 @@
> +config TDX_HOST_SERVICES
> +       tristate "TDX Host Services Driver"
> +       depends on INTEL_TDX_HOST
> +       default m
> 
> Finally I enabled the combination successfully with a patch below, do we
> need the change when TDX_HOST_SERVICES comes?

Ya, we'll need something along those lines.  What exactly we want the Kconfig
soup to look like is TBD though, e.g. it may or may not make sense to have a
common config that says "I want virtualization!"?

[PATCH v2 1/7] KVM: x86: Move kvm_rebooting to x86
[PATCH v2 2/7] KVM: x86: Extract VMXON and EFER.SVME enablement to kernel
[PATCH v2 3/7] KVM: x86/tdx: Do VMXON and TDX-Module initialization during subsys init
[PATCH v2 4/7] x86/virt/tdx: Tag a pile of functions as __init, and globals as __ro_after_init
[PATCH v2 5/7] x86/virt/tdx: KVM: Consolidate TDX CPU hotplug handling
[PATCH v2 6/7] x86/virt/tdx: Use ida_is_empty() to detect if any TDs may be running
[PATCH v2 7/7] KVM: Bury kvm_{en,dis}able_virtualization() in kvm_main.c once more