[RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase

Xin Li (Intel) posted 5 patches 3 weeks, 2 days ago
[RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Xin Li (Intel) 3 weeks, 2 days ago
Move the VMXON setup from the KVM initialization path to the CPU startup
phase to guarantee that hardware virtualization is enabled early and
without interruption.

As a result, KVM, often loaded as a kernel module, no longer needs to worry
about whether or not VMXON has been executed on a CPU (e.g., CPU offline
events or system reboots while KVM is loading).

Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
 arch/x86/include/asm/processor.h |   1 +
 arch/x86/include/asm/vmx.h       |   5 ++
 arch/x86/kernel/cpu/common.c     |  91 ++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmcs.h          |   5 --
 arch/x86/kvm/vmx/vmx.c           | 117 ++-----------------------------
 arch/x86/power/cpu.c             |   7 +-
 6 files changed, 107 insertions(+), 119 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index bde58f6510ac..59660428f46d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -230,6 +230,7 @@ void init_cpu_devs(void);
 void get_cpu_vendor(struct cpuinfo_x86 *c);
 extern void early_cpu_init(void);
 extern void identify_secondary_cpu(unsigned int cpu);
+extern void cpu_enable_virtualization(void);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 void print_cpu_msr(struct cpuinfo_x86 *);
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index cca7d6641287..736d04c1b2fc 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -20,6 +20,11 @@
 #include <asm/trapnr.h>
 #include <asm/vmxfeatures.h>
 
+struct vmcs_hdr {
+	u32 revision_id:31;
+	u32 shadow_vmcs:1;
+};
+
 #define VMCS_CONTROL_BIT(x)	BIT(VMX_FEATURE_##x & 0x1f)
 
 /*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 34a054181c4d..e36877b5a240 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -72,6 +72,7 @@
 #include <asm/tdx.h>
 #include <asm/posted_intr.h>
 #include <asm/runtime-const.h>
+#include <asm/vmx.h>
 
 #include "cpu.h"
 
@@ -1923,6 +1924,84 @@ static void generic_identify(struct cpuinfo_x86 *c)
 #endif
 }
 
+static bool is_vmx_supported(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_VMX & 31)))) {
+		/* May not be an Intel CPU */
+		pr_info("VMX not supported by CPU%d\n", cpu);
+		return false;
+	}
+
+	if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+	    !this_cpu_has(X86_FEATURE_VMX)) {
+		pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU%d\n", cpu);
+		return false;
+	}
+
+	return true;
+}
+
+/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+union vmxon_vmcs {
+	struct vmcs_hdr hdr;
+	char data[PAGE_SIZE];
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(union vmxon_vmcs, vmxon_vmcs);
+
+/*
+ * Executed during the CPU startup phase to execute VMXON to enable VMX. This
+ * ensures that KVM, often loaded as a kernel module, no longer needs to worry
+ * about whether or not VMXON has been executed on a CPU (e.g., CPU offline
+ * events or system reboots while KVM is loading).
+ *
+ * VMXON is not expected to fault, but fault handling is kept as a precaution
+ * against any unexpected code paths that might trigger it and can be removed
+ * later if unnecessary.
+ */
+void cpu_enable_virtualization(void)
+{
+	u64 vmxon_pointer = __pa(this_cpu_ptr(&vmxon_vmcs));
+	int cpu = raw_smp_processor_id();
+	u64 basic_msr;
+
+	if (!is_vmx_supported())
+		return;
+
+	if (cr4_read_shadow() & X86_CR4_VMXE) {
+		pr_err("VMX already enabled on CPU%d\n", cpu);
+		return;
+	}
+
+	memset(this_cpu_ptr(&vmxon_vmcs), 0, PAGE_SIZE);
+
+	/*
+	 * Even though not explicitly documented by TLFS, VMXArea passed as
+	 * VMXON argument should still be marked with revision_id reported by
+	 * physical CPU.
+	 */
+	rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
+	this_cpu_ptr(&vmxon_vmcs)->hdr.revision_id = vmx_basic_vmcs_revision_id(basic_msr);
+
+	intel_pt_handle_vmx(1);
+
+	cr4_set_bits(X86_CR4_VMXE);
+
+	asm goto("1: vmxon %[vmxon_pointer]\n\t"
+		 _ASM_EXTABLE(1b, %l[fault])
+		 : : [vmxon_pointer] "m"(vmxon_pointer)
+		 : : fault);
+
+	return;
+
+fault:
+	pr_err("VMXON faulted on CPU%d\n", cpu);
+	cr4_clear_bits(X86_CR4_VMXE);
+	intel_pt_handle_vmx(0);
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
@@ -2120,6 +2199,12 @@ void identify_secondary_cpu(unsigned int cpu)
 
 	tsx_ap_init();
 	c->initialized = true;
+
+	/*
+	 * Enable AP virtualization immediately after initializing the per-CPU
+	 * cpuinfo_x86 structure, ensuring that this_cpu_has() operates correctly.
+	 */
+	cpu_enable_virtualization();
 }
 
 void print_cpu_info(struct cpuinfo_x86 *c)
@@ -2551,6 +2636,12 @@ void __init arch_cpu_finalize_init(void)
 	*c = boot_cpu_data;
 	c->initialized = true;
 
+	/*
+	 * Enable BSP virtualization right after the BSP cpuinfo_x86 structure
+	 * is initialized to ensure this_cpu_has() works as expected.
+	 */
+	cpu_enable_virtualization();
+
 	alternative_instructions();
 
 	if (IS_ENABLED(CONFIG_X86_64)) {
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index b25625314658..da5631924432 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -13,11 +13,6 @@
 
 #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
 
-struct vmcs_hdr {
-	u32 revision_id:31;
-	u32 shadow_vmcs:1;
-};
-
 struct vmcs {
 	struct vmcs_hdr hdr;
 	u32 abort;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aa157fe5b7b3..f6742df0c4ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -468,7 +468,6 @@ noinline void invept_error(unsigned long ext, u64 eptp)
 	vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
 }
 
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 /*
  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
@@ -2736,43 +2735,14 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	return 0;
 }
 
-static bool __kvm_is_vmx_supported(void)
-{
-	int cpu = smp_processor_id();
-
-	if (!(cpuid_ecx(1) & feature_bit(VMX))) {
-		pr_err("VMX not supported by CPU %d\n", cpu);
-		return false;
-	}
-
-	if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
-	    !this_cpu_has(X86_FEATURE_VMX)) {
-		pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
-		return false;
-	}
-
-	return true;
-}
-
-static bool kvm_is_vmx_supported(void)
-{
-	bool supported;
-
-	migrate_disable();
-	supported = __kvm_is_vmx_supported();
-	migrate_enable();
-
-	return supported;
-}
-
 int vmx_check_processor_compat(void)
 {
 	int cpu = raw_smp_processor_id();
 	struct vmcs_config vmcs_conf;
 	struct vmx_capability vmx_cap;
 
-	if (!__kvm_is_vmx_supported())
-		return -EIO;
+	if (!(cr4_read_shadow() & X86_CR4_VMXE))
+		return -EOPNOTSUPP;
 
 	if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
 		pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
@@ -2787,34 +2757,12 @@ int vmx_check_processor_compat(void)
 	return 0;
 }
 
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
-	u64 msr;
-
-	cr4_set_bits(X86_CR4_VMXE);
-
-	asm goto("1: vmxon %[vmxon_pointer]\n\t"
-			  _ASM_EXTABLE(1b, %l[fault])
-			  : : [vmxon_pointer] "m"(vmxon_pointer)
-			  : : fault);
-	return 0;
-
-fault:
-	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
-		  rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
-	cr4_clear_bits(X86_CR4_VMXE);
-
-	return -EFAULT;
-}
-
 int vmx_enable_virtualization_cpu(void)
 {
 	int cpu = raw_smp_processor_id();
-	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	int r;
 
-	if (cr4_read_shadow() & X86_CR4_VMXE)
-		return -EBUSY;
+	if (!(cr4_read_shadow() & X86_CR4_VMXE))
+		return -EOPNOTSUPP;
 
 	/*
 	 * This can happen if we hot-added a CPU but failed to allocate
@@ -2823,14 +2771,6 @@ int vmx_enable_virtualization_cpu(void)
 	if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
 		return -EFAULT;
 
-	intel_pt_handle_vmx(1);
-
-	r = kvm_cpu_vmxon(phys_addr);
-	if (r) {
-		intel_pt_handle_vmx(0);
-		return r;
-	}
-
 	return 0;
 }
 
@@ -2931,47 +2871,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	return -ENOMEM;
 }
 
-static void free_kvm_area(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		free_vmcs(per_cpu(vmxarea, cpu));
-		per_cpu(vmxarea, cpu) = NULL;
-	}
-}
-
-static __init int alloc_kvm_area(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct vmcs *vmcs;
-
-		vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
-		if (!vmcs) {
-			free_kvm_area();
-			return -ENOMEM;
-		}
-
-		/*
-		 * When eVMCS is enabled, alloc_vmcs_cpu() sets
-		 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
-		 * revision_id reported by MSR_IA32_VMX_BASIC.
-		 *
-		 * However, even though not explicitly documented by
-		 * TLFS, VMXArea passed as VMXON argument should
-		 * still be marked with revision_id reported by
-		 * physical CPU.
-		 */
-		if (kvm_is_using_evmcs())
-			vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
-
-		per_cpu(vmxarea, cpu) = vmcs;
-	}
-	return 0;
-}
-
 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
 		struct kvm_segment *save)
 {
@@ -8204,8 +8103,6 @@ void vmx_hardware_unsetup(void)
 
 	if (nested)
 		nested_vmx_hardware_unsetup();
-
-	free_kvm_area();
 }
 
 void vmx_vm_destroy(struct kvm *kvm)
@@ -8499,10 +8396,6 @@ __init int vmx_hardware_setup(void)
 
 	vmx_set_cpu_caps();
 
-	r = alloc_kvm_area();
-	if (r && nested)
-		nested_vmx_hardware_unsetup();
-
 	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
 
 	/*
@@ -8554,7 +8447,7 @@ int __init vmx_init(void)
 
 	KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
 
-	if (!kvm_is_vmx_supported())
+	if (!(cr4_read_shadow() & X86_CR4_VMXE))
 		return -EOPNOTSUPP;
 
 	/*
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 916441f5e85c..0eec314b79c2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -206,11 +206,11 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	/* cr4 was introduced in the Pentium CPU */
 #ifdef CONFIG_X86_32
 	if (ctxt->cr4)
-		__write_cr4(ctxt->cr4);
+		__write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
 #else
 /* CONFIG X86_64 */
 	wrmsrq(MSR_EFER, ctxt->efer);
-	__write_cr4(ctxt->cr4);
+	__write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
 #endif
 	write_cr3(ctxt->cr3);
 	write_cr2(ctxt->cr2);
@@ -291,6 +291,9 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	 * because some of the MSRs are "emulated" in microcode.
 	 */
 	msr_restore_context(ctxt);
+
+	if (ctxt->cr4 & X86_CR4_VMXE)
+		cpu_enable_virtualization();
 }
 
 /* Needed by apm.c */
-- 
2.51.0
Re: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Huang, Kai 3 weeks, 1 day ago
On Tue, 2025-09-09 at 11:28 -0700, Xin Li (Intel) wrote:
> Move the VMXON setup from the KVM initialization path to the CPU startup
> phase to guarantee that hardware virtualization is enabled early and
> without interruption.
> 
> As a result, KVM, often loaded as a kernel module, no longer needs to worry
> about whether or not VMXON has been executed on a CPU (e.g., CPU offline
> events or system reboots while KVM is loading).

KVM has a module parameter 'enable_virt_at_load', which controls whether
to enable virtualization (in case of VMX, VMXON) when loading KVM or defer
the enabling until the first VM is created.

Changing to unconditionally do VMXON when bringing up the CPU will kinda
break this.  Maybe eventually, we might switch to unconditionally VMXON,
but now it seems a dramatic move.

I was thinking the code change would be the core kernel only provides the
VMXON/OFF APIs for KVM (and other kernel components to use, i.e., more
like "moving" VMX code out of KVM. 
 

[...]

> +static bool is_vmx_supported(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +
> +	if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_VMX & 31)))) {
> +		/* May not be an Intel CPU */
> +		pr_info("VMX not supported by CPU%d\n", cpu);
> +		return false;
> +	}
> +
> +	if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
> +	    !this_cpu_has(X86_FEATURE_VMX)) {
> +		pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU%d\n", cpu);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
> +union vmxon_vmcs {
> +	struct vmcs_hdr hdr;
> +	char data[PAGE_SIZE];
> +};
> +
> +static DEFINE_PER_CPU_PAGE_ALIGNED(union vmxon_vmcs, vmxon_vmcs);
> +
> +/*
> + * Executed during the CPU startup phase to execute VMXON to enable VMX. This
> + * ensures that KVM, often loaded as a kernel module, no longer needs to worry
> + * about whether or not VMXON has been executed on a CPU (e.g., CPU offline
> + * events or system reboots while KVM is loading).
> + *
> + * VMXON is not expected to fault, but fault handling is kept as a precaution
> + * against any unexpected code paths that might trigger it and can be removed
> + * later if unnecessary.
> + */
> +void cpu_enable_virtualization(void)
> +{
> +	u64 vmxon_pointer = __pa(this_cpu_ptr(&vmxon_vmcs));
> +	int cpu = raw_smp_processor_id();
> +	u64 basic_msr;
> +
> +	if (!is_vmx_supported())
> +		return;
> +
> +	if (cr4_read_shadow() & X86_CR4_VMXE) {
> +		pr_err("VMX already enabled on CPU%d\n", cpu);
> +		return;
> +	}
> +
> +	memset(this_cpu_ptr(&vmxon_vmcs), 0, PAGE_SIZE);
> +
> +	/*
> +	 * Even though not explicitly documented by TLFS, VMXArea passed as
> +	 * VMXON argument should still be marked with revision_id reported by
> +	 * physical CPU.
> +	 */
> +	rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
> +	this_cpu_ptr(&vmxon_vmcs)->hdr.revision_id = vmx_basic_vmcs_revision_id(basic_msr);
> +
> +	intel_pt_handle_vmx(1);
> +
> +	cr4_set_bits(X86_CR4_VMXE);
> +
> +	asm goto("1: vmxon %[vmxon_pointer]\n\t"
> +		 _ASM_EXTABLE(1b, %l[fault])
> +		 : : [vmxon_pointer] "m"(vmxon_pointer)
> +		 : : fault);
> +
> +	return;
> +
> +fault:
> +	pr_err("VMXON faulted on CPU%d\n", cpu);
> +	cr4_clear_bits(X86_CR4_VMXE);
> +	intel_pt_handle_vmx(0);
> +}
> +
>  /*
>   * This does the hard work of actually picking apart the CPU stuff...
>   */
> @@ -2120,6 +2199,12 @@ void identify_secondary_cpu(unsigned int cpu)
>  
>  	tsx_ap_init();
>  	c->initialized = true;
> +
> +	/*
> +	 * Enable AP virtualization immediately after initializing the per-CPU
> +	 * cpuinfo_x86 structure, ensuring that this_cpu_has() operates correctly.
> +	 */
> +	cpu_enable_virtualization();
>  }

AFAICT here there's a functional drawback, that this implementation
doesn't handle VMXON failure while the existing KVM code does via a CPUHP
callback.

>  
>  void print_cpu_info(struct cpuinfo_x86 *c)
> @@ -2551,6 +2636,12 @@ void __init arch_cpu_finalize_init(void)
>  	*c = boot_cpu_data;
>  	c->initialized = true;
>  
> +	/*
> +	 * Enable BSP virtualization right after the BSP cpuinfo_x86 structure
> +	 * is initialized to ensure this_cpu_has() works as expected.
> +	 */
> +	cpu_enable_virtualization();
> +
> 

Any reason that you choose to do it in arch_cpu_finalize_init()?  Perhaps
just a arch_initcall() or similar?

KVM has a specific CPUHP_AP_KVM_ONLINE to handle VMXON/OFF for CPU
online/offline.  And it's not in STARTUP section (which is not allowed to
fail) so it can handle the failure of VMXON.

How about adding a VMX specific CPUHP callback instead?

In this way, not only we can put all VMX related code together (e.g.,
arch/x86/virt/vmx/vmx.c) which is way easier to review/maintain, but also
we can still handle the failure of VMXON just like in KVM.

(btw, originally KVM's CPUHP callback was also in STARTUP section, but
later we changed to after that in order to handle VMXON failure and
compatibility check failure gracefully.)

[...]

> diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
> index 916441f5e85c..0eec314b79c2 100644
> --- a/arch/x86/power/cpu.c
> +++ b/arch/x86/power/cpu.c
> @@ -206,11 +206,11 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
>  	/* cr4 was introduced in the Pentium CPU */
>  #ifdef CONFIG_X86_32
>  	if (ctxt->cr4)
> -		__write_cr4(ctxt->cr4);
> +		__write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
>  #else
>  /* CONFIG X86_64 */
>  	wrmsrq(MSR_EFER, ctxt->efer);
> -	__write_cr4(ctxt->cr4);
> +	__write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
>  #endif
>  	write_cr3(ctxt->cr3);
>  	write_cr2(ctxt->cr2);
> @@ -291,6 +291,9 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
>  	 * because some of the MSRs are "emulated" in microcode.
>  	 */
>  	msr_restore_context(ctxt);
> +
> +	if (ctxt->cr4 & X86_CR4_VMXE)
> +		cpu_enable_virtualization();
>  }
> 

If we still leverage what KVM is doing -- using syscore_ops callback -- I
think we can avoid changing this function but keep all VMX code in a
dedicated file.
Re: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Chao Gao 3 weeks, 1 day ago
>> @@ -2551,6 +2636,12 @@ void __init arch_cpu_finalize_init(void)
>>  	*c = boot_cpu_data;
>>  	c->initialized = true;
>>  
>> +	/*
>> +	 * Enable BSP virtualization right after the BSP cpuinfo_x86 structure
>> +	 * is initialized to ensure this_cpu_has() works as expected.
>> +	 */
>> +	cpu_enable_virtualization();
>> +
>> 
>
>Any reason that you choose to do it in arch_cpu_finalize_init()?  Perhaps
>just a arch_initcall() or similar?
>
>KVM has a specific CPUHP_AP_KVM_ONLINE to handle VMXON/OFF for CPU
>online/offline.  And it's not in STARTUP section (which is not allowed to
>fail) so it can handle the failure of VMXON.
>
>How about adding a VMX specific CPUHP callback instead?
>
>In this way, not only we can put all VMX related code together (e.g.,
>arch/x86/virt/vmx/vmx.c) which is way easier to review/maintain, but also
>we can still handle the failure of VMXON just like in KVM.

KVM's policy is that a CPU can be online if there is no VM running. It is hard
to implement/move the same logic inside the core kernel because the core kernel
would need to refcount the running VMs. Any idea/suggestion on how to handle
VMXON failure in the core kernel?
Re: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Huang, Kai 3 weeks, 1 day ago
On Wed, 2025-09-10 at 19:10 +0800, Chao Gao wrote:
> > > @@ -2551,6 +2636,12 @@ void __init arch_cpu_finalize_init(void)
> > >  	*c = boot_cpu_data;
> > >  	c->initialized = true;
> > >  
> > > 
> > > 
> > > 
> > > +	/*
> > > +	 * Enable BSP virtualization right after the BSP cpuinfo_x86 structure
> > > +	 * is initialized to ensure this_cpu_has() works as expected.
> > > +	 */
> > > +	cpu_enable_virtualization();
> > > +
> > > 
> > 
> > Any reason that you choose to do it in arch_cpu_finalize_init()?  Perhaps
> > just a arch_initcall() or similar?
> > 
> > KVM has a specific CPUHP_AP_KVM_ONLINE to handle VMXON/OFF for CPU
> > online/offline.  And it's not in STARTUP section (which is not allowed to
> > fail) so it can handle the failure of VMXON.
> > 
> > How about adding a VMX specific CPUHP callback instead?
> > 
> > In this way, not only we can put all VMX related code together (e.g.,
> > arch/x86/virt/vmx/vmx.c) which is way easier to review/maintain, but also
> > we can still handle the failure of VMXON just like in KVM.
> 
> KVM's policy is that a CPU can be online if there is no VM running. 
> 

This is when 'enable_virt_at_load' is off, right?  The default value is
true.

> It is hard
> to implement/move the same logic inside the core kernel because the core kernel
> would need to refcount the running VMs. Any idea/suggestion on how to handle
> VMXON failure in the core kernel?

Since I think doing VMXON when bringing up CPU unconditionally is a
dramatic move at this stage, I was actually thinking we don't do VMXON in
CPUHP callback, but only do prepare things like sanity check and VMXON
region setup etc.  If anything fails, we refuse to online CPU, or mark CPU
as VMX not supported, whatever.

The core kernel then provides two APIs to do VMXON/VMXOFF respectively,
and KVM can use them.  The APIs needs to handle concurrent requests from
multiple users, though.  VMCLEAR could still be in KVM since this is kinda
KVM's internal on how to manage vCPUs.

Does this make sense? 
Re: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Arjan van de Ven 3 weeks, 1 day ago
> 
> Since I think doing VMXON when bringing up CPU unconditionally is a
> dramatic move at this stage, I was actually thinking we don't do VMXON in
> CPUHP callback, but only do prepare things like sanity check and VMXON
> region setup etc.  If anything fails, we refuse to online CPU, or mark CPU
> as VMX not supported, whatever.

the whole point is to always vmxon -- and simplify all the complexity
from doing this dynamic.
So yes "dramatic" maybe but needed -- especially as things like TDX
and TDX connect need vmxon to be enabled outside of KVM context.


> 
> The core kernel then provides two APIs to do VMXON/VMXOFF respectively,
> and KVM can use them.  The APIs needs to handle concurrent requests from
> multiple users, though.  VMCLEAR could still be in KVM since this is kinda
> KVM's internal on how to manage vCPUs.
> 
> Does this make sense?

not to me -- the whole point is to not having this dynamic thing
RE: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Huang, Kai 3 weeks, 1 day ago
> > Since I think doing VMXON when bringing up CPU unconditionally is a
> > dramatic move at this stage, I was actually thinking we don't do VMXON
> > in CPUHP callback, but only do prepare things like sanity check and
> > VMXON region setup etc.  If anything fails, we refuse to online CPU,
> > or mark CPU as VMX not supported, whatever.
> 
> the whole point is to always vmxon -- and simplify all the complexity from doing
> this dynamic.
> So yes "dramatic" maybe but needed -- especially as things like TDX and TDX
> connect need vmxon to be enabled outside of KVM context.
> 
> 
> >
> > The core kernel then provides two APIs to do VMXON/VMXOFF
> > respectively, and KVM can use them.  The APIs needs to handle
> > concurrent requests from multiple users, though.  VMCLEAR could still
> > be in KVM since this is kinda KVM's internal on how to manage vCPUs.
> >
> > Does this make sense?
> 
> not to me -- the whole point is to not having this dynamic thing

Sure.  Fine to me to just always on. 

Re: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Chao Gao 3 weeks, 1 day ago
> void vmx_vm_destroy(struct kvm *kvm)
>@@ -8499,10 +8396,6 @@ __init int vmx_hardware_setup(void)
> 
> 	vmx_set_cpu_caps();
> 
>-	r = alloc_kvm_area();
>-	if (r && nested)
>-		nested_vmx_hardware_unsetup();
>-

There is a "return r" at the end of this function. with the removal
of "r = alloc_kvm_area()", @r may be uninitialized.

> 	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
> 
> 	/*
>@@ -8554,7 +8447,7 @@ int __init vmx_init(void)
> 
> 	KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
> 
>-	if (!kvm_is_vmx_supported())
>+	if (!(cr4_read_shadow() & X86_CR4_VMXE))
> 		return -EOPNOTSUPP;
> 
> 	/*
>diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
>index 916441f5e85c..0eec314b79c2 100644
>--- a/arch/x86/power/cpu.c
>+++ b/arch/x86/power/cpu.c
>@@ -206,11 +206,11 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
> 	/* cr4 was introduced in the Pentium CPU */
> #ifdef CONFIG_X86_32
> 	if (ctxt->cr4)
>-		__write_cr4(ctxt->cr4);
>+		__write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);

any reason to mask off X86_CR4_VMXE here?

I assume before suspend, VMXOFF is executed and CR4.VMXE is cleared. then
ctxt->cr4 here won't have CR4.VMXE set.

> #else
> /* CONFIG X86_64 */
> 	wrmsrq(MSR_EFER, ctxt->efer);
>-	__write_cr4(ctxt->cr4);
>+	__write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
Re: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Xin Li 3 weeks ago
On 9/10/2025 12:25 AM, Chao Gao wrote:
>> void vmx_vm_destroy(struct kvm *kvm)
>> @@ -8499,10 +8396,6 @@ __init int vmx_hardware_setup(void)
>>
>> 	vmx_set_cpu_caps();
>>
>> -	r = alloc_kvm_area();
>> -	if (r && nested)
>> -		nested_vmx_hardware_unsetup();
>> -
> 
> There is a "return r" at the end of this function. with the removal
> of "r = alloc_kvm_area()", @r may be uninitialized.

Good catch!

There’s no need for r to have function-wide scope anymore; just return 0 at
the end of vmx_hardware_setup() after changing the definition of r as the
following

	if (nested) {
		int r = 0;
		...
	}


BTW, it's a good habit to always initialize local variables, which helps to
avoid this kind of mistakes I made here.


>> diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
>> index 916441f5e85c..0eec314b79c2 100644
>> --- a/arch/x86/power/cpu.c
>> +++ b/arch/x86/power/cpu.c
>> @@ -206,11 +206,11 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
>> 	/* cr4 was introduced in the Pentium CPU */
>> #ifdef CONFIG_X86_32
>> 	if (ctxt->cr4)
>> -		__write_cr4(ctxt->cr4);
>> +		__write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
> 
> any reason to mask off X86_CR4_VMXE here?

In this patch set, X86_CR4_VMXE is an indicator of whether VMX is on.  I
used a per-CPU variable to track that, but later it seems better to track
X86_CR4_VMXE.

> 
> I assume before suspend, VMXOFF is executed and CR4.VMXE is cleared. then
> ctxt->cr4 here won't have CR4.VMXE set.

What you said is for APs per my understanding.

cpu_{enable,disable}_virtualization() in arch/x86/power/cpu.c are only used
to execute VMXON/VMXOFF on BSP.

TBH, there are lot of power management details I don't understand, e.g., AP
states don't seem saved.  But the changes here are required to make S4
work :)

Re: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Posted by Adrian Hunter 3 weeks, 1 day ago
On 09/09/2025 21:28, Xin Li (Intel) wrote:
> +/*
> + * Executed during the CPU startup phase to execute VMXON to enable VMX. This
> + * ensures that KVM, often loaded as a kernel module, no longer needs to worry
> + * about whether or not VMXON has been executed on a CPU (e.g., CPU offline
> + * events or system reboots while KVM is loading).
> + *
> + * VMXON is not expected to fault, but fault handling is kept as a precaution
> + * against any unexpected code paths that might trigger it and can be removed
> + * later if unnecessary.
> + */
> +void cpu_enable_virtualization(void)
> +{
> +	u64 vmxon_pointer = __pa(this_cpu_ptr(&vmxon_vmcs));
> +	int cpu = raw_smp_processor_id();
> +	u64 basic_msr;
> +
> +	if (!is_vmx_supported())
> +		return;
> +
> +	if (cr4_read_shadow() & X86_CR4_VMXE) {
> +		pr_err("VMX already enabled on CPU%d\n", cpu);
> +		return;
> +	}
> +
> +	memset(this_cpu_ptr(&vmxon_vmcs), 0, PAGE_SIZE);
> +
> +	/*
> +	 * Even though not explicitly documented by TLFS, VMXArea passed as
> +	 * VMXON argument should still be marked with revision_id reported by
> +	 * physical CPU.
> +	 */
> +	rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
> +	this_cpu_ptr(&vmxon_vmcs)->hdr.revision_id = vmx_basic_vmcs_revision_id(basic_msr);
> +
> +	intel_pt_handle_vmx(1);

intel_pt_handle_vmx() depends on pt_pmu.vmx which is not initialized
until arch_initcall(pt_init), but it looks like cpu_enable_virtualization()
is called earlier than that.

Also note, intel_pt_handle_vmx() exists because Intel PT and
VMX operation are not allowed together if MSR_IA32_VMX_MISC[14] == 0.
That only affects BDW AFAIK.

And note, moving intel_pt_handle_vmx() back to vmx_enable_virtualization_cpu()
does not look right.  It seems to belong with VMXON, refer SDM:

APPENDIX A VMX CAPABILITY REPORTING FACILITY
A.6 MISCELLANEOUS DATA
If bit 14 is read as 1, Intel® Processor Trace (Intel PT) can be used in VMX operation. If the processor supports
Intel PT but does not allow it to be used in VMX operation, execution of VMXON clears IA32_RTIT_CTL.TraceEn
(see “VMXON—Enter VMX Operation” in Chapter 32); any attempt to write IA32_RTIT_CTL while in VMX
operation (including VMX root operation) causes a general-protection exception.