Relocate VMXOFF from the KVM module unload path to the CPU shutdown phase.
This simplifies proper virtualization cleanup during system shutdown, CPU
hotplug (online/offline cycles), and suspend-to-disk (S4) transitions.
Since INIT interrupts are blocked during VMX operation, VMXOFF must run
just before a CPU shuts down to allow it to be brought back online later.
As a result, VMX instructions are no longer expected to fault.
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
arch/x86/include/asm/processor.h | 1 +
arch/x86/kernel/cpu/common.c | 37 ++++++++++++++++++++++++++++++++
arch/x86/kernel/crash.c | 4 ++++
arch/x86/kernel/process.c | 3 +++
arch/x86/kernel/reboot.c | 11 ++++++----
arch/x86/kernel/smp.c | 5 +++++
arch/x86/kernel/smpboot.c | 6 ++++++
arch/x86/kvm/vmx/vmx.c | 30 --------------------------
arch/x86/power/cpu.c | 3 +++
9 files changed, 66 insertions(+), 34 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 59660428f46d..0bfd4eb1e9e2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,7 @@ void get_cpu_vendor(struct cpuinfo_x86 *c);
extern void early_cpu_init(void);
extern void identify_secondary_cpu(unsigned int cpu);
extern void cpu_enable_virtualization(void);
+extern void cpu_disable_virtualization(void);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e36877b5a240..39b9be9a2fb1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2002,6 +2002,43 @@ void cpu_enable_virtualization(void)
intel_pt_handle_vmx(0);
}
+/*
+ * Because INIT interrupts are blocked during VMX operation, this function
+ * must be called just before a CPU shuts down to ensure it can be brought
+ * back online later.
+ *
+ * Consequently, VMX instructions are no longer expected to fault.
+ *
+ * Although VMXOFF should not fault, fault handling is retained as a
+ * precaution against any unexpected code paths that might trigger it and
+ * can be removed later if unnecessary.
+ */
+void cpu_disable_virtualization(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!is_vmx_supported())
+ return;
+
+ if (!(cr4_read_shadow() & X86_CR4_VMXE)) {
+ pr_err("VMX not enabled or already disabled on CPU%d\n", cpu);
+ return;
+ }
+
+ asm goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+exit:
+ cr4_clear_bits(X86_CR4_VMXE);
+ intel_pt_handle_vmx(0);
+ return;
+
+fault:
+ pr_err("VMXOFF faulted on CPU%d\n", cpu);
+ goto exit;
+}
+
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c6b12bed173d..772c6d350b50 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -111,6 +111,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
crash_smp_send_stop();
+ /* Kept to VMCLEAR loaded VMCSs */
cpu_emergency_disable_virtualization();
/*
@@ -141,6 +142,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
x86_platform.guest.enc_kexec_finish();
crash_save_cpu(regs, smp_processor_id());
+
+ /* Disable virtualization on the last running CPU, usually the BSP */
+ cpu_disable_virtualization();
}
#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1b7960cf6eb0..a0f6397b81ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -827,6 +827,9 @@ void __noreturn stop_this_cpu(void *dummy)
disable_local_APIC();
mcheck_cpu_clear(c);
+ /* Disable virtualization, usually this is an AP */
+ cpu_disable_virtualization();
+
/*
* Use wbinvd on processors that support SME. This provides support
* for performing a successful kexec when going from SME inactive
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..7433e634018f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -764,6 +764,9 @@ void native_machine_shutdown(void)
if (kexec_in_progress)
x86_platform.guest.enc_kexec_finish();
+
+ /* Disable virtualization on the last running CPU, usually the BSP */
+ cpu_disable_virtualization();
}
static void __machine_emergency_restart(int emergency)
@@ -873,14 +876,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
if (shootdown_callback)
shootdown_callback(cpu, regs);
- /*
- * Prepare the CPU for reboot _after_ invoking the callback so that the
- * callback can safely use virtualization instructions, e.g. VMCLEAR.
- */
+ /* Kept to VMCLEAR loaded VMCSs */
cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
+ /* Disable virtualization, usually this is an AP */
+ cpu_disable_virtualization();
+
if (smp_ops.stop_this_cpu) {
smp_ops.stop_this_cpu();
BUG();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b014e6d229f9..eb6a389ba1a9 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -124,7 +124,9 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
return NMI_HANDLED;
+ /* Kept to VMCLEAR loaded VMCSs */
cpu_emergency_disable_virtualization();
+
stop_this_cpu(NULL);
return NMI_HANDLED;
@@ -136,7 +138,10 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
apic_eoi();
+
+ /* Kept to VMCLEAR loaded VMCSs */
cpu_emergency_disable_virtualization();
+
stop_this_cpu(NULL);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33e166f6ab12..fe3b04f33b3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1229,6 +1229,12 @@ int native_cpu_disable(void)
*/
apic_soft_disable();
+ /*
+ * IPIs have been disabled as mentioned above, so virtualization
+ * can now be safely shut down.
+ */
+ cpu_disable_virtualization();
+
return 0;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6742df0c4ff..26af0a8ae08f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -674,29 +674,6 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
return ret;
}
-/*
- * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
- *
- * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
- * atomically track post-VMXON state, e.g. this may be called in NMI context.
- * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
- * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
- * magically in RM, VM86, compat mode, or at CPL>0.
- */
-static int kvm_cpu_vmxoff(void)
-{
- asm goto("1: vmxoff\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- ::: "cc", "memory" : fault);
-
- cr4_clear_bits(X86_CR4_VMXE);
- return 0;
-
-fault:
- cr4_clear_bits(X86_CR4_VMXE);
- return -EIO;
-}
-
void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
@@ -719,8 +696,6 @@ void vmx_emergency_disable_virtualization_cpu(void)
if (v->shadow_vmcs)
vmcs_clear(v->shadow_vmcs);
}
-
- kvm_cpu_vmxoff();
}
static void __loaded_vmcs_clear(void *arg)
@@ -2788,12 +2763,7 @@ void vmx_disable_virtualization_cpu(void)
{
vmclear_local_loaded_vmcss();
- if (kvm_cpu_vmxoff())
- kvm_spurious_fault();
-
hv_reset_evmcs();
-
- intel_pt_handle_vmx(0);
}
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0eec314b79c2..d2c865fdb069 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,6 +129,9 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->misc_enable_saved = !rdmsrq_safe(MSR_IA32_MISC_ENABLE,
&ctxt->misc_enable);
msr_save_context(ctxt);
+
+ /* Now CR4 is saved, disable VMX and clear CR4.VMXE */
+ cpu_disable_virtualization();
}
/* Needed by apm.c */
--
2.51.0
© 2016 - 2025 Red Hat, Inc.