KVM: x86: Allocate/free user_return_msrs at kvm.ko (un)loading time

[PATCH] KVM: x86: Allocate/free user_return_msrs at kvm.ko (un)loading time

Posted by Sean Christopherson 3 months ago

From: Chao Gao <chao.gao@intel.com>

Move user_return_msrs allocation/free from vendor modules (kvm-intel.ko and
kvm-amd.ko) (un)loading time to kvm.ko's to make it less risky to access
user_return_msrs in kvm.ko. Tying the lifetime of user_return_msrs to
vendor modules makes every access to user_return_msrs prone to
use-after-free issues as vendor modules may be unloaded at any time.

Opportunistically turn the per-CPU variable into full structs, as there's
no practical difference between statically allocating the memory and
allocating it unconditionally during module_init().

Zero out kvm_nr_uret_msrs on vendor module exit to further minimize the
chances of consuming stale data, and WARN on vendor module load if KVM
thinks there are existing user-return MSRs.

Note!  The user-return MSRs also need to be "destroyed" if
ops->hardware_setup() fails, as both SVM and VMX expect common KVM to
clean up (because common code, not vendor code, is responsible for
kvm_nr_uret_msrs).

Signed-off-by: Chao Gao <chao.gao@intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 46 ++++++++++++++++------------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c2e28028c2b..24dba35f3217 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -212,7 +212,7 @@ struct kvm_user_return_msrs {
 u32 __read_mostly kvm_nr_uret_msrs;
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs);
 static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
-static struct kvm_user_return_msrs __percpu *user_return_msrs;
+static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs);
 
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -575,25 +575,14 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 		vcpu->arch.apf.gfns[i] = ~0;
 }
 
-static int kvm_init_user_return_msrs(void)
+static void kvm_destroy_user_return_msrs(void)
 {
-	user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
-	if (!user_return_msrs) {
-		pr_err("failed to allocate percpu user_return_msrs\n");
-		return -ENOMEM;
-	}
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
+
 	kvm_nr_uret_msrs = 0;
-	return 0;
-}
-
-static void kvm_free_user_return_msrs(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		WARN_ON_ONCE(per_cpu_ptr(user_return_msrs, cpu)->registered);
-
-	free_percpu(user_return_msrs);
 }
 
 static void kvm_on_user_return(struct user_return_notifier *urn)
@@ -656,7 +645,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr);
 
 static void kvm_user_return_msr_cpu_online(void)
 {
-	struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+	struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
 	u64 value;
 	int i;
 
@@ -678,7 +667,7 @@ static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
 
 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
 {
-	struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+	struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
 	int err;
 
 	value = (value & mask) | (msrs->values[slot].host & ~mask);
@@ -696,13 +685,13 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
 
 u64 kvm_get_user_return_msr(unsigned int slot)
 {
-	return this_cpu_ptr(user_return_msrs)->values[slot].curr;
+	return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr);
 
 static void drop_user_return_notifiers(void)
 {
-	struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+	struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
 
 	if (msrs->registered)
 		kvm_on_user_return(&msrs->urn);
@@ -10077,13 +10066,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 		return -ENOMEM;
 	}
 
-	r = kvm_init_user_return_msrs();
-	if (r)
-		goto out_free_x86_emulator_cache;
-
 	r = kvm_mmu_vendor_module_init();
 	if (r)
-		goto out_free_percpu;
+		goto out_free_x86_emulator_cache;
 
 	kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
 	kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
@@ -10108,6 +10093,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
 
+	WARN_ON_ONCE(kvm_nr_uret_msrs);
+
 	r = ops->hardware_setup();
 	if (r != 0)
 		goto out_mmu_exit;
@@ -10180,9 +10167,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	kvm_x86_ops.enable_virtualization_cpu = NULL;
 	kvm_x86_call(hardware_unsetup)();
 out_mmu_exit:
+	kvm_destroy_user_return_msrs();
 	kvm_mmu_vendor_module_exit();
-out_free_percpu:
-	kvm_free_user_return_msrs();
 out_free_x86_emulator_cache:
 	kmem_cache_destroy(x86_emulator_cache);
 	return r;
@@ -10210,8 +10196,8 @@ void kvm_x86_vendor_exit(void)
 	cancel_work_sync(&pvclock_gtod_work);
 #endif
 	kvm_x86_call(hardware_unsetup)();
+	kvm_destroy_user_return_msrs();
 	kvm_mmu_vendor_module_exit();
-	kvm_free_user_return_msrs();
 	kmem_cache_destroy(x86_emulator_cache);
 #ifdef CONFIG_KVM_XEN
 	static_key_deferred_flush(&kvm_xen_enabled);

base-commit: 9052f4f6c539ea1fb7b282a34e6bb33154ce0b63
-- 
2.51.2.1041.gc1ab5b90ca-goog

Re: [PATCH] KVM: x86: Allocate/free user_return_msrs at kvm.ko (un)loading time

Posted by Sean Christopherson 2 months, 2 weeks ago

On Fri, 07 Nov 2025 17:36:01 -0800, Sean Christopherson wrote:
> Move user_return_msrs allocation/free from vendor modules (kvm-intel.ko and
> kvm-amd.ko) (un)loading time to kvm.ko's to make it less risky to access
> user_return_msrs in kvm.ko. Tying the lifetime of user_return_msrs to
> vendor modules makes every access to user_return_msrs prone to
> use-after-free issues as vendor modules may be unloaded at any time.
> 
> Opportunistically turn the per-CPU variable into full structs, as there's
> no practical difference between statically allocating the memory and
> allocating it unconditionally during module_init().
> 
> [...]

Applied to kvm-x86 misc, thanks!

[1/1] KVM: x86: Allocate/free user_return_msrs at kvm.ko (un)loading time
      https://github.com/kvm-x86/linux/commit/11d984633f7f

--
https://github.com/kvm-x86/linux/tree/next

Re: [PATCH] KVM: x86: Allocate/free user_return_msrs at kvm.ko (un)loading time

Posted by Chao Gao 3 months ago

>-static int kvm_init_user_return_msrs(void)
>+static void kvm_destroy_user_return_msrs(void)
> {
>-	user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
>-	if (!user_return_msrs) {
>-		pr_err("failed to allocate percpu user_return_msrs\n");
>-		return -ENOMEM;
>-	}
>+	int cpu;
>+
>+	for_each_possible_cpu(cpu)
>+		WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);

Could this warning be triggered if the forced shutdown path didn't
unregister the user return callback (i.e., with the patch [*] applied),
and then vendor modules got unloaded immediately after the forced shutdown
(before the CPU exits to the userspace)?

[*]: https://lore.kernel.org/kvm/20251030191528.3380553-4-seanjc@google.com/

Re: [PATCH] KVM: x86: Allocate/free user_return_msrs at kvm.ko (un)loading time

Posted by Sean Christopherson 2 months, 3 weeks ago

On Mon, Nov 10, 2025, Chao Gao wrote:
> >-static int kvm_init_user_return_msrs(void)
> >+static void kvm_destroy_user_return_msrs(void)
> > {
> >-	user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
> >-	if (!user_return_msrs) {
> >-		pr_err("failed to allocate percpu user_return_msrs\n");
> >-		return -ENOMEM;
> >-	}
> >+	int cpu;
> >+
> >+	for_each_possible_cpu(cpu)
> >+		WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
> 
> Could this warning be triggered if the forced shutdown path didn't
> unregister the user return callback (i.e., with the patch [*] applied),
> and then vendor modules got unloaded immediately after the forced shutdown
> (before the CPU exits to the userspace)?

Probably?  But that's more of a feature than a bug, e.g. gives the (privileged!)
user the heads up of how exactly they broke their system when they forced a
reboot.  I'd prefer not to condition it on e.g. !kvm_rebooting unless it's truly
necessary, because it's "just" a WARN, i.e. shouldn't crash the system.

> [*]: https://lore.kernel.org/kvm/20251030191528.3380553-4-seanjc@google.com/