[v1] KVM: Register cpuhp/syscore callbacks when enabling virt

[PATCH 3/4] KVM: Register cpuhp and syscore callbacks when enabling hardware

Posted by Sean Christopherson 1 year, 9 months ago

Register KVM's cpuhp and syscore callback when enabling virtualization
in hardware instead of registering the callbacks during initialization,
and let the CPU up/down framework invoke the inner enable/disable
functions.  Registering the callbacks during initialization makes things
more complex than they need to be, as KVM needs to be very careful about
handling races between enabling CPUs being onlined/offlined and hardware
being enabled/disabled.

Intel TDX support will require KVM to enable virtualization during KVM
initialization, i.e. will add another wrinkle to things, at which point
sorting out the potential races with kvm_usage_count would become even
more complex.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/kvm_main.c | 170 +++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 114 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 10dad093b7fb..ad1b5a9e86d4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5490,7 +5490,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 static DEFINE_PER_CPU(bool, hardware_enabled);
 static int kvm_usage_count;
 
-static int __hardware_enable_nolock(void)
+static int hardware_enable_nolock(void)
 {
 	if (__this_cpu_read(hardware_enabled))
 		return 0;
@@ -5505,34 +5505,18 @@ static int __hardware_enable_nolock(void)
 	return 0;
 }
 
-static void hardware_enable_nolock(void *failed)
-{
-	if (__hardware_enable_nolock())
-		atomic_inc(failed);
-}
-
 static int kvm_online_cpu(unsigned int cpu)
 {
-	int ret = 0;
-
 	/*
 	 * Abort the CPU online process if hardware virtualization cannot
 	 * be enabled. Otherwise running VMs would encounter unrecoverable
 	 * errors when scheduled to this CPU.
 	 */
-	mutex_lock(&kvm_lock);
-	if (kvm_usage_count)
-		ret = __hardware_enable_nolock();
-	mutex_unlock(&kvm_lock);
-	return ret;
+	return hardware_enable_nolock();
 }
 
-static void hardware_disable_nolock(void *junk)
+static void hardware_disable_nolock(void *ign)
 {
-	/*
-	 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
-	 * hardware, not just CPUs that successfully enabled hardware!
-	 */
 	if (!__this_cpu_read(hardware_enabled))
 		return;
 
@@ -5543,78 +5527,10 @@ static void hardware_disable_nolock(void *junk)
 
 static int kvm_offline_cpu(unsigned int cpu)
 {
-	mutex_lock(&kvm_lock);
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
-	mutex_unlock(&kvm_lock);
+	hardware_disable_nolock(NULL);
 	return 0;
 }
 
-static void hardware_disable_all_nolock(void)
-{
-	BUG_ON(!kvm_usage_count);
-
-	kvm_usage_count--;
-	if (!kvm_usage_count)
-		on_each_cpu(hardware_disable_nolock, NULL, 1);
-}
-
-static void hardware_disable_all(void)
-{
-	cpus_read_lock();
-	mutex_lock(&kvm_lock);
-	hardware_disable_all_nolock();
-	mutex_unlock(&kvm_lock);
-	cpus_read_unlock();
-}
-
-static int hardware_enable_all(void)
-{
-	atomic_t failed = ATOMIC_INIT(0);
-	int r;
-
-	/*
-	 * Do not enable hardware virtualization if the system is going down.
-	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
-	 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
-	 * after kvm_reboot() is called.  Note, this relies on system_state
-	 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
-	 * hook instead of registering a dedicated reboot notifier (the latter
-	 * runs before system_state is updated).
-	 */
-	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
-	    system_state == SYSTEM_RESTART)
-		return -EBUSY;
-
-	/*
-	 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
-	 * is called, and so on_each_cpu() between them includes the CPU that
-	 * is being onlined.  As a result, hardware_enable_nolock() may get
-	 * invoked before kvm_online_cpu(), which also enables hardware if the
-	 * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
-	 * enable hardware multiple times.
-	 */
-	cpus_read_lock();
-	mutex_lock(&kvm_lock);
-
-	r = 0;
-
-	kvm_usage_count++;
-	if (kvm_usage_count == 1) {
-		on_each_cpu(hardware_enable_nolock, &failed, 1);
-
-		if (atomic_read(&failed)) {
-			hardware_disable_all_nolock();
-			r = -EBUSY;
-		}
-	}
-
-	mutex_unlock(&kvm_lock);
-	cpus_read_unlock();
-
-	return r;
-}
-
 static void kvm_shutdown(void)
 {
 	/*
@@ -5646,8 +5562,7 @@ static int kvm_suspend(void)
 	lockdep_assert_not_held(&kvm_lock);
 	lockdep_assert_irqs_disabled();
 
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
+	hardware_disable_nolock(NULL);
 	return 0;
 }
 
@@ -5656,8 +5571,7 @@ static void kvm_resume(void)
 	lockdep_assert_not_held(&kvm_lock);
 	lockdep_assert_irqs_disabled();
 
-	if (kvm_usage_count)
-		WARN_ON_ONCE(__hardware_enable_nolock());
+	WARN_ON_ONCE(hardware_enable_nolock());
 }
 
 static struct syscore_ops kvm_syscore_ops = {
@@ -5665,6 +5579,54 @@ static struct syscore_ops kvm_syscore_ops = {
 	.resume = kvm_resume,
 	.shutdown = kvm_shutdown,
 };
+
+static int hardware_enable_all(void)
+{
+	int r;
+
+	guard(mutex)(&kvm_lock);
+
+	if (kvm_usage_count++)
+		return 0;
+
+	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
+			      kvm_online_cpu, kvm_offline_cpu);
+	if (r)
+		return r;
+
+	register_syscore_ops(&kvm_syscore_ops);
+
+	/*
+	 * Undo virtualization enabling and bail if the system is going down.
+	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+	 * possible for an in-flight operation to enable virtualization after
+	 * syscore_shutdown() is called, i.e. without kvm_shutdown() being
+	 * invoked.  Note, this relies on system_state being set _before_
+	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
+	 * or this CPU observes the impending shutdown.  Which is why KVM uses
+	 * a syscore ops hook instead of registering a dedicated reboot
+	 * notifier (the latter runs before system_state is updated).
+	 */
+	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+	    system_state == SYSTEM_RESTART) {
+		unregister_syscore_ops(&kvm_syscore_ops);
+		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void hardware_disable_all(void)
+{
+	guard(mutex)(&kvm_lock);
+
+	if (--kvm_usage_count)
+		return;
+
+	unregister_syscore_ops(&kvm_syscore_ops);
+	cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+}
 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
 static int hardware_enable_all(void)
 {
@@ -6370,15 +6332,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	int r;
 	int cpu;
 
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
-				      kvm_online_cpu, kvm_offline_cpu);
-	if (r)
-		return r;
-
-	register_syscore_ops(&kvm_syscore_ops);
-#endif
-
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
 	if (!vcpu_align)
 		vcpu_align = __alignof__(struct kvm_vcpu);
@@ -6389,10 +6342,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 					   offsetofend(struct kvm_vcpu, stats_id)
 					   - offsetof(struct kvm_vcpu, arch),
 					   NULL);
-	if (!kvm_vcpu_cache) {
-		r = -ENOMEM;
-		goto err_vcpu_cache;
-	}
+	if (!kvm_vcpu_cache)
+		return -ENOMEM;
 
 	for_each_possible_cpu(cpu) {
 		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
@@ -6449,11 +6400,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	for_each_possible_cpu(cpu)
 		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
 	kmem_cache_destroy(kvm_vcpu_cache);
-err_vcpu_cache:
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	unregister_syscore_ops(&kvm_syscore_ops);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init);
@@ -6475,10 +6421,6 @@ void kvm_exit(void)
 	kmem_cache_destroy(kvm_vcpu_cache);
 	kvm_vfio_ops_exit();
 	kvm_async_pf_deinit();
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	unregister_syscore_ops(&kvm_syscore_ops);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
 	kvm_irqfd_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
-- 
2.44.0.769.g3c40516874-goog

Re: [PATCH 3/4] KVM: Register cpuhp and syscore callbacks when enabling hardware

Posted by Huang, Kai 1 year, 9 months ago

On Thu, 2024-04-25 at 16:39 -0700, Sean Christopherson wrote:
> -static void hardware_disable_nolock(void *junk)
> +static void hardware_disable_nolock(void *ign)

nit:

Not sure whether this is intended code change?

Re: [PATCH 3/4] KVM: Register cpuhp and syscore callbacks when enabling hardware

Posted by Sean Christopherson 1 year, 9 months ago

On Thu, Apr 25, 2024, Sean Christopherson wrote:
> Register KVM's cpuhp and syscore callback when enabling virtualization
> in hardware instead of registering the callbacks during initialization,
> and let the CPU up/down framework invoke the inner enable/disable
> functions.  Registering the callbacks during initialization makes things
> more complex than they need to be, as KVM needs to be very careful about
> handling races between enabling CPUs being onlined/offlined and hardware
> being enabled/disabled.
> 
> Intel TDX support will require KVM to enable virtualization during KVM
> initialization, i.e. will add another wrinkle to things, at which point
> sorting out the potential races with kvm_usage_count would become even
> more complex.
> +static int hardware_enable_all(void)
> +{
> +	int r;
> +
> +	guard(mutex)(&kvm_lock);
> +
> +	if (kvm_usage_count++)
> +		return 0;
> +
> +	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
> +			      kvm_online_cpu, kvm_offline_cpu);
> +	if (r)
> +		return r;

There's a lock ordering issue here.  KVM currently takes kvm_lock inside
cpu_hotplug_lock, but this code does the opposite.  I need to take a closer look
at the locking, as I'm not entirely certain that the existing ordering is correct
or ideal.  E.g. cpu_hotplug_lock is taken when updating static keys, static calls,
etc., which makes taking cpu_hotplug_lock outside kvm_lock dicey, as flows that
take kvm_lock then need to be very careful to never trigger seemingly innocuous
updates.

And this lockdep splat that I've now hit twice with the current implementation
suggests that cpu_hotplug_lock => kvm_lock is already unsafe/broken (I need to
re-decipher the splat; I _think_ mostly figured it out last week, but then forgot
over the weekend).


[48419.244819] ======================================================
[48419.250999] WARNING: possible circular locking dependency detected
[48419.257179] 6.9.0-smp--04b1c6b4841d-next #301 Tainted: G S         O      
[48419.264050] ------------------------------------------------------
[48419.270229] tee/27085 is trying to acquire lock:
[48419.274849] ffffb5fdd4e430a8 (&kvm->slots_lock){+.+.}-{3:3}, at: set_nx_huge_pages+0x179/0x1e0 [kvm]
[48419.284085] 
               but task is already holding lock:
[48419.289918] ffffffffc06ccba8 (kvm_lock){+.+.}-{3:3}, at: set_nx_huge_pages+0x14a/0x1e0 [kvm]
[48419.298386] 
               which lock already depends on the new lock.

[48419.306559] 
               the existing dependency chain (in reverse order) is:
[48419.314040] 
               -> #3 (kvm_lock){+.+.}-{3:3}:
[48419.319523]        __mutex_lock+0x6a/0xb40
[48419.323622]        mutex_lock_nested+0x1f/0x30
[48419.328071]        kvm_dev_ioctl+0x4fb/0xe50 [kvm]
[48419.332898]        __se_sys_ioctl+0x7b/0xd0
[48419.337082]        __x64_sys_ioctl+0x21/0x30
[48419.341357]        do_syscall_64+0x8b/0x170
[48419.345540]        entry_SYSCALL_64_after_hwframe+0x71/0x79
[48419.351115] 
               -> #2 (cpu_hotplug_lock){++++}-{0:0}:
[48419.357294]        cpus_read_lock+0x2e/0xb0
[48419.361480]        static_key_slow_inc+0x16/0x30
[48419.366098]        kvm_lapic_set_base+0x6a/0x1c0 [kvm]
[48419.371298]        kvm_set_apic_base+0x8f/0xe0 [kvm]
[48419.376298]        kvm_set_msr_common+0xa29/0x1080 [kvm]
[48419.381645]        vmx_set_msr+0xa54/0xbe0 [kvm_intel]
[48419.386795]        __kvm_set_msr+0xb6/0x1a0 [kvm]
[48419.391535]        kvm_arch_vcpu_ioctl+0xf66/0x1150 [kvm]
[48419.396970]        kvm_vcpu_ioctl+0x485/0x5b0 [kvm]
[48419.401881]        __se_sys_ioctl+0x7b/0xd0
[48419.406067]        __x64_sys_ioctl+0x21/0x30
[48419.410342]        do_syscall_64+0x8b/0x170
[48419.414528]        entry_SYSCALL_64_after_hwframe+0x71/0x79
[48419.420099] 
               -> #1 (&kvm->srcu){.?.+}-{0:0}:
[48419.425758]        __synchronize_srcu+0x44/0x1a0
[48419.430378]        synchronize_srcu_expedited+0x21/0x30
[48419.435603]        kvm_swap_active_memslots+0x113/0x1c0 [kvm]
[48419.441385]        kvm_set_memslot+0x360/0x620 [kvm]
[48419.446387]        __kvm_set_memory_region+0x27b/0x300 [kvm]
[48419.452078]        kvm_vm_ioctl_set_memory_region+0x43/0x60 [kvm]
[48419.458207]        kvm_vm_ioctl+0x295/0x650 [kvm]
[48419.462945]        __se_sys_ioctl+0x7b/0xd0
[48419.467133]        __x64_sys_ioctl+0x21/0x30
[48419.471406]        do_syscall_64+0x8b/0x170
[48419.475590]        entry_SYSCALL_64_after_hwframe+0x71/0x79
[48419.481164] 
               -> #0 (&kvm->slots_lock){+.+.}-{3:3}:
[48419.487343]        __lock_acquire+0x15ef/0x2e40
[48419.491876]        lock_acquire+0xe0/0x260
[48419.495977]        __mutex_lock+0x6a/0xb40
[48419.500076]        mutex_lock_nested+0x1f/0x30
[48419.504521]        set_nx_huge_pages+0x179/0x1e0 [kvm]
[48419.509694]        param_attr_store+0x93/0x100
[48419.514142]        module_attr_store+0x22/0x40
[48419.518587]        sysfs_kf_write+0x81/0xb0
[48419.522774]        kernfs_fop_write_iter+0x133/0x1d0
[48419.527738]        vfs_write+0x317/0x380
[48419.531663]        ksys_write+0x70/0xe0
[48419.535505]        __x64_sys_write+0x1f/0x30
[48419.539777]        do_syscall_64+0x8b/0x170
[48419.543961]        entry_SYSCALL_64_after_hwframe+0x71/0x79
[48419.549534] 
               other info that might help us debug this:

[48419.557534] Chain exists of:
                 &kvm->slots_lock --> cpu_hotplug_lock --> kvm_lock

[48419.567873]  Possible unsafe locking scenario:

[48419.573793]        CPU0                    CPU1
[48419.578325]        ----                    ----
[48419.582859]   lock(kvm_lock);
[48419.585831]                                lock(cpu_hotplug_lock);
[48419.592011]                                lock(kvm_lock);
[48419.597499]   lock(&kvm->slots_lock);
[48419.601173] 
                *** DEADLOCK ***

[48419.607090] 5 locks held by tee/27085:
[48419.610841]  #0: ffffa0dcc92eb410 (sb_writers#4){.+.+}-{0:0}, at: vfs_write+0xe4/0x380
[48419.618765]  #1: ffffa0dce221ac88 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xd8/0x1d0
[48419.627553]  #2: ffffa11c4d6bef28 (kn->active#257){.+.+}-{0:0}, at: kernfs_fop_write_iter+0xe0/0x1d0
[48419.636684]  #3: ffffffffc06e3c50 (&mod->param_lock){+.+.}-{3:3}, at: param_attr_store+0x57/0x100
[48419.645575]  #4: ffffffffc06ccba8 (kvm_lock){+.+.}-{3:3}, at: set_nx_huge_pages+0x14a/0x1e0 [kvm]
[48419.654564] 
               stack backtrace:
[48419.658925] CPU: 93 PID: 27085 Comm: tee Tainted: G S         O       6.9.0-smp--04b1c6b4841d-next #301
[48419.668312] Hardware name: Google Interlaken/interlaken, BIOS 0.20231025.0-0 10/25/2023
[48419.676314] Call Trace:
[48419.678770]  <TASK>
[48419.680878]  dump_stack_lvl+0x83/0xc0
[48419.684552]  dump_stack+0x14/0x20
[48419.687880]  print_circular_bug+0x2f0/0x300
[48419.692068]  check_noncircular+0x103/0x120
[48419.696163]  ? __lock_acquire+0x5e3/0x2e40
[48419.700266]  __lock_acquire+0x15ef/0x2e40
[48419.704286]  ? __lock_acquire+0x5e3/0x2e40
[48419.708387]  ? __lock_acquire+0x5e3/0x2e40
[48419.712486]  ? __lock_acquire+0x5e3/0x2e40
[48419.716586]  lock_acquire+0xe0/0x260
[48419.720164]  ? set_nx_huge_pages+0x179/0x1e0 [kvm]
[48419.725060]  ? lock_acquire+0xe0/0x260
[48419.728822]  ? param_attr_store+0x57/0x100
[48419.732921]  ? set_nx_huge_pages+0x179/0x1e0 [kvm]
[48419.737768]  __mutex_lock+0x6a/0xb40
[48419.741359]  ? set_nx_huge_pages+0x179/0x1e0 [kvm]
[48419.746207]  ? set_nx_huge_pages+0x14a/0x1e0 [kvm]
[48419.751054]  ? param_attr_store+0x57/0x100
[48419.755158]  ? __mutex_lock+0x240/0xb40
[48419.759005]  ? param_attr_store+0x57/0x100
[48419.763107]  mutex_lock_nested+0x1f/0x30
[48419.767031]  set_nx_huge_pages+0x179/0x1e0 [kvm]
[48419.771705]  param_attr_store+0x93/0x100
[48419.775629]  module_attr_store+0x22/0x40
[48419.779556]  sysfs_kf_write+0x81/0xb0
[48419.783222]  kernfs_fop_write_iter+0x133/0x1d0
[48419.787668]  vfs_write+0x317/0x380
[48419.791084]  ksys_write+0x70/0xe0
[48419.794401]  __x64_sys_write+0x1f/0x30
[48419.798152]  do_syscall_64+0x8b/0x170
[48419.801819]  entry_SYSCALL_64_after_hwframe+0x71/0x79
[48419.806872] RIP: 0033:0x7f98eff1ee0d
[48419.810465] Code: e5 41 57 41 56 53 50 49 89 d6 49 89 f7 89 fb 48 8b 05 37 7c 07 00 83 38 00 75 28 b8 01 00 00 00 89 df 4c 89 fe 4c 89 f2 0f 05 <48> 89 c3 48 3d 01 f0 ff ff 73 3a 48 89 d8 48 83 c4 08 5b 41 5e 41
[48419.829213] RSP: 002b:00007ffd5dee15c0 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[48419.836792] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f98eff1ee0d
[48419.843931] RDX: 0000000000000002 RSI: 00007ffd5dee16d0 RDI: 0000000000000005
[48419.851065] RBP: 00007ffd5dee15e0 R08: 00007f98efe161d2 R09: 0000000000000001
[48419.858196] R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000002
[48419.865328] R13: 00007ffd5dee16d0 R14: 0000000000000002 R15: 00007ffd5dee16d0
[48419.872472]  </TASK>

Re: [PATCH 3/4] KVM: Register cpuhp and syscore callbacks when enabling hardware

Posted by Huang, Kai 1 year, 9 months ago

On Tue, 2024-05-07 at 09:31 -0700, Sean Christopherson wrote:
> On Thu, Apr 25, 2024, Sean Christopherson wrote:
> > Register KVM's cpuhp and syscore callback when enabling virtualization
> > in hardware instead of registering the callbacks during initialization,
> > and let the CPU up/down framework invoke the inner enable/disable
> > functions.  Registering the callbacks during initialization makes things
> > more complex than they need to be, as KVM needs to be very careful about
> > handling races between enabling CPUs being onlined/offlined and hardware
> > being enabled/disabled.
> > 
> > Intel TDX support will require KVM to enable virtualization during KVM
> > initialization, i.e. will add another wrinkle to things, at which point
> > sorting out the potential races with kvm_usage_count would become even
> > more complex.
> > +static int hardware_enable_all(void)
> > +{
> > +	int r;
> > +
> > +	guard(mutex)(&kvm_lock);
> > +
> > +	if (kvm_usage_count++)
> > +		return 0;
> > +
> > +	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
> > +			      kvm_online_cpu, kvm_offline_cpu);
> > +	if (r)
> > +		return r;
> 
> There's a lock ordering issue here.  KVM currently takes kvm_lock inside
> cpu_hotplug_lock, but this code does the opposite.  I need to take a closer look
> at the locking, as I'm not entirely certain that the existing ordering is correct
> or ideal.  
> 

Do you mean currently (upstream) hardware_enable_all() takes
cpus_read_lock() first and then kvm_lock?

For this one I think the cpus_read_lock() must be taken outside of
kvm_lock, because the kvm_online_cpu() also takes kvm_lock.  Switching the
order in hardware_enable_all() can result in deadlock.

For example, when CPU 0 is doing hardware_enable_all(), CPU 1 tries to
bring up CPU 2 between kvm_lock and cpus_read_lock() in CPU 0:

cpu 0 			   cpu 1 		cpu 2

(hardware_enable_all())	   (online cpu 2)	(kvm_online_cpu())

1) mutex_lock(&kvm_lock);	   

			   2) cpus_write_lock();
			      bringup cpu 2

						4) mutex_lock(&kvm_lock);

3) cpus_read_lock();				...

						mutex_unlock(&kvm_lock);

			   5) cpus_write_unlock();

   ...

6) mutex_unlock(&kvm_lock);

In this case, the cpus_read_lock() in step 3) will wait for the
cpus_write_unlock() in step 5) to complete, which will wait for CPU 2 to
complete kvm_online_cpu().  But kvm_online_cpu() on CPU 2 will in turn
wait for CPU 0 to release the kvm_lock, so deadlock.

But with the code change in this patch, the kvm_online_cpu() doesn't take
the kvm_lock anymore, so to me it looks it's OK to take cpus_read_lock()
inside kvm_lock.

Btw, even in the current upstream code, IIUC the cpus_read_lock() isn't
absolutely necessary.  It was introduced to prevent running
hardware_enable_nolock() from on_each_cpu() IPI call for the new cpu
before kvm_online_cpu() is invoked.  But due to both hardware_enable_all()
and kvm_online_cpu() both grabs kvm_lock, the hardware_enable_nolock()
inside the kvm_online_cpu() will always wait for hardware_enable_all() to
complete, so the worst case is hardware_enable_nolock() is called twice. 
But this is fine because the second call will basically do nothing due to
the @hardware_enabled per-cpu variable.

> E.g. cpu_hotplug_lock is taken when updating static keys, static calls,
> etc., which makes taking cpu_hotplug_lock outside kvm_lock dicey, as flows that
> take kvm_lock then need to be very careful to never trigger seemingly innocuous
> updates.
> 
> And this lockdep splat that I've now hit twice with the current implementation
> suggests that cpu_hotplug_lock => kvm_lock is already unsafe/broken (I need to
> re-decipher the splat; I _think_ mostly figured it out last week, but then forgot
> over the weekend).

I think if we remove the kvm_lock in kvm_online_cpu(), it's OK to hold
cpus_read_lock() (call the cpuhp_setup_state()) inside the kvm_lock.

If so, maybe we can just have a rule that cpus_read_lock() cannot be hold
outside of kvm_lock.

Re: [PATCH 3/4] KVM: Register cpuhp and syscore callbacks when enabling hardware

Posted by Chao Gao 1 year, 9 months ago

>+static int hardware_enable_all(void)
>+{
>+	int r;
>+
>+	guard(mutex)(&kvm_lock);
>+
>+	if (kvm_usage_count++)
>+		return 0;
>+
>+	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
>+			      kvm_online_cpu, kvm_offline_cpu);

A subtle change is: cpuhp_setup_state() calls kvm_online_cpu() serially
on all CPUs. Previously, hardware enabling is done with on_each_cpu().
I assume performance isn't a concern here. Right?

>+	if (r)
>+		return r;

decrease kvm_usage_count on error?

>+
>+	register_syscore_ops(&kvm_syscore_ops);
>+
>+	/*
>+	 * Undo virtualization enabling and bail if the system is going down.
>+	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
>+	 * possible for an in-flight operation to enable virtualization after
>+	 * syscore_shutdown() is called, i.e. without kvm_shutdown() being
>+	 * invoked.  Note, this relies on system_state being set _before_
>+	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
>+	 * or this CPU observes the impending shutdown.  Which is why KVM uses
>+	 * a syscore ops hook instead of registering a dedicated reboot
>+	 * notifier (the latter runs before system_state is updated).
>+	 */
>+	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
>+	    system_state == SYSTEM_RESTART) {
>+		unregister_syscore_ops(&kvm_syscore_ops);
>+		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
>+		return -EBUSY;

ditto

>+	}
>+
>+	return 0;
>+}

Re: [PATCH 3/4] KVM: Register cpuhp and syscore callbacks when enabling hardware

Posted by Sean Christopherson 1 year, 9 months ago

On Fri, Apr 26, 2024, Chao Gao wrote:
> >+static int hardware_enable_all(void)
> >+{
> >+	int r;
> >+
> >+	guard(mutex)(&kvm_lock);
> >+
> >+	if (kvm_usage_count++)
> >+		return 0;
> >+
> >+	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
> >+			      kvm_online_cpu, kvm_offline_cpu);
> 
> A subtle change is: cpuhp_setup_state() calls kvm_online_cpu() serially
> on all CPUs. Previously, hardware enabling is done with on_each_cpu().

Ooh, I hadn't considered that side effect.

> I assume performance isn't a concern here. Right?

Hmm, performance isn't critical, but I wouldn't be at all surprised if it's
noticeable and problematic for large systems.  Assuming we do end up going this
route, maybe we can "solve" this by documenting KVM's behavior, e.g. so that if
userspace cares about the latency of VM creation, it can that fudge around the
potential latency problem by creating a dummy VM.  Hrm, that's pretty gross though.

Oh!  Hah!  What if we add a module param to let userspace force virtualization to
be enabled when KVM is loaded?  And then kvm_enable_virtualization() doesn't even
need to be exported/public, because KVM can simply require enable_virt_at_load
(or whatever is a good name) to be %true in order to enable TDX.

I don't think there's any real danger for abuse, e.g. *unprivileged* userspace
can effectively do this already by creating a dummy VM.

Am I missing something?  This seems too easy.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7579bda0e310..1bfbb612a98f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -96,6 +96,9 @@ unsigned int halt_poll_ns_shrink;
 module_param(halt_poll_ns_shrink, uint, 0644);
 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);

+static bool enable_virt_at_load;
+module_param(enable_virt_at_load, uint, 0444);
+
 /*
  * Ordering of locks:
  *
@@ -6377,6 +6380,12 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)

        kvm_gmem_init(module);

+       if (enable_virt_at_load) {
+               r = kvm_enable_virtualization();
+               if (r)
+                       goto err_virt;
+       }
+
        /*
         * Registration _must_ be the very last thing done, as this exposes
         * /dev/kvm to userspace, i.e. all infrastructure must be setup!
@@ -6390,6 +6399,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
        return 0;

 err_register:
+       kvm_disable_virtualization();
+err_virt:
        kvm_vfio_ops_exit();
 err_vfio:
        kvm_async_pf_deinit();
@@ -6415,6 +6426,11 @@ void kvm_exit(void)
         */
        misc_deregister(&kvm_dev);

+       if (enable_virt_at_load) {
+               kvm_disable_virtualization();
+               BUG_ON(kvm_usage_count);
+       }
+
        debugfs_remove_recursive(kvm_debugfs_dir);
        for_each_possible_cpu(cpu)
                free_cpumask_var(per_cpu(cpu_kick_mask, cpu));

> >+	if (r)
> >+		return r;
> 
> decrease kvm_usage_count on error?

/facepalm, yes.

[PATCH 1/4] x86/reboot: Unconditionally define cpu_emergency_virt_cb typedef
[PATCH 2/4] KVM: x86: Register emergency virt callback in common code, via kvm_x86_ops
[PATCH 3/4] KVM: Register cpuhp and syscore callbacks when enabling hardware
[PATCH 4/4] KVM: Rename functions related to enabling virtualization hardware