Use mutex_trylock_nest_lock instead of mutex_trylock when locking all vCPUs
of a VM, to avoid triggering a lockdep warning, if the VM is configured to
have more than MAX_LOCK_DEPTH vCPUs.
This fixes the following false lockdep warning:
[ 328.171264] BUG: MAX_LOCK_DEPTH too low!
[ 328.175227] turning off the locking correctness validator.
[ 328.180726] Please attach the output of /proc/lock_stat to the bug report
[ 328.187531] depth: 48 max: 48!
[ 328.190678] 48 locks held by qemu-kvm/11664:
[ 328.194957] #0: ffff800086de5ba0 (&kvm->lock){+.+.}-{3:3}, at: kvm_ioctl_create_device+0x174/0x5b0
[ 328.204048] #1: ffff0800e78800b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0
[ 328.212521] #2: ffff07ffeee51e98 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0
[ 328.220991] #3: ffff0800dc7d80b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0
[ 328.229463] #4: ffff07ffe0c980b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0
[ 328.237934] #5: ffff0800a3883c78 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0
[ 328.246405] #6: ffff07fffbe480b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0
Since the locking of all vCPUs is a primitive that can be useful in other
architectures that are supported by KVM, also move the code to kvm_main.c
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
arch/arm64/include/asm/kvm_host.h | 3 --
arch/arm64/kvm/arch_timer.c | 4 +--
arch/arm64/kvm/arm.c | 43 ---------------------------
arch/arm64/kvm/vgic/vgic-init.c | 4 +--
arch/arm64/kvm/vgic/vgic-its.c | 8 ++---
arch/arm64/kvm/vgic/vgic-kvm-device.c | 12 ++++----
include/linux/kvm_host.h | 3 ++
virt/kvm/kvm_main.c | 34 +++++++++++++++++++++
8 files changed, 51 insertions(+), 60 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e98cfe7855a6..96ce0b01a61e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1263,9 +1263,6 @@ int __init populate_sysreg_config(const struct sys_reg_desc *sr,
unsigned int idx);
int __init populate_nv_trap_config(void);
-bool lock_all_vcpus(struct kvm *kvm);
-void unlock_all_vcpus(struct kvm *kvm);
-
void kvm_calculate_traps(struct kvm_vcpu *vcpu);
/* MMIO helpers */
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 5133dcbfe9f7..fdbc8beec930 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1766,7 +1766,7 @@ int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
mutex_lock(&kvm->lock);
- if (lock_all_vcpus(kvm)) {
+ if (!kvm_trylock_all_vcpus(kvm)) {
set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags);
/*
@@ -1778,7 +1778,7 @@ int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
kvm->arch.timer_data.voffset = offset->counter_offset;
kvm->arch.timer_data.poffset = offset->counter_offset;
- unlock_all_vcpus(kvm);
+ kvm_unlock_all_vcpus(kvm);
} else {
ret = -EBUSY;
}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 68fec8c95fee..d31f42a71bdc 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1914,49 +1914,6 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
}
}
-/* unlocks vcpus from @vcpu_lock_idx and smaller */
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
- struct kvm_vcpu *tmp_vcpu;
-
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
- mutex_unlock(&tmp_vcpu->mutex);
- }
-}
-
-void unlock_all_vcpus(struct kvm *kvm)
-{
- lockdep_assert_held(&kvm->lock);
-
- unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-/* Returns true if all vcpus were locked, false otherwise */
-bool lock_all_vcpus(struct kvm *kvm)
-{
- struct kvm_vcpu *tmp_vcpu;
- unsigned long c;
-
- lockdep_assert_held(&kvm->lock);
-
- /*
- * Any time a vcpu is in an ioctl (including running), the
- * core KVM code tries to grab the vcpu->mutex.
- *
- * By grabbing the vcpu->mutex of all VCPUs we ensure that no
- * other VCPUs can fiddle with the state while we access it.
- */
- kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
- if (!mutex_trylock(&tmp_vcpu->mutex)) {
- unlock_vcpus(kvm, c - 1);
- return false;
- }
- }
-
- return true;
-}
-
static unsigned long nvhe_percpu_size(void)
{
return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 1f33e71c2a73..6a426d403a6b 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -88,7 +88,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
lockdep_assert_held(&kvm->lock);
ret = -EBUSY;
- if (!lock_all_vcpus(kvm))
+ if (kvm_trylock_all_vcpus(kvm))
return ret;
mutex_lock(&kvm->arch.config_lock);
@@ -142,7 +142,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
out_unlock:
mutex_unlock(&kvm->arch.config_lock);
- unlock_all_vcpus(kvm);
+ kvm_unlock_all_vcpus(kvm);
return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index fb96802799c6..7454388e3646 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -1999,7 +1999,7 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -2034,7 +2034,7 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
}
out:
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return ret;
}
@@ -2704,7 +2704,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
mutex_lock(&kvm->lock);
- if (!lock_all_vcpus(kvm)) {
+ if (kvm_trylock_all_vcpus(kvm)) {
mutex_unlock(&kvm->lock);
return -EBUSY;
}
@@ -2726,7 +2726,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
mutex_unlock(&its->its_lock);
mutex_unlock(&kvm->arch.config_lock);
- unlock_all_vcpus(kvm);
+ kvm_unlock_all_vcpus(kvm);
mutex_unlock(&kvm->lock);
return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 359094f68c23..f9ae790163fb 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -268,7 +268,7 @@ static int vgic_set_common_attr(struct kvm_device *dev,
return -ENXIO;
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -276,7 +276,7 @@ static int vgic_set_common_attr(struct kvm_device *dev,
mutex_lock(&dev->kvm->arch.config_lock);
r = vgic_v3_save_pending_tables(dev->kvm);
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return r;
}
@@ -390,7 +390,7 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -415,7 +415,7 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
out:
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && !is_write)
@@ -554,7 +554,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -611,7 +611,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
out:
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && uaccess && !is_write) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1dedc421b3e3..10d6652c7aa0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1015,6 +1015,9 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
void kvm_destroy_vcpus(struct kvm *kvm);
+int kvm_trylock_all_vcpus(struct kvm *kvm);
+void kvm_unlock_all_vcpus(struct kvm *kvm);
+
void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 69782df3617f..834f08dfa24c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1368,6 +1368,40 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
return 0;
}
+/*
+ * Try to lock all of the VM's vCPUs.
+ * Assumes that the kvm->lock is held.
+ */
+int kvm_trylock_all_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i, j;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
+ goto out_unlock;
+ return 0;
+
+out_unlock:
+ kvm_for_each_vcpu(j, vcpu, kvm) {
+ if (i == j)
+ break;
+ mutex_unlock(&vcpu->mutex);
+ }
+ return -EINTR;
+}
+EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus);
+
+void kvm_unlock_all_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ mutex_unlock(&vcpu->mutex);
+}
+EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus);
+
/*
* Allocation size is twice as large as the actual dirty bitmap size.
* See kvm_vm_ioctl_get_dirty_log() why this is needed.
--
2.46.0
nit: in keeping with the existing arm64 patches, please write the
subject as "KVM: arm64: Use ..."
On Wed, 30 Apr 2025 21:30:10 +0100,
Maxim Levitsky <mlevitsk@redhat.com> wrote:
[...]
>
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 68fec8c95fee..d31f42a71bdc 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1914,49 +1914,6 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> }
> }
>
> -/* unlocks vcpus from @vcpu_lock_idx and smaller */
> -static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
> -{
> - struct kvm_vcpu *tmp_vcpu;
> -
> - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
> - tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
> - mutex_unlock(&tmp_vcpu->mutex);
> - }
> -}
> -
> -void unlock_all_vcpus(struct kvm *kvm)
> -{
> - lockdep_assert_held(&kvm->lock);
Note this assertion...
> -
> - unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
> -}
> -
> -/* Returns true if all vcpus were locked, false otherwise */
> -bool lock_all_vcpus(struct kvm *kvm)
> -{
> - struct kvm_vcpu *tmp_vcpu;
> - unsigned long c;
> -
> - lockdep_assert_held(&kvm->lock);
and this one...
> -
> - /*
> - * Any time a vcpu is in an ioctl (including running), the
> - * core KVM code tries to grab the vcpu->mutex.
> - *
> - * By grabbing the vcpu->mutex of all VCPUs we ensure that no
> - * other VCPUs can fiddle with the state while we access it.
> - */
> - kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
> - if (!mutex_trylock(&tmp_vcpu->mutex)) {
> - unlock_vcpus(kvm, c - 1);
> - return false;
> - }
> - }
> -
> - return true;
> -}
> -
> static unsigned long nvhe_percpu_size(void)
> {
> return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
[...]
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 69782df3617f..834f08dfa24c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1368,6 +1368,40 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
> return 0;
> }
>
> +/*
> + * Try to lock all of the VM's vCPUs.
> + * Assumes that the kvm->lock is held.
Assuming is not enough. These assertions have caught a number of bugs,
and I'm not prepared to drop them.
> + */
> +int kvm_trylock_all_vcpus(struct kvm *kvm)
> +{
> + struct kvm_vcpu *vcpu;
> + unsigned long i, j;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
> + goto out_unlock;
> + return 0;
> +
> +out_unlock:
> + kvm_for_each_vcpu(j, vcpu, kvm) {
> + if (i == j)
> + break;
> + mutex_unlock(&vcpu->mutex);
> + }
> + return -EINTR;
> +}
> +EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus);
> +
> +void kvm_unlock_all_vcpus(struct kvm *kvm)
> +{
> + struct kvm_vcpu *vcpu;
> + unsigned long i;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + mutex_unlock(&vcpu->mutex);
> +}
> +EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus);
I don't mind you not including the assertions in these helpers, but
then the existing primitives have to stay and call into the new stuff.
Which, from a simple patch volume, would be far preferable and help
with managing backports.
I'd also expect the introduction of these new helpers to be done in
its own patch, so that we don't get cross architecture dependencies if
something needs to be backported for a reason or another.
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
On Thu, May 01, 2025, Marc Zyngier wrote:
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 69782df3617f..834f08dfa24c 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -1368,6 +1368,40 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
> > return 0;
> > }
> >
> > +/*
> > + * Try to lock all of the VM's vCPUs.
> > + * Assumes that the kvm->lock is held.
>
> Assuming is not enough. These assertions have caught a number of bugs,
> and I'm not prepared to drop them.
>
> > + */
> > +int kvm_trylock_all_vcpus(struct kvm *kvm)
> > +{
> > + struct kvm_vcpu *vcpu;
> > + unsigned long i, j;
> > +
> > + kvm_for_each_vcpu(i, vcpu, kvm)
> > + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
> > + goto out_unlock;
> > + return 0;
> > +
> > +out_unlock:
> > + kvm_for_each_vcpu(j, vcpu, kvm) {
> > + if (i == j)
> > + break;
> > + mutex_unlock(&vcpu->mutex);
> > + }
> > + return -EINTR;
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus);
> > +
> > +void kvm_unlock_all_vcpus(struct kvm *kvm)
> > +{
> > + struct kvm_vcpu *vcpu;
> > + unsigned long i;
> > +
> > + kvm_for_each_vcpu(i, vcpu, kvm)
> > + mutex_unlock(&vcpu->mutex);
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus);
>
> I don't mind you not including the assertions in these helpers,
I do :-) I see no reason not to add assertions here, if locking all vCPUs is
a hot path, we've probably got bigger problems.
On Thu, May 01, 2025 at 09:24:11AM +0100, Marc Zyngier wrote:
> nit: in keeping with the existing arm64 patches, please write the
> subject as "KVM: arm64: Use ..."
>
> On Wed, 30 Apr 2025 21:30:10 +0100,
> Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> [...]
>
> >
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 68fec8c95fee..d31f42a71bdc 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -1914,49 +1914,6 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> > }
> > }
> >
> > -/* unlocks vcpus from @vcpu_lock_idx and smaller */
> > -static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
> > -{
> > - struct kvm_vcpu *tmp_vcpu;
> > -
> > - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
> > - tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
> > - mutex_unlock(&tmp_vcpu->mutex);
> > - }
> > -}
> > -
> > -void unlock_all_vcpus(struct kvm *kvm)
> > -{
> > - lockdep_assert_held(&kvm->lock);
>
> Note this assertion...
>
> > -
> > - unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
> > -}
> > -
> > -/* Returns true if all vcpus were locked, false otherwise */
> > -bool lock_all_vcpus(struct kvm *kvm)
> > -{
> > - struct kvm_vcpu *tmp_vcpu;
> > - unsigned long c;
> > -
> > - lockdep_assert_held(&kvm->lock);
>
> and this one...
>
> > -
> > - /*
> > - * Any time a vcpu is in an ioctl (including running), the
> > - * core KVM code tries to grab the vcpu->mutex.
> > - *
> > - * By grabbing the vcpu->mutex of all VCPUs we ensure that no
> > - * other VCPUs can fiddle with the state while we access it.
> > - */
> > - kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
> > - if (!mutex_trylock(&tmp_vcpu->mutex)) {
> > - unlock_vcpus(kvm, c - 1);
> > - return false;
> > - }
> > - }
> > -
> > - return true;
> > -}
> > -
> > static unsigned long nvhe_percpu_size(void)
> > {
> > return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
>
> [...]
>
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 69782df3617f..834f08dfa24c 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -1368,6 +1368,40 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
> > return 0;
> > }
> >
> > +/*
> > + * Try to lock all of the VM's vCPUs.
> > + * Assumes that the kvm->lock is held.
>
> Assuming is not enough. These assertions have caught a number of bugs,
> and I'm not prepared to drop them.
>
> > + */
> > +int kvm_trylock_all_vcpus(struct kvm *kvm)
> > +{
> > + struct kvm_vcpu *vcpu;
> > + unsigned long i, j;
> > +
> > + kvm_for_each_vcpu(i, vcpu, kvm)
> > + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
This one includes an assertion that kvm->lock is actually held.
That said, I'm not at all sure what the purpose of all this trylock
stuff is here.
Can someone explain? Last time I asked someone said something about
multiple VMs, but I don't know enough about kvm to know what that means.
Are those vcpu->mutex another class for other VMs? Or what gives?
On Thu, 01 May 2025 12:15:52 +0100,
Peter Zijlstra <peterz@infradead.org> wrote:
>
> > > + */
> > > +int kvm_trylock_all_vcpus(struct kvm *kvm)
> > > +{
> > > + struct kvm_vcpu *vcpu;
> > > + unsigned long i, j;
> > > +
> > > + kvm_for_each_vcpu(i, vcpu, kvm)
> > > + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
>
> This one includes an assertion that kvm->lock is actually held.
Ah, cunning. Thanks.
> That said, I'm not at all sure what the purpose of all this trylock
> stuff is here.
>
> Can someone explain? Last time I asked someone said something about
> multiple VMs, but I don't know enough about kvm to know what that means.
Multiple VMs? That'd be real fun. Not.
> Are those vcpu->mutex another class for other VMs? Or what gives?
Nah. This is firmly single VM.
The purpose of this contraption is that there are some rare cases
where we need to make sure that if we update some global state, all
the vcpus of a VM need to see, or none of them.
For these cases, the guarantee comes from luserspace, and it gives the
pinky promise that none of the vcpus are running at that point. But
being of a suspicious nature, we assert that this is true by trying to
take all the vcpu mutexes in one go. This will fail if a vcpu is
running, as KVM itself takes the vcpu mutex before doing anything.
Similar requirement exists if we need to synthesise some state for
userspace from all the individual vcpu states.
If the global locking fails, we return to userspace with a middle
finger indication, and all is well. Of course, this is pretty
expensive, which is why it is only done in setup phases, when the VMM
configures the guest.
The splat this is trying to address is that when you have more than 48
vcpus in a single VM, lockdep gets upset seeing up to 512 locks of a
similar class being taken.
Disclaimer: all the above is completely arm64-specific, and I didn't
even try to understand what other architectures are doing.
HTH,
M.
--
Without deviation from the norm, progress is not possible.
On Thu, May 01, 2025 at 01:44:28PM +0100, Marc Zyngier wrote:
> On Thu, 01 May 2025 12:15:52 +0100,
> Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > > + */
> > > > +int kvm_trylock_all_vcpus(struct kvm *kvm)
> > > > +{
> > > > + struct kvm_vcpu *vcpu;
> > > > + unsigned long i, j;
> > > > +
> > > > + kvm_for_each_vcpu(i, vcpu, kvm)
> > > > + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
> >
> > This one includes an assertion that kvm->lock is actually held.
>
> Ah, cunning. Thanks.
>
> > That said, I'm not at all sure what the purpose of all this trylock
> > stuff is here.
> >
> > Can someone explain? Last time I asked someone said something about
> > multiple VMs, but I don't know enough about kvm to know what that means.
>
> Multiple VMs? That'd be real fun. Not.
>
> > Are those vcpu->mutex another class for other VMs? Or what gives?
>
> Nah. This is firmly single VM.
>
> The purpose of this contraption is that there are some rare cases
> where we need to make sure that if we update some global state, all
> the vcpus of a VM need to see, or none of them.
>
> For these cases, the guarantee comes from luserspace, and it gives the
> pinky promise that none of the vcpus are running at that point. But
> being of a suspicious nature, we assert that this is true by trying to
> take all the vcpu mutexes in one go. This will fail if a vcpu is
> running, as KVM itself takes the vcpu mutex before doing anything.
>
> Similar requirement exists if we need to synthesise some state for
> userspace from all the individual vcpu states.
Ah, okay. Because x86 is simply doing mutex_lock() instead of
mutex_trylock() -- which would end up waiting for this activity to
subside I suppose.
Hence the use of the killable variant I suppose, for when they get tired
of waiting.
If all the architectures are basically doing the same thing, it might
make sense to unify this particular behaviour. But what do I know.
On Thu, 01 May 2025 14:41:26 +0100,
Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, May 01, 2025 at 01:44:28PM +0100, Marc Zyngier wrote:
> > On Thu, 01 May 2025 12:15:52 +0100,
> > Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > > > + */
> > > > > +int kvm_trylock_all_vcpus(struct kvm *kvm)
> > > > > +{
> > > > > + struct kvm_vcpu *vcpu;
> > > > > + unsigned long i, j;
> > > > > +
> > > > > + kvm_for_each_vcpu(i, vcpu, kvm)
> > > > > + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
> > >
> > > This one includes an assertion that kvm->lock is actually held.
> >
> > Ah, cunning. Thanks.
> >
> > > That said, I'm not at all sure what the purpose of all this trylock
> > > stuff is here.
> > >
> > > Can someone explain? Last time I asked someone said something about
> > > multiple VMs, but I don't know enough about kvm to know what that means.
> >
> > Multiple VMs? That'd be real fun. Not.
> >
> > > Are those vcpu->mutex another class for other VMs? Or what gives?
> >
> > Nah. This is firmly single VM.
> >
> > The purpose of this contraption is that there are some rare cases
> > where we need to make sure that if we update some global state, all
> > the vcpus of a VM need to see, or none of them.
> >
> > For these cases, the guarantee comes from luserspace, and it gives the
> > pinky promise that none of the vcpus are running at that point. But
> > being of a suspicious nature, we assert that this is true by trying to
> > take all the vcpu mutexes in one go. This will fail if a vcpu is
> > running, as KVM itself takes the vcpu mutex before doing anything.
> >
> > Similar requirement exists if we need to synthesise some state for
> > userspace from all the individual vcpu states.
>
> Ah, okay. Because x86 is simply doing mutex_lock() instead of
> mutex_trylock() -- which would end up waiting for this activity to
> subside I suppose.
>
> Hence the use of the killable variant I suppose, for when they get tired
> of waiting.
Yeah, I remember some debate around that when this refactoring was
first posted. I quickly paged it out.
> If all the architectures are basically doing the same thing, it might
> make sense to unify this particular behaviour. But what do I know.
I don't know either. The trylock behaviour has been there since day-1
on the arm side, and changing it would have userspace visible effects.
So I'm pretty keen on preserving it, warts and all. The last thing I
need is a VMM person hitting my inbox on the grounds that their toy is
broken.
On the other hand, we're talking about virtualisation, so everything
is more or less broken by design...
M.
--
Without deviation from the norm, progress is not possible.
© 2016 - 2026 Red Hat, Inc.