arch/x86/kvm/svm/svm.c | 71 ++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/svm/svm.h | 2 ++ 2 files changed, 67 insertions(+), 6 deletions(-)
Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and
only if KVM has at least one active VM. Leaving the bit set at all times
unfortunately degrades performance by a wee bit more than expected.
Use a dedicated spinlock and counter instead of hooking virtualization
enablement, as changing the behavior of kvm.enable_virt_at_load based on
SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could
result in performance issues for flows that are sensitive to VM creation
latency.
Defer setting BP_SPEC_REDUCE until VMRUN is imminent to avoid impacting
performance on CPUs that aren't running VMs, e.g. if a setup is using
housekeeping CPUs. Setting BP_SPEC_REDUCE in task context, i.e. without
blasting IPIs to all CPUs, also helps avoid serializing 1<=>N transitions
without incurring a gross amount of complexity (see the Link for details
on how ugly coordinating via IPIs gets).
Link: https://lore.kernel.org/all/aBOnzNCngyS_pQIW@google.com
Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX")
Reported-by: Michael Larabel <Michael@michaellarabel.com>
Closes: https://www.phoronix.com/review/linux-615-amd-regression
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
v2: Defer setting BP_SPEC_REDUCE until VMRUN is imminent, which in turn
allows for eliding the lock on 0<=>1 transitions as there is no race
with CPUs doing VMRUN before receiving the IPI to set the bit, and
having multiple tasks take the lock during svm_srso_vm_init() is a-ok.
v1: https://lore.kernel.org/all/20250502223456.887618-1-seanjc@google.com
arch/x86/kvm/svm/svm.c | 71 ++++++++++++++++++++++++++++++++++++++----
arch/x86/kvm/svm/svm.h | 2 ++
2 files changed, 67 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cc1c721ba067..15f7a0703c16 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();
amd_pmu_disable_virt();
-
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static int svm_enable_virtualization_cpu(void)
@@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
-
return 0;
}
@@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+ struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+ if (!sd->bp_spec_reduce_set)
+ return;
+
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ if (atomic_dec_return(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ /*
+ * Verify a new VM didn't come along, acquire the lock, and increment
+ * the count before this task acquired the lock.
+ */
+ if (atomic_read(&srso_nr_vms))
+ return;
+
+ on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ /*
+ * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+ * transition, i.e. destroying the last VM, is fully complete, e.g. so
+ * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+ */
+ if (atomic_inc_not_zero(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+ !sd->bp_spec_reduce_set) {
+ sd->bp_spec_reduce_set = true;
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ }
svm->guest_state_loaded = true;
}
@@ -5036,6 +5092,8 @@ static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);
+
+ svm_srso_vm_destroy();
}
static int svm_vm_init(struct kvm *kvm)
@@ -5061,6 +5119,7 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ svm_srso_vm_init();
return 0;
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d4490eaed55d..f16b068c4228 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -335,6 +335,8 @@ struct svm_cpu_data {
u32 next_asid;
u32 min_asid;
+ bool bp_spec_reduce_set;
+
struct vmcb *save_area;
unsigned long save_area_pa;
base-commit: 45eb29140e68ffe8e93a5471006858a018480a45
--
2.49.0.967.g6a0df3ecc3-goog
On Mon, 05 May 2025 11:03:00 -0700, Sean Christopherson wrote:
> Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and
> only if KVM has at least one active VM. Leaving the bit set at all times
> unfortunately degrades performance by a wee bit more than expected.
>
> Use a dedicated spinlock and counter instead of hooking virtualization
> enablement, as changing the behavior of kvm.enable_virt_at_load based on
> SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could
> result in performance issues for flows that are sensitive to VM creation
> latency.
>
> [...]
Applied to kvm-x86 fixes. Assuming -next doesn't explode overnight, I'll get
a pull request sent to Paolo tomorrow.
[1/1] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions
https://github.com/kvm-x86/linux/commit/e3417ab75ab2
--
https://github.com/kvm-x86/linux/tree/next
On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote:
> Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and
> only if KVM has at least one active VM. Leaving the bit set at all times
> unfortunately degrades performance by a wee bit more than expected.
>
> Use a dedicated spinlock and counter instead of hooking virtualization
> enablement, as changing the behavior of kvm.enable_virt_at_load based on
> SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could
> result in performance issues for flows that are sensitive to VM creation
> latency.
>
> Defer setting BP_SPEC_REDUCE until VMRUN is imminent to avoid impacting
> performance on CPUs that aren't running VMs, e.g. if a setup is using
> housekeeping CPUs. Setting BP_SPEC_REDUCE in task context, i.e. without
> blasting IPIs to all CPUs, also helps avoid serializing 1<=>N transitions
> without incurring a gross amount of complexity (see the Link for details
> on how ugly coordinating via IPIs gets).
>
> Link: https://lore.kernel.org/all/aBOnzNCngyS_pQIW@google.com
> Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX")
I guess
Cc: <stable@kernel.org>
as the above is in 6.14.
> Reported-by: Michael Larabel <Michael@michaellarabel.com>
> Closes: https://www.phoronix.com/review/linux-615-amd-regression
> Cc: Borislav Petkov <bp@alien8.de>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
LGTM, seems to work too on my machine.
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Thx for sticking with this and improving it!
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote:
> Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and
> only if KVM has at least one active VM. Leaving the bit set at all times
> unfortunately degrades performance by a wee bit more than expected.
>
> Use a dedicated spinlock and counter instead of hooking virtualization
> enablement, as changing the behavior of kvm.enable_virt_at_load based on
> SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could
> result in performance issues for flows that are sensitive to VM creation
> latency.
>
> Defer setting BP_SPEC_REDUCE until VMRUN is imminent to avoid impacting
> performance on CPUs that aren't running VMs, e.g. if a setup is using
> housekeeping CPUs. Setting BP_SPEC_REDUCE in task context, i.e. without
> blasting IPIs to all CPUs, also helps avoid serializing 1<=>N transitions
> without incurring a gross amount of complexity (see the Link for details
> on how ugly coordinating via IPIs gets).
>
> Link: https://lore.kernel.org/all/aBOnzNCngyS_pQIW@google.com
> Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX")
> Reported-by: Michael Larabel <Michael@michaellarabel.com>
> Closes: https://www.phoronix.com/review/linux-615-amd-regression
> Cc: Borislav Petkov <bp@alien8.de>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>
> v2: Defer setting BP_SPEC_REDUCE until VMRUN is imminent, which in turn
> allows for eliding the lock on 0<=>1 transitions as there is no race
> with CPUs doing VMRUN before receiving the IPI to set the bit, and
> having multiple tasks take the lock during svm_srso_vm_init() is a-ok.
>
> v1: https://lore.kernel.org/all/20250502223456.887618-1-seanjc@google.com
>
> arch/x86/kvm/svm/svm.c | 71 ++++++++++++++++++++++++++++++++++++++----
> arch/x86/kvm/svm/svm.h | 2 ++
> 2 files changed, 67 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cc1c721ba067..15f7a0703c16 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
> kvm_cpu_svm_disable();
>
> amd_pmu_disable_virt();
> -
> - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
> }
>
> static int svm_enable_virtualization_cpu(void)
> @@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
> rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
> }
>
> - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
> -
> return 0;
> }
>
> @@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
> __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
> }
>
> +#ifdef CONFIG_CPU_MITIGATIONS
> +static DEFINE_SPINLOCK(srso_lock);
> +static atomic_t srso_nr_vms;
> +
> +static void svm_srso_clear_bp_spec_reduce(void *ign)
> +{
> + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
> +
> + if (!sd->bp_spec_reduce_set)
> + return;
> +
> + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
> + sd->bp_spec_reduce_set = false;
> +}
> +
> +static void svm_srso_vm_destroy(void)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> + return;
> +
> + if (atomic_dec_return(&srso_nr_vms))
> + return;
> +
> + guard(spinlock)(&srso_lock);
> +
> + /*
> + * Verify a new VM didn't come along, acquire the lock, and increment
> + * the count before this task acquired the lock.
> + */
> + if (atomic_read(&srso_nr_vms))
> + return;
> +
> + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
Just a passing-by comment. I get worried about sending IPIs while
holding a spinlock because if someone ever tries to hold that spinlock
with IRQs disabled, it may cause a deadlock.
This is not the case for this lock, but it's not obvious (at least to
me) that holding it in a different code path that doesn't send IPIs with
IRQs disabled could cause a problem.
You could add a comment, convert it to a mutex to make this scenario
impossible, or dismiss my comment as being too paranoid/ridiculous :)
On Tue, May 06, 2025, Yosry Ahmed wrote:
> On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote:
> > +static void svm_srso_vm_destroy(void)
> > +{
> > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> > + return;
> > +
> > + if (atomic_dec_return(&srso_nr_vms))
> > + return;
> > +
> > + guard(spinlock)(&srso_lock);
> > +
> > + /*
> > + * Verify a new VM didn't come along, acquire the lock, and increment
> > + * the count before this task acquired the lock.
> > + */
> > + if (atomic_read(&srso_nr_vms))
> > + return;
> > +
> > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
>
> Just a passing-by comment. I get worried about sending IPIs while
> holding a spinlock because if someone ever tries to hold that spinlock
> with IRQs disabled, it may cause a deadlock.
>
> This is not the case for this lock, but it's not obvious (at least to
> me) that holding it in a different code path that doesn't send IPIs with
> IRQs disabled could cause a problem.
>
> You could add a comment, convert it to a mutex to make this scenario
> impossible,
Using a mutex doesn't make deadlock impossible, it's still perfectly legal to
disable IRQs while holding a mutex.
Similarly, I don't want to add a comment, because there is absolutely nothing
special/unique about this situation/lock. E.g. KVM has tens of calls to
smp_call_function_many_cond() while holding a spinlock equivalent, in the form
of kvm_make_all_cpus_request() while holding mmu_lock.
smp_call_function_many_cond() already asserts that IRQs are disabled, so I have
zero concerns about this flow breaking in the future.
> or dismiss my comment as being too paranoid/ridiculous :)
I wouldn't say your thought process is too paranoid; when writing the code, I had
to pause and think to remember whether or not using on_each_cpu() while holding a
spinlock is allowed. But I do think the conclusion is wrong :-)
On Tue, May 06, 2025 at 07:16:06AM -0700, Sean Christopherson wrote:
> On Tue, May 06, 2025, Yosry Ahmed wrote:
> > On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote:
> > > +static void svm_srso_vm_destroy(void)
> > > +{
> > > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> > > + return;
> > > +
> > > + if (atomic_dec_return(&srso_nr_vms))
> > > + return;
> > > +
> > > + guard(spinlock)(&srso_lock);
> > > +
> > > + /*
> > > + * Verify a new VM didn't come along, acquire the lock, and increment
> > > + * the count before this task acquired the lock.
> > > + */
> > > + if (atomic_read(&srso_nr_vms))
> > > + return;
> > > +
> > > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
> >
> > Just a passing-by comment. I get worried about sending IPIs while
> > holding a spinlock because if someone ever tries to hold that spinlock
> > with IRQs disabled, it may cause a deadlock.
> >
> > This is not the case for this lock, but it's not obvious (at least to
> > me) that holding it in a different code path that doesn't send IPIs with
> > IRQs disabled could cause a problem.
> >
> > You could add a comment, convert it to a mutex to make this scenario
> > impossible,
>
> Using a mutex doesn't make deadlock impossible, it's still perfectly legal to
> disable IRQs while holding a mutex.
Right, but it's illegal to hold a mutex while disabling IRQs. In this
case, if the other CPU is already holding the lock then there's no risk
of deadlock, right?
>
> Similarly, I don't want to add a comment, because there is absolutely nothing
> special/unique about this situation/lock. E.g. KVM has tens of calls to
> smp_call_function_many_cond() while holding a spinlock equivalent, in the form
> of kvm_make_all_cpus_request() while holding mmu_lock.
Agreed that it's not a unique situation at all. Ideally we'd have some
debugging (lockdep?) magic that identifies that an IPI is being sent
while a lock is held, and that this specific lock is never spinned on
with IRQs disabled.
>
> smp_call_function_many_cond() already asserts that IRQs are disabled, so I have
> zero concerns about this flow breaking in the future.
That doesn't really help tho, the problem is if another CPU spins on the
lock with IRQs disabled, regardless of whether or not it. Basically if
CPU 1 acquires the lock and sends an IPI while CPU 2 disables IRQs and
spins on the lock.
>
> > or dismiss my comment as being too paranoid/ridiculous :)
>
> I wouldn't say your thought process is too paranoid; when writing the code, I had
> to pause and think to remember whether or not using on_each_cpu() while holding a
> spinlock is allowed. But I do think the conclusion is wrong :-)
That's fair. I think protection against this should be done more
generically as I mentioned earlier, but it felt like it would be
easy-ish to side-step it in this case.
On Tue, May 06, 2025, Yosry Ahmed wrote:
> On Tue, May 06, 2025 at 07:16:06AM -0700, Sean Christopherson wrote:
> > On Tue, May 06, 2025, Yosry Ahmed wrote:
> > > On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote:
> > > > +static void svm_srso_vm_destroy(void)
> > > > +{
> > > > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> > > > + return;
> > > > +
> > > > + if (atomic_dec_return(&srso_nr_vms))
> > > > + return;
> > > > +
> > > > + guard(spinlock)(&srso_lock);
> > > > +
> > > > + /*
> > > > + * Verify a new VM didn't come along, acquire the lock, and increment
> > > > + * the count before this task acquired the lock.
> > > > + */
> > > > + if (atomic_read(&srso_nr_vms))
> > > > + return;
> > > > +
> > > > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
> > >
> > > Just a passing-by comment. I get worried about sending IPIs while
> > > holding a spinlock because if someone ever tries to hold that spinlock
> > > with IRQs disabled, it may cause a deadlock.
> > >
> > > This is not the case for this lock, but it's not obvious (at least to
> > > me) that holding it in a different code path that doesn't send IPIs with
> > > IRQs disabled could cause a problem.
> > >
> > > You could add a comment, convert it to a mutex to make this scenario
> > > impossible,
> >
> > Using a mutex doesn't make deadlock impossible, it's still perfectly legal to
> > disable IRQs while holding a mutex.
>
> Right, but it's illegal to hold a mutex while disabling IRQs.
Nit on the wording: it's illegal to take a mutex while IRQs are disabled. Disabling
IRQs while already holding a mutex is fine.
And it's also illegal to take a spinlock while IRQs are disabled, becauase spinlocks
become sleepable mutexes with PREEMPT_RT=y. While PREEMPT_RT=y isn't super common,
people do run KVM with PREEMPT_RT=y, and I'm guessing bots/CI would trip any such
violation quite quickly.
E.g. with IRQs disabled around the guard(spinlock)(&srso_lock):
BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2799, name: qemu
preempt_count: 0, expected: 0
RCU nest depth: 0, expected: 0
1 lock held by qemu/2799:
#0: ffffffff8263f898 (srso_lock){....}-{3:3}, at: svm_vm_destroy+0x47/0xa0
irq event stamp: 9090
hardirqs last enabled at (9089): [<ffffffff81414087>] vprintk_store+0x467/0x4d0
hardirqs last disabled at (9090): [<ffffffff812fd1ce>] svm_vm_destroy+0x5e/0xa0
softirqs last enabled at (0): [<ffffffff8137585c>] copy_process+0xa1c/0x29f0
softirqs last disabled at (0): [<0000000000000000>] 0x0
Call Trace:
<TASK>
dump_stack_lvl+0x57/0x80
__might_resched.cold+0xcc/0xde
rt_spin_lock+0x5b/0x170
svm_vm_destroy+0x47/0xa0
kvm_destroy_vm+0x180/0x310
kvm_vm_release+0x1d/0x30
__fput+0x10d/0x2f0
task_work_run+0x58/0x90
do_exit+0x325/0xa80
do_group_exit+0x32/0xa0
get_signal+0xb5b/0xbb0
arch_do_signal_or_restart+0x29/0x230
syscall_exit_to_user_mode+0xea/0x180
do_syscall_64+0x7a/0x220
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7fb50ae7fc4e
</TASK>
> In this case, if the other CPU is already holding the lock then there's no
> risk of deadlock, right?
Not on srso_lock, but there's still deadlock potential on the locks used to protect
the call_function_data structure.
> > Similarly, I don't want to add a comment, because there is absolutely nothing
> > special/unique about this situation/lock. E.g. KVM has tens of calls to
> > smp_call_function_many_cond() while holding a spinlock equivalent, in the form
> > of kvm_make_all_cpus_request() while holding mmu_lock.
>
> Agreed that it's not a unique situation at all. Ideally we'd have some
> debugging (lockdep?) magic that identifies that an IPI is being sent
> while a lock is held, and that this specific lock is never spinned on
> with IRQs disabled.
Sleepable spinlocks aside, the lockdep_assert_irqs_enabled() in
smp_call_function_many_cond() already provides sufficient of coverage for that
case. And if code is using some other form of IPI communication *and* taking raw
spinlocks, then I think it goes without saying that developers would need to be
very, very careful.
> > smp_call_function_many_cond() already asserts that IRQs are disabled, so I have
> > zero concerns about this flow breaking in the future.
>
> That doesn't really help tho, the problem is if another CPU spins on the
> lock with IRQs disabled, regardless of whether or not it. Basically if
> CPU 1 acquires the lock and sends an IPI while CPU 2 disables IRQs and
> spins on the lock.
Given that svm_srso_vm_destroy() is guaranteed to call on_each_cpu() with the
lock held at some point, I'm completely comfortable relying on its lockdep
assertion.
> > > or dismiss my comment as being too paranoid/ridiculous :)
> >
> > I wouldn't say your thought process is too paranoid; when writing the code, I had
> > to pause and think to remember whether or not using on_each_cpu() while holding a
> > spinlock is allowed. But I do think the conclusion is wrong :-)
>
> That's fair. I think protection against this should be done more generically
> as I mentioned earlier, but it felt like it would be easy-ish to side-step it
> in this case.
Eh, modifying this code in such a way that it could deadlock without lockdep
noticing would likely send up a comincal number of red flags during code review.
On Tue, May 06, 2025 at 08:57:36AM -0700, Sean Christopherson wrote:
> On Tue, May 06, 2025, Yosry Ahmed wrote:
> > On Tue, May 06, 2025 at 07:16:06AM -0700, Sean Christopherson wrote:
> > > On Tue, May 06, 2025, Yosry Ahmed wrote:
> > > > On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote:
> > > > > +static void svm_srso_vm_destroy(void)
> > > > > +{
> > > > > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> > > > > + return;
> > > > > +
> > > > > + if (atomic_dec_return(&srso_nr_vms))
> > > > > + return;
> > > > > +
> > > > > + guard(spinlock)(&srso_lock);
> > > > > +
> > > > > + /*
> > > > > + * Verify a new VM didn't come along, acquire the lock, and increment
> > > > > + * the count before this task acquired the lock.
> > > > > + */
> > > > > + if (atomic_read(&srso_nr_vms))
> > > > > + return;
> > > > > +
> > > > > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
> > > >
> > > > Just a passing-by comment. I get worried about sending IPIs while
> > > > holding a spinlock because if someone ever tries to hold that spinlock
> > > > with IRQs disabled, it may cause a deadlock.
> > > >
> > > > This is not the case for this lock, but it's not obvious (at least to
> > > > me) that holding it in a different code path that doesn't send IPIs with
> > > > IRQs disabled could cause a problem.
> > > >
> > > > You could add a comment, convert it to a mutex to make this scenario
> > > > impossible,
> > >
> > > Using a mutex doesn't make deadlock impossible, it's still perfectly legal to
> > > disable IRQs while holding a mutex.
> >
> > Right, but it's illegal to hold a mutex while disabling IRQs.
>
> Nit on the wording: it's illegal to take a mutex while IRQs are disabled. Disabling
> IRQs while already holding a mutex is fine.
Yes :)
>
> And it's also illegal to take a spinlock while IRQs are disabled, becauase spinlocks
> become sleepable mutexes with PREEMPT_RT=y. While PREEMPT_RT=y isn't super common,
> people do run KVM with PREEMPT_RT=y, and I'm guessing bots/CI would trip any such
> violation quite quickly.
But I think spin_lock_irq() and friends aren't a violation with
PREEMPT_RT=y, so these wouldn't trip bots/CI AFAICT.
>
> E.g. with IRQs disabled around the guard(spinlock)(&srso_lock):
>
> BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
> in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2799, name: qemu
> preempt_count: 0, expected: 0
> RCU nest depth: 0, expected: 0
> 1 lock held by qemu/2799:
> #0: ffffffff8263f898 (srso_lock){....}-{3:3}, at: svm_vm_destroy+0x47/0xa0
> irq event stamp: 9090
> hardirqs last enabled at (9089): [<ffffffff81414087>] vprintk_store+0x467/0x4d0
> hardirqs last disabled at (9090): [<ffffffff812fd1ce>] svm_vm_destroy+0x5e/0xa0
> softirqs last enabled at (0): [<ffffffff8137585c>] copy_process+0xa1c/0x29f0
> softirqs last disabled at (0): [<0000000000000000>] 0x0
> Call Trace:
> <TASK>
> dump_stack_lvl+0x57/0x80
> __might_resched.cold+0xcc/0xde
> rt_spin_lock+0x5b/0x170
> svm_vm_destroy+0x47/0xa0
> kvm_destroy_vm+0x180/0x310
> kvm_vm_release+0x1d/0x30
> __fput+0x10d/0x2f0
> task_work_run+0x58/0x90
> do_exit+0x325/0xa80
> do_group_exit+0x32/0xa0
> get_signal+0xb5b/0xbb0
> arch_do_signal_or_restart+0x29/0x230
> syscall_exit_to_user_mode+0xea/0x180
> do_syscall_64+0x7a/0x220
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> RIP: 0033:0x7fb50ae7fc4e
> </TASK>
>
> > In this case, if the other CPU is already holding the lock then there's no
> > risk of deadlock, right?
>
> Not on srso_lock, but there's still deadlock potential on the locks used to protect
> the call_function_data structure.
>
> > > Similarly, I don't want to add a comment, because there is absolutely nothing
> > > special/unique about this situation/lock. E.g. KVM has tens of calls to
> > > smp_call_function_many_cond() while holding a spinlock equivalent, in the form
> > > of kvm_make_all_cpus_request() while holding mmu_lock.
> >
> > Agreed that it's not a unique situation at all. Ideally we'd have some
> > debugging (lockdep?) magic that identifies that an IPI is being sent
> > while a lock is held, and that this specific lock is never spinned on
> > with IRQs disabled.
>
> Sleepable spinlocks aside, the lockdep_assert_irqs_enabled() in
> smp_call_function_many_cond() already provides sufficient of coverage for that
> case. And if code is using some other form of IPI communication *and* taking raw
> spinlocks, then I think it goes without saying that developers would need to be
> very, very careful.
I think we are not talking about the same thing, or I am
misunderstanding you. The lockdep_assert_irqs_enabled() assertion in
smp_call_function_many_cond() does not protect against this case AFAICT.
Basically imagine that a new code path is added that does:
spin_lock_irq(&srso_lock);
/* Some trivial logic, no IPI sending */
spin_unlock_irq(&srso_lock);
I believe spin_lock_irq() will disable IRQs (at least on some setups)
then spin on the lock.
Now imagine svm_srso_vm_destroy() is already holding the lock and sends
the IPI from CPU 1, while CPU 2 is executing the above code with IRQs
already disabled and spinning on the lock.
This is the deadlock scenario I am talking about. The lockdep assertion
in smp_call_function_many_cond() doesn't help because IRQs are enabled
on CPU 1, the problem is that they are disabled on CPU 2.
Lockdep can detect this by keeping track of the fact that some code
paths acquire the lock with IRQs off while some code paths acquire the
lock and send IPIs, I think.
>
> > > smp_call_function_many_cond() already asserts that IRQs are disabled, so I have
> > > zero concerns about this flow breaking in the future.
> >
> > That doesn't really help tho, the problem is if another CPU spins on the
> > lock with IRQs disabled, regardless of whether or not it. Basically if
> > CPU 1 acquires the lock and sends an IPI while CPU 2 disables IRQs and
> > spins on the lock.
>
> Given that svm_srso_vm_destroy() is guaranteed to call on_each_cpu() with the
> lock held at some point, I'm completely comfortable relying on its lockdep
> assertion.
>
> > > > or dismiss my comment as being too paranoid/ridiculous :)
> > >
> > > I wouldn't say your thought process is too paranoid; when writing the code, I had
> > > to pause and think to remember whether or not using on_each_cpu() while holding a
> > > spinlock is allowed. But I do think the conclusion is wrong :-)
> >
> > That's fair. I think protection against this should be done more generically
> > as I mentioned earlier, but it felt like it would be easy-ish to side-step it
> > in this case.
>
> Eh, modifying this code in such a way that it could deadlock without lockdep
> noticing would likely send up a comincal number of red flags during code review.
It just takes another code path using spin_lock_irq() or friends to
deadlock AFAICT.
Anyway, this has side tracked and I am probably taking more of your time
that I originally wanted, I was just making an observation more-or-less
:)
On Wed, May 07, 2025, Yosry Ahmed wrote: > On Tue, May 06, 2025 at 08:57:36AM -0700, Sean Christopherson wrote: > > Sleepable spinlocks aside, the lockdep_assert_irqs_enabled() in > > smp_call_function_many_cond() already provides sufficient of coverage for that > > case. And if code is using some other form of IPI communication *and* taking raw > > spinlocks, then I think it goes without saying that developers would need to be > > very, very careful. > > I think we are not talking about the same thing, or I am > misunderstanding you. The lockdep_assert_irqs_enabled() assertion in > smp_call_function_many_cond() does not protect against this case AFAICT. > > Basically imagine that a new code path is added that does: > > spin_lock_irq(&srso_lock); > /* Some trivial logic, no IPI sending */ > spin_unlock_irq(&srso_lock); > > I believe spin_lock_irq() will disable IRQs (at least on some setups) > then spin on the lock. Yes, because the most common use case for spin_lock_irq() is to prevent deadlock due to the lock being taken in IRQ context. > Now imagine svm_srso_vm_destroy() is already holding the lock and sends > the IPI from CPU 1, while CPU 2 is executing the above code with IRQs > already disabled and spinning on the lock. > > This is the deadlock scenario I am talking about. The lockdep assertion > in smp_call_function_many_cond() doesn't help because IRQs are enabled > on CPU 1, the problem is that they are disabled on CPU 2. > > Lockdep can detect this by keeping track of the fact that some code > paths acquire the lock with IRQs off while some code paths acquire the > lock and send IPIs, I think. I understand the scenario, I just don't see any meaningful risk in this case, which in turn means I don't see any reason to use an inferior lock type (for this particular case) to protect the count. spin_lock_irq() isn't a tool that's used willy-nilly, and the usage of srso_lock is extremely limited. If we manage to merge code that does spin_lock_irq(&srso_lock), you have my full permission to mock my ineptitude :-)
© 2016 - 2025 Red Hat, Inc.