Documentation/admin-guide/hw-vuln/srso.rst | 13 ++++++++++++ arch/x86/include/asm/cpufeatures.h | 4 ++++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++++++++---- arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/lib/msr.c | 2 ++ 6 files changed, 46 insertions(+), 4 deletions(-)
So,
in the interest of finally making some progress here I'd like to commit this
below (will test it one more time just in case but it should work :-P). It is
simple and straight-forward and doesn't need an IBPB when the bit gets
cleared.
A potential future improvement is David's suggestion that there could be a way
for tracking when the first guest gets started, we set the bit then, we make
sure the bit gets set on each logical CPU when the guests migrate across the
machine and when the *last* guest exists, that bit gets cleared again.
It all depends on how much machinery is additionally needed and how much ugly
it becomes and how much noticeable the perf impact even is from simply setting
that bit blindly on every CPU...
But something we can chase later, once *some* fix is there first.
Thx.
---
Add support for
CPUID Fn8000_0021_EAX[31] (SRSO_MSR_FIX). If this bit is 1, it
indicates that software may use MSR BP_CFG[BpSpecReduce] to mitigate
SRSO.
Enable BpSpecReduce to mitigate SRSO across guest/host boundaries.
Switch back to enabling the bit when virtualization is enabled and to
clear the bit when virtualization is disabled because using a MSR slot
would clear the bit when the guest is exited and any training the guest
has done, would potentially influence the host kernel when execution
enters the kernel and hasn't VMRUN the guest yet.
More detail on the public thread in Link.
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20241202120416.6054-1-bp@kernel.org
---
Documentation/admin-guide/hw-vuln/srso.rst | 13 ++++++++++++
arch/x86/include/asm/cpufeatures.h | 4 ++++
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++++++++----
arch/x86/kvm/svm/svm.c | 6 ++++++
arch/x86/lib/msr.c | 2 ++
6 files changed, 46 insertions(+), 4 deletions(-)
diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst
index 2ad1c05b8c88..66af95251a3d 100644
--- a/Documentation/admin-guide/hw-vuln/srso.rst
+++ b/Documentation/admin-guide/hw-vuln/srso.rst
@@ -104,7 +104,20 @@ The possible values in this file are:
(spec_rstack_overflow=ibpb-vmexit)
+ * 'Mitigation: Reduced Speculation':
+ This mitigation gets automatically enabled when the above one "IBPB on
+ VMEXIT" has been selected and the CPU supports the BpSpecReduce bit.
+
+ It gets automatically enabled on machines which have the
+ SRSO_USER_KERNEL_NO=1 CPUID bit. In that case, the code logic is to switch
+ to the above =ibpb-vmexit mitigation because the user/kernel boundary is
+ not affected anymore and thus "safe RET" is not needed.
+
+ After enabling the IBPB on VMEXIT mitigation option, the BpSpecReduce bit
+ is detected (functionality present on all such machines) and that
+ practically overrides IBPB on VMEXIT as it has a lot less performance
+ impact and takes care of the guest->host attack vector too.
In order to exploit vulnerability, an attacker needs to:
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 508c0dad116b..43653f2704c9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -468,6 +468,10 @@
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
#define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */
+#define X86_FEATURE_SRSO_BP_SPEC_REDUCE (20*32+31) /*
+ * BP_CFG[BpSpecReduce] can be used to mitigate SRSO for VMs.
+ * (SRSO_MSR_FIX in the official doc).
+ */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 72765b2fe0d8..d35519b337ba 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -721,6 +721,7 @@
/* Zen4 */
#define MSR_ZEN4_BP_CFG 0xc001102e
+#define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4
#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5
/* Fam 19h MSRs */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index a5d0998d7604..1d7afc40f227 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2522,6 +2522,7 @@ enum srso_mitigation {
SRSO_MITIGATION_SAFE_RET,
SRSO_MITIGATION_IBPB,
SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
};
enum srso_mitigation_cmd {
@@ -2539,7 +2540,8 @@ static const char * const srso_strings[] = {
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
- [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
@@ -2578,7 +2580,7 @@ static void __init srso_select_mitigation(void)
srso_cmd == SRSO_CMD_OFF) {
if (boot_cpu_has(X86_FEATURE_SBPB))
x86_pred_cmd = PRED_CMD_SBPB;
- return;
+ goto out;
}
if (has_microcode) {
@@ -2590,7 +2592,7 @@ static void __init srso_select_mitigation(void)
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
- return;
+ goto out;
}
if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
@@ -2670,6 +2672,12 @@ static void __init srso_select_mitigation(void)
ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+
if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
@@ -2691,7 +2699,15 @@ static void __init srso_select_mitigation(void)
}
out:
- pr_info("%s\n", srso_strings[srso_mitigation]);
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation != SRSO_MITIGATION_NONE)
+ pr_info("%s\n", srso_strings[srso_mitigation]);
}
#undef pr_fmt
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a713c803a3a3..77ab66c5bb96 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,6 +607,9 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();
amd_pmu_disable_virt();
+
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static int svm_enable_virtualization_cpu(void)
@@ -684,6 +687,9 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+
return 0;
}
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4bf4fad5b148..5a18ecc04a6c 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -103,6 +103,7 @@ int msr_set_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, true);
}
+EXPORT_SYMBOL_GPL(msr_set_bit);
/**
* msr_clear_bit - Clear @bit in a MSR @msr.
@@ -118,6 +119,7 @@ int msr_clear_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, false);
}
+EXPORT_SYMBOL_GPL(msr_clear_bit);
#ifdef CONFIG_TRACEPOINTS
void do_trace_write_msr(unsigned int msr, u64 val, int failed)
--
2.43.0
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Tue, Feb 18, 2025 at 12:13:33PM +0100, Borislav Petkov wrote:
> So,
>
> in the interest of finally making some progress here I'd like to commit this
> below (will test it one more time just in case but it should work :-P). It is
> simple and straight-forward and doesn't need an IBPB when the bit gets
> cleared.
>
> A potential future improvement is David's suggestion that there could be a way
> for tracking when the first guest gets started, we set the bit then, we make
> sure the bit gets set on each logical CPU when the guests migrate across the
> machine and when the *last* guest exists, that bit gets cleared again.
Well, that "simplicity" was short-lived:
https://www.phoronix.com/review/linux-615-amd-regression
Sean, how about this below?
It is hacky and RFC-ish - i.e., don't look too hard at it - but basically
I'm pushing down into arch code the decision whether to enable virt on load.
And it has no effects on anything else but machines which have this
SRSO_MSR_FIX (Zen5).
And it seems to work here - the MSR is set only when I create a VM - i.e., as
expected.
Thoughts? Better ideas?
Thx.
---
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 823c0434bbad..6cc8698df1a5 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -16,6 +16,7 @@ BUILD_BUG_ON(1)
KVM_X86_OP(check_processor_compatibility)
KVM_X86_OP(enable_virtualization_cpu)
KVM_X86_OP(disable_virtualization_cpu)
+KVM_X86_OP_OPTIONAL(enable_virt_on_load)
KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3131abcac4f1..c1a29d7fee45 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1664,6 +1664,7 @@ struct kvm_x86_ops {
int (*enable_virtualization_cpu)(void);
void (*disable_virtualization_cpu)(void);
+ bool (*enable_virt_on_load)(void);
cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
void (*hardware_unsetup)(void);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 67657b3a36ce..dcbba55cb949 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -693,6 +693,16 @@ static int svm_enable_virtualization_cpu(void)
return 0;
}
+static bool svm_enable_virt_on_load(void)
+{
+ bool ret = true;
+
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ ret = false;
+
+ return ret;
+}
+
static void svm_cpu_uninit(int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
@@ -5082,6 +5092,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.hardware_unsetup = svm_hardware_unsetup,
.enable_virtualization_cpu = svm_enable_virtualization_cpu,
.disable_virtualization_cpu = svm_disable_virtualization_cpu,
+ .enable_virt_on_load = svm_enable_virt_on_load,
.emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c6553985e75..a09dc8cbd59f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12576,9 +12576,15 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
-void kvm_arch_enable_virtualization(void)
+bool kvm_arch_enable_virtualization(bool allow_arch_override)
{
+ if (allow_arch_override)
+ if (!kvm_x86_call(enable_virt_on_load)())
+ return false;
+
cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+
+ return true;
}
void kvm_arch_disable_virtualization(void)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 291d49b9bf05..4353ef54d45d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1599,7 +1599,7 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
* kvm_usage_count, i.e. at the beginning of the generic hardware enabling
* sequence, and at the end of the generic hardware disabling sequence.
*/
-void kvm_arch_enable_virtualization(void);
+bool kvm_arch_enable_virtualization(bool);
void kvm_arch_disable_virtualization(void);
/*
* kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e85b33a92624..0009661dee1d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -143,8 +143,8 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file)
#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
.open = kvm_no_compat_open
#endif
-static int kvm_enable_virtualization(void);
-static void kvm_disable_virtualization(void);
+static int kvm_enable_virtualization(bool allow_arch_override);
+static void kvm_disable_virtualization(bool allow_arch_override);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
@@ -1187,7 +1187,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
if (r)
goto out_err_no_arch_destroy_vm;
- r = kvm_enable_virtualization();
+ r = kvm_enable_virtualization(false);
if (r)
goto out_err_no_disable;
@@ -1224,7 +1224,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
#endif
out_err_no_mmu_notifier:
- kvm_disable_virtualization();
+ kvm_disable_virtualization(false);
out_err_no_disable:
kvm_arch_destroy_vm(kvm);
out_err_no_arch_destroy_vm:
@@ -1320,7 +1320,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
#endif
kvm_arch_free_vm(kvm);
preempt_notifier_dec();
- kvm_disable_virtualization();
+ kvm_disable_virtualization(false);
mmdrop(mm);
}
@@ -5489,9 +5489,9 @@ static DEFINE_PER_CPU(bool, virtualization_enabled);
static DEFINE_MUTEX(kvm_usage_lock);
static int kvm_usage_count;
-__weak void kvm_arch_enable_virtualization(void)
+__weak bool kvm_arch_enable_virtualization(bool)
{
-
+ return false;
}
__weak void kvm_arch_disable_virtualization(void)
@@ -5589,8 +5589,9 @@ static struct syscore_ops kvm_syscore_ops = {
.shutdown = kvm_shutdown,
};
-static int kvm_enable_virtualization(void)
+static int kvm_enable_virtualization(bool allow_arch_override)
{
+ bool do_init;
int r;
guard(mutex)(&kvm_usage_lock);
@@ -5598,7 +5599,9 @@ static int kvm_enable_virtualization(void)
if (kvm_usage_count++)
return 0;
- kvm_arch_enable_virtualization();
+ do_init = kvm_arch_enable_virtualization(allow_arch_override);
+ if (!do_init)
+ goto out;
r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
kvm_online_cpu, kvm_offline_cpu);
@@ -5631,11 +5634,13 @@ static int kvm_enable_virtualization(void)
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
err_cpuhp:
kvm_arch_disable_virtualization();
+
+out:
--kvm_usage_count;
return r;
}
-static void kvm_disable_virtualization(void)
+static void kvm_disable_virtualization(bool allow_arch_override)
{
guard(mutex)(&kvm_usage_lock);
@@ -5650,7 +5655,7 @@ static void kvm_disable_virtualization(void)
static int kvm_init_virtualization(void)
{
if (enable_virt_at_load)
- return kvm_enable_virtualization();
+ return kvm_enable_virtualization(true);
return 0;
}
@@ -5658,10 +5663,10 @@ static int kvm_init_virtualization(void)
static void kvm_uninit_virtualization(void)
{
if (enable_virt_at_load)
- kvm_disable_virtualization();
+ kvm_disable_virtualization(true);
}
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
-static int kvm_enable_virtualization(void)
+static int kvm_enable_virtualization(bool allow_arch_override)
{
return 0;
}
@@ -5671,7 +5676,7 @@ static int kvm_init_virtualization(void)
return 0;
}
-static void kvm_disable_virtualization(void)
+static void kvm_disable_virtualization(bool allow_arch_override)
{
}
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Tue, Apr 29, 2025, Borislav Petkov wrote:
> On Tue, Feb 18, 2025 at 12:13:33PM +0100, Borislav Petkov wrote:
> > So,
> >
> > in the interest of finally making some progress here I'd like to commit this
> > below (will test it one more time just in case but it should work :-P). It is
> > simple and straight-forward and doesn't need an IBPB when the bit gets
> > cleared.
> >
> > A potential future improvement is David's suggestion that there could be a way
> > for tracking when the first guest gets started, we set the bit then, we make
> > sure the bit gets set on each logical CPU when the guests migrate across the
> > machine and when the *last* guest exists, that bit gets cleared again.
>
> Well, that "simplicity" was short-lived:
>
> https://www.phoronix.com/review/linux-615-amd-regression
LOL.
> Sean, how about this below?
Eww. That's quite painful, and completely disallowing enable_virt_on_load is
undesirable, e.g. for use cases where the host is (almost) exclusively running
VMs.
Best idea I have is to throw in the towel on getting fancy, and just maintain a
dedicated count in SVM.
Alternatively, we could plumb an arch hook into kvm_create_vm() and kvm_destroy_vm()
that's called when KVM adds/deletes a VM from vm_list, and key off vm_list being
empty. But that adds a lot of boilerplate just to avoid a mutex+count.
I haven't tested on a system with X86_FEATURE_SRSO_BP_SPEC_REDUCE, but did verify
the mechanics by inverting the flag.
--
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 30 Apr 2025 15:34:50 -0700
Subject: [PATCH] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count
transitions
Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and
only if KVM has at least one active VM. Leaving the bit set at all times
unfortunately degrades performance by a wee bit more than expected.
Use a dedicated mutex and counter instead of hooking virtualization
enablement, as changing the behavior of kvm.enable_virt_at_load based on
SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could
result in performance issues for flows that are sensity to VM creation
latency.
Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX")
Reported-by: Michael Larabel <Michael@michaellarabel.com>
Closes: https://www.phoronix.com/review/linux-615-amd-regression
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/svm/svm.c | 39 +++++++++++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d5d0c5c3300b..fe8866572218 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();
amd_pmu_disable_virt();
-
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static int svm_enable_virtualization_cpu(void)
@@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
-
return 0;
}
@@ -5032,10 +5026,42 @@ static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
sev_vcpu_deliver_sipi_vector(vcpu, vector);
}
+static DEFINE_MUTEX(srso_lock);
+static int srso_nr_vms;
+
+static void svm_toggle_srso_spec_reduce(void *set)
+{
+ if (set)
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ else
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+}
+
+static void svm_srso_add_remove_vm(int count)
+{
+ bool set;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ guard(mutex)(&srso_lock);
+
+ set = !srso_nr_vms;
+ srso_nr_vms += count;
+
+ WARN_ON_ONCE(srso_nr_vms < 0);
+ if (!set && srso_nr_vms)
+ return;
+
+ on_each_cpu(svm_toggle_srso_spec_reduce, (void *)set, 1);
+}
+
static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);
+
+ svm_srso_add_remove_vm(-1);
}
static int svm_vm_init(struct kvm *kvm)
@@ -5061,6 +5087,7 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ svm_srso_add_remove_vm(1);
return 0;
}
base-commit: f158e1b145f73aae1d3b7e756eb129a15b2b7a90
--
On Wed, Apr 30, 2025 at 04:33:19PM -0700, Sean Christopherson wrote:
> Eww. That's quite painful, and completely disallowing enable_virt_on_load is
> undesirable, e.g. for use cases where the host is (almost) exclusively running
> VMs.
I wanted to stay generic... :-)
> Best idea I have is to throw in the towel on getting fancy, and just maintain a
> dedicated count in SVM.
>
> Alternatively, we could plumb an arch hook into kvm_create_vm() and kvm_destroy_vm()
> that's called when KVM adds/deletes a VM from vm_list, and key off vm_list being
> empty. But that adds a lot of boilerplate just to avoid a mutex+count.
FWIW, that was Tom's idea.
> +static void svm_srso_add_remove_vm(int count)
> +{
> + bool set;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> + return;
> +
> + guard(mutex)(&srso_lock);
> +
> + set = !srso_nr_vms;
> + srso_nr_vms += count;
> +
> + WARN_ON_ONCE(srso_nr_vms < 0);
> + if (!set && srso_nr_vms)
> + return;
So instead of doing this "by-foot", I would've used any of those
atomic_inc_return() and atomic_dec_and_test() and act upon the value when it
becomes 0 or !0 instead of passing 1 and -1. Because the count is kinda
implicit...
But yeah, not a biggie - that solves the issue too.
Thanks!
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Thu, May 01, 2025, Borislav Petkov wrote:
> On Wed, Apr 30, 2025 at 04:33:19PM -0700, Sean Christopherson wrote:
> > +static void svm_srso_add_remove_vm(int count)
> > +{
> > + bool set;
> > +
> > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> > + return;
> > +
> > + guard(mutex)(&srso_lock);
> > +
> > + set = !srso_nr_vms;
> > + srso_nr_vms += count;
> > +
> > + WARN_ON_ONCE(srso_nr_vms < 0);
> > + if (!set && srso_nr_vms)
> > + return;
>
> So instead of doing this "by-foot", I would've used any of those
> atomic_inc_return() and atomic_dec_and_test() and act upon the value when it
> becomes 0 or !0 instead of passing 1 and -1. Because the count is kinda
> implicit...
Heh, I considered that, and even tried it this morning because I thought it wouldn't
be as tricky as I first thought, but turns out, yeah, it's tricky. The complication
is that KVM needs to ensure BP_SPEC_REDUCE=1 on all CPUs before any VM is created.
I thought it wouldn't be _that_ tricky once I realized the 1=>0 case doesn't require
ordering, e.g. running host code while other CPUs have BP_SPEC_REDUCE=1 is totally
fine, KVM just needs to ensure no guest code is executed with BP_SPEC_REDUCE=0.
But guarding against all the possible edge cases is comically difficult.
For giggles, I did get it working, but it's a rather absurd amount of complexity
for very little gain. In practice, when srso_nr_vms >= 1, CPUs will hold the lock
for only a handful of cycles. The latency of VM creation is somewhat important,
but it's certainly not that latency sensitive, e.g. KVM already serializes VM
creation on kvm_lock (which is a mutex) in order to add VMs to vm_list.
The only truly slow path is the 0<=>1 transitions, and those *must* be slow to
get the ordering correct.
One thing I can/will do is change the mutex into a spinlock, so that on non-RT
systems there's zero chance of VM creation getting bogged down because a task
happens to get preempted at just the wrong time. I was thinking it's illegal to
use on_each_cpu() in a spinlock, but that's fine; it's using it with IRQs disabled
that's problematic.
I'll post a proper patch with the spinlock and CONFIG_CPU_MITIGATIONS #ifdef,
assuming testing comes back clean.
As for all the problematic scenarios... If KVM increments the counter before
sending IPIs, then reacing VM creation can result in the second VM skipping the
mutex and doing KVM_RUN with BP_SPEC_REDUCE=0.
VMs MSR CPU-0 CPU-1
-----------------------------
0 0 CREATE
0 0 lock()
1 0 inc()
1 0 CREATE
2 0 inc()
2 0 KVM_RUN :-(
2 0 IPI
2 1 WRMSR WRMSR
But simply incrementing the count after sending IPIs obviously doesn't work.
VMs MSR CPU-0 CPU-1
-----------------------------
0 0 CREATE
0 0 lock()
0 0 IPI
0 0 WRMSR WRMSR
1 0 inc()
1 0 KVM_RUN :-(
And passing in set/clear (or using separate IPI handlers) doesn't handle the case
where the IPI from destroy arrives after the IPI from create.
VMs MSR CPU-0 CPU-1
-----------------------------
1 1 DESTROY
0 1 CREATE dec()
0 1 lock()
0 1 IPI(1)
0 1 WRMSR WRMSR
0 1 IPI(0)
0 0 WRMSR WRMSR
1 0 inc()
1 0 KVM_RUN :-(
Addressing that by adding a global flag to track that SRSO needs to be set
almost works, but there's still a race with a destroy IPI if the callback only
checks the "set" flag.
VMs MSR CPU-0 CPU-1
-----------------------------
1 1 DESTROY
0 1 CREATE dec()
0 1 lock()
0 1 set=1
0 1 IPI
0 1 WRMSR WRMSR
1 0 inc()
1 1 set=0
1 1 IPI
1 0 WRMSR WRMSR
1 0 KVM_RUN :-(
To address all of those, I ended up with the below. It's not actually that much
code, but amount of documentation needed to explain everything is ugly.
#ifndef CONFIG_CPU_MITIGATIONS
static DEFINE_MUTEX(srso_add_vm_lock);
static atomic_t srso_nr_vms;
static bool srso_set;
static void svm_toggle_srso_spec_reduce(void *ign)
{
/*
* Read both srso_set and the count (and don't pass in set/clear!) so
* that BP_SPEC_REDUCE isn't incorrectly cleared if IPIs from destroying
* he last VM arrive after IPIs from creating the first VM (in the new
* "generation").
*/
if (READ_ONCE(srso_set) || atomic_read(&srso_nr_vms))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
else
msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static void svm_srso_vm_init(void)
{
if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
return;
/*
* Acquire the mutex on a 0=>1 transition to ensure BP_SPEC_REDUCE is
* set before any VM is fully created.
*/
if (atomic_inc_not_zero(&srso_nr_vms))
return;
guard(mutex)(&srso_add_vm_lock);
/*
* Re-check the count before sending IPIs, only the first task needs to
* toggle BP_SPEC_REDUCE, other tasks just need to wait. For the 0=>1
* case, update the count *after* BP_SPEC_REDUCE is set on all CPUs to
* ensure creating multiple VMs concurrently doesn't result in a task
* skipping the mutex before BP_SPEC_REDUCE is set.
*
* Atomically increment the count in all cases as the mutex only guards
* 0=>1 transitions, e.g. another task can decrement the count if a VM
* was created (0=>1) *and* destroyed (1=>0) between observing a count
* of '0' and acquiring the mutex, and another task can increment the
* count if the count is already >= 1.
*/
if (!atomic_inc_not_zero(&srso_nr_vms)) {
WRITE_ONCE(srso_set, true);
on_each_cpu(svm_toggle_srso_spec_reduce, NULL, 1);
atomic_inc(&srso_nr_vms);
smp_mb__after_atomic();
WRITE_ONCE(srso_set, false);
}
}
static void svm_srso_vm_destroy(void)
{
if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
return;
/*
* If the last VM is being destroyed, clear BP_SPEC_REDUCE on all CPUs.
* Unlike the creation case, there is no need to wait on other CPUs as
* running code with BP_SPEC_REDUCE=1 is always safe, KVM just needs to
* ensure guest code never runs with BP_SPEC_REDUCE=0.
*/
if (atomic_dec_and_test(&srso_nr_vms))
on_each_cpu(svm_toggle_srso_spec_reduce, NULL, 0);
}
#else
static void svm_srso_vm_init(void) { }
static void svm_srso_vm_destroy(void) { }
#endif /* CONFIG_CPU_MITIGATIONS */
On Thu, May 01, 2025 at 09:56:44AM -0700, Sean Christopherson wrote:
> Heh, I considered that, and even tried it this morning because I thought it wouldn't
> be as tricky as I first thought, but turns out, yeah, it's tricky. The complication
> is that KVM needs to ensure BP_SPEC_REDUCE=1 on all CPUs before any VM is created.
>
> I thought it wouldn't be _that_ tricky once I realized the 1=>0 case doesn't require
> ordering, e.g. running host code while other CPUs have BP_SPEC_REDUCE=1 is totally
> fine, KVM just needs to ensure no guest code is executed with BP_SPEC_REDUCE=0.
> But guarding against all the possible edge cases is comically difficult.
>
> For giggles, I did get it working, but it's a rather absurd amount of complexity
Thanks for taking the time to explain - that's, well, funky. :-\
Btw, in talking about this, David had this other idea which sounds
interesting:
How about we do a per-CPU var which holds down whether BP_SPEC_REDUCE is
enabled on the CPU?
It'll toggle the MSR bit before VMRUN on the CPU when num VMs goes 0=>1. This
way you avoid the IPIs and you set the bit on time.
You'd still need to do an IPI on VMEXIT when VM count does 1=>0 but that's
easy.
Dunno, there probably already is a per-CPU setting in KVM so you could add
that to it...
Anyway, something along those lines...
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On 4/30/25 6:33 PM, Sean Christopherson wrote:
> On Tue, Apr 29, 2025, Borislav Petkov wrote:
>> On Tue, Feb 18, 2025 at 12:13:33PM +0100, Borislav Petkov wrote:
>>> So,
>>>
>>> in the interest of finally making some progress here I'd like to commit this
>>> below (will test it one more time just in case but it should work :-P). It is
>>> simple and straight-forward and doesn't need an IBPB when the bit gets
>>> cleared.
>>>
>>> A potential future improvement is David's suggestion that there could be a way
>>> for tracking when the first guest gets started, we set the bit then, we make
>>> sure the bit gets set on each logical CPU when the guests migrate across the
>>> machine and when the *last* guest exists, that bit gets cleared again.
>> Well, that "simplicity" was short-lived:
>>
>> https://www.phoronix.com/review/linux-615-amd-regression
> LOL.
>
>> Sean, how about this below?
> Eww. That's quite painful, and completely disallowing enable_virt_on_load is
> undesirable, e.g. for use cases where the host is (almost) exclusively running
> VMs.
>
> Best idea I have is to throw in the towel on getting fancy, and just maintain a
> dedicated count in SVM.
>
> Alternatively, we could plumb an arch hook into kvm_create_vm() and kvm_destroy_vm()
> that's called when KVM adds/deletes a VM from vm_list, and key off vm_list being
> empty. But that adds a lot of boilerplate just to avoid a mutex+count.
>
> I haven't tested on a system with X86_FEATURE_SRSO_BP_SPEC_REDUCE, but did verify
> the mechanics by inverting the flag.
Testing this patch on the same EPYC Turin server as my original tests, I
can confirm that on a clean boot without any VMs running, the
performance is back to where it was on v6.14. :)
Thanks,
Michael
>
> --
> From: Sean Christopherson <seanjc@google.com>
> Date: Wed, 30 Apr 2025 15:34:50 -0700
> Subject: [PATCH] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count
> transitions
>
> Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and
> only if KVM has at least one active VM. Leaving the bit set at all times
> unfortunately degrades performance by a wee bit more than expected.
>
> Use a dedicated mutex and counter instead of hooking virtualization
> enablement, as changing the behavior of kvm.enable_virt_at_load based on
> SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could
> result in performance issues for flows that are sensity to VM creation
> latency.
>
> Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX")
> Reported-by: Michael Larabel <Michael@michaellarabel.com>
> Closes: https://www.phoronix.com/review/linux-615-amd-regression
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/kvm/svm/svm.c | 39 +++++++++++++++++++++++++++++++++------
> 1 file changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index d5d0c5c3300b..fe8866572218 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
> kvm_cpu_svm_disable();
>
> amd_pmu_disable_virt();
> -
> - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
> }
>
> static int svm_enable_virtualization_cpu(void)
> @@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
> rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
> }
>
> - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
> -
> return 0;
> }
>
> @@ -5032,10 +5026,42 @@ static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
> sev_vcpu_deliver_sipi_vector(vcpu, vector);
> }
>
> +static DEFINE_MUTEX(srso_lock);
> +static int srso_nr_vms;
> +
> +static void svm_toggle_srso_spec_reduce(void *set)
> +{
> + if (set)
> + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
> + else
> + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
> +}
> +
> +static void svm_srso_add_remove_vm(int count)
> +{
> + bool set;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
> + return;
> +
> + guard(mutex)(&srso_lock);
> +
> + set = !srso_nr_vms;
> + srso_nr_vms += count;
> +
> + WARN_ON_ONCE(srso_nr_vms < 0);
> + if (!set && srso_nr_vms)
> + return;
> +
> + on_each_cpu(svm_toggle_srso_spec_reduce, (void *)set, 1);
> +}
> +
> static void svm_vm_destroy(struct kvm *kvm)
> {
> avic_vm_destroy(kvm);
> sev_vm_destroy(kvm);
> +
> + svm_srso_add_remove_vm(-1);
> }
>
> static int svm_vm_init(struct kvm *kvm)
> @@ -5061,6 +5087,7 @@ static int svm_vm_init(struct kvm *kvm)
> return ret;
> }
>
> + svm_srso_add_remove_vm(1);
> return 0;
> }
>
>
> base-commit: f158e1b145f73aae1d3b7e756eb129a15b2b7a90
> --
> in the interest of finally making some progress here I'd like to commit this
> below (will test it one more time just in case but it should work :-P). It is
> simple and straight-forward and doesn't need an IBPB when the bit gets
> cleared.
That's indeed simple and straight-forward for the time being.
Maybe a small improvement we could add on top is to have a separate and
dedicated cmdline option?
Indeed, with `X86_FEATURE_SRSO_USER_KERNEL_NO` we are not effectively using an
IBPB on VM-Exit anymore. Something like the diff down below?
Best,
Patrick
---
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1d7afc40f2272..7609d80eda123 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2531,6 +2531,7 @@ enum srso_mitigation_cmd {
SRSO_CMD_SAFE_RET,
SRSO_CMD_IBPB,
SRSO_CMD_IBPB_ON_VMEXIT,
+ SRSO_CMD_BP_SPEC_REDUCE,
};
static const char * const srso_strings[] = {
@@ -2562,6 +2563,8 @@ static int __init srso_parse_cmdline(char *str)
srso_cmd = SRSO_CMD_IBPB;
else if (!strcmp(str, "ibpb-vmexit"))
srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT;
+ else if (!strcmp(str, "spec-reduce"))
+ srso_cmd = SRSO_CMD_BP_SPEC_REDUCE;
else
pr_err("Ignoring unknown SRSO option (%s).", str);
@@ -2617,7 +2620,7 @@ static void __init srso_select_mitigation(void)
case SRSO_CMD_SAFE_RET:
if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))
- goto ibpb_on_vmexit;
+ goto spec_reduce;
if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
/*
@@ -2670,14 +2673,7 @@ static void __init srso_select_mitigation(void)
}
break;
-ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
- if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
- pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
- srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
- break;
- }
-
if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
@@ -2694,6 +2690,14 @@ static void __init srso_select_mitigation(void)
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
}
break;
+
+spec_reduce:
+ case SRSO_CMD_BP_SPEC_REDUCE:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
default:
break;
}
On Tue, Feb 18, 2025 at 02:42:57PM +0000, Patrick Bellasi wrote:
> Maybe a small improvement we could add on top is to have a separate and
> dedicated cmdline option?
>
> Indeed, with `X86_FEATURE_SRSO_USER_KERNEL_NO` we are not effectively using an
> IBPB on VM-Exit anymore. Something like the diff down below?
Except that I don't see the point of this yet one more cmdline option. Our
mitigations options space is a nightmare. Why do we want to add another one?
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
© 2016 - 2025 Red Hat, Inc.