x86/amd: Fix DE_CFG races and CPUID rescan issues

[PATCH 3/3] x86/amd: Fix race editing DE_CFG

Posted by Andrew Cooper 2 months, 2 weeks ago

We have two different functions explaining that DE_CFG is Core-scoped and that
writes are racy but happen to be safe.  This is only true when there's one of
them.

Introduce amd_init_de_cfg() to be the singular function which writes to
DE_CFG, modelled after the logic we already have for BP_CFG.

While reworking amd_check_zenbleed() into a simple predicate used by
amd_init_de_cfg(), fix the microcode table.  The 'good_rev' was specific to an
individual stepping and not valid to be matched by model, let alone a range.
The only CPUs incorrectly matched that I can locate appear to be
pre-production, and probably didn't get Zenbleed microcode.

Rework amd_init_lfence() to be amd_init_lfence_dispatch() with only the
purpose of configuring X86_FEATURE_LFENCE_DISPATCH in the case that it needs
synthesising.  Run it on the BSP only and use setup_force_cpu_cap() to prevent
the bit disappearing on a subseuqent CPUID rescan.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

I've submitted a matching fix to Linux's Zenbleed table.
---
 xen/arch/x86/cpu/amd.c   | 227 +++++++++++++++++++--------------------
 xen/arch/x86/cpu/cpu.h   |   3 +-
 xen/arch/x86/cpu/hygon.c |   6 +-
 3 files changed, 118 insertions(+), 118 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 7953261895ac..c9d0b55c8c8d 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -747,45 +747,6 @@ void amd_log_freq(const struct cpuinfo_x86 *c)
 		printk("CPU%u: %u MHz\n", smp_processor_id(), low_mhz);
 }
 
-void amd_init_lfence(struct cpuinfo_x86 *c)
-{
-	uint64_t value;
-
-	/*
-	 * Some hardware has LFENCE dispatch serialising always enabled,
-	 * nothing to do on that case.
-	 */
-	if (test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability))
-		return;
-
-	/*
-	 * Attempt to set lfence to be Dispatch Serialising.  This MSR almost
-	 * certainly isn't virtualised (and Xen at least will leak the real
-	 * value in but silently discard writes), as well as being per-core
-	 * rather than per-thread, so do a full safe read/write/readback cycle
-	 * in the worst case.
-	 */
-	if (rdmsr_safe(MSR_AMD64_DE_CFG, &value))
-		/* Unable to read.  Assume the safer default. */
-		__clear_bit(X86_FEATURE_LFENCE_DISPATCH,
-			    c->x86_capability);
-	else if (value & AMD64_DE_CFG_LFENCE_SERIALISE)
-		/* Already dispatch serialising. */
-		__set_bit(X86_FEATURE_LFENCE_DISPATCH,
-			  c->x86_capability);
-	else if (wrmsr_safe(MSR_AMD64_DE_CFG,
-			    value | AMD64_DE_CFG_LFENCE_SERIALISE) ||
-		 rdmsr_safe(MSR_AMD64_DE_CFG, &value) ||
-		 !(value & AMD64_DE_CFG_LFENCE_SERIALISE))
-		/* Attempt to set failed.  Assume the safer default. */
-		__clear_bit(X86_FEATURE_LFENCE_DISPATCH,
-			    c->x86_capability);
-	else
-		/* Successfully enabled! */
-		__set_bit(X86_FEATURE_LFENCE_DISPATCH,
-			  c->x86_capability);
-}
-
 /*
  * Refer to the AMD Speculative Store Bypass whitepaper:
  * https://developer.amd.com/wp-content/resources/124441_AMD64_SpeculativeStoreBypassDisable_Whitepaper_final.pdf
@@ -979,76 +940,6 @@ void __init detect_zen2_null_seg_behaviour(void)
 
 }
 
-static void amd_check_zenbleed(void)
-{
-	const struct cpu_signature *sig = &this_cpu(cpu_sig);
-	unsigned int good_rev;
-	uint64_t val, old_val, chickenbit = (1 << 9);
-
-	/*
-	 * If we're virtualised, we can't do family/model checks safely, and
-	 * we likely wouldn't have access to DE_CFG even if we could see a
-	 * microcode revision.
-	 *
-	 * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
-	 * position to care either way.  An admin doesn't want to be disabling
-	 * AVX as a mitigation on any build of Xen with this logic present.
-	 */
-	if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17)
-		return;
-
-	switch (boot_cpu_data.x86_model) {
-	case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
-	case 0x60 ... 0x67: good_rev = 0x0860010b; break;
-	case 0x68 ... 0x6f: good_rev = 0x08608105; break;
-	case 0x70 ... 0x7f: good_rev = 0x08701032; break;
-	case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
-	default:
-		/*
-		 * With the Fam17h check above, most parts getting here are
-		 * Zen1.  They're not affected.  Assume Zen2 ones making it
-		 * here are affected regardless of microcode version.
-		 */
-		if (is_zen1_uarch())
-			return;
-		good_rev = ~0U;
-		break;
-	}
-
-	rdmsrl(MSR_AMD64_DE_CFG, val);
-	old_val = val;
-
-	/*
-	 * Microcode is the preferred mitigation, in terms of performance.
-	 * However, without microcode, this chickenbit (specific to the Zen2
-	 * uarch) disables Floating Point Mov-Elimination to mitigate the
-	 * issue.
-	 */
-	val &= ~chickenbit;
-	if (sig->rev < good_rev)
-		val |= chickenbit;
-
-	if (val == old_val)
-		/* Nothing to change. */
-		return;
-
-	/*
-	 * DE_CFG is a Core-scoped MSR, and this write is racy during late
-	 * microcode load.  However, both threads calculate the new value from
-	 * state which is shared, and unrelated to the old value, so the
-	 * result should be consistent.
-	 */
-	wrmsrl(MSR_AMD64_DE_CFG, val);
-
-	/*
-	 * Inform the admin that we changed something, but don't spam,
-	 * especially during a late microcode load.
-	 */
-	if (smp_processor_id() == 0)
-		printk(XENLOG_INFO "Zenbleed mitigation - using %s\n",
-		       val & chickenbit ? "chickenbit" : "microcode");
-}
-
 static void cf_check fam17_disable_c6(void *arg)
 {
 	/* Disable C6 by clearing the CCR{0,1,2}_CC6EN bits. */
@@ -1075,6 +966,112 @@ static void cf_check fam17_disable_c6(void *arg)
 	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
 }
 
+static bool zenbleed_use_chickenbit(void)
+{
+    unsigned int curr_rev;
+    uint8_t fixed_rev;
+
+    /*
+     * If we're virtualised, we can't do family/model checks safely, and
+     * we likely wouldn't have access to DE_CFG even if we could see a
+     * microcode revision.
+     *
+     * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
+     * position to care either way.  An admin doesn't want to be disabling
+     * AVX as a mitigation on any build of Xen with this logic present.
+     */
+    if ( cpu_has_hypervisor || boot_cpu_data.family != 0x17 )
+        return false;
+
+    curr_rev = this_cpu(cpu_sig).rev;
+    switch ( curr_rev >> 8 )
+    {
+    case 0x083010: fixed_rev = 0x7a; break;
+    case 0x086001: fixed_rev = 0x0b; break;
+    case 0x086081: fixed_rev = 0x05; break;
+    case 0x087010: fixed_rev = 0x32; break;
+    case 0x08a000: fixed_rev = 0x08; break;
+    default:
+        /*
+         * With the Fam17h check above, most parts getting here are Zen1.
+         * They're not affected.  Assume Zen2 ones making it here are affected
+         * regardless of microcode version.
+         */
+        return is_zen2_uarch();
+    }
+
+    return (uint8_t)curr_rev >= fixed_rev;
+}
+
+void amd_init_de_cfg(const struct cpuinfo_x86 *c)
+{
+    uint64_t val, new = 0;
+
+    /* The MSR doesn't exist on Fam 0xf/0x11. */
+    if ( c->family != 0xf && c->family != 0x11 )
+        return;
+
+    /*
+     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
+     * serialising, and is enumerated in CPUID.  Hypervisors may also
+     * enumerate it when the setting is in place and MSR_AMD64_DE_CFG isn't
+     * available.
+     */
+    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
+        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
+
+    /*
+     * If vulnerable to Zenbleed and not mitigated in microcode, use the
+     * bigger hammer.
+     */
+    if ( zenbleed_use_chickenbit() )
+        new |= (1 << 9);
+
+    if ( !new )
+        return;
+
+    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) ||
+         (val & new) == new )
+        return;
+
+    /*
+     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
+     * threads calculate the new value from state which expected to be
+     * consistent across CPUs and unrelated to the old value, so the result
+     * should be consistent.
+     */
+    wrmsr_safe(MSR_AMD64_DE_CFG, val | new);
+}
+
+void __init amd_init_lfence_dispatch(void)
+{
+    struct cpuinfo_x86 *c = &boot_cpu_data;
+    uint64_t val;
+
+    if ( test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
+        /* LFENCE is forced dispatch serialising and we can't control it. */
+        return;
+
+    if ( c->family == 0xf || c->family == 0x11 )
+        /* MSR doesn't exist.  LFENCE is dispatch serialising on this hardare. */
+        goto set;
+
+    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) )
+        /* Unable to read.  Assume the safer default. */
+        goto clear;
+
+    if ( val & AMD64_DE_CFG_LFENCE_SERIALISE )
+        /* Already dispatch serialising. */
+        goto set;
+
+ clear:
+    setup_clear_cpu_cap(X86_FEATURE_LFENCE_DISPATCH);
+    return;
+
+ set:
+    setup_force_cpu_cap(X86_FEATURE_LFENCE_DISPATCH);
+}
+
 static void amd_check_bp_cfg(void)
 {
 	uint64_t val, new = 0;
@@ -1118,6 +1115,11 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
 	u32 l, h;
 	uint64_t value;
 
+        amd_init_de_cfg(c);
+
+        if (c == &boot_cpu_data)
+            amd_init_lfence_dispatch(); /* Needs amd_init_de_cfg() */
+
 	/* Disable TLB flush filter by setting HWCR.FFDIS on K8
 	 * bit 6 of msr C001_0015
 	 *
@@ -1156,12 +1158,6 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
 	if (c == &boot_cpu_data && !cpu_has(c, X86_FEATURE_RSTR_FP_ERR_PTRS))
 		setup_force_cpu_cap(X86_BUG_FPU_PTRS);
 
-	if (c->x86 == 0x0f || c->x86 == 0x11)
-		/* Always dispatch serialising on this hardare. */
-		__set_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability);
-	else /* Implicily "== 0x10 || >= 0x12" by being 64bit. */
-		amd_init_lfence(c);
-
 	amd_init_ssbd(c);
 
 	if (c->x86 == 0x17)
@@ -1379,7 +1375,6 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
 	if ((smp_processor_id() == 1) && !cpu_has(c, X86_FEATURE_ITSC))
 		disable_c1_ramping();
 
-	amd_check_zenbleed();
 	amd_check_bp_cfg();
 
 	if (fam17_c6_disabled)
diff --git a/xen/arch/x86/cpu/cpu.h b/xen/arch/x86/cpu/cpu.h
index cbb434f3a23d..8bed3f52490f 100644
--- a/xen/arch/x86/cpu/cpu.h
+++ b/xen/arch/x86/cpu/cpu.h
@@ -24,7 +24,8 @@ extern bool detect_extended_topology(struct cpuinfo_x86 *c);
 
 void cf_check early_init_amd(struct cpuinfo_x86 *c);
 void amd_log_freq(const struct cpuinfo_x86 *c);
-void amd_init_lfence(struct cpuinfo_x86 *c);
+void amd_init_de_cfg(const struct cpuinfo_x86 *c);
+void amd_init_lfence_dispatch(void);
 void amd_init_ssbd(const struct cpuinfo_x86 *c);
 void amd_init_spectral_chicken(void);
 void detect_zen2_null_seg_behaviour(void);
diff --git a/xen/arch/x86/cpu/hygon.c b/xen/arch/x86/cpu/hygon.c
index f7508cc8fcb9..68e98651cb79 100644
--- a/xen/arch/x86/cpu/hygon.c
+++ b/xen/arch/x86/cpu/hygon.c
@@ -31,7 +31,11 @@ static void cf_check init_hygon(struct cpuinfo_x86 *c)
 {
 	unsigned long long value;
 
-	amd_init_lfence(c);
+        amd_init_de_cfg(c);
+
+        if (c == &boot_cpu_data)
+            amd_init_lfence_dispatch(); /* Needs amd_init_de_cfg() */
+
 	amd_init_ssbd(c);
 
 	/* Probe for NSCB on Zen2 CPUs when not virtualised */
-- 
2.39.5

Re: [PATCH 3/3] x86/amd: Fix race editing DE_CFG

Posted by Jan Beulich 2 months, 2 weeks ago

On 26.11.2025 14:22, Andrew Cooper wrote:
> @@ -1075,6 +966,112 @@ static void cf_check fam17_disable_c6(void *arg)
>  	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
>  }
>  
> +static bool zenbleed_use_chickenbit(void)
> +{
> +    unsigned int curr_rev;
> +    uint8_t fixed_rev;
> +
> +    /*
> +     * If we're virtualised, we can't do family/model checks safely, and
> +     * we likely wouldn't have access to DE_CFG even if we could see a
> +     * microcode revision.
> +     *
> +     * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
> +     * position to care either way.  An admin doesn't want to be disabling
> +     * AVX as a mitigation on any build of Xen with this logic present.
> +     */
> +    if ( cpu_has_hypervisor || boot_cpu_data.family != 0x17 )
> +        return false;
> +
> +    curr_rev = this_cpu(cpu_sig).rev;
> +    switch ( curr_rev >> 8 )
> +    {
> +    case 0x083010: fixed_rev = 0x7a; break;
> +    case 0x086001: fixed_rev = 0x0b; break;
> +    case 0x086081: fixed_rev = 0x05; break;
> +    case 0x087010: fixed_rev = 0x32; break;
> +    case 0x08a000: fixed_rev = 0x08; break;
> +    default:
> +        /*
> +         * With the Fam17h check above, most parts getting here are Zen1.
> +         * They're not affected.  Assume Zen2 ones making it here are affected
> +         * regardless of microcode version.
> +         */
> +        return is_zen2_uarch();
> +    }
> +
> +    return (uint8_t)curr_rev >= fixed_rev;
> +}
> +
> +void amd_init_de_cfg(const struct cpuinfo_x86 *c)
> +{
> +    uint64_t val, new = 0;
> +
> +    /* The MSR doesn't exist on Fam 0xf/0x11. */
> +    if ( c->family != 0xf && c->family != 0x11 )
> +        return;

Comment and code don't match. Did you mean

    if ( c->family == 0xf || c->family == 0x11 )
        return;

(along the lines of what you have in amd_init_lfence_dispatch())?

> +    /*
> +     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
> +     * serialising, and is enumerated in CPUID.  Hypervisors may also
> +     * enumerate it when the setting is in place and MSR_AMD64_DE_CFG isn't
> +     * available.
> +     */
> +    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
> +        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
> +
> +    /*
> +     * If vulnerable to Zenbleed and not mitigated in microcode, use the
> +     * bigger hammer.
> +     */
> +    if ( zenbleed_use_chickenbit() )
> +        new |= (1 << 9);
> +
> +    if ( !new )
> +        return;
> +
> +    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) ||
> +         (val & new) == new )
> +        return;
> +
> +    /*
> +     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
> +     * threads calculate the new value from state which expected to be
> +     * consistent across CPUs and unrelated to the old value, so the result
> +     * should be consistent.
> +     */
> +    wrmsr_safe(MSR_AMD64_DE_CFG, val | new);

Either of the bits may be the cause of #GP. In that case we wouldn't set the
other bit, even if it may be possible to set it.

> +}
> +
> +void __init amd_init_lfence_dispatch(void)
> +{
> +    struct cpuinfo_x86 *c = &boot_cpu_data;
> +    uint64_t val;
> +
> +    if ( test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
> +        /* LFENCE is forced dispatch serialising and we can't control it. */
> +        return;
> +
> +    if ( c->family == 0xf || c->family == 0x11 )
> +        /* MSR doesn't exist.  LFENCE is dispatch serialising on this hardare. */
> +        goto set;
> +
> +    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) )
> +        /* Unable to read.  Assume the safer default. */
> +        goto clear;
> +
> +    if ( val & AMD64_DE_CFG_LFENCE_SERIALISE )
> +        /* Already dispatch serialising. */
> +        goto set;

Why "already", when this is now after we did the adjustment?

> + clear:
> +    setup_clear_cpu_cap(X86_FEATURE_LFENCE_DISPATCH);
> +    return;
> +
> + set:
> +    setup_force_cpu_cap(X86_FEATURE_LFENCE_DISPATCH);
> +}
> +
>  static void amd_check_bp_cfg(void)
>  {
>  	uint64_t val, new = 0;
> @@ -1118,6 +1115,11 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
>  	u32 l, h;
>  	uint64_t value;
>  
> +        amd_init_de_cfg(c);
> +
> +        if (c == &boot_cpu_data)
> +            amd_init_lfence_dispatch(); /* Needs amd_init_de_cfg() */
> +
>  	/* Disable TLB flush filter by setting HWCR.FFDIS on K8
>  	 * bit 6 of msr C001_0015
>  	 *

Nit: I don't think we want to mix indentation style within a function.

> --- a/xen/arch/x86/cpu/hygon.c
> +++ b/xen/arch/x86/cpu/hygon.c
> @@ -31,7 +31,11 @@ static void cf_check init_hygon(struct cpuinfo_x86 *c)
>  {
>  	unsigned long long value;
>  
> -	amd_init_lfence(c);
> +        amd_init_de_cfg(c);
> +
> +        if (c == &boot_cpu_data)
> +            amd_init_lfence_dispatch(); /* Needs amd_init_de_cfg() */
> +
>  	amd_init_ssbd(c);
>  
>  	/* Probe for NSCB on Zen2 CPUs when not virtualised */

Same here then.

Jan

Re: [PATCH 3/3] x86/amd: Fix race editing DE_CFG

Posted by Andrew Cooper 2 months, 2 weeks ago

On 26/11/2025 3:07 pm, Jan Beulich wrote:
> On 26.11.2025 14:22, Andrew Cooper wrote:
>> @@ -1075,6 +966,112 @@ static void cf_check fam17_disable_c6(void *arg)
>>  	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
>>  }
>>  
>> +static bool zenbleed_use_chickenbit(void)
>> +{
>> +    unsigned int curr_rev;
>> +    uint8_t fixed_rev;
>> +
>> +    /*
>> +     * If we're virtualised, we can't do family/model checks safely, and
>> +     * we likely wouldn't have access to DE_CFG even if we could see a
>> +     * microcode revision.
>> +     *
>> +     * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
>> +     * position to care either way.  An admin doesn't want to be disabling
>> +     * AVX as a mitigation on any build of Xen with this logic present.
>> +     */
>> +    if ( cpu_has_hypervisor || boot_cpu_data.family != 0x17 )
>> +        return false;
>> +
>> +    curr_rev = this_cpu(cpu_sig).rev;
>> +    switch ( curr_rev >> 8 )
>> +    {
>> +    case 0x083010: fixed_rev = 0x7a; break;
>> +    case 0x086001: fixed_rev = 0x0b; break;
>> +    case 0x086081: fixed_rev = 0x05; break;
>> +    case 0x087010: fixed_rev = 0x32; break;
>> +    case 0x08a000: fixed_rev = 0x08; break;
>> +    default:
>> +        /*
>> +         * With the Fam17h check above, most parts getting here are Zen1.
>> +         * They're not affected.  Assume Zen2 ones making it here are affected
>> +         * regardless of microcode version.
>> +         */
>> +        return is_zen2_uarch();
>> +    }
>> +
>> +    return (uint8_t)curr_rev >= fixed_rev;
>> +}
>> +
>> +void amd_init_de_cfg(const struct cpuinfo_x86 *c)
>> +{
>> +    uint64_t val, new = 0;
>> +
>> +    /* The MSR doesn't exist on Fam 0xf/0x11. */
>> +    if ( c->family != 0xf && c->family != 0x11 )
>> +        return;
> Comment and code don't match. Did you mean
>
>     if ( c->family == 0xf || c->family == 0x11 )
>         return;
>
> (along the lines of what you have in amd_init_lfence_dispatch())?

Oh - that was a last minute refactor which I didn't do quite correctly. 
Yes, it should match amd_init_lfence_dispatch().

>
>> +    /*
>> +     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
>> +     * serialising, and is enumerated in CPUID.  Hypervisors may also
>> +     * enumerate it when the setting is in place and MSR_AMD64_DE_CFG isn't
>> +     * available.
>> +     */
>> +    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
>> +        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
>> +
>> +    /*
>> +     * If vulnerable to Zenbleed and not mitigated in microcode, use the
>> +     * bigger hammer.
>> +     */
>> +    if ( zenbleed_use_chickenbit() )
>> +        new |= (1 << 9);
>> +
>> +    if ( !new )
>> +        return;
>> +
>> +    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) ||
>> +         (val & new) == new )
>> +        return;
>> +
>> +    /*
>> +     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
>> +     * threads calculate the new value from state which expected to be
>> +     * consistent across CPUs and unrelated to the old value, so the result
>> +     * should be consistent.
>> +     */
>> +    wrmsr_safe(MSR_AMD64_DE_CFG, val | new);
> Either of the bits may be the cause of #GP. In that case we wouldn't set the
> other bit, even if it may be possible to set it.

This MSR does not #GP on real hardware.

Also, both of these bits come from instructions AMD have provided,
saying "set $X in case $Y", which we have honoured as part of the
conditions for setting up new, which I consider to be a reasonable
guarantee that no #GP will ensue.

This wrmsr_safe() is covering the virt case, because older Xen and
Byhive used to disallow writes to it, and OpenBSD would explode as a
consequence.  Xen's fix was 4175fd3ccd17.

I toyed with the idea of having a tristate de_cfg_writeable, but that
got very ugly very quickly

The other option would be to ignore DE_CFG entirely under virt.  That's
what we do for BP_CFG already, and no hypervisor is going to really let
us have access to it, and it would downgrade to non-safe variants.

>> +}
>> +
>> +void __init amd_init_lfence_dispatch(void)
>> +{
>> +    struct cpuinfo_x86 *c = &boot_cpu_data;
>> +    uint64_t val;
>> +
>> +    if ( test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
>> +        /* LFENCE is forced dispatch serialising and we can't control it. */
>> +        return;
>> +
>> +    if ( c->family == 0xf || c->family == 0x11 )
>> +        /* MSR doesn't exist.  LFENCE is dispatch serialising on this hardare. */
>> +        goto set;
>> +
>> +    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) )
>> +        /* Unable to read.  Assume the safer default. */
>> +        goto clear;
>> +
>> +    if ( val & AMD64_DE_CFG_LFENCE_SERIALISE )
>> +        /* Already dispatch serialising. */
>> +        goto set;
> Why "already", when this is now after we did the adjustment?

The old logic read it back to see if the write had stuck.  Again that
was the virt case.

With the new logic, I can probably just drop this particular comment. 
It's not terribly useful given the new layout.

~Andrew

Re: [PATCH 3/3] x86/amd: Fix race editing DE_CFG

Posted by Andrew Cooper 2 months, 2 weeks ago

On 26/11/2025 4:55 pm, Andrew Cooper wrote:
> On 26/11/2025 3:07 pm, Jan Beulich wrote:
>> On 26.11.2025 14:22, Andrew Cooper wrote:
>>> @@ -1075,6 +966,112 @@ static void cf_check fam17_disable_c6(void *arg)
>>>  	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
>>>  }
>>>  
>>> +static bool zenbleed_use_chickenbit(void)
>>> +{
>>> +    unsigned int curr_rev;
>>> +    uint8_t fixed_rev;
>>> +
>>> +    /*
>>> +     * If we're virtualised, we can't do family/model checks safely, and
>>> +     * we likely wouldn't have access to DE_CFG even if we could see a
>>> +     * microcode revision.
>>> +     *
>>> +     * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
>>> +     * position to care either way.  An admin doesn't want to be disabling
>>> +     * AVX as a mitigation on any build of Xen with this logic present.
>>> +     */
>>> +    if ( cpu_has_hypervisor || boot_cpu_data.family != 0x17 )
>>> +        return false;
>>> +
>>> +    curr_rev = this_cpu(cpu_sig).rev;
>>> +    switch ( curr_rev >> 8 )
>>> +    {
>>> +    case 0x083010: fixed_rev = 0x7a; break;
>>> +    case 0x086001: fixed_rev = 0x0b; break;
>>> +    case 0x086081: fixed_rev = 0x05; break;
>>> +    case 0x087010: fixed_rev = 0x32; break;
>>> +    case 0x08a000: fixed_rev = 0x08; break;
>>> +    default:
>>> +        /*
>>> +         * With the Fam17h check above, most parts getting here are Zen1.
>>> +         * They're not affected.  Assume Zen2 ones making it here are affected
>>> +         * regardless of microcode version.
>>> +         */
>>> +        return is_zen2_uarch();
>>> +    }
>>> +
>>> +    return (uint8_t)curr_rev >= fixed_rev;
>>> +}
>>> +
>>> +void amd_init_de_cfg(const struct cpuinfo_x86 *c)
>>> +{
>>> +    uint64_t val, new = 0;
>>> +
>>> +    /* The MSR doesn't exist on Fam 0xf/0x11. */
>>> +    if ( c->family != 0xf && c->family != 0x11 )
>>> +        return;
>> Comment and code don't match. Did you mean
>>
>>     if ( c->family == 0xf || c->family == 0x11 )
>>         return;
>>
>> (along the lines of what you have in amd_init_lfence_dispatch())?
> Oh - that was a last minute refactor which I didn't do quite correctly. 
> Yes, it should match amd_init_lfence_dispatch().
>
>>> +    /*
>>> +     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
>>> +     * serialising, and is enumerated in CPUID.  Hypervisors may also
>>> +     * enumerate it when the setting is in place and MSR_AMD64_DE_CFG isn't
>>> +     * available.
>>> +     */
>>> +    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
>>> +        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
>>> +
>>> +    /*
>>> +     * If vulnerable to Zenbleed and not mitigated in microcode, use the
>>> +     * bigger hammer.
>>> +     */
>>> +    if ( zenbleed_use_chickenbit() )
>>> +        new |= (1 << 9);
>>> +
>>> +    if ( !new )
>>> +        return;
>>> +
>>> +    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) ||
>>> +         (val & new) == new )
>>> +        return;
>>> +
>>> +    /*
>>> +     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
>>> +     * threads calculate the new value from state which expected to be
>>> +     * consistent across CPUs and unrelated to the old value, so the result
>>> +     * should be consistent.
>>> +     */
>>> +    wrmsr_safe(MSR_AMD64_DE_CFG, val | new);
>> Either of the bits may be the cause of #GP. In that case we wouldn't set the
>> other bit, even if it may be possible to set it.
> This MSR does not #GP on real hardware.
>
> Also, both of these bits come from instructions AMD have provided,
> saying "set $X in case $Y", which we have honoured as part of the
> conditions for setting up new, which I consider to be a reasonable
> guarantee that no #GP will ensue.
>
> This wrmsr_safe() is covering the virt case, because older Xen and
> Byhive used to disallow writes to it, and OpenBSD would explode as a
> consequence.  Xen's fix was 4175fd3ccd17.
>
> I toyed with the idea of having a tristate de_cfg_writeable, but that
> got very ugly very quickly
>
> The other option would be to ignore DE_CFG entirely under virt.  That's
> what we do for BP_CFG already, and no hypervisor is going to really let
> us have access to it, and it would downgrade to non-safe variants.

In fact, ignoring the virt case for DE_CFG makes this generally nicer.

~Andrew

Re: [PATCH 3/3] x86/amd: Fix race editing DE_CFG

Posted by Jan Beulich 2 months, 2 weeks ago

On 26.11.2025 18:56, Andrew Cooper wrote:
> On 26/11/2025 4:55 pm, Andrew Cooper wrote:
>> On 26/11/2025 3:07 pm, Jan Beulich wrote:
>>> On 26.11.2025 14:22, Andrew Cooper wrote:
>>>> @@ -1075,6 +966,112 @@ static void cf_check fam17_disable_c6(void *arg)
>>>>  	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
>>>>  }
>>>>  
>>>> +static bool zenbleed_use_chickenbit(void)
>>>> +{
>>>> +    unsigned int curr_rev;
>>>> +    uint8_t fixed_rev;
>>>> +
>>>> +    /*
>>>> +     * If we're virtualised, we can't do family/model checks safely, and
>>>> +     * we likely wouldn't have access to DE_CFG even if we could see a
>>>> +     * microcode revision.
>>>> +     *
>>>> +     * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
>>>> +     * position to care either way.  An admin doesn't want to be disabling
>>>> +     * AVX as a mitigation on any build of Xen with this logic present.
>>>> +     */
>>>> +    if ( cpu_has_hypervisor || boot_cpu_data.family != 0x17 )
>>>> +        return false;
>>>> +
>>>> +    curr_rev = this_cpu(cpu_sig).rev;
>>>> +    switch ( curr_rev >> 8 )
>>>> +    {
>>>> +    case 0x083010: fixed_rev = 0x7a; break;
>>>> +    case 0x086001: fixed_rev = 0x0b; break;
>>>> +    case 0x086081: fixed_rev = 0x05; break;
>>>> +    case 0x087010: fixed_rev = 0x32; break;
>>>> +    case 0x08a000: fixed_rev = 0x08; break;
>>>> +    default:
>>>> +        /*
>>>> +         * With the Fam17h check above, most parts getting here are Zen1.
>>>> +         * They're not affected.  Assume Zen2 ones making it here are affected
>>>> +         * regardless of microcode version.
>>>> +         */
>>>> +        return is_zen2_uarch();
>>>> +    }
>>>> +
>>>> +    return (uint8_t)curr_rev >= fixed_rev;
>>>> +}
>>>> +
>>>> +void amd_init_de_cfg(const struct cpuinfo_x86 *c)
>>>> +{
>>>> +    uint64_t val, new = 0;
>>>> +
>>>> +    /* The MSR doesn't exist on Fam 0xf/0x11. */
>>>> +    if ( c->family != 0xf && c->family != 0x11 )
>>>> +        return;
>>> Comment and code don't match. Did you mean
>>>
>>>     if ( c->family == 0xf || c->family == 0x11 )
>>>         return;
>>>
>>> (along the lines of what you have in amd_init_lfence_dispatch())?
>> Oh - that was a last minute refactor which I didn't do quite correctly. 
>> Yes, it should match amd_init_lfence_dispatch().
>>
>>>> +    /*
>>>> +     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
>>>> +     * serialising, and is enumerated in CPUID.  Hypervisors may also
>>>> +     * enumerate it when the setting is in place and MSR_AMD64_DE_CFG isn't
>>>> +     * available.
>>>> +     */
>>>> +    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
>>>> +        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
>>>> +
>>>> +    /*
>>>> +     * If vulnerable to Zenbleed and not mitigated in microcode, use the
>>>> +     * bigger hammer.
>>>> +     */
>>>> +    if ( zenbleed_use_chickenbit() )
>>>> +        new |= (1 << 9);
>>>> +
>>>> +    if ( !new )
>>>> +        return;
>>>> +
>>>> +    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) ||
>>>> +         (val & new) == new )
>>>> +        return;
>>>> +
>>>> +    /*
>>>> +     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
>>>> +     * threads calculate the new value from state which expected to be
>>>> +     * consistent across CPUs and unrelated to the old value, so the result
>>>> +     * should be consistent.
>>>> +     */
>>>> +    wrmsr_safe(MSR_AMD64_DE_CFG, val | new);
>>> Either of the bits may be the cause of #GP. In that case we wouldn't set the
>>> other bit, even if it may be possible to set it.
>> This MSR does not #GP on real hardware.

I consider this unexpected / inconsistent, at least as long as some of the
bits would be documented as reserved. "Would be" because the particular
Fam17 and Fam19 PPRs I'm looking at don't even mention DE_CFG (or BP_CFG,
for that matter).

>> Also, both of these bits come from instructions AMD have provided,
>> saying "set $X in case $Y", which we have honoured as part of the
>> conditions for setting up new, which I consider to be a reasonable
>> guarantee that no #GP will ensue.

The AMD instructions are for particular models, aren't they? While that
may mean the bits are fine to blindly (try to) set on other models, pretty
likely this can't be extended to other families. (While
zenbleed_use_chickenbit() is family-specific, the LFENCE bit is tried
without regard to family.)

>> This wrmsr_safe() is covering the virt case, because older Xen and
>> Byhive used to disallow writes to it, and OpenBSD would explode as a
>> consequence.  Xen's fix was 4175fd3ccd17.
>>
>> I toyed with the idea of having a tristate de_cfg_writeable, but that
>> got very ugly very quickly
>>
>> The other option would be to ignore DE_CFG entirely under virt.  That's
>> what we do for BP_CFG already, and no hypervisor is going to really let
>> us have access to it, and it would downgrade to non-safe variants.
> 
> In fact, ignoring the virt case for DE_CFG makes this generally nicer.

And being consistent with what we do with BP_CFG looks desirable to me as
well.

Jan

Re: [PATCH 3/3] x86/amd: Fix race editing DE_CFG

Posted by Andrew Cooper 2 months, 1 week ago

On 27/11/2025 7:58 am, Jan Beulich wrote:
> On 26.11.2025 18:56, Andrew Cooper wrote:
>> On 26/11/2025 4:55 pm, Andrew Cooper wrote:
>>> On 26/11/2025 3:07 pm, Jan Beulich wrote:
>>>> On 26.11.2025 14:22, Andrew Cooper wrote:
>>>>> @@ -1075,6 +966,112 @@ static void cf_check fam17_disable_c6(void *arg)
>>>>>  	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
>>>>>  }
>>>>>  
>>>>> +static bool zenbleed_use_chickenbit(void)
>>>>> +{
>>>>> +    unsigned int curr_rev;
>>>>> +    uint8_t fixed_rev;
>>>>> +
>>>>> +    /*
>>>>> +     * If we're virtualised, we can't do family/model checks safely, and
>>>>> +     * we likely wouldn't have access to DE_CFG even if we could see a
>>>>> +     * microcode revision.
>>>>> +     *
>>>>> +     * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
>>>>> +     * position to care either way.  An admin doesn't want to be disabling
>>>>> +     * AVX as a mitigation on any build of Xen with this logic present.
>>>>> +     */
>>>>> +    if ( cpu_has_hypervisor || boot_cpu_data.family != 0x17 )
>>>>> +        return false;
>>>>> +
>>>>> +    curr_rev = this_cpu(cpu_sig).rev;
>>>>> +    switch ( curr_rev >> 8 )
>>>>> +    {
>>>>> +    case 0x083010: fixed_rev = 0x7a; break;
>>>>> +    case 0x086001: fixed_rev = 0x0b; break;
>>>>> +    case 0x086081: fixed_rev = 0x05; break;
>>>>> +    case 0x087010: fixed_rev = 0x32; break;
>>>>> +    case 0x08a000: fixed_rev = 0x08; break;
>>>>> +    default:
>>>>> +        /*
>>>>> +         * With the Fam17h check above, most parts getting here are Zen1.
>>>>> +         * They're not affected.  Assume Zen2 ones making it here are affected
>>>>> +         * regardless of microcode version.
>>>>> +         */
>>>>> +        return is_zen2_uarch();
>>>>> +    }
>>>>> +
>>>>> +    return (uint8_t)curr_rev >= fixed_rev;
>>>>> +}
>>>>> +
>>>>> +void amd_init_de_cfg(const struct cpuinfo_x86 *c)
>>>>> +{
>>>>> +    uint64_t val, new = 0;
>>>>> +
>>>>> +    /* The MSR doesn't exist on Fam 0xf/0x11. */
>>>>> +    if ( c->family != 0xf && c->family != 0x11 )
>>>>> +        return;
>>>> Comment and code don't match. Did you mean
>>>>
>>>>     if ( c->family == 0xf || c->family == 0x11 )
>>>>         return;
>>>>
>>>> (along the lines of what you have in amd_init_lfence_dispatch())?
>>> Oh - that was a last minute refactor which I didn't do quite correctly. 
>>> Yes, it should match amd_init_lfence_dispatch().
>>>
>>>>> +    /*
>>>>> +     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
>>>>> +     * serialising, and is enumerated in CPUID.  Hypervisors may also
>>>>> +     * enumerate it when the setting is in place and MSR_AMD64_DE_CFG isn't
>>>>> +     * available.
>>>>> +     */
>>>>> +    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
>>>>> +        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
>>>>> +
>>>>> +    /*
>>>>> +     * If vulnerable to Zenbleed and not mitigated in microcode, use the
>>>>> +     * bigger hammer.
>>>>> +     */
>>>>> +    if ( zenbleed_use_chickenbit() )
>>>>> +        new |= (1 << 9);
>>>>> +
>>>>> +    if ( !new )
>>>>> +        return;
>>>>> +
>>>>> +    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) ||
>>>>> +         (val & new) == new )
>>>>> +        return;
>>>>> +
>>>>> +    /*
>>>>> +     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
>>>>> +     * threads calculate the new value from state which expected to be
>>>>> +     * consistent across CPUs and unrelated to the old value, so the result
>>>>> +     * should be consistent.
>>>>> +     */
>>>>> +    wrmsr_safe(MSR_AMD64_DE_CFG, val | new);
>>>> Either of the bits may be the cause of #GP. In that case we wouldn't set the
>>>> other bit, even if it may be possible to set it.
>>> This MSR does not #GP on real hardware.
> I consider this unexpected / inconsistent, at least as long as some of the
> bits would be documented as reserved. "Would be" because the particular
> Fam17 and Fam19 PPRs I'm looking at don't even mention DE_CFG (or BP_CFG,
> for that matter).

You need the even-more-NDA manual to find those details.

Reserved doesn't mean #GP. It means "don't rely on the behaviour".

>>> Also, both of these bits come from instructions AMD have provided,
>>> saying "set $X in case $Y", which we have honoured as part of the
>>> conditions for setting up new, which I consider to be a reasonable
>>> guarantee that no #GP will ensue.
> The AMD instructions are for particular models, aren't they? While that
> may mean the bits are fine to blindly (try to) set on other models, pretty
> likely this can't be extended to other families. (While
> zenbleed_use_chickenbit() is family-specific, the LFENCE bit is tried
> without regard to family.)

The Managing Speciation whitepaper says "set bit 1 on Fam 10, 12, 14,
15-17".

It also says that AMD will treat the MSR and bit 1 as architectural
moving forwards.  In reality, on Zen3 (post-dating the whitepaper) and
later, it's write-discard, read-as-1, and this is the behaviour we
provide to all VMs.

The Zenbleed instruction say "set bit 9 on Zen2".

So, the logic in this patch following AMD's written instructions.

~Andrew

[PATCH v1.9 3/3] x86/amd: Fix race editing DE_CFG

Posted by Andrew Cooper 2 months, 2 weeks ago

We have two different functions explaining that DE_CFG is Core-scoped and that
writes are racy but happen to be safe.  This is only true when there's one of
them.

Introduce amd_init_de_cfg() to be the singular function which writes to
DE_CFG, modelled after the logic we already have for BP_CFG.

While reworking amd_check_zenbleed() into a simple predicate used by
amd_init_de_cfg(), fix the microcode table.  The 'good_rev' was specific to an
individual stepping and not valid to be matched by model, let alone a range.
The only CPUs incorrectly matched that I can locate appear to be
pre-production, and probably didn't get Zenbleed microcode.

Rework amd_init_lfence() to be amd_init_lfence_dispatch() with only the
purpose of configuring X86_FEATURE_LFENCE_DISPATCH in the case that it needs
synthesising.  Run it on the BSP only and use setup_force_cpu_cap() to prevent
the bit disappearing on a subseuqent CPUID rescan.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v2:
 * Fix FamF/11h check in amd_init_de_cfg(), and exclude the virt case.
 * Fix tabs/spaces in init_{amd,hygon}()
 * Drop comment in amd_init_lfence_dispatch()
---
 xen/arch/x86/cpu/amd.c   | 223 +++++++++++++++++++--------------------
 xen/arch/x86/cpu/cpu.h   |   3 +-
 xen/arch/x86/cpu/hygon.c |   6 +-
 3 files changed, 114 insertions(+), 118 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 7953261895ac..82b02df9a9d5 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -747,45 +747,6 @@ void amd_log_freq(const struct cpuinfo_x86 *c)
 		printk("CPU%u: %u MHz\n", smp_processor_id(), low_mhz);
 }
 
-void amd_init_lfence(struct cpuinfo_x86 *c)
-{
-	uint64_t value;
-
-	/*
-	 * Some hardware has LFENCE dispatch serialising always enabled,
-	 * nothing to do on that case.
-	 */
-	if (test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability))
-		return;
-
-	/*
-	 * Attempt to set lfence to be Dispatch Serialising.  This MSR almost
-	 * certainly isn't virtualised (and Xen at least will leak the real
-	 * value in but silently discard writes), as well as being per-core
-	 * rather than per-thread, so do a full safe read/write/readback cycle
-	 * in the worst case.
-	 */
-	if (rdmsr_safe(MSR_AMD64_DE_CFG, &value))
-		/* Unable to read.  Assume the safer default. */
-		__clear_bit(X86_FEATURE_LFENCE_DISPATCH,
-			    c->x86_capability);
-	else if (value & AMD64_DE_CFG_LFENCE_SERIALISE)
-		/* Already dispatch serialising. */
-		__set_bit(X86_FEATURE_LFENCE_DISPATCH,
-			  c->x86_capability);
-	else if (wrmsr_safe(MSR_AMD64_DE_CFG,
-			    value | AMD64_DE_CFG_LFENCE_SERIALISE) ||
-		 rdmsr_safe(MSR_AMD64_DE_CFG, &value) ||
-		 !(value & AMD64_DE_CFG_LFENCE_SERIALISE))
-		/* Attempt to set failed.  Assume the safer default. */
-		__clear_bit(X86_FEATURE_LFENCE_DISPATCH,
-			    c->x86_capability);
-	else
-		/* Successfully enabled! */
-		__set_bit(X86_FEATURE_LFENCE_DISPATCH,
-			  c->x86_capability);
-}
-
 /*
  * Refer to the AMD Speculative Store Bypass whitepaper:
  * https://developer.amd.com/wp-content/resources/124441_AMD64_SpeculativeStoreBypassDisable_Whitepaper_final.pdf
@@ -979,76 +940,6 @@ void __init detect_zen2_null_seg_behaviour(void)
 
 }
 
-static void amd_check_zenbleed(void)
-{
-	const struct cpu_signature *sig = &this_cpu(cpu_sig);
-	unsigned int good_rev;
-	uint64_t val, old_val, chickenbit = (1 << 9);
-
-	/*
-	 * If we're virtualised, we can't do family/model checks safely, and
-	 * we likely wouldn't have access to DE_CFG even if we could see a
-	 * microcode revision.
-	 *
-	 * A hypervisor may hide AVX as a stopgap mitigation.  We're not in a
-	 * position to care either way.  An admin doesn't want to be disabling
-	 * AVX as a mitigation on any build of Xen with this logic present.
-	 */
-	if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17)
-		return;
-
-	switch (boot_cpu_data.x86_model) {
-	case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
-	case 0x60 ... 0x67: good_rev = 0x0860010b; break;
-	case 0x68 ... 0x6f: good_rev = 0x08608105; break;
-	case 0x70 ... 0x7f: good_rev = 0x08701032; break;
-	case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
-	default:
-		/*
-		 * With the Fam17h check above, most parts getting here are
-		 * Zen1.  They're not affected.  Assume Zen2 ones making it
-		 * here are affected regardless of microcode version.
-		 */
-		if (is_zen1_uarch())
-			return;
-		good_rev = ~0U;
-		break;
-	}
-
-	rdmsrl(MSR_AMD64_DE_CFG, val);
-	old_val = val;
-
-	/*
-	 * Microcode is the preferred mitigation, in terms of performance.
-	 * However, without microcode, this chickenbit (specific to the Zen2
-	 * uarch) disables Floating Point Mov-Elimination to mitigate the
-	 * issue.
-	 */
-	val &= ~chickenbit;
-	if (sig->rev < good_rev)
-		val |= chickenbit;
-
-	if (val == old_val)
-		/* Nothing to change. */
-		return;
-
-	/*
-	 * DE_CFG is a Core-scoped MSR, and this write is racy during late
-	 * microcode load.  However, both threads calculate the new value from
-	 * state which is shared, and unrelated to the old value, so the
-	 * result should be consistent.
-	 */
-	wrmsrl(MSR_AMD64_DE_CFG, val);
-
-	/*
-	 * Inform the admin that we changed something, but don't spam,
-	 * especially during a late microcode load.
-	 */
-	if (smp_processor_id() == 0)
-		printk(XENLOG_INFO "Zenbleed mitigation - using %s\n",
-		       val & chickenbit ? "chickenbit" : "microcode");
-}
-
 static void cf_check fam17_disable_c6(void *arg)
 {
 	/* Disable C6 by clearing the CCR{0,1,2}_CC6EN bits. */
@@ -1075,6 +966,108 @@ static void cf_check fam17_disable_c6(void *arg)
 	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
 }
 
+static bool zenbleed_use_chickenbit(void)
+{
+    unsigned int curr_rev;
+    uint8_t fixed_rev;
+
+    /* Zenbleed only affects Zen2.  Nothing to do on non-Fam17h systems. */
+    if ( boot_cpu_data.family != 0x17 )
+        return false;
+
+    curr_rev = this_cpu(cpu_sig).rev;
+    switch ( curr_rev >> 8 )
+    {
+    case 0x083010: fixed_rev = 0x7a; break;
+    case 0x086001: fixed_rev = 0x0b; break;
+    case 0x086081: fixed_rev = 0x05; break;
+    case 0x087010: fixed_rev = 0x32; break;
+    case 0x08a000: fixed_rev = 0x08; break;
+    default:
+        /*
+         * With the Fam17h check above, most parts getting here are Zen1.
+         * They're not affected.  Assume Zen2 ones making it here are affected
+         * regardless of microcode version.
+         */
+        return is_zen2_uarch();
+    }
+
+    return (uint8_t)curr_rev >= fixed_rev;
+}
+
+void amd_init_de_cfg(const struct cpuinfo_x86 *c)
+{
+    uint64_t val, new = 0;
+
+    /*
+     * The MSR doesn't exist on Fam 0xf/0x11.  If virtualised, we won't have
+     * mutable access even if we can read it.
+     */
+    if ( c->family == 0xf || c->family == 0x11 || cpu_has_hypervisor )
+        return;
+
+    /*
+     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
+     * serialising, and is enumerated in CPUID.
+     *
+     * On older systems, LFENCE is unconditionally dispatch serialising (when
+     * the MSR doesn't exist), or can be made so by setting this bit.
+     */
+    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
+        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
+
+    /*
+     * If vulnerable to Zenbleed and not mitigated in microcode, use the
+     * bigger hammer.
+     */
+    if ( zenbleed_use_chickenbit() )
+        new |= (1 << 9);
+
+    if ( !new )
+        return;
+
+    val = rdmsr(MSR_AMD64_DE_CFG);
+
+    if ( (val & new) == new )
+        return;
+
+    /*
+     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
+     * threads calculate the new value from state which expected to be
+     * consistent across CPUs and unrelated to the old value, so the result
+     * should be consistent.
+     */
+    wrmsr(MSR_AMD64_DE_CFG, val | new);
+}
+
+void __init amd_init_lfence_dispatch(void)
+{
+    struct cpuinfo_x86 *c = &boot_cpu_data;
+    uint64_t val;
+
+    if ( test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
+        /* LFENCE is forced dispatch serialising and we can't control it. */
+        return;
+
+    if ( c->family == 0xf || c->family == 0x11 )
+        /* MSR doesn't exist.  LFENCE is dispatch serialising on this hardare. */
+        goto set;
+
+    if ( rdmsr_safe(MSR_AMD64_DE_CFG, &val) )
+        /* Unable to read.  Assume the safer default. */
+        goto clear;
+
+    if ( val & AMD64_DE_CFG_LFENCE_SERIALISE )
+        goto set;
+
+ clear:
+    setup_clear_cpu_cap(X86_FEATURE_LFENCE_DISPATCH);
+    return;
+
+ set:
+    setup_force_cpu_cap(X86_FEATURE_LFENCE_DISPATCH);
+}
+
 static void amd_check_bp_cfg(void)
 {
 	uint64_t val, new = 0;
@@ -1118,6 +1111,11 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
 	u32 l, h;
 	uint64_t value;
 
+	amd_init_de_cfg(c);
+
+	if (c == &boot_cpu_data)
+		amd_init_lfence_dispatch(); /* Needs amd_init_de_cfg() */
+
 	/* Disable TLB flush filter by setting HWCR.FFDIS on K8
 	 * bit 6 of msr C001_0015
 	 *
@@ -1156,12 +1154,6 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
 	if (c == &boot_cpu_data && !cpu_has(c, X86_FEATURE_RSTR_FP_ERR_PTRS))
 		setup_force_cpu_cap(X86_BUG_FPU_PTRS);
 
-	if (c->x86 == 0x0f || c->x86 == 0x11)
-		/* Always dispatch serialising on this hardare. */
-		__set_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability);
-	else /* Implicily "== 0x10 || >= 0x12" by being 64bit. */
-		amd_init_lfence(c);
-
 	amd_init_ssbd(c);
 
 	if (c->x86 == 0x17)
@@ -1379,7 +1371,6 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
 	if ((smp_processor_id() == 1) && !cpu_has(c, X86_FEATURE_ITSC))
 		disable_c1_ramping();
 
-	amd_check_zenbleed();
 	amd_check_bp_cfg();
 
 	if (fam17_c6_disabled)
diff --git a/xen/arch/x86/cpu/cpu.h b/xen/arch/x86/cpu/cpu.h
index cbb434f3a23d..8bed3f52490f 100644
--- a/xen/arch/x86/cpu/cpu.h
+++ b/xen/arch/x86/cpu/cpu.h
@@ -24,7 +24,8 @@ extern bool detect_extended_topology(struct cpuinfo_x86 *c);
 
 void cf_check early_init_amd(struct cpuinfo_x86 *c);
 void amd_log_freq(const struct cpuinfo_x86 *c);
-void amd_init_lfence(struct cpuinfo_x86 *c);
+void amd_init_de_cfg(const struct cpuinfo_x86 *c);
+void amd_init_lfence_dispatch(void);
 void amd_init_ssbd(const struct cpuinfo_x86 *c);
 void amd_init_spectral_chicken(void);
 void detect_zen2_null_seg_behaviour(void);
diff --git a/xen/arch/x86/cpu/hygon.c b/xen/arch/x86/cpu/hygon.c
index f7508cc8fcb9..c3faabbdb764 100644
--- a/xen/arch/x86/cpu/hygon.c
+++ b/xen/arch/x86/cpu/hygon.c
@@ -31,7 +31,11 @@ static void cf_check init_hygon(struct cpuinfo_x86 *c)
 {
 	unsigned long long value;
 
-	amd_init_lfence(c);
+	amd_init_de_cfg(c);
+
+	if (c == &boot_cpu_data)
+		amd_init_lfence_dispatch(); /* Needs amd_init_de_cfg() */
+
 	amd_init_ssbd(c);
 
 	/* Probe for NSCB on Zen2 CPUs when not virtualised */

base-commit: fb0e37df71a31318c61e0715ffed3e149ca8a4aa
prerequisite-patch-id: 3598b7168c11c3dd68f825c75b77edc552fc231f
prerequisite-patch-id: e57b8bc06e86a4470b5bcf8e412d248e1f8f3331
-- 
2.39.5

Re: [PATCH v1.9 3/3] x86/amd: Fix race editing DE_CFG

Posted by Jan Beulich 2 months, 1 week ago

On 27.11.2025 18:18, Andrew Cooper wrote:
> We have two different functions explaining that DE_CFG is Core-scoped and that
> writes are racy but happen to be safe.  This is only true when there's one of
> them.
> 
> Introduce amd_init_de_cfg() to be the singular function which writes to
> DE_CFG, modelled after the logic we already have for BP_CFG.
> 
> While reworking amd_check_zenbleed() into a simple predicate used by
> amd_init_de_cfg(), fix the microcode table.  The 'good_rev' was specific to an
> individual stepping and not valid to be matched by model, let alone a range.
> The only CPUs incorrectly matched that I can locate appear to be
> pre-production, and probably didn't get Zenbleed microcode.
> 
> Rework amd_init_lfence() to be amd_init_lfence_dispatch() with only the
> purpose of configuring X86_FEATURE_LFENCE_DISPATCH in the case that it needs
> synthesising.  Run it on the BSP only and use setup_force_cpu_cap() to prevent
> the bit disappearing on a subseuqent CPUID rescan.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>
with one more request towards commentary:

> +void amd_init_de_cfg(const struct cpuinfo_x86 *c)
> +{
> +    uint64_t val, new = 0;
> +
> +    /*
> +     * The MSR doesn't exist on Fam 0xf/0x11.  If virtualised, we won't have
> +     * mutable access even if we can read it.
> +     */
> +    if ( c->family == 0xf || c->family == 0x11 || cpu_has_hypervisor )
> +        return;
> +
> +    /*
> +     * On Zen3 (Fam 0x19) and later CPUs, LFENCE is unconditionally dispatch
> +     * serialising, and is enumerated in CPUID.
> +     *
> +     * On older systems, LFENCE is unconditionally dispatch serialising (when
> +     * the MSR doesn't exist), or can be made so by setting this bit.
> +     */
> +    if ( !test_bit(X86_FEATURE_LFENCE_DISPATCH, c->x86_capability) )
> +        new |= AMD64_DE_CFG_LFENCE_SERIALISE;
> +
> +    /*
> +     * If vulnerable to Zenbleed and not mitigated in microcode, use the
> +     * bigger hammer.
> +     */
> +    if ( zenbleed_use_chickenbit() )
> +        new |= (1 << 9);
> +
> +    if ( !new )
> +        return;
> +
> +    val = rdmsr(MSR_AMD64_DE_CFG);
> +
> +    if ( (val & new) == new )
> +        return;
> +
> +    /*
> +     * DE_CFG is a Core-scoped MSR, and this write is racy.  However, both
> +     * threads calculate the new value from state which expected to be
> +     * consistent across CPUs and unrelated to the old value, so the result
> +     * should be consistent.
> +     */
> +    wrmsr(MSR_AMD64_DE_CFG, val | new);

The reason this isn't at risk of faulting when potentially trying to set a
reserved bit would better be at least briefly mentioned here, I think. Yes,
logic above tries to eliminate all cases where either bit may be reserved,
but we're still on somewhat thin ice here.

Jan

[PATCH 1/3] x86/amd: Use setup_force_cpu_cap() for BTC_NO
[PATCH 2/3] x86/amd: Stop updating the Zenbleed mitigation dynamically
[PATCH 3/3] x86/amd: Fix race editing DE_CFG