[PATCH] x86/amd: extend CPU errata #1474 affected models

Roger Pau Monne posted 1 patch 4 months, 2 weeks ago
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/20231220142218.22850-1-roger.pau@citrix.com
There is a newer version of this series
xen/arch/x86/cpu/amd.c | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
[PATCH] x86/amd: extend CPU errata #1474 affected models
Posted by Roger Pau Monne 4 months, 2 weeks ago
Errata #1474 does also affect models from family 17h ranges 00-2Fh, so the
errata now covers all the models released under Family 17h (Zen, Zen+ and
Zen2).

Additionally extend the workaround to Family 18h (Hygon), since it's based on
the Zen architecture and very likely affected.

Rename all the zen2 related symbols to plain zen, since the errata doesn't
exclusively affect Zen2 anymore.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
I'm unsure whether to just drop the zen_ prefix altogether, to not make it look
to only affect plain Zen (Zen1) models.
---
 xen/arch/x86/cpu/amd.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 0f305312ff2a..099646dc3994 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk;
 bool __ro_after_init amd_legacy_ssbd;
 bool __initdata amd_virt_spec_ctrl;
 
-static bool __read_mostly zen2_c6_disabled;
+static bool __read_mostly zen_c6_disabled;
 
 static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo,
 				 unsigned int *hi)
@@ -978,24 +978,24 @@ void amd_check_zenbleed(void)
 		       val & chickenbit ? "chickenbit" : "microcode");
 }
 
-static void cf_check zen2_disable_c6(void *arg)
+static void cf_check zen_disable_c6(void *arg)
 {
 	/* Disable C6 by clearing the CCR{0,1,2}_CC6EN bits. */
 	const uint64_t mask = ~((1ul << 6) | (1ul << 14) | (1ul << 22));
 	uint64_t val;
 
-	if (!zen2_c6_disabled) {
+	if (!zen_c6_disabled) {
 		printk(XENLOG_WARNING
     "Disabling C6 after 1000 days apparent uptime due to AMD errata 1474\n");
-		zen2_c6_disabled = true;
+		zen_c6_disabled = true;
 		/*
 		 * Prevent CPU hotplug so that started CPUs will either see
-		 * zen2_c6_disabled set, or will be handled by
+		 * zen_c6_disabled set, or will be handled by
 		 * smp_call_function().
 		 */
 		while (!get_cpu_maps())
 			process_pending_softirqs();
-		smp_call_function(zen2_disable_c6, NULL, 0);
+		smp_call_function(zen_disable_c6, NULL, 0);
 		put_cpu_maps();
 	}
 
@@ -1294,8 +1294,8 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
 	amd_check_zenbleed();
 	amd_check_erratum_1485();
 
-	if (zen2_c6_disabled)
-		zen2_disable_c6(NULL);
+	if (zen_c6_disabled)
+		zen_disable_c6(NULL);
 
 	check_syscfg_dram_mod_en();
 
@@ -1307,7 +1307,7 @@ const struct cpu_dev amd_cpu_dev = {
 	.c_init		= init_amd,
 };
 
-static int __init cf_check zen2_c6_errata_check(void)
+static int __init cf_check amd_check_erratum_1474(void)
 {
 	/*
 	 * Errata #1474: A Core May Hang After About 1044 Days
@@ -1315,7 +1315,8 @@ static int __init cf_check zen2_c6_errata_check(void)
 	 */
 	s_time_t delta;
 
-	if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch())
+	if (cpu_has_hypervisor ||
+	    (boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18))
 		return 0;
 
 	/*
@@ -1330,10 +1331,10 @@ static int __init cf_check zen2_c6_errata_check(void)
 	if (delta > 0) {
 		static struct timer errata_c6;
 
-		init_timer(&errata_c6, zen2_disable_c6, NULL, 0);
+		init_timer(&errata_c6, zen_disable_c6, NULL, 0);
 		set_timer(&errata_c6, NOW() + delta);
 	} else
-		zen2_disable_c6(NULL);
+		zen_disable_c6(NULL);
 
 	return 0;
 }
@@ -1341,4 +1342,4 @@ static int __init cf_check zen2_c6_errata_check(void)
  * Must be executed after early_time_init() for tsc_ticks2ns() to have been
  * calibrated.  That prevents us doing the check in init_amd().
  */
-presmp_initcall(zen2_c6_errata_check);
+presmp_initcall(amd_check_erratum_1474);
-- 
2.43.0


Re: [PATCH] x86/amd: extend CPU errata #1474 affected models
Posted by Andrew Cooper 4 months, 2 weeks ago
On 20/12/2023 2:22 pm, Roger Pau Monne wrote:
> Errata #1474 does also affect models from family 17h ranges 00-2Fh, so the
> errata now covers all the models released under Family 17h (Zen, Zen+ and
> Zen2).

Perhaps "has now been extended to cover models from ..." ?

> Additionally extend the workaround to Family 18h (Hygon), since it's based on
> the Zen architecture and very likely affected.
>
> Rename all the zen2 related symbols to plain zen, since the errata doesn't
> exclusively affect Zen2 anymore.
>
> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks for doing this.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

I was going to suggest linking to an example revision guide but I see
the AMD website is still broken.

> diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
> index 0f305312ff2a..099646dc3994 100644
> --- a/xen/arch/x86/cpu/amd.c
> +++ b/xen/arch/x86/cpu/amd.c
> @@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk;
>  bool __ro_after_init amd_legacy_ssbd;
>  bool __initdata amd_virt_spec_ctrl;
>  
> -static bool __read_mostly zen2_c6_disabled;
> +static bool __read_mostly zen_c6_disabled;

amd_1474_c6_disable ?

That's about as general as I can make it, without losing precision.


>  
>  static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo,
>  				 unsigned int *hi)
> @@ -978,24 +978,24 @@ void amd_check_zenbleed(void)
>  		       val & chickenbit ? "chickenbit" : "microcode");
>  }
>  
> -static void cf_check zen2_disable_c6(void *arg)
> +static void cf_check zen_disable_c6(void *arg)

fam17_disable_c6() ?  I know Hygon is 0x18 but it's also reasonably well
know to be the same uarch.

This particular algorithm is good for all Fam17 uarches, irrespective of
#1474, even if they happen to be the same set of CPUs in practice.

~Andrew

Re: [PATCH] x86/amd: extend CPU errata #1474 affected models
Posted by Roger Pau Monné 4 months, 2 weeks ago
On Wed, Dec 20, 2023 at 02:46:43PM +0000, Andrew Cooper wrote:
> On 20/12/2023 2:22 pm, Roger Pau Monne wrote:
> > Errata #1474 does also affect models from family 17h ranges 00-2Fh, so the
> > errata now covers all the models released under Family 17h (Zen, Zen+ and
> > Zen2).
> 
> Perhaps "has now been extended to cover models from ..." ?
> 
> > Additionally extend the workaround to Family 18h (Hygon), since it's based on
> > the Zen architecture and very likely affected.
> >
> > Rename all the zen2 related symbols to plain zen, since the errata doesn't
> > exclusively affect Zen2 anymore.
> >
> > Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> Thanks for doing this.
> 
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> 
> I was going to suggest linking to an example revision guide but I see
> the AMD website is still broken.
> 
> > diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
> > index 0f305312ff2a..099646dc3994 100644
> > --- a/xen/arch/x86/cpu/amd.c
> > +++ b/xen/arch/x86/cpu/amd.c
> > @@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk;
> >  bool __ro_after_init amd_legacy_ssbd;
> >  bool __initdata amd_virt_spec_ctrl;
> >  
> > -static bool __read_mostly zen2_c6_disabled;
> > +static bool __read_mostly zen_c6_disabled;
> 
> amd_1474_c6_disable ?

Maybe just fam17h_c6_disabled, since the main usage of that variable
is to force calling fam17_disable_c6().

> That's about as general as I can make it, without losing precision.
> 
> 
> >  
> >  static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo,
> >  				 unsigned int *hi)
> > @@ -978,24 +978,24 @@ void amd_check_zenbleed(void)
> >  		       val & chickenbit ? "chickenbit" : "microcode");
> >  }
> >  
> > -static void cf_check zen2_disable_c6(void *arg)
> > +static void cf_check zen_disable_c6(void *arg)
> 
> fam17_disable_c6() ?  I know Hygon is 0x18 but it's also reasonably well
> know to be the same uarch.
> 
> This particular algorithm is good for all Fam17 uarches, irrespective of
> #1474, even if they happen to be the same set of CPUs in practice.

Yeah, I was about to use fam17h prefix, but that wouldn't cover Hygon.
I we are fine with it I can send an adjusted v2 using fam17h prefix.

Thanks, Roger.

Re: [PATCH] x86/amd: extend CPU errata #1474 affected models
Posted by Andrew Cooper 4 months, 2 weeks ago
On 20/12/2023 3:10 pm, Roger Pau Monné wrote:
> On Wed, Dec 20, 2023 at 02:46:43PM +0000, Andrew Cooper wrote:
>> On 20/12/2023 2:22 pm, Roger Pau Monne wrote:
>>> Errata #1474 does also affect models from family 17h ranges 00-2Fh, so the
>>> errata now covers all the models released under Family 17h (Zen, Zen+ and
>>> Zen2).
>> Perhaps "has now been extended to cover models from ..." ?
>>
>>> Additionally extend the workaround to Family 18h (Hygon), since it's based on
>>> the Zen architecture and very likely affected.
>>>
>>> Rename all the zen2 related symbols to plain zen, since the errata doesn't
>>> exclusively affect Zen2 anymore.
>>>
>>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>> Thanks for doing this.
>>
>> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>
>> I was going to suggest linking to an example revision guide but I see
>> the AMD website is still broken.
>>
>>> diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
>>> index 0f305312ff2a..099646dc3994 100644
>>> --- a/xen/arch/x86/cpu/amd.c
>>> +++ b/xen/arch/x86/cpu/amd.c
>>> @@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk;
>>>  bool __ro_after_init amd_legacy_ssbd;
>>>  bool __initdata amd_virt_spec_ctrl;
>>>  
>>> -static bool __read_mostly zen2_c6_disabled;
>>> +static bool __read_mostly zen_c6_disabled;
>> amd_1474_c6_disable ?
> Maybe just fam17h_c6_disabled, since the main usage of that variable
> is to force calling fam17_disable_c6().
>
>> That's about as general as I can make it, without losing precision.
>>
>>
>>>  
>>>  static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo,
>>>  				 unsigned int *hi)
>>> @@ -978,24 +978,24 @@ void amd_check_zenbleed(void)
>>>  		       val & chickenbit ? "chickenbit" : "microcode");
>>>  }
>>>  
>>> -static void cf_check zen2_disable_c6(void *arg)
>>> +static void cf_check zen_disable_c6(void *arg)
>> fam17_disable_c6() ?  I know Hygon is 0x18 but it's also reasonably well
>> know to be the same uarch.
>>
>> This particular algorithm is good for all Fam17 uarches, irrespective of
>> #1474, even if they happen to be the same set of CPUs in practice.
> Yeah, I was about to use fam17h prefix, but that wouldn't cover Hygon.
> I we are fine with it I can send an adjusted v2 using fam17h prefix.

I think we're fine calling it fam17.  Happy to do that consistently.

~Andrew

Re: [PATCH] x86/amd: extend CPU errata #1474 affected models
Posted by Jan Beulich 4 months, 2 weeks ago
On 20.12.2023 16:42, Andrew Cooper wrote:
> On 20/12/2023 3:10 pm, Roger Pau Monné wrote:
>> On Wed, Dec 20, 2023 at 02:46:43PM +0000, Andrew Cooper wrote:
>>> On 20/12/2023 2:22 pm, Roger Pau Monne wrote:
>>>> @@ -978,24 +978,24 @@ void amd_check_zenbleed(void)
>>>>  		       val & chickenbit ? "chickenbit" : "microcode");
>>>>  }
>>>>  
>>>> -static void cf_check zen2_disable_c6(void *arg)
>>>> +static void cf_check zen_disable_c6(void *arg)
>>> fam17_disable_c6() ?  I know Hygon is 0x18 but it's also reasonably well
>>> know to be the same uarch.
>>>
>>> This particular algorithm is good for all Fam17 uarches, irrespective of
>>> #1474, even if they happen to be the same set of CPUs in practice.
>> Yeah, I was about to use fam17h prefix, but that wouldn't cover Hygon.
>> I we are fine with it I can send an adjusted v2 using fam17h prefix.
> 
> I think we're fine calling it fam17.  Happy to do that consistently.

I agree. And it clearly cannot be just "zen", to avoid it also wrongly
covering Zen3 / Zen4.

Jan