[PATCH v3 06/10] x86/mce: Break up __mcheck_cpu_apply_quirks()

Qiuxu Zhuo posted 10 patches 1 month ago
There is a newer version of this series
[PATCH v3 06/10] x86/mce: Break up __mcheck_cpu_apply_quirks()
Posted by Qiuxu Zhuo 1 month ago
From: Tony Luck <tony.luck@intel.com>

Split each vendor specific part into its own helper function.

Tested-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
---
Changes in v3:
  - Newly added.

 arch/x86/kernel/cpu/mce/core.c | 194 ++++++++++++++++++---------------
 1 file changed, 106 insertions(+), 88 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 57c05015f984..bb8b1000fa0a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1880,101 +1880,119 @@ static void __mcheck_cpu_check_banks(void)
 	}
 }
 
+static void apply_quirks_amd(struct cpuinfo_x86 *c)
+{
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+	struct mca_config *cfg = &mca_cfg;
+
+	/* This should be disabled by the BIOS, but isn't always */
+	if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
+		/*
+		 * disable GART TBL walk error reporting, which
+		 * trips off incorrectly with the IOMMU & 3ware
+		 * & Cerberus:
+		 */
+		clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+	}
+	if (c->x86 < 0x11 && cfg->bootlog < 0) {
+		/*
+		 * Lots of broken BIOS around that don't clear them
+		 * by default and leave crap in there. Don't log:
+		 */
+		cfg->bootlog = 0;
+	}
+	/*
+	 * Various K7s with broken bank 0 around. Always disable
+	 * by default.
+	 */
+	if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
+		mce_banks[0].ctl = 0;
+
+	/*
+	 * overflow_recov is supported for F15h Models 00h-0fh
+	 * even though we don't have a CPUID bit for it.
+	 */
+	if (c->x86 == 0x15 && c->x86_model <= 0xf)
+		mce_flags.overflow_recov = 1;
+
+	if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+		mce_flags.zen_ifu_quirk = 1;
+}
+
+static void apply_quirks_intel(struct cpuinfo_x86 *c)
+{
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+	struct mca_config *cfg = &mca_cfg;
+
+	/*
+	 * SDM documents that on family 6 bank 0 should not be written
+	 * because it aliases to another special BIOS controlled
+	 * register.
+	 * But it's not aliased anymore on model 0x1a+
+	 * Don't ignore bank 0 completely because there could be a
+	 * valid event later, merely don't write CTL0.
+	 */
+	if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
+		mce_banks[0].init = false;
+
+	/*
+	 * All newer Intel systems support MCE broadcasting. Enable
+	 * synchronization with a one second timeout.
+	 */
+	if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+	    cfg->monarch_timeout < 0)
+		cfg->monarch_timeout = USEC_PER_SEC;
+
+	/*
+	 * There are also broken BIOSes on some Pentium M and
+	 * earlier systems:
+	 */
+	if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
+		cfg->bootlog = 0;
+
+	if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
+		mce_flags.snb_ifu_quirk = 1;
+
+	/*
+	 * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+	 * rep movs.
+	 */
+	if (c->x86_vfm == INTEL_SKYLAKE_X)
+		mce_flags.skx_repmov_quirk = 1;
+}
+
+static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
+{
+	struct mca_config *cfg = &mca_cfg;
+
+	/*
+	 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+	 * synchronization with a one second timeout.
+	 */
+	if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+		if (cfg->monarch_timeout < 0)
+			cfg->monarch_timeout = USEC_PER_SEC;
+	}
+}
+
 /* Add per CPU specific workarounds here */
 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 {
-	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	struct mca_config *cfg = &mca_cfg;
 
-	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+	switch (c->x86_vendor) {
+	case X86_VENDOR_UNKNOWN:
 		pr_info("unknown CPU type - not enabling MCE support\n");
 		return -EOPNOTSUPP;
-	}
-
-	/* This should be disabled by the BIOS, but isn't always */
-	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
-			/*
-			 * disable GART TBL walk error reporting, which
-			 * trips off incorrectly with the IOMMU & 3ware
-			 * & Cerberus:
-			 */
-			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
-		}
-		if (c->x86 < 0x11 && cfg->bootlog < 0) {
-			/*
-			 * Lots of broken BIOS around that don't clear them
-			 * by default and leave crap in there. Don't log:
-			 */
-			cfg->bootlog = 0;
-		}
-		/*
-		 * Various K7s with broken bank 0 around. Always disable
-		 * by default.
-		 */
-		if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
-			mce_banks[0].ctl = 0;
-
-		/*
-		 * overflow_recov is supported for F15h Models 00h-0fh
-		 * even though we don't have a CPUID bit for it.
-		 */
-		if (c->x86 == 0x15 && c->x86_model <= 0xf)
-			mce_flags.overflow_recov = 1;
-
-		if (c->x86 >= 0x17 && c->x86 <= 0x1A)
-			mce_flags.zen_ifu_quirk = 1;
-
-	}
-
-	if (c->x86_vendor == X86_VENDOR_INTEL) {
-		/*
-		 * SDM documents that on family 6 bank 0 should not be written
-		 * because it aliases to another special BIOS controlled
-		 * register.
-		 * But it's not aliased anymore on model 0x1a+
-		 * Don't ignore bank 0 completely because there could be a
-		 * valid event later, merely don't write CTL0.
-		 */
-
-		if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
-			mce_banks[0].init = false;
-
-		/*
-		 * All newer Intel systems support MCE broadcasting. Enable
-		 * synchronization with a one second timeout.
-		 */
-		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
-			cfg->monarch_timeout < 0)
-			cfg->monarch_timeout = USEC_PER_SEC;
-
-		/*
-		 * There are also broken BIOSes on some Pentium M and
-		 * earlier systems:
-		 */
-		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
-			cfg->bootlog = 0;
-
-		if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
-			mce_flags.snb_ifu_quirk = 1;
-
-		/*
-		 * Skylake, Cascacde Lake and Cooper Lake require a quirk on
-		 * rep movs.
-		 */
-		if (c->x86_vfm == INTEL_SKYLAKE_X)
-			mce_flags.skx_repmov_quirk = 1;
-	}
-
-	if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
-		/*
-		 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
-		 * synchronization with a one second timeout.
-		 */
-		if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
-			if (cfg->monarch_timeout < 0)
-				cfg->monarch_timeout = USEC_PER_SEC;
-		}
+	case X86_VENDOR_AMD:
+		apply_quirks_amd(c);
+		break;
+	case X86_VENDOR_INTEL:
+		apply_quirks_intel(c);
+		break;
+	case X86_VENDOR_ZHAOXIN:
+		apply_quirks_zhaoxin(c);
+		break;
 	}
 
 	if (cfg->monarch_timeout < 0)
-- 
2.17.1
Re: [PATCH v3 06/10] x86/mce: Break up __mcheck_cpu_apply_quirks()
Posted by Yazen Ghannam 3 weeks, 5 days ago
On Fri, Oct 25, 2024 at 10:45:58AM +0800, Qiuxu Zhuo wrote:
> From: Tony Luck <tony.luck@intel.com>
> 
> Split each vendor specific part into its own helper function.
> 
> Tested-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> ---
> Changes in v3:
>   - Newly added.
> 
>  arch/x86/kernel/cpu/mce/core.c | 194 ++++++++++++++++++---------------
>  1 file changed, 106 insertions(+), 88 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 57c05015f984..bb8b1000fa0a 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -1880,101 +1880,119 @@ static void __mcheck_cpu_check_banks(void)
>  	}
>  }
>  
> +static void apply_quirks_amd(struct cpuinfo_x86 *c)
> +{
> +	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> +	struct mca_config *cfg = &mca_cfg;
> +
> +	/* This should be disabled by the BIOS, but isn't always */
> +	if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
> +		/*
> +		 * disable GART TBL walk error reporting, which
> +		 * trips off incorrectly with the IOMMU & 3ware
> +		 * & Cerberus:
> +		 */
> +		clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
> +	}

Newline here please.

> +	if (c->x86 < 0x11 && cfg->bootlog < 0) {
> +		/*
> +		 * Lots of broken BIOS around that don't clear them
> +		 * by default and leave crap in there. Don't log:
> +		 */
> +		cfg->bootlog = 0;
> +	}

And here.

> +	/*
> +	 * Various K7s with broken bank 0 around. Always disable
> +	 * by default.
> +	 */
> +	if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
> +		mce_banks[0].ctl = 0;
> +
> +	/*
> +	 * overflow_recov is supported for F15h Models 00h-0fh
> +	 * even though we don't have a CPUID bit for it.
> +	 */
> +	if (c->x86 == 0x15 && c->x86_model <= 0xf)
> +		mce_flags.overflow_recov = 1;
> +
> +	if (c->x86 >= 0x17 && c->x86 <= 0x1A)
> +		mce_flags.zen_ifu_quirk = 1;
> +}
> +
> +static void apply_quirks_intel(struct cpuinfo_x86 *c)
> +{
> +	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> +	struct mca_config *cfg = &mca_cfg;

Is there a benefit to this pointer? We use mca_cfg.FIELD in most other
places.

Thanks,
Yazen
RE: [PATCH v3 06/10] x86/mce: Break up __mcheck_cpu_apply_quirks()
Posted by Zhuo, Qiuxu 3 weeks, 5 days ago
Hi Yazen,

> From: Yazen Ghannam <yazen.ghannam@amd.com>
> [...]
> > +static void apply_quirks_amd(struct cpuinfo_x86 *c) {
> > +	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> > +	struct mca_config *cfg = &mca_cfg;
> > +
> > +	/* This should be disabled by the BIOS, but isn't always */
> > +	if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
> > +		/*
> > +		 * disable GART TBL walk error reporting, which
> > +		 * trips off incorrectly with the IOMMU & 3ware
> > +		 * & Cerberus:
> > +		 */
> > +		clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
> > +	}
> 
> Newline here please.

OK. 
Will update it in next version.

> > +	if (c->x86 < 0x11 && cfg->bootlog < 0) {
> > +		/*
> > +		 * Lots of broken BIOS around that don't clear them
> > +		 * by default and leave crap in there. Don't log:
> > +		 */
> > +		cfg->bootlog = 0;
> > +	}
> 
> And here.

And will update it in next version.

> [...]

> > +static void apply_quirks_intel(struct cpuinfo_x86 *c) {
> > +	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> > +	struct mca_config *cfg = &mca_cfg;
> 
> Is there a benefit to this pointer? We use mca_cfg.FIELD in most other places.

This could make the diff smaller for easier review, and I also believe that fewer direct
uses of global variables in functions are better. Additionally, there are multiple uses of
'mca_cfg' in the function, the local variable 'cfg' is shorter and more convenient to use.

[ Certainly, if the global variable 'mca_cfg' is only used once in the function, directly
  using it might be more convenient. ]

Just from my perspective, no strong preference. 😊

-Qiuxu
Re: [PATCH v3 06/10] x86/mce: Break up __mcheck_cpu_apply_quirks()
Posted by Yazen Ghannam 3 weeks, 5 days ago
On Wed, Oct 30, 2024 at 01:39:43AM +0000, Zhuo, Qiuxu wrote:

[...]

Thanks Qiuxu.

> 
> > > +static void apply_quirks_intel(struct cpuinfo_x86 *c) {
> > > +	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> > > +	struct mca_config *cfg = &mca_cfg;
> > 
> > Is there a benefit to this pointer? We use mca_cfg.FIELD in most other places.
> 
> This could make the diff smaller for easier review, and I also believe that fewer direct
> uses of global variables in functions are better. Additionally, there are multiple uses of
> 'mca_cfg' in the function, the local variable 'cfg' is shorter and more convenient to use.
>

I don't think it would make the diff smaller here since the code is
already being moved.

Though you could say this is a separate logical change compared to just
moving the code as-is.

Also, I don't think the "shorter, more convenient" idea holds. It's not
that much shorter. And there are already cases of using the global
variables "mca_cfg" and "mce_flags".

Why is "...fewer direct uses of global variables in functions..." better?

> [ Certainly, if the global variable 'mca_cfg' is only used once in the function, directly
>   using it might be more convenient. ]
>

There is one such case in your patch.

> Just from my perspective, no strong preference. 😊
> 

Same here. I just figured this suggestion would be another possible
cleanup. :)

Thanks,
Yazen

RE: [PATCH v3 06/10] x86/mce: Break up __mcheck_cpu_apply_quirks()
Posted by Zhuo, Qiuxu 3 weeks, 4 days ago
Hi Yazen,

> From: Yazen Ghannam <yazen.ghannam@amd.com>
> [...]
> > Just from my perspective, no strong preference. 😊
> 
> Same here. I just figured this suggestion would be another possible
> cleanup. :)

Thanks for your suggestion. Yes, it does save 3 lines of code.
Either the current patch or your suggestion is OK with me.

Hi @Boris, 
may I know which option is OK with you:

    Option A (current patch): 
                    struct mca_config *cfg = &mca_cfg;
                    and then use 'cfg' in apply_quirks_{amd, intel, zhaoxin}()

    Option B (suggested by Yazen):
                    Directly use 'mca_cfg' in apply_quirks_{amd, intel, zhaoxin}()

Thanks!
-Qiuxu