[tip: ras/core] EDAC/mce_amd: Add support for FRU text in MCA

tip-bot2 for Yazen Ghannam posted 1 patch 3 weeks, 2 days ago
arch/x86/include/asm/mce.h |  1 +
drivers/edac/mce_amd.c     | 18 ++++++++++++------
2 files changed, 13 insertions(+), 6 deletions(-)
[tip: ras/core] EDAC/mce_amd: Add support for FRU text in MCA
Posted by tip-bot2 for Yazen Ghannam 3 weeks, 2 days ago
The following commit has been merged into the ras/core branch of tip:

Commit-ID:     612c2addff367ee461dc99ffca2bc786f105d2ec
Gitweb:        https://git.kernel.org/tip/612c2addff367ee461dc99ffca2bc786f105d2ec
Author:        Yazen Ghannam <yazen.ghannam@amd.com>
AuthorDate:    Tue, 22 Oct 2024 19:36:31 
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Thu, 31 Oct 2024 10:53:04 +01:00

EDAC/mce_amd: Add support for FRU text in MCA

A new "FRU Text in MCA" feature is defined where the Field Replaceable
Unit (FRU) Text for a device is represented by a string in the new
MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA
bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]).

The FRU Text is populated dynamically for each individual error state
(MCA_STATUS, MCA_ADDR, et al.). Handle the case where an MCA bank covers
multiple devices, for example, a Unified Memory Controller (UMC) bank
that manages two DIMMs.

  [ Yazen: Add Avadhut as co-developer for wrapper changes. ]
  [ bp: Do not expose MCA_CONFIG to userspace yet. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Co-developed-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20241022194158.110073-6-avadhut.naik@amd.com
---
 arch/x86/include/asm/mce.h |  1 +
 drivers/edac/mce_amd.c     | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4d936ee..4543cf2 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -61,6 +61,7 @@
  *  - TCC bit is present in MCx_STATUS.
  */
 #define MCI_CONFIG_MCAX		0x1
+#define MCI_CONFIG_FRUTEXT	BIT_ULL(9)
 #define MCI_IPID_MCATYPE	0xFFFF0000
 #define MCI_IPID_HWID		0xFFF
 
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 194d9fd..50d74d3 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -795,6 +795,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	struct mce *m = (struct mce *)data;
 	struct mce_hw_err *err = to_mce_hw_err(m);
 	unsigned int fam = x86_family(m->cpuid);
+	u32 mca_config_lo = 0, dummy;
 	int ecc;
 
 	if (m->kflags & MCE_HANDLED_CEC)
@@ -814,11 +815,9 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
 
 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
-		u32 low, high;
-		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+		rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
 
-		if (!rdmsr_safe(addr, &low, &high) &&
-		    (low & MCI_CONFIG_MCAX))
+		if (mca_config_lo & MCI_CONFIG_MCAX)
 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 
 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
@@ -853,8 +852,15 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 
 		if (m->status & MCI_STATUS_SYNDV) {
 			pr_cont(", Syndrome: 0x%016llx\n", m->synd);
-			pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
-				 err->vendor.amd.synd1, err->vendor.amd.synd2);
+			if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
+				char frutext[17];
+
+				frutext[16] = '\0';
+				memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
+				memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
+
+				pr_emerg(HW_ERR "FRU Text: %s", frutext);
+			}
 		}
 
 		pr_cont("\n");