[PATCH v4 2/2] EDAC/amd64: Include DRAM address in output

Yazen Ghannam posted 2 patches 2 weeks, 2 days ago
[PATCH v4 2/2] EDAC/amd64: Include DRAM address in output
Posted by Yazen Ghannam 2 weeks, 2 days ago
From: Avadhut Naik <avadhut.naik@amd.com>

The DRAM address of an error is used by tooling to find failure
patterns. This information can be used for general analysis off system.
And it can be used on system to take action like offline a page affected
by a bad row.

Other EDAC modules (GHES and SKX) provide this information in their
output. The AMD64 EDAC module was not able to provide this information,
because system-specific translation is needed.

Recent AMD systems provide a PRM handler for DRAM address translation.

Use this PRM handler to get the DRAM address of an error. Include this
in the EDAC "other_detail" field.

[Yazen: Reword commit message and reformat other_detail string]

Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Co-developed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 drivers/edac/amd64_edac.c | 27 ++++++++++++++++++++++++++-
 drivers/edac/amd64_edac.h |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 8908ab881c85..42acda4cfd59 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2704,11 +2704,16 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
 	return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
 }
 
+#define MSG_SIZE 512
+
 static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
 			    u8 ecc_type)
 {
 	enum hw_event_mc_err_type err_type;
 	const char *string;
+	char s[MSG_SIZE];
+
+	memset(s, 0, sizeof(s));
 
 	if (ecc_type == 2)
 		err_type = HW_EVENT_ERR_CORRECTED;
@@ -2723,6 +2728,21 @@ static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
 
 	switch (err->err_code) {
 	case DECODE_OK:
+		if (err->dram_addr) {
+			struct atl_dram_addr *da = err->dram_addr;
+			char *p = s, *end = p + sizeof(s);
+
+			/* Include a version prefix in case the format needs to change later. */
+			p += scnprintf(p, end - p, " [AMDv1]");
+			p += scnprintf(p, end - p, " %s:0x%x", "ChipSelect",	da->chip_select);
+			p += scnprintf(p, end - p, " %s:0x%x", "Row",		da->row_addr);
+			p += scnprintf(p, end - p, " %s:0x%x", "Column",	da->col_addr);
+			p += scnprintf(p, end - p, " %s:0x%x", "Bank",		da->bank_addr);
+			p += scnprintf(p, end - p, " %s:0x%x", "BankGroup",	da->bank_group);
+			p += scnprintf(p, end - p, " %s:0x%x", "RankMul",	da->rank_mul);
+			p += scnprintf(p, end - p, " %s:0x%x", "SubChannel",	da->sub_ch);
+		}
+
 		string = "";
 		break;
 	case ERR_NODE:
@@ -2748,7 +2768,7 @@ static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
 	edac_mc_handle_error(err_type, mci, 1,
 			     err->page, err->offset, err->syndrome,
 			     err->csrow, err->channel, -1,
-			     string, "");
+			     string, s);
 }
 
 static inline void decode_bus_error(int node_id, struct mce *m)
@@ -2808,6 +2828,7 @@ static void umc_get_err_info(struct mce *m, struct err_info *err)
 static void decode_umc_error(int node_id, struct mce *m)
 {
 	u8 ecc_type = (m->status >> 45) & 0x3;
+	struct atl_dram_addr dram_addr;
 	struct mem_ctl_info *mci;
 	unsigned long sys_addr;
 	struct amd64_pvt *pvt;
@@ -2822,6 +2843,7 @@ static void decode_umc_error(int node_id, struct mce *m)
 
 	pvt = mci->pvt_info;
 
+	memset(&dram_addr, 0, sizeof(dram_addr));
 	memset(&err, 0, sizeof(err));
 
 	if (m->status & MCI_STATUS_DEFERRED)
@@ -2853,6 +2875,9 @@ static void decode_umc_error(int node_id, struct mce *m)
 		goto log_error;
 	}
 
+	if (!amd_convert_umc_mca_addr_to_dram_addr(&a_err, &dram_addr))
+		err.dram_addr = &dram_addr;
+
 	error_address_to_page_and_offset(sys_addr, &err);
 
 log_error:
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 1757c1b99fc8..e0ad1c0fc1c3 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -400,6 +400,7 @@ struct err_info {
 	u16 syndrome;
 	u32 page;
 	u32 offset;
+	struct atl_dram_addr *dram_addr;
 };
 
 static inline u32 get_umc_base(u8 channel)
-- 
2.53.0