[PATCH 6/6 v6] ACPI: extlog: Trace CPER CXL Protocol Error Section

Fabio M. De Francesco posted 6 patches 3 months, 2 weeks ago
There is a newer version of this series
[PATCH 6/6 v6] ACPI: extlog: Trace CPER CXL Protocol Error Section
Posted by Fabio M. De Francesco 3 months, 2 weeks ago
When Firmware First is enabled, BIOS handles errors first and then it makes
them available to the kernel via the Common Platform Error Record (CPER)
sections (UEFI 2.10 Appendix N). Linux parses the CPER sections via one of
two similar paths, either ELOG or GHES. The errors managed by ELOG are
signaled to the BIOS by the I/O Machine Check Architecture (I/O MCA).

Currently, ELOG and GHES show some inconsistencies in how they report to
userspace via trace events.

Therefore, make the two mentioned paths act similarly by tracing the CPER
CXL Protocol Error Section (UEFI v2.10, Appendix N.2.13).

Cc: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
---
 drivers/acpi/acpi_extlog.c | 22 ++++++++++++++++++++++
 drivers/cxl/core/ras.c     |  6 ++++++
 include/cxl/event.h        |  2 ++
 3 files changed, 30 insertions(+)

diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index cefe8d2d8aff..b005918517d1 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -12,6 +12,7 @@
 #include <linux/ratelimit.h>
 #include <linux/edac.h>
 #include <linux/ras.h>
+#include <cxl/event.h>
 #include <acpi/ghes.h>
 #include <asm/cpu.h>
 #include <asm/mce.h>
@@ -160,6 +161,21 @@ static void extlog_print_pcie(struct cper_sec_pcie *pcie_err,
 	pci_dev_put(pdev);
 }
 
+static void
+extlog_cxl_cper_handle_prot_err(struct cxl_cper_sec_prot_err *prot_err,
+				int severity)
+{
+	struct cxl_cper_prot_err_work_data wd;
+
+	if (cxl_cper_sec_prot_err_valid(prot_err))
+		return;
+
+	if (cxl_cper_setup_prot_err_work_data(&wd, prot_err, severity))
+		return;
+
+	cxl_cper_ras_handle_prot_err(&wd);
+}
+
 static int extlog_print(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -211,6 +227,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
 			if (gdata->error_data_length >= sizeof(*mem))
 				trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
 						       (u8)gdata->error_severity);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+			struct cxl_cper_sec_prot_err *prot_err =
+				acpi_hest_get_payload(gdata);
+
+			extlog_cxl_cper_handle_prot_err(prot_err,
+							gdata->error_severity);
 		} else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
 			struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 2731ba3a0799..3f527b0c6509 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -105,6 +105,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
 }
 
+void cxl_cper_ras_handle_prot_err(struct cxl_cper_prot_err_work_data *wd)
+{
+	cxl_cper_handle_prot_err(wd);
+}
+EXPORT_SYMBOL_GPL(cxl_cper_ras_handle_prot_err);
+
 static void cxl_cper_prot_err_work_fn(struct work_struct *work)
 {
 	struct cxl_cper_prot_err_work_data wd;
diff --git a/include/cxl/event.h b/include/cxl/event.h
index 94081aec597a..a37eef112411 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -340,4 +340,6 @@ cxl_cper_setup_prot_err_work_data(struct cxl_cper_prot_err_work_data *wd,
 }
 #endif
 
+void cxl_cper_ras_handle_prot_err(struct cxl_cper_prot_err_work_data *wd);
+
 #endif /* _LINUX_CXL_EVENT_H */
-- 
2.51.0
Re: [PATCH 6/6 v6] ACPI: extlog: Trace CPER CXL Protocol Error Section
Posted by Jonathan Cameron 3 months, 1 week ago
On Thu, 23 Oct 2025 14:25:41 +0200
"Fabio M. De Francesco" <fabio.m.de.francesco@linux.intel.com> wrote:

> When Firmware First is enabled, BIOS handles errors first and then it makes
> them available to the kernel via the Common Platform Error Record (CPER)
> sections (UEFI 2.10 Appendix N). Linux parses the CPER sections via one of
> two similar paths, either ELOG or GHES. The errors managed by ELOG are
> signaled to the BIOS by the I/O Machine Check Architecture (I/O MCA).
> 
> Currently, ELOG and GHES show some inconsistencies in how they report to
> userspace via trace events.
> 
> Therefore, make the two mentioned paths act similarly by tracing the CPER
> CXL Protocol Error Section (UEFI v2.10, Appendix N.2.13).
> 
> Cc: Dan Williams <dan.j.williams@intel.com>
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>

Just one small question.   With that addressed, 
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>

> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 2731ba3a0799..3f527b0c6509 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -105,6 +105,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>  		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
>  }
>  
> +void cxl_cper_ras_handle_prot_err(struct cxl_cper_prot_err_work_data *wd)

Why do we need this wrapper?  The name is a bit more general, so if you
do need it, then why not instead just rename cxl_cper_handle_prot_err()

> +{
> +	cxl_cper_handle_prot_err(wd);
> +}
> +EXPORT_SYMBOL_GPL(cxl_cper_ras_handle_prot_err);
> +
>  static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>  {
>  	struct cxl_cper_prot_err_work_data wd;
> diff --git a/include/cxl/event.h b/include/cxl/event.h
> index 94081aec597a..a37eef112411 100644
> --- a/include/cxl/event.h
> +++ b/include/cxl/event.h
> @@ -340,4 +340,6 @@ cxl_cper_setup_prot_err_work_data(struct cxl_cper_prot_err_work_data *wd,
>  }
>  #endif
>  
> +void cxl_cper_ras_handle_prot_err(struct cxl_cper_prot_err_work_data *wd);
> +
>  #endif /* _LINUX_CXL_EVENT_H */
Re: [PATCH 6/6 v6] ACPI: extlog: Trace CPER CXL Protocol Error Section
Posted by Fabio M. De Francesco 3 months ago
On Tuesday, October 28, 2025 4:06:09 PM Central European Standard Time Jonathan Cameron wrote:
> On Thu, 23 Oct 2025 14:25:41 +0200
> "Fabio M. De Francesco" <fabio.m.de.francesco@linux.intel.com> wrote:
> 
> > When Firmware First is enabled, BIOS handles errors first and then it makes
> > them available to the kernel via the Common Platform Error Record (CPER)
> > sections (UEFI 2.10 Appendix N). Linux parses the CPER sections via one of
> > two similar paths, either ELOG or GHES. The errors managed by ELOG are
> > signaled to the BIOS by the I/O Machine Check Architecture (I/O MCA).
> > 
> > Currently, ELOG and GHES show some inconsistencies in how they report to
> > userspace via trace events.
> > 
> > Therefore, make the two mentioned paths act similarly by tracing the CPER
> > CXL Protocol Error Section (UEFI v2.10, Appendix N.2.13).
> > 
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
> 
> Just one small question.   With that addressed, 
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> 
> > diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> > index 2731ba3a0799..3f527b0c6509 100644
> > --- a/drivers/cxl/core/ras.c
> > +++ b/drivers/cxl/core/ras.c
> > @@ -105,6 +105,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
> >  		cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
> >  }
> >  
> > +void cxl_cper_ras_handle_prot_err(struct cxl_cper_prot_err_work_data *wd)
> 
> Why do we need this wrapper?  The name is a bit more general, so if you
> do need it, then why not instead just rename cxl_cper_handle_prot_err()
> 
Actually, on a second thought I believe that we don't need either this
wrapper or renaming cxl_cper_handle_prot_err(). I'll export the latter
as it is.

Fabio
> > +{
> > +	cxl_cper_handle_prot_err(wd);
> > +}
> > +EXPORT_SYMBOL_GPL(cxl_cper_ras_handle_prot_err);
> > +
> >  static void cxl_cper_prot_err_work_fn(struct work_struct *work)
> >  {
> >  	struct cxl_cper_prot_err_work_data wd;
> > diff --git a/include/cxl/event.h b/include/cxl/event.h
> > index 94081aec597a..a37eef112411 100644
> > --- a/include/cxl/event.h
> > +++ b/include/cxl/event.h
> > @@ -340,4 +340,6 @@ cxl_cper_setup_prot_err_work_data(struct cxl_cper_prot_err_work_data *wd,
> >  }
> >  #endif
> >  
> > +void cxl_cper_ras_handle_prot_err(struct cxl_cper_prot_err_work_data *wd);
> > +
> >  #endif /* _LINUX_CXL_EVENT_H */
> 
>