Create cxl_do_recovery() to provide uncorrectable protocol error (UCE)
handling. Follow similar design as found in PCIe error driver,
pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs
as fatal with a kernel panic. This is to prevent corruption on CXL memory.
Export the PCI error driver's merge_result() to CXL namespace. Introduce
PCI_ERS_RESULT_PANIC and add support in merge_result() routine. This will
be used by CXL to panic the system in the case of uncorrectable protocol
errors. PCI error handling is not currently expected to use the
PCI_ERS_RESULT_PANIC.
Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the
first device in all cases.
Copy the PCI error driver's report_error_detected() to cxl_report_error_detected().
Note, only CXL Endpoints and RCH Downstream Ports(RCH DSP) are currently
supported. Add locking for PCI device as done in PCI's report_error_detected().
This is necessary to prevent the RAS registers from disappearing before
logging is completed.
Call panic() to halt the system in the case of uncorrectable errors (UCE)
in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use
if a UCE is not found. In this case the AER status must be cleared and
uses pci_aer_clear_fatal_status().
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
drivers/cxl/core/native_ras.c | 44 +++++++++++++++++++++++++++++++++++
drivers/pci/pcie/cxl_aer.c | 3 ++-
drivers/pci/pcie/err.c | 8 +++++--
include/linux/aer.h | 11 +++++++++
include/linux/pci.h | 3 +++
5 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/drivers/cxl/core/native_ras.c b/drivers/cxl/core/native_ras.c
index 5bd79d5019e7..19f8f2ac8376 100644
--- a/drivers/cxl/core/native_ras.c
+++ b/drivers/cxl/core/native_ras.c
@@ -8,8 +8,52 @@
#include <core/core.h>
#include <cxlpci.h>
+static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
+{
+ pci_ers_result_t vote, *result = data;
+
+ if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
+ (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_END))
+ return 0;
+
+ guard(device)(&pdev->dev);
+
+ vote = cxl_error_detected(pdev, pci_channel_io_frozen);
+ *result = merge_result(*result, vote);
+
+ return 0;
+}
+
+static void cxl_walk_bridge(struct pci_dev *bridge,
+ int (*cb)(struct pci_dev *, void *),
+ void *userdata)
+{
+ if (cb(bridge, userdata))
+ return;
+
+ if (bridge->subordinate)
+ pci_walk_bus(bridge->subordinate, cb, userdata);
+}
+
static void cxl_do_recovery(struct pci_dev *pdev)
{
+ pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
+
+ cxl_walk_bridge(pdev, cxl_report_error_detected, &status);
+ if (status == PCI_ERS_RESULT_PANIC)
+ panic("CXL cachemem error.");
+
+ /*
+ * If we have native control of AER, clear error status in the device
+ * that detected the error. If the platform retained control of AER,
+ * it is responsible for clearing this status. In that case, the
+ * signaling device may not even be visible to the OS.
+ */
+ if (cxl_error_is_native(pdev)) {
+ pcie_clear_device_status(pdev);
+ pci_aer_clear_nonfatal_status(pdev);
+ pci_aer_clear_fatal_status(pdev);
+ }
}
static bool is_cxl_rcd(struct pci_dev *pdev)
diff --git a/drivers/pci/pcie/cxl_aer.c b/drivers/pci/pcie/cxl_aer.c
index 939438a7161a..b238791b7101 100644
--- a/drivers/pci/pcie/cxl_aer.c
+++ b/drivers/pci/pcie/cxl_aer.c
@@ -52,12 +52,13 @@ static bool is_cxl_mem_dev(struct pci_dev *dev)
return true;
}
-static bool cxl_error_is_native(struct pci_dev *dev)
+bool cxl_error_is_native(struct pci_dev *dev)
{
struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
return (pcie_ports_native || host->native_aer);
}
+EXPORT_SYMBOL_NS_GPL(cxl_error_is_native, "CXL");
static bool is_internal_error(struct aer_err_info *info)
{
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index de6381c690f5..63fceb3e8613 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -21,9 +21,12 @@
#include "portdrv.h"
#include "../pci.h"
-static pci_ers_result_t merge_result(enum pci_ers_result orig,
- enum pci_ers_result new)
+pci_ers_result_t merge_result(enum pci_ers_result orig,
+ enum pci_ers_result new)
{
+ if (new == PCI_ERS_RESULT_PANIC)
+ return PCI_ERS_RESULT_PANIC;
+
if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
return PCI_ERS_RESULT_NO_AER_DRIVER;
@@ -45,6 +48,7 @@ static pci_ers_result_t merge_result(enum pci_ers_result orig,
return orig;
}
+EXPORT_SYMBOL_NS_GPL(merge_result, "CXL");
static int report_error_detected(struct pci_dev *dev,
pci_channel_state_t state,
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 0aafcc678e45..f14db635ef90 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -10,6 +10,7 @@
#include <linux/errno.h>
#include <linux/types.h>
+#include <linux/pci.h>
#include <linux/workqueue_types.h>
#define AER_NONFATAL 0
@@ -78,6 +79,8 @@ struct cxl_proto_err_work_data {
int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
void pci_aer_clear_fatal_status(struct pci_dev *dev);
int pcie_aer_is_native(struct pci_dev *dev);
+pci_ers_result_t merge_result(enum pci_ers_result orig,
+ enum pci_ers_result new);
#else
static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
{
@@ -85,16 +88,24 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
}
static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
+static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
+ enum pci_ers_result new)
+{
+ return PCI_ERS_RESULT_NONE;
+}
+
#endif
#if defined(CONFIG_PCIEAER_CXL)
void cxl_register_proto_err_work(struct work_struct *work);
void cxl_unregister_proto_err_work(void);
int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd);
+bool cxl_error_is_native(struct pci_dev *dev);
#else
static inline void cxl_register_proto_err_work(struct work_struct *work) { }
static inline void cxl_unregister_proto_err_work(void) { }
static inline int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd) { return 0; }
+static inline bool cxl_error_is_native(struct pci_dev *dev) { return 0; }
#endif
void pci_print_aer(struct pci_dev *dev, int aer_severity,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 79326358f641..16a8310e0373 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -868,6 +868,9 @@ enum pci_ers_result {
/* No AER capabilities registered for the driver */
PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6,
+
+ /* System is unstable, panic. Is CXL specific */
+ PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7,
};
/* PCI bus error event callbacks */
--
2.34.1
>-----Original Message----- >From: Terry Bowman <terry.bowman@amd.com> >Sent: 26 June 2025 23:43 >To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>; >dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com; >bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>; >ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com; >rrichter@amd.com; dan.carpenter@linaro.org; >PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de; >Benjamin.Cheatham@amd.com; >sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com; >linux-cxl@vger.kernel.org >Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org >Subject: [PATCH v10 07/17] CXL/PCI: Introduce CXL uncorrectable protocol error >recovery > >Create cxl_do_recovery() to provide uncorrectable protocol error (UCE) >handling. Follow similar design as found in PCIe error driver, >pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs as >fatal with a kernel panic. This is to prevent corruption on CXL memory. > >Export the PCI error driver's merge_result() to CXL namespace. Introduce >PCI_ERS_RESULT_PANIC and add support in merge_result() routine. This will be >used by CXL to panic the system in the case of uncorrectable protocol errors. PCI >error handling is not currently expected to use the PCI_ERS_RESULT_PANIC. > >Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the first >device in all cases. > >Copy the PCI error driver's report_error_detected() to >cxl_report_error_detected(). >Note, only CXL Endpoints and RCH Downstream Ports(RCH DSP) are currently >supported. Add locking for PCI device as done in PCI's report_error_detected(). >This is necessary to prevent the RAS registers from disappearing before logging >is completed. > >Call panic() to halt the system in the case of uncorrectable errors (UCE) in >cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use if a UCE is >not found. In this case the AER status must be cleared and uses >pci_aer_clear_fatal_status(). > >Signed-off-by: Terry Bowman <terry.bowman@amd.com> >--- > drivers/cxl/core/native_ras.c | 44 +++++++++++++++++++++++++++++++++++ > drivers/pci/pcie/cxl_aer.c | 3 ++- > drivers/pci/pcie/err.c | 8 +++++-- > include/linux/aer.h | 11 +++++++++ > include/linux/pci.h | 3 +++ > 5 files changed, 66 insertions(+), 3 deletions(-) > [...] > > void pci_print_aer(struct pci_dev *dev, int aer_severity, diff --git >a/include/linux/pci.h b/include/linux/pci.h index 79326358f641..16a8310e0373 >100644 >--- a/include/linux/pci.h >+++ b/include/linux/pci.h >@@ -868,6 +868,9 @@ enum pci_ers_result { > > /* No AER capabilities registered for the driver */ > PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6, >+ >+ /* System is unstable, panic. Is CXL specific */ >+ PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7, Extra space is present after casting? > }; > > /* PCI bus error event callbacks */ >-- >2.34.1
On 6/27/2025 7:27 AM, Shiju Jose wrote: >> -----Original Message----- >> From: Terry Bowman <terry.bowman@amd.com> >> Sent: 26 June 2025 23:43 >> To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>; >> dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com; >> bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>; >> ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com; >> rrichter@amd.com; dan.carpenter@linaro.org; >> PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de; >> Benjamin.Cheatham@amd.com; >> sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com; >> linux-cxl@vger.kernel.org >> Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org >> Subject: [PATCH v10 07/17] CXL/PCI: Introduce CXL uncorrectable protocol error >> recovery >> >> Create cxl_do_recovery() to provide uncorrectable protocol error (UCE) >> handling. Follow similar design as found in PCIe error driver, >> pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs as >> fatal with a kernel panic. This is to prevent corruption on CXL memory. >> >> Export the PCI error driver's merge_result() to CXL namespace. Introduce >> PCI_ERS_RESULT_PANIC and add support in merge_result() routine. This will be >> used by CXL to panic the system in the case of uncorrectable protocol errors. PCI >> error handling is not currently expected to use the PCI_ERS_RESULT_PANIC. >> >> Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the first >> device in all cases. >> >> Copy the PCI error driver's report_error_detected() to >> cxl_report_error_detected(). >> Note, only CXL Endpoints and RCH Downstream Ports(RCH DSP) are currently >> supported. Add locking for PCI device as done in PCI's report_error_detected(). >> This is necessary to prevent the RAS registers from disappearing before logging >> is completed. >> >> Call panic() to halt the system in the case of uncorrectable errors (UCE) in >> cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use if a UCE is >> not found. In this case the AER status must be cleared and uses >> pci_aer_clear_fatal_status(). >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> >> --- >> drivers/cxl/core/native_ras.c | 44 +++++++++++++++++++++++++++++++++++ >> drivers/pci/pcie/cxl_aer.c | 3 ++- >> drivers/pci/pcie/err.c | 8 +++++-- >> include/linux/aer.h | 11 +++++++++ >> include/linux/pci.h | 3 +++ >> 5 files changed, 66 insertions(+), 3 deletions(-) >> > [...] >> void pci_print_aer(struct pci_dev *dev, int aer_severity, diff --git >> a/include/linux/pci.h b/include/linux/pci.h index 79326358f641..16a8310e0373 >> 100644 >> --- a/include/linux/pci.h >> +++ b/include/linux/pci.h >> @@ -868,6 +868,9 @@ enum pci_ers_result { >> >> /* No AER capabilities registered for the driver */ >> PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6, >> + >> + /* System is unstable, panic. Is CXL specific */ >> + PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7, > Extra space is present after casting? >> }; Hi Shiju, I see the existing PCIE_ERS_RESULT entries have a space before the number. For example, PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6, ^ I do see that I had an extra space in my comment that I will fix. Please let me know if you agree or if I'm missing something? -Terry >> >> /* PCI bus error event callbacks */ >> -- >> 2.34.1
On Thu, 26 Jun 2025 17:42:42 -0500 Terry Bowman <terry.bowman@amd.com> wrote: > Create cxl_do_recovery() to provide uncorrectable protocol error (UCE) > handling. Follow similar design as found in PCIe error driver, > pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs > as fatal with a kernel panic. This is to prevent corruption on CXL memory. > > Export the PCI error driver's merge_result() to CXL namespace. I think this may be a confusion from earlier review. Anyhow, it should be namespaced in the sense of not exporting something the vague name of merge_result but it's PCI code, not CXL code and we don't have the dangerous interface argument to justify putting it in the CXL namespace so I think a namespaced EXPORT makes little sense for this one. Jonathan > Introduce > PCI_ERS_RESULT_PANIC and add support in merge_result() routine. This will > be used by CXL to panic the system in the case of uncorrectable protocol > errors. PCI error handling is not currently expected to use the > PCI_ERS_RESULT_PANIC. > > Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the > first device in all cases. > > Copy the PCI error driver's report_error_detected() to cxl_report_error_detected(). > Note, only CXL Endpoints and RCH Downstream Ports(RCH DSP) are currently > supported. Add locking for PCI device as done in PCI's report_error_detected(). > This is necessary to prevent the RAS registers from disappearing before > logging is completed. > > Call panic() to halt the system in the case of uncorrectable errors (UCE) > in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use > if a UCE is not found. In this case the AER status must be cleared and > uses pci_aer_clear_fatal_status(). > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c > index de6381c690f5..63fceb3e8613 100644 > --- a/drivers/pci/pcie/err.c > +++ b/drivers/pci/pcie/err.c > @@ -21,9 +21,12 @@ > #include "portdrv.h" > #include "../pci.h" > > -static pci_ers_result_t merge_result(enum pci_ers_result orig, > - enum pci_ers_result new) > +pci_ers_result_t merge_result(enum pci_ers_result orig, > + enum pci_ers_result new) > { > + if (new == PCI_ERS_RESULT_PANIC) > + return PCI_ERS_RESULT_PANIC; > + > if (new == PCI_ERS_RESULT_NO_AER_DRIVER) > return PCI_ERS_RESULT_NO_AER_DRIVER; > > @@ -45,6 +48,7 @@ static pci_ers_result_t merge_result(enum pci_ers_result orig, > > return orig; > } > +EXPORT_SYMBOL_NS_GPL(merge_result, "CXL"); Do we care about namespacing this? I think not given it is PCIe code and hardly destructive for other drivers to mess with it if they like. I would namespace it in the sense of renaming it to make it clear it's about pci errors though. pci_ers_merge_result() perhaps? Do that as a percursor patch. > > static int report_error_detected(struct pci_dev *dev, > pci_channel_state_t state,
On 6/27/2025 6:05 AM, Jonathan Cameron wrote: > On Thu, 26 Jun 2025 17:42:42 -0500 > Terry Bowman <terry.bowman@amd.com> wrote: > >> Create cxl_do_recovery() to provide uncorrectable protocol error (UCE) >> handling. Follow similar design as found in PCIe error driver, >> pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs >> as fatal with a kernel panic. This is to prevent corruption on CXL memory. >> >> Export the PCI error driver's merge_result() to CXL namespace. > I think this may be a confusion from earlier review. Anyhow, it should > be namespaced in the sense of not exporting something the vague name of > merge_result but it's PCI code, not CXL code and we don't have the dangerous > interface argument to justify putting it in the CXL namespace so I think > a namespaced EXPORT makes little sense for this one. > > Jonathan > > >> Introduce >> PCI_ERS_RESULT_PANIC and add support in merge_result() routine. This will >> be used by CXL to panic the system in the case of uncorrectable protocol >> errors. PCI error handling is not currently expected to use the >> PCI_ERS_RESULT_PANIC. >> >> Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the >> first device in all cases. >> >> Copy the PCI error driver's report_error_detected() to cxl_report_error_detected(). >> Note, only CXL Endpoints and RCH Downstream Ports(RCH DSP) are currently >> supported. Add locking for PCI device as done in PCI's report_error_detected(). >> This is necessary to prevent the RAS registers from disappearing before >> logging is completed. >> >> Call panic() to halt the system in the case of uncorrectable errors (UCE) >> in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use >> if a UCE is not found. In this case the AER status must be cleared and >> uses pci_aer_clear_fatal_status(). >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> > >> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c >> index de6381c690f5..63fceb3e8613 100644 >> --- a/drivers/pci/pcie/err.c >> +++ b/drivers/pci/pcie/err.c >> @@ -21,9 +21,12 @@ >> #include "portdrv.h" >> #include "../pci.h" >> >> -static pci_ers_result_t merge_result(enum pci_ers_result orig, >> - enum pci_ers_result new) >> +pci_ers_result_t merge_result(enum pci_ers_result orig, >> + enum pci_ers_result new) >> { >> + if (new == PCI_ERS_RESULT_PANIC) >> + return PCI_ERS_RESULT_PANIC; >> + >> if (new == PCI_ERS_RESULT_NO_AER_DRIVER) >> return PCI_ERS_RESULT_NO_AER_DRIVER; >> >> @@ -45,6 +48,7 @@ static pci_ers_result_t merge_result(enum pci_ers_result orig, >> >> return orig; >> } >> +EXPORT_SYMBOL_NS_GPL(merge_result, "CXL"); > Do we care about namespacing this? I think not given it is PCIe code > and hardly destructive for other drivers to mess with it if they like. > > I would namespace it in the sense of renaming it to make it clear > it's about pci errors though. > > pci_ers_merge_result() perhaps? > > Do that as a percursor patch. > Good idea. There is a lot of changes related to just exporting this and changing the name. I've changed the namespace export to be: EXPORT_SYMBOL(pci_ers_merge_result); I moved this and its related required changes into an earlier patch. -Terry >> >> static int report_error_detected(struct pci_dev *dev, >> pci_channel_state_t state,
© 2016 - 2025 Red Hat, Inc.