The CXL AER error handling logic currently resides in the AER driver file,
drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled
using #ifdefs.
Improve the AER driver maintainability by separating the CXL specific logic
from the AER driver's core functionality and removing the #ifdefs.
Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the
new file.
Update the makefile to conditionally compile the CXL file using the
existing CONFIG_PCIEAER_CXL Kconfig.
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
drivers/pci/pci.h | 8 +++
drivers/pci/pcie/Makefile | 1 +
drivers/pci/pcie/aer.c | 138 -------------------------------------
drivers/pci/pcie/cxl_aer.c | 138 +++++++++++++++++++++++++++++++++++++
include/linux/pci_ids.h | 2 +
5 files changed, 149 insertions(+), 138 deletions(-)
create mode 100644 drivers/pci/pcie/cxl_aer.c
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index a0d1e59b5666..91b583cf18eb 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1029,6 +1029,14 @@ static inline void pci_save_aer_state(struct pci_dev *dev) { }
static inline void pci_restore_aer_state(struct pci_dev *dev) { }
#endif
+#ifdef CONFIG_PCIEAER_CXL
+void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info);
+void cxl_rch_enable_rcec(struct pci_dev *rcec);
+#else
+static inline void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) { }
+static inline void cxl_rch_enable_rcec(struct pci_dev *rcec) { }
+#endif
+
#ifdef CONFIG_ACPI
bool pci_acpi_preserve_config(struct pci_host_bridge *bridge);
int pci_acpi_program_hp_params(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 173829aa02e6..cd2cb925dbd5 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o
obj-y += aspm.o
obj-$(CONFIG_PCIEAER) += aer.o err.o tlp.o
+obj-$(CONFIG_PCIEAER_CXL) += cxl_aer.o
obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o
obj-$(CONFIG_PCIE_PME) += pme.o
obj-$(CONFIG_PCIE_DPC) += dpc.o
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a2df9456595a..0b4d721980ef 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1094,144 +1094,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
}
-#ifdef CONFIG_PCIEAER_CXL
-
-/**
- * pci_aer_unmask_internal_errors - unmask internal errors
- * @dev: pointer to the pci_dev data structure
- *
- * Unmask internal errors in the Uncorrectable and Correctable Error
- * Mask registers.
- *
- * Note: AER must be enabled and supported by the device which must be
- * checked in advance, e.g. with pcie_aer_is_native().
- */
-static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
-{
- int aer = dev->aer_cap;
- u32 mask;
-
- pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
- mask &= ~PCI_ERR_UNC_INTN;
- pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
-
- pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
- mask &= ~PCI_ERR_COR_INTERNAL;
- pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
-}
-
-static bool is_cxl_mem_dev(struct pci_dev *dev)
-{
- /*
- * The capability, status, and control fields in Device 0,
- * Function 0 DVSEC control the CXL functionality of the
- * entire device (CXL 3.0, 8.1.3).
- */
- if (dev->devfn != PCI_DEVFN(0, 0))
- return false;
-
- /*
- * CXL Memory Devices must have the 502h class code set (CXL
- * 3.0, 8.1.12.1).
- */
- if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL)
- return false;
-
- return true;
-}
-
-static bool cxl_error_is_native(struct pci_dev *dev)
-{
- struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
-
- return (pcie_ports_native || host->native_aer);
-}
-
-static bool is_internal_error(struct aer_err_info *info)
-{
- if (info->severity == AER_CORRECTABLE)
- return info->status & PCI_ERR_COR_INTERNAL;
-
- return info->status & PCI_ERR_UNC_INTN;
-}
-
-static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
-{
- struct aer_err_info *info = (struct aer_err_info *)data;
- const struct pci_error_handlers *err_handler;
-
- if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
- return 0;
-
- /* Protect dev->driver */
- device_lock(&dev->dev);
-
- err_handler = dev->driver ? dev->driver->err_handler : NULL;
- if (!err_handler)
- goto out;
-
- if (info->severity == AER_CORRECTABLE) {
- if (err_handler->cor_error_detected)
- err_handler->cor_error_detected(dev);
- } else if (err_handler->error_detected) {
- if (info->severity == AER_NONFATAL)
- err_handler->error_detected(dev, pci_channel_io_normal);
- else if (info->severity == AER_FATAL)
- err_handler->error_detected(dev, pci_channel_io_frozen);
- }
-out:
- device_unlock(&dev->dev);
- return 0;
-}
-
-static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
-{
- /*
- * Internal errors of an RCEC indicate an AER error in an
- * RCH's downstream port. Check and handle them in the CXL.mem
- * device driver.
- */
- if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
- is_internal_error(info))
- pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
-}
-
-static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
-{
- bool *handles_cxl = data;
-
- if (!*handles_cxl)
- *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
-
- /* Non-zero terminates iteration */
- return *handles_cxl;
-}
-
-static bool handles_cxl_errors(struct pci_dev *rcec)
-{
- bool handles_cxl = false;
-
- if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
- pcie_aer_is_native(rcec))
- pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
-
- return handles_cxl;
-}
-
-static void cxl_rch_enable_rcec(struct pci_dev *rcec)
-{
- if (!handles_cxl_errors(rcec))
- return;
-
- pci_aer_unmask_internal_errors(rcec);
- pci_info(rcec, "CXL: Internal errors unmasked");
-}
-
-#else
-static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
-static inline void cxl_rch_handle_error(struct pci_dev *dev,
- struct aer_err_info *info) { }
-#endif
/**
* pci_aer_handle_error - handle logging error into an event log
diff --git a/drivers/pci/pcie/cxl_aer.c b/drivers/pci/pcie/cxl_aer.c
new file mode 100644
index 000000000000..b2ea14f70055
--- /dev/null
+++ b/drivers/pci/pcie/cxl_aer.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
+
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include "../pci.h"
+
+/**
+ * pci_aer_unmask_internal_errors - unmask internal errors
+ * @dev: pointer to the pci_dev data structure
+ *
+ * Unmask internal errors in the Uncorrectable and Correctable Error
+ * Mask registers.
+ *
+ * Note: AER must be enabled and supported by the device which must be
+ * checked in advance, e.g. with pcie_aer_is_native().
+ */
+static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
+{
+ int aer = dev->aer_cap;
+ u32 mask;
+
+ pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
+ mask &= ~PCI_ERR_UNC_INTN;
+ pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
+
+ pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
+ mask &= ~PCI_ERR_COR_INTERNAL;
+ pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
+}
+
+static bool is_cxl_mem_dev(struct pci_dev *dev)
+{
+ /*
+ * The capability, status, and control fields in Device 0,
+ * Function 0 DVSEC control the CXL functionality of the
+ * entire device (CXL 3.2, 8.1.3).
+ */
+ if (dev->devfn != PCI_DEVFN(0, 0))
+ return false;
+
+ /*
+ * CXL Memory Devices must have the 502h class code set (CXL
+ * 3.2, 8.1.12.1).
+ */
+ if (FIELD_GET(PCI_CLASS_CODE_MASK, dev->class) != PCI_CLASS_MEMORY_CXL)
+ return false;
+
+ return true;
+}
+
+static bool cxl_error_is_native(struct pci_dev *dev)
+{
+ struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
+
+ return (pcie_ports_native || host->native_aer);
+}
+
+static bool is_internal_error(struct aer_err_info *info)
+{
+ if (info->severity == AER_CORRECTABLE)
+ return info->status & PCI_ERR_COR_INTERNAL;
+
+ return info->status & PCI_ERR_UNC_INTN;
+}
+
+static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
+{
+ struct aer_err_info *info = (struct aer_err_info *)data;
+ const struct pci_error_handlers *err_handler;
+
+ if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
+ return 0;
+
+ /* Protect dev->driver */
+ device_lock(&dev->dev);
+
+ err_handler = dev->driver ? dev->driver->err_handler : NULL;
+ if (!err_handler)
+ goto out;
+
+ if (info->severity == AER_CORRECTABLE) {
+ if (err_handler->cor_error_detected)
+ err_handler->cor_error_detected(dev);
+ } else if (err_handler->error_detected) {
+ if (info->severity == AER_NONFATAL)
+ err_handler->error_detected(dev, pci_channel_io_normal);
+ else if (info->severity == AER_FATAL)
+ err_handler->error_detected(dev, pci_channel_io_frozen);
+ }
+out:
+ device_unlock(&dev->dev);
+ return 0;
+}
+
+void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
+{
+ /*
+ * Internal errors of an RCEC indicate an AER error in an
+ * RCH's downstream port. Check and handle them in the CXL.mem
+ * device driver.
+ */
+ if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
+ is_internal_error(info))
+ pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
+}
+
+static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
+{
+ bool *handles_cxl = data;
+
+ if (!*handles_cxl)
+ *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
+
+ /* Non-zero terminates iteration */
+ return *handles_cxl;
+}
+
+static bool handles_cxl_errors(struct pci_dev *rcec)
+{
+ bool handles_cxl = false;
+
+ if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
+ pcie_aer_is_native(rcec))
+ pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
+
+ return handles_cxl;
+}
+
+void cxl_rch_enable_rcec(struct pci_dev *rcec)
+{
+ if (!handles_cxl_errors(rcec))
+ return;
+
+ pci_aer_unmask_internal_errors(rcec);
+ pci_info(rcec, "CXL: Internal errors unmasked");
+}
+
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e2d71b6fdd84..31b3935bf189 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -12,6 +12,8 @@
/* Device classes and subclasses */
+#define PCI_CLASS_CODE_MASK 0xFFFF00
+
#define PCI_CLASS_NOT_DEFINED 0x0000
#define PCI_CLASS_NOT_DEFINED_VGA 0x0001
--
2.34.1
Terry Bowman wrote: > The CXL AER error handling logic currently resides in the AER driver file, > drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled > using #ifdefs. > > Improve the AER driver maintainability by separating the CXL specific logic > from the AER driver's core functionality and removing the #ifdefs. > Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the > new file. > > Update the makefile to conditionally compile the CXL file using the > existing CONFIG_PCIEAER_CXL Kconfig. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > --- After reading patch5 I want to qualify my Reviewed-by:... > drivers/pci/pci.h | 8 +++ > drivers/pci/pcie/Makefile | 1 + > drivers/pci/pcie/aer.c | 138 ------------------------------------- > drivers/pci/pcie/cxl_aer.c | 138 +++++++++++++++++++++++++++++++++++++ This is a poor name for this file because the functionality only relates to code that supports a dead-end generation of RCH / RCD hardware platforms. I do agree that it should be removed from aer.c so typical PCIe AER maintenance does not need to trip over that cruft. Please call it something like rch_aer.c so it is tucked out of the way, sticks out as odd in any future diffstat, and does not confuse from the CXL VH error handling that supports current and future generation hardware. Perhaps even move it to its own silent Kconfig symbol with a deprecation warning, something like below, so someone remembers to delete it. diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig index 17919b99fa66..da88358bbb4f 100644 --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -58,6 +58,13 @@ config PCIEAER_CXL If unsure, say Y. +# Restricted CXL Host (RCH) error handling supports first generation CXL +# hardware and can be deprecated in 7-10 years when only CXL Virtual Host +# (CXL specification version 2+) hardware remains in service +config RCH_AER + def_bool y + depends on PCIEAER_CXL + # # PCI Express ECRC #
On 7/23/2025 8:16 PM, dan.j.williams@intel.com wrote: > Terry Bowman wrote: >> The CXL AER error handling logic currently resides in the AER driver file, >> drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled >> using #ifdefs. >> >> Improve the AER driver maintainability by separating the CXL specific logic >> from the AER driver's core functionality and removing the #ifdefs. >> Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the >> new file. >> >> Update the makefile to conditionally compile the CXL file using the >> existing CONFIG_PCIEAER_CXL Kconfig. >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> >> --- > After reading patch5 I want to qualify my Reviewed-by:... > >> drivers/pci/pci.h | 8 +++ >> drivers/pci/pcie/Makefile | 1 + >> drivers/pci/pcie/aer.c | 138 ------------------------------------- >> drivers/pci/pcie/cxl_aer.c | 138 +++++++++++++++++++++++++++++++++++++ > This is a poor name for this file because the functionality only relates to > code that supports a dead-end generation of RCH / RCD hardware platforms. > > I do agree that it should be removed from aer.c so typical PCIe AER > maintenance does not need to trip over that cruft. > > Please call it something like rch_aer.c so it is tucked out of the way, > sticks out as odd in any future diffstat, and does not confuse from the > CXL VH error handling that supports current and future generation > hardware. > > Perhaps even move it to its own silent Kconfig symbol with a deprecation > warning, something like below, so someone remembers to delete it. cxl_rch_handle_error_iter() and cxl_rch_handle_error() need to be moved from pci/pcie/cxl_aer.c into cxl/core/native_ras.c introduced in this series. There is no RCH or VH handling in cxl_aer.c. cxl_aer.c serves to detect if an error is a CXL error and if it is then it forwards it to the CXL drivers using the kfifo introduced later. I will update the commit message stating more will be added later. Dave Jiang introduced cxl/core/pci_aer.c I understand the name is still up for possible change. The native_ras.c changes in this series is planned to be moved into cxl/core/pci_aer.c for v11. The files were created with the same purpose but we used different filenames and need to converge. Let me know if you still want the rename to rch_aer.c. -Terry > diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig > index 17919b99fa66..da88358bbb4f 100644 > --- a/drivers/pci/pcie/Kconfig > +++ b/drivers/pci/pcie/Kconfig > @@ -58,6 +58,13 @@ config PCIEAER_CXL > > If unsure, say Y. > > +# Restricted CXL Host (RCH) error handling supports first generation CXL > +# hardware and can be deprecated in 7-10 years when only CXL Virtual Host > +# (CXL specification version 2+) hardware remains in service > +config RCH_AER > + def_bool y > + depends on PCIEAER_CXL > + > # > # PCI Express ECRC > #
Bowman, Terry wrote: > On 7/23/2025 8:16 PM, dan.j.williams@intel.com wrote: > > Terry Bowman wrote: > >> The CXL AER error handling logic currently resides in the AER driver file, > >> drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled > >> using #ifdefs. > >> > >> Improve the AER driver maintainability by separating the CXL specific logic > >> from the AER driver's core functionality and removing the #ifdefs. > >> Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the > >> new file. > >> > >> Update the makefile to conditionally compile the CXL file using the > >> existing CONFIG_PCIEAER_CXL Kconfig. > >> > >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> > >> --- > > After reading patch5 I want to qualify my Reviewed-by:... > > > >> drivers/pci/pci.h | 8 +++ > >> drivers/pci/pcie/Makefile | 1 + > >> drivers/pci/pcie/aer.c | 138 ------------------------------------- > >> drivers/pci/pcie/cxl_aer.c | 138 +++++++++++++++++++++++++++++++++++++ > > This is a poor name for this file because the functionality only relates to > > code that supports a dead-end generation of RCH / RCD hardware platforms. > > > > I do agree that it should be removed from aer.c so typical PCIe AER > > maintenance does not need to trip over that cruft. > > > > Please call it something like rch_aer.c so it is tucked out of the way, > > sticks out as odd in any future diffstat, and does not confuse from the > > CXL VH error handling that supports current and future generation > > hardware. > > > > Perhaps even move it to its own silent Kconfig symbol with a deprecation > > warning, something like below, so someone remembers to delete it. > > cxl_rch_handle_error_iter() and cxl_rch_handle_error() need to be moved from pci/pcie/cxl_aer.c > into cxl/core/native_ras.c introduced in this series. There is no RCH or VH handling in cxl_aer.c. > cxl_aer.c serves to detect if an error is a CXL error and if it is then it forwards it to the > CXL drivers using the kfifo introduced later. I will update the commit message stating more > will be added later. Wait, this set moves the same function to a new file twice in the same set? I had not gotten that far along, but that's not acceptable. The reasons I had assumed that the rch bits would remain as a vestigial drivers/pci/pcie/rch_aer.c file to be cut from the kernel later are: - The goal of forwarding protocol errors to the cxl_core is that the cxl_core maintains a cxl_port hierarchy. For the RCH case there is no hierarchy and little to no value in being able disposition or decorate error reports with the cxl_port driver. - The RCH code requires a series of new PCI core exports for this one-off unfortunate mistake of history where the CXL specification tried way too hard to hide the presence of CXL. If this code is already on a deprecation path, that contraindicates new exports. > Dave Jiang introduced cxl/core/pci_aer.c I understand the name is still up for possible change. > The native_ras.c changes in this series is planned to be moved into cxl/core/pci_aer.c for v11. > The files were created with the same purpose but we used different filenames and need to converge. Why not put this stuff in the existing cxl/core/ras.c? I do expect that we want to route CPER reports to cxl_port objects at some point, so the "native" distinction is more confusing than beneficial as far as I can see.
Terry Bowman wrote: > The CXL AER error handling logic currently resides in the AER driver file, > drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled > using #ifdefs. > > Improve the AER driver maintainability by separating the CXL specific logic > from the AER driver's core functionality and removing the #ifdefs. > Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the > new file. > > Update the makefile to conditionally compile the CXL file using the > existing CONFIG_PCIEAER_CXL Kconfig. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> [..] > diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h > index e2d71b6fdd84..31b3935bf189 100644 > --- a/include/linux/pci_ids.h > +++ b/include/linux/pci_ids.h > @@ -12,6 +12,8 @@ > > /* Device classes and subclasses */ > > +#define PCI_CLASS_CODE_MASK 0xFFFF00 Per other comments do not add this updated in the same patch as the move. When / if you submit it separately it likely also belongs next to PCI_CLASS_REVISION in include/uapi/linux/pci_regs.h defined with __GENMASK(23, 8). Otherwise, with this change dropped you can add: Reviewed-by: Dan Williams <dan.j.williams@intel.com>
On 7/23/2025 7:01 PM, dan.j.williams@intel.com wrote: > Terry Bowman wrote: >> The CXL AER error handling logic currently resides in the AER driver file, >> drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled >> using #ifdefs. >> >> Improve the AER driver maintainability by separating the CXL specific logic >> from the AER driver's core functionality and removing the #ifdefs. >> Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the >> new file. >> >> Update the makefile to conditionally compile the CXL file using the >> existing CONFIG_PCIEAER_CXL Kconfig. >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> > [..] >> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h >> index e2d71b6fdd84..31b3935bf189 100644 >> --- a/include/linux/pci_ids.h >> +++ b/include/linux/pci_ids.h >> @@ -12,6 +12,8 @@ >> >> /* Device classes and subclasses */ >> >> +#define PCI_CLASS_CODE_MASK 0xFFFF00 > Per other comments do not add this updated in the same patch as the > move. > > When / if you submit it separately it likely also belongs next to > PCI_CLASS_REVISION in include/uapi/linux/pci_regs.h defined with > __GENMASK(23, 8). include/uapi/linux/pci_regs.h appears to use all values without using GENMASK(). Just adding as a note. I'm making the change. > Otherwise, with this change dropped you can add: > > Reviewed-by: Dan Williams <dan.j.williams@intel.com> Your next email pauses this. I'll respond there. -Terry
Bowman, Terry wrote: > > > On 7/23/2025 7:01 PM, dan.j.williams@intel.com wrote: > > Terry Bowman wrote: > >> The CXL AER error handling logic currently resides in the AER driver file, > >> drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled > >> using #ifdefs. > >> > >> Improve the AER driver maintainability by separating the CXL specific logic > >> from the AER driver's core functionality and removing the #ifdefs. > >> Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the > >> new file. > >> > >> Update the makefile to conditionally compile the CXL file using the > >> existing CONFIG_PCIEAER_CXL Kconfig. > >> > >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> > > [..] > >> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h > >> index e2d71b6fdd84..31b3935bf189 100644 > >> --- a/include/linux/pci_ids.h > >> +++ b/include/linux/pci_ids.h > >> @@ -12,6 +12,8 @@ > >> > >> /* Device classes and subclasses */ > >> > >> +#define PCI_CLASS_CODE_MASK 0xFFFF00 > > Per other comments do not add this updated in the same patch as the > > move. > > > > When / if you submit it separately it likely also belongs next to > > PCI_CLASS_REVISION in include/uapi/linux/pci_regs.h defined with > > __GENMASK(23, 8). > > include/uapi/linux/pci_regs.h appears to use all values without using GENMASK(). > Just adding as a note. I'm making the change. pci_regs.h is in include/uapi/. Historically that meant that it was unable to use GENMASK() from include/linux/. That changed "recently" (compared to the age of pci_regs.h) with commit: 3c7a8e190bc5 uapi: introduce uapi-friendly macros for GENMASK
On 6/26/25 3:42 PM, Terry Bowman wrote: > The CXL AER error handling logic currently resides in the AER driver file, > drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled > using #ifdefs. > > Improve the AER driver maintainability by separating the CXL specific logic > from the AER driver's core functionality and removing the #ifdefs. > Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the > new file. > > Update the makefile to conditionally compile the CXL file using the > existing CONFIG_PCIEAER_CXL Kconfig. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > --- After moving the code , you seem to have updated it with your own changes. May be you split it into two patches. > drivers/pci/pci.h | 8 +++ > drivers/pci/pcie/Makefile | 1 + > drivers/pci/pcie/aer.c | 138 ------------------------------------- > drivers/pci/pcie/cxl_aer.c | 138 +++++++++++++++++++++++++++++++++++++ > include/linux/pci_ids.h | 2 + > 5 files changed, 149 insertions(+), 138 deletions(-) > create mode 100644 drivers/pci/pcie/cxl_aer.c > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index a0d1e59b5666..91b583cf18eb 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -1029,6 +1029,14 @@ static inline void pci_save_aer_state(struct pci_dev *dev) { } > static inline void pci_restore_aer_state(struct pci_dev *dev) { } > #endif > > +#ifdef CONFIG_PCIEAER_CXL > +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info); > +void cxl_rch_enable_rcec(struct pci_dev *rcec); > +#else > +static inline void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) { } > +static inline void cxl_rch_enable_rcec(struct pci_dev *rcec) { } > +#endif > + > #ifdef CONFIG_ACPI > bool pci_acpi_preserve_config(struct pci_host_bridge *bridge); > int pci_acpi_program_hp_params(struct pci_dev *dev); > diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile > index 173829aa02e6..cd2cb925dbd5 100644 > --- a/drivers/pci/pcie/Makefile > +++ b/drivers/pci/pcie/Makefile > @@ -8,6 +8,7 @@ obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o > > obj-y += aspm.o > obj-$(CONFIG_PCIEAER) += aer.o err.o tlp.o > +obj-$(CONFIG_PCIEAER_CXL) += cxl_aer.o > obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o > obj-$(CONFIG_PCIE_PME) += pme.o > obj-$(CONFIG_PCIE_DPC) += dpc.o > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index a2df9456595a..0b4d721980ef 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -1094,144 +1094,6 @@ static bool find_source_device(struct pci_dev *parent, > return true; > } > > -#ifdef CONFIG_PCIEAER_CXL > - > -/** > - * pci_aer_unmask_internal_errors - unmask internal errors > - * @dev: pointer to the pci_dev data structure > - * > - * Unmask internal errors in the Uncorrectable and Correctable Error > - * Mask registers. > - * > - * Note: AER must be enabled and supported by the device which must be > - * checked in advance, e.g. with pcie_aer_is_native(). > - */ > -static void pci_aer_unmask_internal_errors(struct pci_dev *dev) > -{ > - int aer = dev->aer_cap; > - u32 mask; > - > - pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); > - mask &= ~PCI_ERR_UNC_INTN; > - pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask); > - > - pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); > - mask &= ~PCI_ERR_COR_INTERNAL; > - pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); > -} > - > -static bool is_cxl_mem_dev(struct pci_dev *dev) > -{ > - /* > - * The capability, status, and control fields in Device 0, > - * Function 0 DVSEC control the CXL functionality of the > - * entire device (CXL 3.0, 8.1.3). > - */ > - if (dev->devfn != PCI_DEVFN(0, 0)) > - return false; > - > - /* > - * CXL Memory Devices must have the 502h class code set (CXL > - * 3.0, 8.1.12.1). > - */ > - if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL) > - return false; > - > - return true; > -} > - > -static bool cxl_error_is_native(struct pci_dev *dev) > -{ > - struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); > - > - return (pcie_ports_native || host->native_aer); > -} > - > -static bool is_internal_error(struct aer_err_info *info) > -{ > - if (info->severity == AER_CORRECTABLE) > - return info->status & PCI_ERR_COR_INTERNAL; > - > - return info->status & PCI_ERR_UNC_INTN; > -} > - > -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) > -{ > - struct aer_err_info *info = (struct aer_err_info *)data; > - const struct pci_error_handlers *err_handler; > - > - if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) > - return 0; > - > - /* Protect dev->driver */ > - device_lock(&dev->dev); > - > - err_handler = dev->driver ? dev->driver->err_handler : NULL; > - if (!err_handler) > - goto out; > - > - if (info->severity == AER_CORRECTABLE) { > - if (err_handler->cor_error_detected) > - err_handler->cor_error_detected(dev); > - } else if (err_handler->error_detected) { > - if (info->severity == AER_NONFATAL) > - err_handler->error_detected(dev, pci_channel_io_normal); > - else if (info->severity == AER_FATAL) > - err_handler->error_detected(dev, pci_channel_io_frozen); > - } > -out: > - device_unlock(&dev->dev); > - return 0; > -} > - > -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) > -{ > - /* > - * Internal errors of an RCEC indicate an AER error in an > - * RCH's downstream port. Check and handle them in the CXL.mem > - * device driver. > - */ > - if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && > - is_internal_error(info)) > - pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); > -} > - > -static int handles_cxl_error_iter(struct pci_dev *dev, void *data) > -{ > - bool *handles_cxl = data; > - > - if (!*handles_cxl) > - *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); > - > - /* Non-zero terminates iteration */ > - return *handles_cxl; > -} > - > -static bool handles_cxl_errors(struct pci_dev *rcec) > -{ > - bool handles_cxl = false; > - > - if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && > - pcie_aer_is_native(rcec)) > - pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); > - > - return handles_cxl; > -} > - > -static void cxl_rch_enable_rcec(struct pci_dev *rcec) > -{ > - if (!handles_cxl_errors(rcec)) > - return; > - > - pci_aer_unmask_internal_errors(rcec); > - pci_info(rcec, "CXL: Internal errors unmasked"); > -} > - > -#else > -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { } > -static inline void cxl_rch_handle_error(struct pci_dev *dev, > - struct aer_err_info *info) { } > -#endif > > /** > * pci_aer_handle_error - handle logging error into an event log > diff --git a/drivers/pci/pcie/cxl_aer.c b/drivers/pci/pcie/cxl_aer.c > new file mode 100644 > index 000000000000..b2ea14f70055 > --- /dev/null > +++ b/drivers/pci/pcie/cxl_aer.c > @@ -0,0 +1,138 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ > + > +#include <linux/pci.h> > +#include <linux/aer.h> > +#include "../pci.h" > + > +/** > + * pci_aer_unmask_internal_errors - unmask internal errors > + * @dev: pointer to the pci_dev data structure > + * > + * Unmask internal errors in the Uncorrectable and Correctable Error > + * Mask registers. > + * > + * Note: AER must be enabled and supported by the device which must be > + * checked in advance, e.g. with pcie_aer_is_native(). > + */ > +static void pci_aer_unmask_internal_errors(struct pci_dev *dev) > +{ > + int aer = dev->aer_cap; > + u32 mask; > + > + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); > + mask &= ~PCI_ERR_UNC_INTN; > + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask); > + > + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); > + mask &= ~PCI_ERR_COR_INTERNAL; > + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); > +} > + > +static bool is_cxl_mem_dev(struct pci_dev *dev) > +{ > + /* > + * The capability, status, and control fields in Device 0, > + * Function 0 DVSEC control the CXL functionality of the > + * entire device (CXL 3.2, 8.1.3). > + */ > + if (dev->devfn != PCI_DEVFN(0, 0)) > + return false; > + > + /* > + * CXL Memory Devices must have the 502h class code set (CXL > + * 3.2, 8.1.12.1). > + */ > + if (FIELD_GET(PCI_CLASS_CODE_MASK, dev->class) != PCI_CLASS_MEMORY_CXL) > + return false; > + > + return true; > +} > + > +static bool cxl_error_is_native(struct pci_dev *dev) > +{ > + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); > + > + return (pcie_ports_native || host->native_aer); > +} > + > +static bool is_internal_error(struct aer_err_info *info) > +{ > + if (info->severity == AER_CORRECTABLE) > + return info->status & PCI_ERR_COR_INTERNAL; > + > + return info->status & PCI_ERR_UNC_INTN; > +} > + > +static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) > +{ > + struct aer_err_info *info = (struct aer_err_info *)data; > + const struct pci_error_handlers *err_handler; > + > + if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) > + return 0; > + > + /* Protect dev->driver */ > + device_lock(&dev->dev); > + > + err_handler = dev->driver ? dev->driver->err_handler : NULL; > + if (!err_handler) > + goto out; > + > + if (info->severity == AER_CORRECTABLE) { > + if (err_handler->cor_error_detected) > + err_handler->cor_error_detected(dev); > + } else if (err_handler->error_detected) { > + if (info->severity == AER_NONFATAL) > + err_handler->error_detected(dev, pci_channel_io_normal); > + else if (info->severity == AER_FATAL) > + err_handler->error_detected(dev, pci_channel_io_frozen); > + } > +out: > + device_unlock(&dev->dev); > + return 0; > +} > + > +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) > +{ > + /* > + * Internal errors of an RCEC indicate an AER error in an > + * RCH's downstream port. Check and handle them in the CXL.mem > + * device driver. > + */ > + if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && > + is_internal_error(info)) > + pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); > +} > + > +static int handles_cxl_error_iter(struct pci_dev *dev, void *data) > +{ > + bool *handles_cxl = data; > + > + if (!*handles_cxl) > + *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); > + > + /* Non-zero terminates iteration */ > + return *handles_cxl; > +} > + > +static bool handles_cxl_errors(struct pci_dev *rcec) > +{ > + bool handles_cxl = false; > + > + if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && > + pcie_aer_is_native(rcec)) > + pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); > + > + return handles_cxl; > +} > + > +void cxl_rch_enable_rcec(struct pci_dev *rcec) > +{ > + if (!handles_cxl_errors(rcec)) > + return; > + > + pci_aer_unmask_internal_errors(rcec); > + pci_info(rcec, "CXL: Internal errors unmasked"); > +} > + > diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h > index e2d71b6fdd84..31b3935bf189 100644 > --- a/include/linux/pci_ids.h > +++ b/include/linux/pci_ids.h > @@ -12,6 +12,8 @@ > > /* Device classes and subclasses */ > > +#define PCI_CLASS_CODE_MASK 0xFFFF00 > + > #define PCI_CLASS_NOT_DEFINED 0x0000 > #define PCI_CLASS_NOT_DEFINED_VGA 0x0001 > -- Sathyanarayanan Kuppuswamy Linux Kernel Developer
On 6/26/2025 6:42 PM, Sathyanarayanan Kuppuswamy wrote: > On 6/26/25 3:42 PM, Terry Bowman wrote: >> The CXL AER error handling logic currently resides in the AER driver file, >> drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled >> using #ifdefs. >> >> Improve the AER driver maintainability by separating the CXL specific logic >> from the AER driver's core functionality and removing the #ifdefs. >> Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the >> new file. >> >> Update the makefile to conditionally compile the CXL file using the >> existing CONFIG_PCIEAER_CXL Kconfig. >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> >> --- > After moving the code , you seem to have updated it with your own > changes. May be you split it into two patches. Yes, I'll update to make a copy and add the changes in the following patch(es). -Terry >> drivers/pci/pci.h | 8 +++ >> drivers/pci/pcie/Makefile | 1 + >> drivers/pci/pcie/aer.c | 138 ------------------------------------- >> drivers/pci/pcie/cxl_aer.c | 138 +++++++++++++++++++++++++++++++++++++ >> include/linux/pci_ids.h | 2 + >> 5 files changed, 149 insertions(+), 138 deletions(-) >> create mode 100644 drivers/pci/pcie/cxl_aer.c >> >> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h >> index a0d1e59b5666..91b583cf18eb 100644 >> --- a/drivers/pci/pci.h >> +++ b/drivers/pci/pci.h >> @@ -1029,6 +1029,14 @@ static inline void pci_save_aer_state(struct pci_dev *dev) { } >> static inline void pci_restore_aer_state(struct pci_dev *dev) { } >> #endif >> >> +#ifdef CONFIG_PCIEAER_CXL >> +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info); >> +void cxl_rch_enable_rcec(struct pci_dev *rcec); >> +#else >> +static inline void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) { } >> +static inline void cxl_rch_enable_rcec(struct pci_dev *rcec) { } >> +#endif >> + >> #ifdef CONFIG_ACPI >> bool pci_acpi_preserve_config(struct pci_host_bridge *bridge); >> int pci_acpi_program_hp_params(struct pci_dev *dev); >> diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile >> index 173829aa02e6..cd2cb925dbd5 100644 >> --- a/drivers/pci/pcie/Makefile >> +++ b/drivers/pci/pcie/Makefile >> @@ -8,6 +8,7 @@ obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o >> >> obj-y += aspm.o >> obj-$(CONFIG_PCIEAER) += aer.o err.o tlp.o >> +obj-$(CONFIG_PCIEAER_CXL) += cxl_aer.o >> obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o >> obj-$(CONFIG_PCIE_PME) += pme.o >> obj-$(CONFIG_PCIE_DPC) += dpc.o >> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c >> index a2df9456595a..0b4d721980ef 100644 >> --- a/drivers/pci/pcie/aer.c >> +++ b/drivers/pci/pcie/aer.c >> @@ -1094,144 +1094,6 @@ static bool find_source_device(struct pci_dev *parent, >> return true; >> } >> >> -#ifdef CONFIG_PCIEAER_CXL >> - >> -/** >> - * pci_aer_unmask_internal_errors - unmask internal errors >> - * @dev: pointer to the pci_dev data structure >> - * >> - * Unmask internal errors in the Uncorrectable and Correctable Error >> - * Mask registers. >> - * >> - * Note: AER must be enabled and supported by the device which must be >> - * checked in advance, e.g. with pcie_aer_is_native(). >> - */ >> -static void pci_aer_unmask_internal_errors(struct pci_dev *dev) >> -{ >> - int aer = dev->aer_cap; >> - u32 mask; >> - >> - pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); >> - mask &= ~PCI_ERR_UNC_INTN; >> - pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask); >> - >> - pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); >> - mask &= ~PCI_ERR_COR_INTERNAL; >> - pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); >> -} >> - >> -static bool is_cxl_mem_dev(struct pci_dev *dev) >> -{ >> - /* >> - * The capability, status, and control fields in Device 0, >> - * Function 0 DVSEC control the CXL functionality of the >> - * entire device (CXL 3.0, 8.1.3). >> - */ >> - if (dev->devfn != PCI_DEVFN(0, 0)) >> - return false; >> - >> - /* >> - * CXL Memory Devices must have the 502h class code set (CXL >> - * 3.0, 8.1.12.1). >> - */ >> - if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL) >> - return false; >> - >> - return true; >> -} >> - >> -static bool cxl_error_is_native(struct pci_dev *dev) >> -{ >> - struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); >> - >> - return (pcie_ports_native || host->native_aer); >> -} >> - >> -static bool is_internal_error(struct aer_err_info *info) >> -{ >> - if (info->severity == AER_CORRECTABLE) >> - return info->status & PCI_ERR_COR_INTERNAL; >> - >> - return info->status & PCI_ERR_UNC_INTN; >> -} >> - >> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) >> -{ >> - struct aer_err_info *info = (struct aer_err_info *)data; >> - const struct pci_error_handlers *err_handler; >> - >> - if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) >> - return 0; >> - >> - /* Protect dev->driver */ >> - device_lock(&dev->dev); >> - >> - err_handler = dev->driver ? dev->driver->err_handler : NULL; >> - if (!err_handler) >> - goto out; >> - >> - if (info->severity == AER_CORRECTABLE) { >> - if (err_handler->cor_error_detected) >> - err_handler->cor_error_detected(dev); >> - } else if (err_handler->error_detected) { >> - if (info->severity == AER_NONFATAL) >> - err_handler->error_detected(dev, pci_channel_io_normal); >> - else if (info->severity == AER_FATAL) >> - err_handler->error_detected(dev, pci_channel_io_frozen); >> - } >> -out: >> - device_unlock(&dev->dev); >> - return 0; >> -} >> - >> -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) >> -{ >> - /* >> - * Internal errors of an RCEC indicate an AER error in an >> - * RCH's downstream port. Check and handle them in the CXL.mem >> - * device driver. >> - */ >> - if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && >> - is_internal_error(info)) >> - pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); >> -} >> - >> -static int handles_cxl_error_iter(struct pci_dev *dev, void *data) >> -{ >> - bool *handles_cxl = data; >> - >> - if (!*handles_cxl) >> - *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); >> - >> - /* Non-zero terminates iteration */ >> - return *handles_cxl; >> -} >> - >> -static bool handles_cxl_errors(struct pci_dev *rcec) >> -{ >> - bool handles_cxl = false; >> - >> - if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && >> - pcie_aer_is_native(rcec)) >> - pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); >> - >> - return handles_cxl; >> -} >> - >> -static void cxl_rch_enable_rcec(struct pci_dev *rcec) >> -{ >> - if (!handles_cxl_errors(rcec)) >> - return; >> - >> - pci_aer_unmask_internal_errors(rcec); >> - pci_info(rcec, "CXL: Internal errors unmasked"); >> -} >> - >> -#else >> -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { } >> -static inline void cxl_rch_handle_error(struct pci_dev *dev, >> - struct aer_err_info *info) { } >> -#endif >> >> /** >> * pci_aer_handle_error - handle logging error into an event log >> diff --git a/drivers/pci/pcie/cxl_aer.c b/drivers/pci/pcie/cxl_aer.c >> new file mode 100644 >> index 000000000000..b2ea14f70055 >> --- /dev/null >> +++ b/drivers/pci/pcie/cxl_aer.c >> @@ -0,0 +1,138 @@ >> +// SPDX-License-Identifier: GPL-2.0-only >> +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ >> + >> +#include <linux/pci.h> >> +#include <linux/aer.h> >> +#include "../pci.h" >> + >> +/** >> + * pci_aer_unmask_internal_errors - unmask internal errors >> + * @dev: pointer to the pci_dev data structure >> + * >> + * Unmask internal errors in the Uncorrectable and Correctable Error >> + * Mask registers. >> + * >> + * Note: AER must be enabled and supported by the device which must be >> + * checked in advance, e.g. with pcie_aer_is_native(). >> + */ >> +static void pci_aer_unmask_internal_errors(struct pci_dev *dev) >> +{ >> + int aer = dev->aer_cap; >> + u32 mask; >> + >> + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); >> + mask &= ~PCI_ERR_UNC_INTN; >> + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask); >> + >> + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); >> + mask &= ~PCI_ERR_COR_INTERNAL; >> + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); >> +} >> + >> +static bool is_cxl_mem_dev(struct pci_dev *dev) >> +{ >> + /* >> + * The capability, status, and control fields in Device 0, >> + * Function 0 DVSEC control the CXL functionality of the >> + * entire device (CXL 3.2, 8.1.3). >> + */ >> + if (dev->devfn != PCI_DEVFN(0, 0)) >> + return false; >> + >> + /* >> + * CXL Memory Devices must have the 502h class code set (CXL >> + * 3.2, 8.1.12.1). >> + */ >> + if (FIELD_GET(PCI_CLASS_CODE_MASK, dev->class) != PCI_CLASS_MEMORY_CXL) >> + return false; >> + >> + return true; >> +} >> + >> +static bool cxl_error_is_native(struct pci_dev *dev) >> +{ >> + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); >> + >> + return (pcie_ports_native || host->native_aer); >> +} >> + >> +static bool is_internal_error(struct aer_err_info *info) >> +{ >> + if (info->severity == AER_CORRECTABLE) >> + return info->status & PCI_ERR_COR_INTERNAL; >> + >> + return info->status & PCI_ERR_UNC_INTN; >> +} >> + >> +static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) >> +{ >> + struct aer_err_info *info = (struct aer_err_info *)data; >> + const struct pci_error_handlers *err_handler; >> + >> + if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) >> + return 0; >> + >> + /* Protect dev->driver */ >> + device_lock(&dev->dev); >> + >> + err_handler = dev->driver ? dev->driver->err_handler : NULL; >> + if (!err_handler) >> + goto out; >> + >> + if (info->severity == AER_CORRECTABLE) { >> + if (err_handler->cor_error_detected) >> + err_handler->cor_error_detected(dev); >> + } else if (err_handler->error_detected) { >> + if (info->severity == AER_NONFATAL) >> + err_handler->error_detected(dev, pci_channel_io_normal); >> + else if (info->severity == AER_FATAL) >> + err_handler->error_detected(dev, pci_channel_io_frozen); >> + } >> +out: >> + device_unlock(&dev->dev); >> + return 0; >> +} >> + >> +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) >> +{ >> + /* >> + * Internal errors of an RCEC indicate an AER error in an >> + * RCH's downstream port. Check and handle them in the CXL.mem >> + * device driver. >> + */ >> + if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && >> + is_internal_error(info)) >> + pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); >> +} >> + >> +static int handles_cxl_error_iter(struct pci_dev *dev, void *data) >> +{ >> + bool *handles_cxl = data; >> + >> + if (!*handles_cxl) >> + *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); >> + >> + /* Non-zero terminates iteration */ >> + return *handles_cxl; >> +} >> + >> +static bool handles_cxl_errors(struct pci_dev *rcec) >> +{ >> + bool handles_cxl = false; >> + >> + if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && >> + pcie_aer_is_native(rcec)) >> + pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); >> + >> + return handles_cxl; >> +} >> + >> +void cxl_rch_enable_rcec(struct pci_dev *rcec) >> +{ >> + if (!handles_cxl_errors(rcec)) >> + return; >> + >> + pci_aer_unmask_internal_errors(rcec); >> + pci_info(rcec, "CXL: Internal errors unmasked"); >> +} >> + >> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h >> index e2d71b6fdd84..31b3935bf189 100644 >> --- a/include/linux/pci_ids.h >> +++ b/include/linux/pci_ids.h >> @@ -12,6 +12,8 @@ >> >> /* Device classes and subclasses */ >> >> +#define PCI_CLASS_CODE_MASK 0xFFFF00 >> + >> #define PCI_CLASS_NOT_DEFINED 0x0000 >> #define PCI_CLASS_NOT_DEFINED_VGA 0x0001 >>
On Thu, 26 Jun 2025 16:42:09 -0700 Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@linux.intel.com> wrote: > On 6/26/25 3:42 PM, Terry Bowman wrote: > > The CXL AER error handling logic currently resides in the AER driver file, > > drivers/pci/pcie/aer.c. CXL specific changes are conditionally compiled > > using #ifdefs. > > > > Improve the AER driver maintainability by separating the CXL specific logic > > from the AER driver's core functionality and removing the #ifdefs. > > Introduce drivers/pci/pcie/cxl_aer.c and move the CXL AER logic into the > > new file. > > > > Update the makefile to conditionally compile the CXL file using the > > existing CONFIG_PCIEAER_CXL Kconfig. > > > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > > --- > > After moving the code , you seem to have updated it with your own > changes. May be you split it into two patches. Agreed. I think the changes are small but a direct code move followed by cleanup (I think it's the mask and the comment update only?) would be better. Assuming you do that, for both resulting patches: Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
© 2016 - 2025 Red Hat, Inc.