Restricted CXL Host (RCH) error handling is not currently supported by the
CXL Port error handling flow. Integrate the existing RCH error handling
into the new Port error handling.
Update cxl_rch_handle_error_iter() to forward the RCH protocol error using
the AER-CXL kfifo.
Update cxl_handle_proto_error() to begin the RCH error handling with a call
to cxl_handle_rdport_errors(). This function handles both correctable and
uncorrectable RCH protocol errors.
Change the cxl_handle_rdport_errors() function parameter from a CXL device
state to a PCI device.
Report the serial number of the RCD Endpoint in the RCH logging. This
is used to associate the RCH with the RCD in the logs.
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
Changes in v16:
- New commit
---
drivers/cxl/core/core.h | 6 ++++--
drivers/cxl/core/ras.c | 15 ++++++++++++---
drivers/cxl/core/ras_rch.c | 13 +++++++------
drivers/pci/pcie/aer_cxl_rch.c | 17 +----------------
4 files changed, 24 insertions(+), 27 deletions(-)
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 0eb2e28bb2c2..76d2593e68c6 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -186,8 +186,9 @@ void cxl_handle_cor_ras(struct device *dev, u64 serial,
void __iomem *ras_base);
void cxl_dport_map_rch_aer(struct cxl_dport *dport);
void cxl_disable_rch_root_ints(struct cxl_dport *dport);
-void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds);
+void cxl_handle_rdport_errors(struct pci_dev *pdev);
void devm_cxl_dport_ras_setup(struct cxl_dport *dport);
+u64 cxl_serial_number(struct device *dev);
#else
static inline int cxl_ras_init(void)
{
@@ -203,8 +204,9 @@ static inline void cxl_handle_cor_ras(struct device *dev, u64 serial,
void __iomem *ras_base) { }
static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { }
static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { }
-static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
+static inline void cxl_handle_rdport_errors(struct pci_dev *pdev) { }
static inline void devm_cxl_dport_ras_setup(struct cxl_dport *dport) { }
+static inline u64 cxl_serial_number(struct device *dev) { return 0; }
#endif /* CONFIG_CXL_RAS */
int cxl_gpf_port_setup(struct cxl_dport *dport);
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 1d4be2d78469..48d3ef7cbb92 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -218,7 +218,7 @@ static struct cxl_port *get_cxl_port(struct pci_dev *pdev)
return NULL;
}
-static u64 cxl_serial_number(struct device *dev)
+u64 cxl_serial_number(struct device *dev)
{
struct pci_dev *pdev = to_pci_dev(dev);
struct cxl_port *port __free(put_cxl_port) = get_cxl_port(pdev);
@@ -371,7 +371,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
}
if (cxlds->rcd)
- cxl_handle_rdport_errors(cxlds);
+ cxl_handle_rdport_errors(pdev);
cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial,
cxlmd->endpoint->regs.ras);
@@ -396,7 +396,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
}
if (cxlds->rcd)
- cxl_handle_rdport_errors(cxlds);
+ cxl_handle_rdport_errors(pdev);
/*
* A frozen channel indicates an impending reset which is fatal to
* CXL.mem operation, and will likely crash the system. On the off
@@ -431,6 +431,15 @@ EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
static void cxl_handle_proto_error(struct pci_dev *pdev, int severity)
{
+ /*
+ * CXL RCD's AER error interrupt is used for reporting RCD and RCH
+ * Downstream Port protocol errors. RCH protocol errors are handled
+ * using a unique procedure separate from from CXL Port devices.
+ * See CXL spec r4.0, 12.2 CXL Error Handling
+ */
+ if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)
+ cxl_handle_rdport_errors(pdev);
+
if (severity == AER_CORRECTABLE) {
struct device *dev = &pdev->dev;
diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c
index 5771abfc16de..184b7877f700 100644
--- a/drivers/cxl/core/ras_rch.c
+++ b/drivers/cxl/core/ras_rch.c
@@ -95,17 +95,20 @@ static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
return false;
}
-void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
+void cxl_handle_rdport_errors(struct pci_dev *pdev)
{
- struct pci_dev *pdev = to_pci_dev(cxlds->dev);
struct aer_capability_regs aer_regs;
+ struct device *dev = &pdev->dev;
+ u64 serial = cxl_serial_number(dev);
struct cxl_dport *dport;
+ void __iomem *ras_base;
int severity;
struct cxl_port *port __free(put_cxl_port) =
cxl_pci_find_port(pdev, &dport);
if (!port)
return;
+ ras_base = dport->regs.ras;
if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs))
return;
@@ -115,9 +118,7 @@ void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
pci_print_aer(pdev, severity, &aer_regs);
if (severity == AER_CORRECTABLE)
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial,
- dport->regs.ras);
+ cxl_handle_cor_ras(dev, serial, ras_base);
else
- cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
- dport->regs.ras);
+ cxl_handle_ras(dev, serial, ras_base);
}
diff --git a/drivers/pci/pcie/aer_cxl_rch.c b/drivers/pci/pcie/aer_cxl_rch.c
index e471eefec9c4..83142eac0cab 100644
--- a/drivers/pci/pcie/aer_cxl_rch.c
+++ b/drivers/pci/pcie/aer_cxl_rch.c
@@ -37,26 +37,11 @@ static bool cxl_error_is_native(struct pci_dev *dev)
static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
{
struct aer_err_info *info = (struct aer_err_info *)data;
- const struct pci_error_handlers *err_handler;
if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
return 0;
- guard(device)(&dev->dev);
-
- err_handler = dev->driver ? dev->driver->err_handler : NULL;
- if (!err_handler)
- return 0;
-
- if (info->severity == AER_CORRECTABLE) {
- if (err_handler->cor_error_detected)
- err_handler->cor_error_detected(dev);
- } else if (err_handler->error_detected) {
- if (info->severity == AER_NONFATAL)
- err_handler->error_detected(dev, pci_channel_io_normal);
- else if (info->severity == AER_FATAL)
- err_handler->error_detected(dev, pci_channel_io_frozen);
- }
+ cxl_forward_error(dev, info);
return 0;
}
--
2.34.1