CXL drivers now implement protocol RAS support. PCI protocol errors,
however, continue to be reported via the AER capability and must still be
handled by a PCI error recovery callback.
Replace the existing cxl_error_detected() callback in cxl/pci.c with a
new cxl_pci_error_detected() implementation that handles uncorrectable
AER PCI protocol errors. Changes for PCI Correctable protocol errors will
be added in a future patch.
Introduce function cxl_uncor_aer_present() to handle and log the CXL
Endpoint's AER errors. Endpoint fatal AER errors are not currently logged by
the AER driver and require logging here with a call to pci_print_aer().
This cleanly separates CXL protocol error handling from PCI AER handling
and ensures that each subsystem processes only the errors it is
responsible.
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Assisted-by: Azure:gpt4.1-nano-key
---
Changes in v15->v16:
- Update commit message (DaveJ)
- s/cxl_handle_aer()/cxl_uncor_aer_present()/g (Jonathan)
- cxl_uncor_aer_present(): Leave original result calculation based on
if a UCE is present and the provided state (Terry)
- Add call to pci_print_aer(). AER fails to log because is upstream
link (Terry)
Changes in v14->v15:
- Update commit message and title. Added Bjorn's ack.
- Move CE and UCE handling logic here
Changes in v13->v14:
- Add Dave Jiang's review-by
- Update commit message & headline (Bjorn)
- Refactor cxl_port_error_detected()/cxl_port_cor_error_detected() to
one line (Jonathan)
- Remove cxl_walk_port() (Dan)
- Remove cxl_pci_drv_bound(). Check for 'is_cxl' parent port is
sufficient (Dan)
- Remove device_lock_if()
- Combined CE and UCE here (Terry)
Changes in v12->v13:
- Move get_pci_cxl_host_dev() and cxl_handle_proto_error() to Dequeue
patch (Terry)
- Remove EP case in cxl_get_ras_base(), not used. (Terry)
- Remove check for dport->dport_dev (Dave)
- Remove whitespace (Terry)
Changes in v11->v12:
- Add call to cxl_pci_drv_bound() in cxl_handle_proto_error() and
pci_to_cxl_dev()
- Change cxl_error_detected() -> cxl_cor_error_detected()
- Remove NULL variable assignments
- Replace bus_find_device() with find_cxl_port_by_uport() for upstream
port searches.
Changes in v10->v11:
- None
---
drivers/cxl/core/ras.c | 57 ++++++++++++++++++++++++------------------
drivers/cxl/cxlpci.h | 9 +++----
drivers/cxl/pci.c | 6 ++---
3 files changed, 39 insertions(+), 33 deletions(-)
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 254144d19764..884e40c66638 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -393,34 +393,41 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
}
EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state)
+static bool cxl_uncor_aer_present(struct pci_dev *pdev)
{
- struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
- struct cxl_memdev *cxlmd = cxlds->cxlmd;
- struct device *dev = &cxlmd->dev;
- bool ue;
-
- scoped_guard(device, dev) {
- if (!dev->driver) {
- dev_warn(&pdev->dev,
- "%s: memdev disabled, abort error handling\n",
- dev_name(dev));
- return PCI_ERS_RESULT_DISCONNECT;
- }
+ struct aer_capability_regs aer_regs;
+ u32 fatal, aer_cap = pdev->aer_cap;
- if (cxlds->rcd)
- cxl_handle_rdport_errors(pdev);
- /*
- * A frozen channel indicates an impending reset which is fatal to
- * CXL.mem operation, and will likely crash the system. On the off
- * chance the situation is recoverable dump the status of the RAS
- * capability registers and bounce the active state of the memdev.
- */
- ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
- cxlmd->endpoint->regs.ras);
+ if (!aer_cap) {
+ pr_warn_ratelimited("%s: AER capability isn't present\n",
+ pci_name(pdev));
+ return false;
}
+ pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS,
+ &aer_regs.uncor_status);
+ pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK,
+ &aer_regs.uncor_mask);
+ pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_SEVER,
+ &aer_regs.uncor_severity);
+
+ fatal = (aer_regs.uncor_severity & aer_regs.uncor_severity);
+ pci_print_aer(pdev, fatal ? AER_FATAL : AER_NONFATAL, &aer_regs);
+
+ pci_aer_clear_nonfatal_status(pdev);
+ pci_aer_clear_fatal_status(pdev);
+
+ return aer_regs.uncor_status & ~aer_regs.uncor_mask;
+}
+
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ bool ue = cxl_uncor_aer_present(pdev);
+ struct cxl_port *port = get_cxl_port(pdev);
+ struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+ struct device *dev = &cxlmd->dev;
+
switch (state) {
case pci_channel_io_normal:
if (ue) {
@@ -441,7 +448,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
}
return PCI_ERS_RESULT_NEED_RESET;
}
-EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
+EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");
static void cxl_handle_proto_error(struct pci_dev *pdev, int severity)
{
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 0cf64218aa16..86029d96d6bb 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -79,15 +79,14 @@ void read_cdat_data(struct cxl_port *port);
#ifdef CONFIG_CXL_RAS
void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state);
void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t error);
void devm_cxl_port_ras_setup(struct cxl_port *port);
#else
static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
-
-static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state)
+static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
{
return PCI_ERS_RESULT_NONE;
}
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index fbb300a01830..b57f4727af53 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1051,8 +1051,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
}
}
-static const struct pci_error_handlers cxl_error_handlers = {
- .error_detected = cxl_error_detected,
+static const struct pci_error_handlers pci_error_handlers = {
+ .error_detected = cxl_pci_error_detected,
.slot_reset = cxl_slot_reset,
.resume = cxl_error_resume,
.cor_error_detected = cxl_cor_error_detected,
@@ -1063,7 +1063,7 @@ static struct pci_driver cxl_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = cxl_mem_pci_tbl,
.probe = cxl_pci_probe,
- .err_handler = &cxl_error_handlers,
+ .err_handler = &pci_error_handlers,
.dev_groups = cxl_rcd_groups,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
--
2.34.1