[PATCH v16 08/10] cxl: Update Endpoint AER uncorrectable handler

Terry Bowman posted 10 patches 7 hours ago
[PATCH v16 08/10] cxl: Update Endpoint AER uncorrectable handler
Posted by Terry Bowman 7 hours ago
CXL drivers now implement protocol RAS support. PCI protocol errors,
however, continue to be reported via the AER capability and must still be
handled by a PCI error recovery callback.

Replace the existing cxl_error_detected() callback in cxl/pci.c with a
new cxl_pci_error_detected() implementation that handles uncorrectable
AER PCI protocol errors. Changes for PCI Correctable protocol errors will
be added in a future patch.

Introduce function cxl_uncor_aer_present() to handle and log the CXL
Endpoint's AER errors. Endpoint fatal AER errors are not currently logged by
the AER driver and require logging here with a call to pci_print_aer().

This cleanly separates CXL protocol error handling from PCI AER handling
and ensures that each subsystem processes only the errors it is
responsible.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Assisted-by: Azure:gpt4.1-nano-key

---

Changes in v15->v16:
- Update commit message (DaveJ)
- s/cxl_handle_aer()/cxl_uncor_aer_present()/g (Jonathan)
- cxl_uncor_aer_present(): Leave original result calculation based on
  if a UCE is present and the provided state (Terry)
- Add call to pci_print_aer(). AER fails to log because is upstream
  link (Terry)

Changes in v14->v15:
- Update commit message and title. Added Bjorn's ack.
- Move CE and UCE handling logic here

Changes in v13->v14:
- Add Dave Jiang's review-by
- Update commit message & headline (Bjorn)
- Refactor cxl_port_error_detected()/cxl_port_cor_error_detected() to
  one line (Jonathan)
- Remove cxl_walk_port() (Dan)
- Remove cxl_pci_drv_bound(). Check for 'is_cxl' parent port is
  sufficient (Dan)
- Remove device_lock_if()
- Combined CE and UCE here (Terry)

Changes in v12->v13:
- Move get_pci_cxl_host_dev() and cxl_handle_proto_error() to Dequeue
  patch (Terry)
- Remove EP case in cxl_get_ras_base(), not used. (Terry)
- Remove check for dport->dport_dev (Dave)
- Remove whitespace (Terry)

Changes in v11->v12:
- Add call to cxl_pci_drv_bound() in cxl_handle_proto_error() and
  pci_to_cxl_dev()
- Change cxl_error_detected() -> cxl_cor_error_detected()
- Remove NULL variable assignments
- Replace bus_find_device() with find_cxl_port_by_uport() for upstream
  port searches.

Changes in v10->v11:
- None
---
 drivers/cxl/core/ras.c | 57 ++++++++++++++++++++++++------------------
 drivers/cxl/cxlpci.h   |  9 +++----
 drivers/cxl/pci.c      |  6 ++---
 3 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 254144d19764..884e40c66638 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -393,34 +393,41 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
 
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state)
+static bool cxl_uncor_aer_present(struct pci_dev *pdev)
 {
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	bool ue;
-
-	scoped_guard(device, dev) {
-		if (!dev->driver) {
-			dev_warn(&pdev->dev,
-				 "%s: memdev disabled, abort error handling\n",
-				 dev_name(dev));
-			return PCI_ERS_RESULT_DISCONNECT;
-		}
+	struct aer_capability_regs aer_regs;
+	u32 fatal, aer_cap = pdev->aer_cap;
 
-		if (cxlds->rcd)
-			cxl_handle_rdport_errors(pdev);
-		/*
-		 * A frozen channel indicates an impending reset which is fatal to
-		 * CXL.mem operation, and will likely crash the system. On the off
-		 * chance the situation is recoverable dump the status of the RAS
-		 * capability registers and bounce the active state of the memdev.
-		 */
-		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
-				    cxlmd->endpoint->regs.ras);
+	if (!aer_cap) {
+		pr_warn_ratelimited("%s: AER capability isn't present\n",
+				    pci_name(pdev));
+		return false;
 	}
 
+	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS,
+			      &aer_regs.uncor_status);
+	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK,
+			      &aer_regs.uncor_mask);
+	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_SEVER,
+			      &aer_regs.uncor_severity);
+
+	fatal = (aer_regs.uncor_severity & aer_regs.uncor_severity);
+	pci_print_aer(pdev, fatal ? AER_FATAL : AER_NONFATAL, &aer_regs);
+
+	pci_aer_clear_nonfatal_status(pdev);
+	pci_aer_clear_fatal_status(pdev);
+
+	return aer_regs.uncor_status & ~aer_regs.uncor_mask;
+}
+
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+					pci_channel_state_t state)
+{
+	bool ue = cxl_uncor_aer_present(pdev);
+	struct cxl_port *port = get_cxl_port(pdev);
+	struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+	struct device *dev = &cxlmd->dev;
+
 	switch (state) {
 	case pci_channel_io_normal:
 		if (ue) {
@@ -441,7 +448,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 	}
 	return PCI_ERS_RESULT_NEED_RESET;
 }
-EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
+EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");
 
 static void cxl_handle_proto_error(struct pci_dev *pdev, int severity)
 {
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 0cf64218aa16..86029d96d6bb 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -79,15 +79,14 @@ void read_cdat_data(struct cxl_port *port);
 
 #ifdef CONFIG_CXL_RAS
 void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state);
 void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+					pci_channel_state_t error);
 void devm_cxl_port_ras_setup(struct cxl_port *port);
 #else
 static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
-
-static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-						  pci_channel_state_t state)
+static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+						      pci_channel_state_t state)
 {
 	return PCI_ERS_RESULT_NONE;
 }
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index fbb300a01830..b57f4727af53 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1051,8 +1051,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
 	}
 }
 
-static const struct pci_error_handlers cxl_error_handlers = {
-	.error_detected	= cxl_error_detected,
+static const struct pci_error_handlers pci_error_handlers = {
+	.error_detected	= cxl_pci_error_detected,
 	.slot_reset	= cxl_slot_reset,
 	.resume		= cxl_error_resume,
 	.cor_error_detected	= cxl_cor_error_detected,
@@ -1063,7 +1063,7 @@ static struct pci_driver cxl_pci_driver = {
 	.name			= KBUILD_MODNAME,
 	.id_table		= cxl_mem_pci_tbl,
 	.probe			= cxl_pci_probe,
-	.err_handler		= &cxl_error_handlers,
+	.err_handler		= &pci_error_handlers,
 	.dev_groups		= cxl_rcd_groups,
 	.driver	= {
 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
-- 
2.34.1