[v10] Enable CXL PCIe Port Protocol Error handling and logging

[PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error handlers

Posted by Terry Bowman 3 months, 1 week ago

CXL Endpoint protocol errors are currently handled using PCI error
handlers. The CXL Endpoint requires CXL specific handling in the case of
uncorrectable error (UCE) handling not provided by the PCI handlers.

Add CXL specific handlers for CXL Endpoints. Rename the existing
cxl_error_handlers to be pci_error_handlers to more correctly indicate
the error type and follow naming consistency.

The PCI handlers will be called if the CXL device is not trained for
alternate protocol (CXL). Update the CXL Endpoint PCI handlers to call the
CXL UCE handlers.

The existing EP UCE handler includes checks for various results. These are
no longer needed because CXL UCE recovery will not be attempted. Implement
cxl_handle_ras() to return PCI_ERS_RESULT_NONE or PCI_ERS_RESULT_PANIC. The
CXL UCE handler is called by cxl_do_recovery() that acts on the return
value. In the case of the PCI handler path, call panic() if the result is
PCI_ERS_RESULT_PANIC.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
---
 drivers/cxl/core/native_ras.c | 15 ++++---
 drivers/cxl/core/pci.c        | 77 ++++++++++++++++++-----------------
 drivers/cxl/cxl.h             |  4 ++
 drivers/cxl/cxlpci.h          |  6 +--
 drivers/cxl/pci.c             |  8 ++--
 5 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/drivers/cxl/core/native_ras.c b/drivers/cxl/core/native_ras.c
index 19f8f2ac8376..89b65a35f2c0 100644
--- a/drivers/cxl/core/native_ras.c
+++ b/drivers/cxl/core/native_ras.c
@@ -7,18 +7,20 @@
 #include <cxlmem.h>
 #include <core/core.h>
 #include <cxlpci.h>
+#include <core/core.h>
 
 static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
 {
 	pci_ers_result_t vote, *result = data;
+	struct device *dev = &pdev->dev;
 
 	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
 	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_END))
 		return 0;
 
-	guard(device)(&pdev->dev);
+	guard(device)(dev);
 
-	vote = cxl_error_detected(pdev, pci_channel_io_frozen);
+	vote = cxl_error_detected(dev);
 	*result = merge_result(*result, vote);
 
 	return 0;
@@ -82,16 +84,17 @@ static bool is_cxl_rcd(struct pci_dev *pdev)
 static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data)
 {
 	struct cxl_proto_error_info *err_info = data;
+	struct device *dev = &pdev->dev;
 
-	guard(device)(&pdev->dev);
+	guard(device)(dev);
 
 	if (!is_cxl_rcd(pdev) || !cxl_pci_drv_bound(pdev))
 		return 0;
 
 	if (err_info->severity == AER_CORRECTABLE)
-		cxl_cor_error_detected(pdev);
+		cxl_cor_error_detected(dev);
 	else
-		cxl_error_detected(pdev, pci_channel_io_frozen);
+		cxl_error_detected(dev);
 
 	return 1;
 }
@@ -126,7 +129,7 @@ static void cxl_handle_proto_error(struct cxl_proto_error_info *err_info)
 						       aer + PCI_ERR_COR_STATUS,
 						       0, PCI_ERR_COR_INTERNAL);
 
-		cxl_cor_error_detected(pdev);
+		cxl_cor_error_detected(&pdev->dev);
 
 		pcie_clear_device_status(pdev);
 	} else {
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 887b54cf3395..7209ffb5c2fe 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -705,8 +705,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
  * Log the state of the RAS status registers and prepare them to log the
  * next error status. Return 1 if reset needed.
  */
-static bool cxl_handle_ras(struct device *dev, u64 serial,
-			   void __iomem *ras_base)
+static pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial,
+				       void __iomem *ras_base)
 {
 	u32 hl[CXL_HEADERLOG_SIZE_U32];
 	void __iomem *addr;
@@ -715,13 +715,13 @@ static bool cxl_handle_ras(struct device *dev, u64 serial,
 
 	if (!ras_base) {
 		dev_warn_once(dev, "CXL RAS register block is not mapped");
-		return false;
+		return PCI_ERS_RESULT_NONE;
 	}
 
 	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
 	status = readl(addr);
 	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
-		return false;
+		return PCI_ERS_RESULT_NONE;
 
 	/* If multiple errors, log header points to first error from ctrl reg */
 	if (hweight32(status) > 1) {
@@ -738,7 +738,7 @@ static bool cxl_handle_ras(struct device *dev, u64 serial,
 	trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
 
-	return true;
+	return PCI_ERS_RESULT_PANIC;
 }
 
 #ifdef CONFIG_PCIEAER_CXL
@@ -833,13 +833,14 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
 static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
 #endif
 
-void cxl_cor_error_detected(struct pci_dev *pdev)
+void cxl_cor_error_detected(struct device *dev)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
 	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct device *dev = &cxlds->cxlmd->dev;
+	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
 
-	scoped_guard(device, dev) {
-		if (!dev->driver) {
+	scoped_guard(device, cxlmd_dev) {
+		if (!cxlmd_dev->driver) {
 			dev_warn(&pdev->dev,
 				 "%s: memdev disabled, abort error handling\n",
 				 dev_name(dev));
@@ -854,20 +855,26 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
 
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state)
+void pci_cor_error_detected(struct pci_dev *pdev)
 {
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	bool ue;
+	cxl_cor_error_detected(&pdev->dev);
+}
+EXPORT_SYMBOL_NS_GPL(pci_cor_error_detected, "CXL");
 
-	scoped_guard(device, dev) {
-		if (!dev->driver) {
+pci_ers_result_t cxl_error_detected(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
+	pci_ers_result_t ue;
+
+	scoped_guard(device, cxlmd_dev) {
+
+		if (!cxlmd_dev->driver) {
 			dev_warn(&pdev->dev,
 				 "%s: memdev disabled, abort error handling\n",
 				 dev_name(dev));
-			return PCI_ERS_RESULT_DISCONNECT;
+			return PCI_ERS_RESULT_PANIC;
 		}
 
 		if (cxlds->rcd)
@@ -881,29 +888,23 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
 	}
 
-
-	switch (state) {
-	case pci_channel_io_normal:
-		if (ue) {
-			device_release_driver(dev);
-			return PCI_ERS_RESULT_NEED_RESET;
-		}
-		return PCI_ERS_RESULT_CAN_RECOVER;
-	case pci_channel_io_frozen:
-		dev_warn(&pdev->dev,
-			 "%s: frozen state error detected, disable CXL.mem\n",
-			 dev_name(dev));
-		device_release_driver(dev);
-		return PCI_ERS_RESULT_NEED_RESET;
-	case pci_channel_io_perm_failure:
-		dev_warn(&pdev->dev,
-			 "failure state error detected, request disconnect\n");
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-	return PCI_ERS_RESULT_NEED_RESET;
+	return ue;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
 
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t error)
+{
+	pci_ers_result_t rc;
+
+	rc = cxl_error_detected(&pdev->dev);
+	if (rc == PCI_ERS_RESULT_PANIC)
+		panic("CXL cachemem error.");
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(pci_error_detected, "CXL");
+
 static int cxl_flit_size(struct pci_dev *pdev)
 {
 	if (cxl_pci_flit_256(pdev))
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index d696d419bd5a..a2eedc8a82e8 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -11,6 +11,7 @@
 #include <linux/log2.h>
 #include <linux/node.h>
 #include <linux/io.h>
+#include <linux/pci.h>
 
 extern const struct nvdimm_security_ops *cxl_security_ops;
 
@@ -797,6 +798,9 @@ static inline int cxl_root_decoder_autoremove(struct device *host,
 }
 int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint);
 
+void cxl_cor_error_detected(struct device *dev);
+pci_ers_result_t cxl_error_detected(struct device *dev);
+
 /**
  * struct cxl_endpoint_dvsec_info - Cached DVSEC info
  * @mem_enabled: cached value of mem_enabled in the DVSEC at init time
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index ed3c9701b79f..e69a47f0cd94 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -133,8 +133,8 @@ struct cxl_dev_state;
 int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
 			struct cxl_endpoint_dvsec_info *info);
 void read_cdat_data(struct cxl_port *port);
-void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state);
+void pci_cor_error_detected(struct pci_dev *pdev);
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t error);
 bool cxl_pci_drv_bound(struct pci_dev *pdev);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index cae049f9ae3e..91fab33094a9 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1112,11 +1112,11 @@ static void cxl_reset_done(struct pci_dev *pdev)
 	}
 }
 
-static const struct pci_error_handlers cxl_error_handlers = {
-	.error_detected	= cxl_error_detected,
+static const struct pci_error_handlers pci_error_handlers = {
+	.error_detected = pci_error_detected,
 	.slot_reset	= cxl_slot_reset,
 	.resume		= cxl_error_resume,
-	.cor_error_detected	= cxl_cor_error_detected,
+	.cor_error_detected	= pci_cor_error_detected,
 	.reset_done	= cxl_reset_done,
 };
 
@@ -1124,7 +1124,7 @@ static struct pci_driver cxl_pci_driver = {
 	.name			= KBUILD_MODNAME,
 	.id_table		= cxl_mem_pci_tbl,
 	.probe			= cxl_pci_probe,
-	.err_handler		= &cxl_error_handlers,
+	.err_handler		= &pci_error_handlers,
 	.dev_groups		= cxl_rcd_groups,
 	.driver	= {
 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
-- 
2.34.1

Re: [PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error handlers

Posted by Dave Jiang 2 months, 2 weeks ago


On 6/26/25 3:42 PM, Terry Bowman wrote:
> CXL Endpoint protocol errors are currently handled using PCI error
> handlers. The CXL Endpoint requires CXL specific handling in the case of
> uncorrectable error (UCE) handling not provided by the PCI handlers.
> 
> Add CXL specific handlers for CXL Endpoints. Rename the existing
> cxl_error_handlers to be pci_error_handlers to more correctly indicate
> the error type and follow naming consistency.
> 
> The PCI handlers will be called if the CXL device is not trained for
> alternate protocol (CXL). Update the CXL Endpoint PCI handlers to call the
> CXL UCE handlers.

Would the CXL device still be functional if it can't train the CXL protocols? Just wondering if we still need the standard PCI handlers in that case at all. 

DJ

> 
> The existing EP UCE handler includes checks for various results. These are
> no longer needed because CXL UCE recovery will not be attempted. Implement
> cxl_handle_ras() to return PCI_ERS_RESULT_NONE or PCI_ERS_RESULT_PANIC. The
> CXL UCE handler is called by cxl_do_recovery() that acts on the return
> value. In the case of the PCI handler path, call panic() if the result is
> PCI_ERS_RESULT_PANIC.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> ---
>  drivers/cxl/core/native_ras.c | 15 ++++---
>  drivers/cxl/core/pci.c        | 77 ++++++++++++++++++-----------------
>  drivers/cxl/cxl.h             |  4 ++
>  drivers/cxl/cxlpci.h          |  6 +--
>  drivers/cxl/pci.c             |  8 ++--
>  5 files changed, 59 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/cxl/core/native_ras.c b/drivers/cxl/core/native_ras.c
> index 19f8f2ac8376..89b65a35f2c0 100644
> --- a/drivers/cxl/core/native_ras.c
> +++ b/drivers/cxl/core/native_ras.c
> @@ -7,18 +7,20 @@
>  #include <cxlmem.h>
>  #include <core/core.h>
>  #include <cxlpci.h>
> +#include <core/core.h>
>  
>  static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
>  {
>  	pci_ers_result_t vote, *result = data;
> +	struct device *dev = &pdev->dev;
>  
>  	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
>  	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_END))
>  		return 0;
>  
> -	guard(device)(&pdev->dev);
> +	guard(device)(dev);
>  
> -	vote = cxl_error_detected(pdev, pci_channel_io_frozen);
> +	vote = cxl_error_detected(dev);
>  	*result = merge_result(*result, vote);
>  
>  	return 0;
> @@ -82,16 +84,17 @@ static bool is_cxl_rcd(struct pci_dev *pdev)
>  static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data)
>  {
>  	struct cxl_proto_error_info *err_info = data;
> +	struct device *dev = &pdev->dev;
>  
> -	guard(device)(&pdev->dev);
> +	guard(device)(dev);
>  
>  	if (!is_cxl_rcd(pdev) || !cxl_pci_drv_bound(pdev))
>  		return 0;
>  
>  	if (err_info->severity == AER_CORRECTABLE)
> -		cxl_cor_error_detected(pdev);
> +		cxl_cor_error_detected(dev);
>  	else
> -		cxl_error_detected(pdev, pci_channel_io_frozen);
> +		cxl_error_detected(dev);
>  
>  	return 1;
>  }
> @@ -126,7 +129,7 @@ static void cxl_handle_proto_error(struct cxl_proto_error_info *err_info)
>  						       aer + PCI_ERR_COR_STATUS,
>  						       0, PCI_ERR_COR_INTERNAL);
>  
> -		cxl_cor_error_detected(pdev);
> +		cxl_cor_error_detected(&pdev->dev);
>  
>  		pcie_clear_device_status(pdev);
>  	} else {
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 887b54cf3395..7209ffb5c2fe 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -705,8 +705,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
>   * Log the state of the RAS status registers and prepare them to log the
>   * next error status. Return 1 if reset needed.
>   */
> -static bool cxl_handle_ras(struct device *dev, u64 serial,
> -			   void __iomem *ras_base)
> +static pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial,
> +				       void __iomem *ras_base)
>  {
>  	u32 hl[CXL_HEADERLOG_SIZE_U32];
>  	void __iomem *addr;
> @@ -715,13 +715,13 @@ static bool cxl_handle_ras(struct device *dev, u64 serial,
>  
>  	if (!ras_base) {
>  		dev_warn_once(dev, "CXL RAS register block is not mapped");
> -		return false;
> +		return PCI_ERS_RESULT_NONE;
>  	}
>  
>  	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>  	status = readl(addr);
>  	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
> -		return false;
> +		return PCI_ERS_RESULT_NONE;
>  
>  	/* If multiple errors, log header points to first error from ctrl reg */
>  	if (hweight32(status) > 1) {
> @@ -738,7 +738,7 @@ static bool cxl_handle_ras(struct device *dev, u64 serial,
>  	trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
>  	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>  
> -	return true;
> +	return PCI_ERS_RESULT_PANIC;
>  }
>  
>  #ifdef CONFIG_PCIEAER_CXL
> @@ -833,13 +833,14 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
>  static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
>  #endif
>  
> -void cxl_cor_error_detected(struct pci_dev *pdev)
> +void cxl_cor_error_detected(struct device *dev)
>  {
> +	struct pci_dev *pdev = to_pci_dev(dev);
>  	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> -	struct device *dev = &cxlds->cxlmd->dev;
> +	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
>  
> -	scoped_guard(device, dev) {
> -		if (!dev->driver) {
> +	scoped_guard(device, cxlmd_dev) {
> +		if (!cxlmd_dev->driver) {
>  			dev_warn(&pdev->dev,
>  				 "%s: memdev disabled, abort error handling\n",
>  				 dev_name(dev));
> @@ -854,20 +855,26 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>  
> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> -				    pci_channel_state_t state)
> +void pci_cor_error_detected(struct pci_dev *pdev)
>  {
> -	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> -	struct cxl_memdev *cxlmd = cxlds->cxlmd;
> -	struct device *dev = &cxlmd->dev;
> -	bool ue;
> +	cxl_cor_error_detected(&pdev->dev);
> +}
> +EXPORT_SYMBOL_NS_GPL(pci_cor_error_detected, "CXL");
>  
> -	scoped_guard(device, dev) {
> -		if (!dev->driver) {
> +pci_ers_result_t cxl_error_detected(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
> +	pci_ers_result_t ue;
> +
> +	scoped_guard(device, cxlmd_dev) {
> +
> +		if (!cxlmd_dev->driver) {
>  			dev_warn(&pdev->dev,
>  				 "%s: memdev disabled, abort error handling\n",
>  				 dev_name(dev));
> -			return PCI_ERS_RESULT_DISCONNECT;
> +			return PCI_ERS_RESULT_PANIC;
>  		}
>  
>  		if (cxlds->rcd)
> @@ -881,29 +888,23 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
>  	}
>  
> -
> -	switch (state) {
> -	case pci_channel_io_normal:
> -		if (ue) {
> -			device_release_driver(dev);
> -			return PCI_ERS_RESULT_NEED_RESET;
> -		}
> -		return PCI_ERS_RESULT_CAN_RECOVER;
> -	case pci_channel_io_frozen:
> -		dev_warn(&pdev->dev,
> -			 "%s: frozen state error detected, disable CXL.mem\n",
> -			 dev_name(dev));
> -		device_release_driver(dev);
> -		return PCI_ERS_RESULT_NEED_RESET;
> -	case pci_channel_io_perm_failure:
> -		dev_warn(&pdev->dev,
> -			 "failure state error detected, request disconnect\n");
> -		return PCI_ERS_RESULT_DISCONNECT;
> -	}
> -	return PCI_ERS_RESULT_NEED_RESET;
> +	return ue;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
>  
> +pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
> +				    pci_channel_state_t error)
> +{
> +	pci_ers_result_t rc;
> +
> +	rc = cxl_error_detected(&pdev->dev);
> +	if (rc == PCI_ERS_RESULT_PANIC)
> +		panic("CXL cachemem error.");
> +
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(pci_error_detected, "CXL");
> +
>  static int cxl_flit_size(struct pci_dev *pdev)
>  {
>  	if (cxl_pci_flit_256(pdev))
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index d696d419bd5a..a2eedc8a82e8 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -11,6 +11,7 @@
>  #include <linux/log2.h>
>  #include <linux/node.h>
>  #include <linux/io.h>
> +#include <linux/pci.h>
>  
>  extern const struct nvdimm_security_ops *cxl_security_ops;
>  
> @@ -797,6 +798,9 @@ static inline int cxl_root_decoder_autoremove(struct device *host,
>  }
>  int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint);
>  
> +void cxl_cor_error_detected(struct device *dev);
> +pci_ers_result_t cxl_error_detected(struct device *dev);
> +
>  /**
>   * struct cxl_endpoint_dvsec_info - Cached DVSEC info
>   * @mem_enabled: cached value of mem_enabled in the DVSEC at init time
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index ed3c9701b79f..e69a47f0cd94 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -133,8 +133,8 @@ struct cxl_dev_state;
>  int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
>  			struct cxl_endpoint_dvsec_info *info);
>  void read_cdat_data(struct cxl_port *port);
> -void cxl_cor_error_detected(struct pci_dev *pdev);
> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> -				    pci_channel_state_t state);
> +void pci_cor_error_detected(struct pci_dev *pdev);
> +pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
> +				    pci_channel_state_t error);
>  bool cxl_pci_drv_bound(struct pci_dev *pdev);
>  #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index cae049f9ae3e..91fab33094a9 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1112,11 +1112,11 @@ static void cxl_reset_done(struct pci_dev *pdev)
>  	}
>  }
>  
> -static const struct pci_error_handlers cxl_error_handlers = {
> -	.error_detected	= cxl_error_detected,
> +static const struct pci_error_handlers pci_error_handlers = {
> +	.error_detected = pci_error_detected,
>  	.slot_reset	= cxl_slot_reset,
>  	.resume		= cxl_error_resume,
> -	.cor_error_detected	= cxl_cor_error_detected,
> +	.cor_error_detected	= pci_cor_error_detected,
>  	.reset_done	= cxl_reset_done,
>  };
>  
> @@ -1124,7 +1124,7 @@ static struct pci_driver cxl_pci_driver = {
>  	.name			= KBUILD_MODNAME,
>  	.id_table		= cxl_mem_pci_tbl,
>  	.probe			= cxl_pci_probe,
> -	.err_handler		= &cxl_error_handlers,
> +	.err_handler		= &pci_error_handlers,
>  	.dev_groups		= cxl_rcd_groups,
>  	.driver	= {
>  		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,

Re: [PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error handlers

Posted by Bowman, Terry 2 months, 2 weeks ago

On 7/21/2025 5:35 PM, Dave Jiang wrote:
>
> On 6/26/25 3:42 PM, Terry Bowman wrote:
>> CXL Endpoint protocol errors are currently handled using PCI error
>> handlers. The CXL Endpoint requires CXL specific handling in the case of
>> uncorrectable error (UCE) handling not provided by the PCI handlers.
>>
>> Add CXL specific handlers for CXL Endpoints. Rename the existing
>> cxl_error_handlers to be pci_error_handlers to more correctly indicate
>> the error type and follow naming consistency.
>>
>> The PCI handlers will be called if the CXL device is not trained for
>> alternate protocol (CXL). Update the CXL Endpoint PCI handlers to call the
>> CXL UCE handlers.
> Would the CXL device still be functional if it can't train the CXL protocols? Just wondering if we still need the standard PCI handlers in that case at all. 
>
> DJ

A CXL EP failing training will not support CXL functionality. 

Once training fails the RAS registers may be unavailable. I'm concerned accesses to the 
MMIO RAS registers could possibly cause a MCE if the PCIe device doesn't respond. It will 
depend on how the training fails. This a reason to remove the PCIe handlers.

BTW, the AER status will be logged by the AER driver before a PCIe handler is called.

A while back Dan mentioned we should leave the PCIe EP handlers. He may have an opinion 
or more to add.

-Terry

[snip]

RE: [PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error handlers

Posted by Shiju Jose 3 months, 1 week ago

>-----Original Message-----
>From: Terry Bowman <terry.bowman@amd.com>
>Sent: 26 June 2025 23:43
>To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
>dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
>bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
>ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
>rrichter@amd.com; dan.carpenter@linaro.org;
>PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
>Benjamin.Cheatham@amd.com;
>sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
>linux-cxl@vger.kernel.org
>Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>Subject: [PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error
>handlers
>
>CXL Endpoint protocol errors are currently handled using PCI error handlers. The
>CXL Endpoint requires CXL specific handling in the case of uncorrectable error
>(UCE) handling not provided by the PCI handlers.
>
>Add CXL specific handlers for CXL Endpoints. Rename the existing
>cxl_error_handlers to be pci_error_handlers to more correctly indicate the
>error type and follow naming consistency.
>
>The PCI handlers will be called if the CXL device is not trained for alternate
>protocol (CXL). Update the CXL Endpoint PCI handlers to call the CXL UCE
>handlers.
>
>The existing EP UCE handler includes checks for various results. These are no
>longer needed because CXL UCE recovery will not be attempted. Implement
>cxl_handle_ras() to return PCI_ERS_RESULT_NONE or PCI_ERS_RESULT_PANIC.
>The CXL UCE handler is called by cxl_do_recovery() that acts on the return
>value. In the case of the PCI handler path, call panic() if the result is
>PCI_ERS_RESULT_PANIC.
>
>Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>Reviewed-by: Kuppuswamy Sathyanarayanan
><sathyanarayanan.kuppuswamy@linux.intel.com>
>---
> drivers/cxl/core/native_ras.c | 15 ++++---
> drivers/cxl/core/pci.c        | 77 ++++++++++++++++++-----------------
> drivers/cxl/cxl.h             |  4 ++
> drivers/cxl/cxlpci.h          |  6 +--
> drivers/cxl/pci.c             |  8 ++--
> 5 files changed, 59 insertions(+), 51 deletions(-)
>
[...]
>diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index
>887b54cf3395..7209ffb5c2fe 100644
>--- a/drivers/cxl/core/pci.c
>+++ b/drivers/cxl/core/pci.c
>@@ -705,8 +705,8 @@ static void header_log_copy(void __iomem *ras_base,
>u32 *log)
>  * Log the state of the RAS status registers and prepare them to log the
>  * next error status. Return 1 if reset needed.
>  */
>-static bool cxl_handle_ras(struct device *dev, u64 serial,
>-			   void __iomem *ras_base)
>+static pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial,
>+				       void __iomem *ras_base)
> {
> 	u32 hl[CXL_HEADERLOG_SIZE_U32];
> 	void __iomem *addr;
>@@ -715,13 +715,13 @@ static bool cxl_handle_ras(struct device *dev, u64
>serial,
>
> 	if (!ras_base) {
> 		dev_warn_once(dev, "CXL RAS register block is not mapped");
>-		return false;
>+		return PCI_ERS_RESULT_NONE;
> 	}
>
> 	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
> 	status = readl(addr);
> 	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
>-		return false;
>+		return PCI_ERS_RESULT_NONE;
>
> 	/* If multiple errors, log header points to first error from ctrl reg */
> 	if (hweight32(status) > 1) {
>@@ -738,7 +738,7 @@ static bool cxl_handle_ras(struct device *dev, u64 serial,
> 	trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
> 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>
>-	return true;
>+	return PCI_ERS_RESULT_PANIC;
> }
>
> #ifdef CONFIG_PCIEAER_CXL
>@@ -833,13 +833,14 @@ static void cxl_handle_rdport_errors(struct
>cxl_dev_state *cxlds)  static void cxl_handle_rdport_errors(struct cxl_dev_state
>*cxlds) { }  #endif
>
>-void cxl_cor_error_detected(struct pci_dev *pdev)
>+void cxl_cor_error_detected(struct device *dev)
> {
>+	struct pci_dev *pdev = to_pci_dev(dev);
> 	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>-	struct device *dev = &cxlds->cxlmd->dev;
>+	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
>
>-	scoped_guard(device, dev) {
>-		if (!dev->driver) {
>+	scoped_guard(device, cxlmd_dev) {
>+		if (!cxlmd_dev->driver) {
> 			dev_warn(&pdev->dev,
> 				 "%s: memdev disabled, abort error
>handling\n",
> 				 dev_name(dev));
>@@ -854,20 +855,26 @@ void cxl_cor_error_detected(struct pci_dev *pdev)  }
>EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>
>-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>-				    pci_channel_state_t state)
>+void pci_cor_error_detected(struct pci_dev *pdev)
> {
>-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>-	struct device *dev = &cxlmd->dev;
>-	bool ue;
>+	cxl_cor_error_detected(&pdev->dev);
>+}
>+EXPORT_SYMBOL_NS_GPL(pci_cor_error_detected, "CXL");
>
>-	scoped_guard(device, dev) {
>-		if (!dev->driver) {
>+pci_ers_result_t cxl_error_detected(struct device *dev) {
>+	struct pci_dev *pdev = to_pci_dev(dev);
>+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>+	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
>+	pci_ers_result_t ue;
>+
>+	scoped_guard(device, cxlmd_dev) {
>+
Please remove the extra blank line.

>+		if (!cxlmd_dev->driver) {
> 			dev_warn(&pdev->dev,
> 				 "%s: memdev disabled, abort error
>handling\n",
> 				 dev_name(dev));

Thanks,
Shiju

Re: [PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error handlers

Posted by Jonathan Cameron 3 months, 1 week ago

On Thu, 26 Jun 2025 17:42:49 -0500
Terry Bowman <terry.bowman@amd.com> wrote:

> CXL Endpoint protocol errors are currently handled using PCI error
> handlers. The CXL Endpoint requires CXL specific handling in the case of
> uncorrectable error (UCE) handling not provided by the PCI handlers.
> 
> Add CXL specific handlers for CXL Endpoints. Rename the existing
> cxl_error_handlers to be pci_error_handlers to more correctly indicate
> the error type and follow naming consistency.
> 
> The PCI handlers will be called if the CXL device is not trained for
> alternate protocol (CXL). Update the CXL Endpoint PCI handlers to call the
> CXL UCE handlers.
> 
> The existing EP UCE handler includes checks for various results. These are
> no longer needed because CXL UCE recovery will not be attempted. Implement
> cxl_handle_ras() to return PCI_ERS_RESULT_NONE or PCI_ERS_RESULT_PANIC. The
> CXL UCE handler is called by cxl_do_recovery() that acts on the return
> value. In the case of the PCI handler path, call panic() if the result is
> PCI_ERS_RESULT_PANIC.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>

A few minor comments inline.

J
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 887b54cf3395..7209ffb5c2fe 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c


>  
> -	scoped_guard(device, dev) {
> -		if (!dev->driver) {
> +pci_ers_result_t cxl_error_detected(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
> +	pci_ers_result_t ue;
> +
> +	scoped_guard(device, cxlmd_dev) {
I think there is nothing much happening after this (maybe introduced in later
patches in which case ignore this comment).

So can you just use a guard and reduce the indent of the rest?

> +
> +		if (!cxlmd_dev->driver) {
>  			dev_warn(&pdev->dev,
>  				 "%s: memdev disabled, abort error handling\n",
>  				 dev_name(dev));
> -			return PCI_ERS_RESULT_DISCONNECT;
> +			return PCI_ERS_RESULT_PANIC;
>  		}
>  
>  		if (cxlds->rcd)
> @@ -881,29 +888,23 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);

little hard to tell from this code blob but can you return here?

>  	}
>  
> -
> -	switch (state) {
> -	case pci_channel_io_normal:
> -		if (ue) {
> -			device_release_driver(dev);
> -			return PCI_ERS_RESULT_NEED_RESET;
> -		}
> -		return PCI_ERS_RESULT_CAN_RECOVER;
> -	case pci_channel_io_frozen:
> -		dev_warn(&pdev->dev,
> -			 "%s: frozen state error detected, disable CXL.mem\n",
> -			 dev_name(dev));
> -		device_release_driver(dev);
> -		return PCI_ERS_RESULT_NEED_RESET;
> -	case pci_channel_io_perm_failure:
> -		dev_warn(&pdev->dev,
> -			 "failure state error detected, request disconnect\n");
> -		return PCI_ERS_RESULT_DISCONNECT;
> -	}
> -	return PCI_ERS_RESULT_NEED_RESET;
> +	return ue;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");