[PATCH v15 7/9] cxl: Update Endpoint AER uncorrectable handler

Terry Bowman posted 9 patches 5 days, 15 hours ago
[PATCH v15 7/9] cxl: Update Endpoint AER uncorrectable handler
Posted by Terry Bowman 5 days, 15 hours ago
CXL drivers now implement protocol RAS support. PCI protocol errors,
however, continue to be reported via the AER capability and must still be
handled by a PCI error recovery callback.

Replace the existing cxl_error_detected() callback in cxl/pci.c with a
new cxl_pci_error_detected() implementation that handles only uncorrectable
PCI protocol errors reported through AER.

Introduce helper named cxl_handler_aer() amd implement to handle and
log the CXL device's AER error.

This cleanly separates CXL protocol error handling from PCI AER handling
and ensures that each subsystem processes only the errors it is
responsible.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>

---

Changes in v14->v15:
- Title update (Terry)
- Change cxl_pci_error-detected() to handle & log AER (Terry)
- Update commit message (Terry)
- Moved cxl_handle_ras()/cxl_handle_cor_ras() to earlier patch (Terry)

Changes in v13->v14:
- Update commit headline (Bjorn)
- Rename pci_error_detected()/pci_cor_error_detected() ->
  cxl_pci_error_detected/cxl_pci_cor_error_detected() (Jonathan)
- Remove now-invalid comment in cxl_error_detected() (Jonathan)
- Split into separate patches for UCE and CE (Terry)

Changes in v12->v13:
- Update commit messaqge (Terry)
- Updated all the implementation and commit message. (Terry)
- Refactored cxl_cor_error_detected()/cxl_error_detected() to remove
  pdev (Dave Jiang)

Changes in v11->v12:
- None

Changes in v10->v11:
- cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
- cxl_error_detected() - Remove extra line (Shiju)
- Changes moved to core/ras.c (Terry)
- cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
- Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
- Move #include "pci.h from cxl.h to core.h (Terry)
- Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
---
 drivers/cxl/core/ras.c | 68 +++++++++++++++---------------------------
 drivers/cxl/cxlpci.h   |  9 +++---
 drivers/cxl/pci.c      |  6 ++--
 3 files changed, 31 insertions(+), 52 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 970ff3df442c..061e6aaec176 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -441,55 +441,35 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
 
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state)
+static bool cxl_handle_aer(struct pci_dev *pdev)
 {
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	bool ue;
-
-	scoped_guard(device, dev) {
-		if (!dev->driver) {
-			dev_warn(&pdev->dev,
-				 "%s: memdev disabled, abort error handling\n",
-				 dev_name(dev));
-			return PCI_ERS_RESULT_DISCONNECT;
-		}
+	struct aer_capability_regs aer;
+	u32 aer_cap = pdev->aer_cap;
 
-		if (cxlds->rcd)
-			cxl_handle_rdport_errors(cxlds);
-		/*
-		 * A frozen channel indicates an impending reset which is fatal to
-		 * CXL.mem operation, and will likely crash the system. On the off
-		 * chance the situation is recoverable dump the status of the RAS
-		 * capability registers and bounce the active state of the memdev.
-		 */
-		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
-				    cxlmd->endpoint->regs.ras);
+	if (!aer_cap) {
+		pr_warn_ratelimited("%s: AER capability isn't present\n",
+				    pci_name(pdev));
+		return false;
 	}
 
-	switch (state) {
-	case pci_channel_io_normal:
-		if (ue) {
-			device_release_driver(dev);
-			return PCI_ERS_RESULT_NEED_RESET;
-		}
-		return PCI_ERS_RESULT_CAN_RECOVER;
-	case pci_channel_io_frozen:
-		dev_warn(&pdev->dev,
-			 "%s: frozen state error detected, disable CXL.mem\n",
-			 dev_name(dev));
-		device_release_driver(dev);
-		return PCI_ERS_RESULT_NEED_RESET;
-	case pci_channel_io_perm_failure:
-		dev_warn(&pdev->dev,
-			 "failure state error detected, request disconnect\n");
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-	return PCI_ERS_RESULT_NEED_RESET;
+	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS, &aer.uncor_status);
+	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK, &aer.uncor_mask);
+
+	/* The AER driver logged the error */
+	pci_aer_clear_nonfatal_status(pdev);
+	pci_aer_clear_fatal_status(pdev);
+
+	return (aer.uncor_status & aer.uncor_mask);
+}
+
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+					pci_channel_state_t error)
+{
+	u32 rc = cxl_handle_aer(pdev);
+
+	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_CAN_RECOVER;
 }
-EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
+EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");
 
 static void cxl_handle_proto_error(struct cxl_proto_err_work_data *err_info)
 {
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 970add0256e9..5534422b496c 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -79,15 +79,14 @@ void read_cdat_data(struct cxl_port *port);
 
 #ifdef CONFIG_CXL_RAS
 void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state);
 void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+					pci_channel_state_t error);
 void devm_cxl_port_ras_setup(struct cxl_port *port);
 #else
 static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
-
-static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-						  pci_channel_state_t state)
+static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+						      pci_channel_state_t state)
 {
 	return PCI_ERS_RESULT_NONE;
 }
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index acb0eb2a13c3..ff741adc7c7f 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1051,8 +1051,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
 	}
 }
 
-static const struct pci_error_handlers cxl_error_handlers = {
-	.error_detected	= cxl_error_detected,
+static const struct pci_error_handlers pci_error_handlers = {
+	.error_detected	= cxl_pci_error_detected,
 	.slot_reset	= cxl_slot_reset,
 	.resume		= cxl_error_resume,
 	.cor_error_detected	= cxl_cor_error_detected,
@@ -1063,7 +1063,7 @@ static struct pci_driver cxl_pci_driver = {
 	.name			= KBUILD_MODNAME,
 	.id_table		= cxl_mem_pci_tbl,
 	.probe			= cxl_pci_probe,
-	.err_handler		= &cxl_error_handlers,
+	.err_handler		= &pci_error_handlers,
 	.dev_groups		= cxl_rcd_groups,
 	.driver	= {
 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
-- 
2.34.1
Re: [PATCH v15 7/9] cxl: Update Endpoint AER uncorrectable handler
Posted by Dave Jiang 5 days ago

On 2/2/26 7:52 PM, Terry Bowman wrote:
> CXL drivers now implement protocol RAS support. PCI protocol errors,
> however, continue to be reported via the AER capability and must still be
> handled by a PCI error recovery callback.
> 
> Replace the existing cxl_error_detected() callback in cxl/pci.c with a
> new cxl_pci_error_detected() implementation that handles only uncorrectable
> PCI protocol errors reported through AER.

Do we need to explain why only uncorrectable is handled?

> 
> Introduce helper named cxl_handler_aer() amd implement to handle and
> log the CXL device's AER error.
> 
> This cleanly separates CXL protocol error handling from PCI AER handling
> and ensures that each subsystem processes only the errors it is
> responsible.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> 
> ---
> 
> Changes in v14->v15:
> - Title update (Terry)
> - Change cxl_pci_error-detected() to handle & log AER (Terry)
> - Update commit message (Terry)
> - Moved cxl_handle_ras()/cxl_handle_cor_ras() to earlier patch (Terry)
> 
> Changes in v13->v14:
> - Update commit headline (Bjorn)
> - Rename pci_error_detected()/pci_cor_error_detected() ->
>   cxl_pci_error_detected/cxl_pci_cor_error_detected() (Jonathan)
> - Remove now-invalid comment in cxl_error_detected() (Jonathan)
> - Split into separate patches for UCE and CE (Terry)
> 
> Changes in v12->v13:
> - Update commit messaqge (Terry)
> - Updated all the implementation and commit message. (Terry)
> - Refactored cxl_cor_error_detected()/cxl_error_detected() to remove
>   pdev (Dave Jiang)
> 
> Changes in v11->v12:
> - None
> 
> Changes in v10->v11:
> - cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
> - cxl_error_detected() - Remove extra line (Shiju)
> - Changes moved to core/ras.c (Terry)
> - cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
> - Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
> - Move #include "pci.h from cxl.h to core.h (Terry)
> - Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
> ---
>  drivers/cxl/core/ras.c | 68 +++++++++++++++---------------------------
>  drivers/cxl/cxlpci.h   |  9 +++---
>  drivers/cxl/pci.c      |  6 ++--
>  3 files changed, 31 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 970ff3df442c..061e6aaec176 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -441,55 +441,35 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>  
> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> -				    pci_channel_state_t state)
> +static bool cxl_handle_aer(struct pci_dev *pdev)

For a function that returns a bool, the function name doesn't sound quite right. Maybe cxl_uncor_aer_present()?

DJ

>  {
> -	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> -	struct cxl_memdev *cxlmd = cxlds->cxlmd;
> -	struct device *dev = &cxlmd->dev;
> -	bool ue;
> -
> -	scoped_guard(device, dev) {
> -		if (!dev->driver) {
> -			dev_warn(&pdev->dev,
> -				 "%s: memdev disabled, abort error handling\n",
> -				 dev_name(dev));
> -			return PCI_ERS_RESULT_DISCONNECT;
> -		}
> +	struct aer_capability_regs aer;
> +	u32 aer_cap = pdev->aer_cap;
>  
> -		if (cxlds->rcd)
> -			cxl_handle_rdport_errors(cxlds);
> -		/*
> -		 * A frozen channel indicates an impending reset which is fatal to
> -		 * CXL.mem operation, and will likely crash the system. On the off
> -		 * chance the situation is recoverable dump the status of the RAS
> -		 * capability registers and bounce the active state of the memdev.
> -		 */
> -		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
> -				    cxlmd->endpoint->regs.ras);
> +	if (!aer_cap) {
> +		pr_warn_ratelimited("%s: AER capability isn't present\n",
> +				    pci_name(pdev));
> +		return false;
>  	}
>  
> -	switch (state) {
> -	case pci_channel_io_normal:
> -		if (ue) {
> -			device_release_driver(dev);
> -			return PCI_ERS_RESULT_NEED_RESET;
> -		}
> -		return PCI_ERS_RESULT_CAN_RECOVER;
> -	case pci_channel_io_frozen:
> -		dev_warn(&pdev->dev,
> -			 "%s: frozen state error detected, disable CXL.mem\n",
> -			 dev_name(dev));
> -		device_release_driver(dev);
> -		return PCI_ERS_RESULT_NEED_RESET;
> -	case pci_channel_io_perm_failure:
> -		dev_warn(&pdev->dev,
> -			 "failure state error detected, request disconnect\n");
> -		return PCI_ERS_RESULT_DISCONNECT;
> -	}
> -	return PCI_ERS_RESULT_NEED_RESET;
> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS, &aer.uncor_status);
> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK, &aer.uncor_mask);
> +
> +	/* The AER driver logged the error */
> +	pci_aer_clear_nonfatal_status(pdev);
> +	pci_aer_clear_fatal_status(pdev);
> +
> +	return (aer.uncor_status & aer.uncor_mask);
> +}
> +
> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
> +					pci_channel_state_t error)
> +{
> +	u32 rc = cxl_handle_aer(pdev);
> +
> +	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_CAN_RECOVER;
>  }
> -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
> +EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");
>  
>  static void cxl_handle_proto_error(struct cxl_proto_err_work_data *err_info)
>  {
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 970add0256e9..5534422b496c 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -79,15 +79,14 @@ void read_cdat_data(struct cxl_port *port);
>  
>  #ifdef CONFIG_CXL_RAS
>  void cxl_cor_error_detected(struct pci_dev *pdev);
> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> -				    pci_channel_state_t state);
>  void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
> +					pci_channel_state_t error);
>  void devm_cxl_port_ras_setup(struct cxl_port *port);
>  #else
>  static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
> -
> -static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> -						  pci_channel_state_t state)
> +static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
> +						      pci_channel_state_t state)
>  {
>  	return PCI_ERS_RESULT_NONE;
>  }
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index acb0eb2a13c3..ff741adc7c7f 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1051,8 +1051,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
>  	}
>  }
>  
> -static const struct pci_error_handlers cxl_error_handlers = {
> -	.error_detected	= cxl_error_detected,
> +static const struct pci_error_handlers pci_error_handlers = {
> +	.error_detected	= cxl_pci_error_detected,
>  	.slot_reset	= cxl_slot_reset,
>  	.resume		= cxl_error_resume,
>  	.cor_error_detected	= cxl_cor_error_detected,
> @@ -1063,7 +1063,7 @@ static struct pci_driver cxl_pci_driver = {
>  	.name			= KBUILD_MODNAME,
>  	.id_table		= cxl_mem_pci_tbl,
>  	.probe			= cxl_pci_probe,
> -	.err_handler		= &cxl_error_handlers,
> +	.err_handler		= &pci_error_handlers,
>  	.dev_groups		= cxl_rcd_groups,
>  	.driver	= {
>  		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
Re: [PATCH v15 7/9] cxl: Update Endpoint AER uncorrectable handler
Posted by Bowman, Terry 4 days, 23 hours ago
On 2/3/2026 11:31 AM, Dave Jiang wrote:
> 
> 
> On 2/2/26 7:52 PM, Terry Bowman wrote:
>> CXL drivers now implement protocol RAS support. PCI protocol errors,
>> however, continue to be reported via the AER capability and must still be
>> handled by a PCI error recovery callback.
>>
>> Replace the existing cxl_error_detected() callback in cxl/pci.c with a
>> new cxl_pci_error_detected() implementation that handles only uncorrectable
>> PCI protocol errors reported through AER.
> 
> Do we need to explain why only uncorrectable is handled?
> 

Would it be Ok if I removed "only" with s/only// ? 

After mentioning an important detail I shoud elaborate. But, how about if 
remove it and not refer to the CE at all here? CE shouldnt be mentioned unless 
good reason in a primarily UCE patch.

- Terry

>>
>> Introduce helper named cxl_handler_aer() amd implement to handle and
>> log the CXL device's AER error.
>>
>> This cleanly separates CXL protocol error handling from PCI AER handling
>> and ensures that each subsystem processes only the errors it is
>> responsible.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>
>> ---
>>
>> Changes in v14->v15:
>> - Title update (Terry)
>> - Change cxl_pci_error-detected() to handle & log AER (Terry)
>> - Update commit message (Terry)
>> - Moved cxl_handle_ras()/cxl_handle_cor_ras() to earlier patch (Terry)
>>
>> Changes in v13->v14:
>> - Update commit headline (Bjorn)
>> - Rename pci_error_detected()/pci_cor_error_detected() ->
>>   cxl_pci_error_detected/cxl_pci_cor_error_detected() (Jonathan)
>> - Remove now-invalid comment in cxl_error_detected() (Jonathan)
>> - Split into separate patches for UCE and CE (Terry)
>>
>> Changes in v12->v13:
>> - Update commit messaqge (Terry)
>> - Updated all the implementation and commit message. (Terry)
>> - Refactored cxl_cor_error_detected()/cxl_error_detected() to remove
>>   pdev (Dave Jiang)
>>
>> Changes in v11->v12:
>> - None
>>
>> Changes in v10->v11:
>> - cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
>> - cxl_error_detected() - Remove extra line (Shiju)
>> - Changes moved to core/ras.c (Terry)
>> - cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
>> - Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
>> - Move #include "pci.h from cxl.h to core.h (Terry)
>> - Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
>> ---
>>  drivers/cxl/core/ras.c | 68 +++++++++++++++---------------------------
>>  drivers/cxl/cxlpci.h   |  9 +++---
>>  drivers/cxl/pci.c      |  6 ++--
>>  3 files changed, 31 insertions(+), 52 deletions(-)
>>
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index 970ff3df442c..061e6aaec176 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -441,55 +441,35 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>>  
>> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>> -				    pci_channel_state_t state)
>> +static bool cxl_handle_aer(struct pci_dev *pdev)
> 
> For a function that returns a bool, the function name doesn't sound quite right. Maybe cxl_uncor_aer_present()?
> 
> DJ
> 

I was trying to follow the pattern of detected() function calls the 
handle() function as done for cxl_handle_ras() and cxl_handle_cor_ras().

I will change to cxl_uncor_aer_present().

-Terry

>>  {
>> -	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>> -	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>> -	struct device *dev = &cxlmd->dev;
>> -	bool ue;
>> -
>> -	scoped_guard(device, dev) {
>> -		if (!dev->driver) {
>> -			dev_warn(&pdev->dev,
>> -				 "%s: memdev disabled, abort error handling\n",
>> -				 dev_name(dev));
>> -			return PCI_ERS_RESULT_DISCONNECT;
>> -		}
>> +	struct aer_capability_regs aer;
>> +	u32 aer_cap = pdev->aer_cap;
>>  
>> -		if (cxlds->rcd)
>> -			cxl_handle_rdport_errors(cxlds);
>> -		/*
>> -		 * A frozen channel indicates an impending reset which is fatal to
>> -		 * CXL.mem operation, and will likely crash the system. On the off
>> -		 * chance the situation is recoverable dump the status of the RAS
>> -		 * capability registers and bounce the active state of the memdev.
>> -		 */
>> -		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
>> -				    cxlmd->endpoint->regs.ras);
>> +	if (!aer_cap) {
>> +		pr_warn_ratelimited("%s: AER capability isn't present\n",
>> +				    pci_name(pdev));
>> +		return false;
>>  	}
>>  
>> -	switch (state) {
>> -	case pci_channel_io_normal:
>> -		if (ue) {
>> -			device_release_driver(dev);
>> -			return PCI_ERS_RESULT_NEED_RESET;
>> -		}
>> -		return PCI_ERS_RESULT_CAN_RECOVER;
>> -	case pci_channel_io_frozen:
>> -		dev_warn(&pdev->dev,
>> -			 "%s: frozen state error detected, disable CXL.mem\n",
>> -			 dev_name(dev));
>> -		device_release_driver(dev);
>> -		return PCI_ERS_RESULT_NEED_RESET;
>> -	case pci_channel_io_perm_failure:
>> -		dev_warn(&pdev->dev,
>> -			 "failure state error detected, request disconnect\n");
>> -		return PCI_ERS_RESULT_DISCONNECT;
>> -	}
>> -	return PCI_ERS_RESULT_NEED_RESET;
>> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS, &aer.uncor_status);
>> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK, &aer.uncor_mask);
>> +
>> +	/* The AER driver logged the error */
>> +	pci_aer_clear_nonfatal_status(pdev);
>> +	pci_aer_clear_fatal_status(pdev);
>> +
>> +	return (aer.uncor_status & aer.uncor_mask);
>> +}
>> +
>> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>> +					pci_channel_state_t error)
>> +{
>> +	u32 rc = cxl_handle_aer(pdev);
>> +
>> +	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_CAN_RECOVER;
>>  }
>> -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
>> +EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");
>>  
>>  static void cxl_handle_proto_error(struct cxl_proto_err_work_data *err_info)
>>  {
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 970add0256e9..5534422b496c 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -79,15 +79,14 @@ void read_cdat_data(struct cxl_port *port);
>>  
>>  #ifdef CONFIG_CXL_RAS
>>  void cxl_cor_error_detected(struct pci_dev *pdev);
>> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>> -				    pci_channel_state_t state);
>>  void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
>> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>> +					pci_channel_state_t error);
>>  void devm_cxl_port_ras_setup(struct cxl_port *port);
>>  #else
>>  static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
>> -
>> -static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>> -						  pci_channel_state_t state)
>> +static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>> +						      pci_channel_state_t state)
>>  {
>>  	return PCI_ERS_RESULT_NONE;
>>  }
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index acb0eb2a13c3..ff741adc7c7f 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -1051,8 +1051,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
>>  	}
>>  }
>>  
>> -static const struct pci_error_handlers cxl_error_handlers = {
>> -	.error_detected	= cxl_error_detected,
>> +static const struct pci_error_handlers pci_error_handlers = {
>> +	.error_detected	= cxl_pci_error_detected,
>>  	.slot_reset	= cxl_slot_reset,
>>  	.resume		= cxl_error_resume,
>>  	.cor_error_detected	= cxl_cor_error_detected,
>> @@ -1063,7 +1063,7 @@ static struct pci_driver cxl_pci_driver = {
>>  	.name			= KBUILD_MODNAME,
>>  	.id_table		= cxl_mem_pci_tbl,
>>  	.probe			= cxl_pci_probe,
>> -	.err_handler		= &cxl_error_handlers,
>> +	.err_handler		= &pci_error_handlers,
>>  	.dev_groups		= cxl_rcd_groups,
>>  	.driver	= {
>>  		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
>
Re: [PATCH v15 7/9] cxl: Update Endpoint AER uncorrectable handler
Posted by Dave Jiang 4 days, 23 hours ago

On 2/3/26 11:35 AM, Bowman, Terry wrote:
> On 2/3/2026 11:31 AM, Dave Jiang wrote:
>>
>>
>> On 2/2/26 7:52 PM, Terry Bowman wrote:
>>> CXL drivers now implement protocol RAS support. PCI protocol errors,
>>> however, continue to be reported via the AER capability and must still be
>>> handled by a PCI error recovery callback.
>>>
>>> Replace the existing cxl_error_detected() callback in cxl/pci.c with a
>>> new cxl_pci_error_detected() implementation that handles only uncorrectable
>>> PCI protocol errors reported through AER.
>>
>> Do we need to explain why only uncorrectable is handled?
>>
> 
> Would it be Ok if I removed "only" with s/only// ? 
> 
> After mentioning an important detail I shoud elaborate. But, how about if 
> remove it and not refer to the CE at all here? CE shouldnt be mentioned unless 
> good reason in a primarily UCE patch.

Is CE handling added later? Maybe just say that.

DJ

> 
> - Terry
> 
>>>
>>> Introduce helper named cxl_handler_aer() amd implement to handle and
>>> log the CXL device's AER error.
>>>
>>> This cleanly separates CXL protocol error handling from PCI AER handling
>>> and ensures that each subsystem processes only the errors it is
>>> responsible.
>>>
>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>>
>>> ---
>>>
>>> Changes in v14->v15:
>>> - Title update (Terry)
>>> - Change cxl_pci_error-detected() to handle & log AER (Terry)
>>> - Update commit message (Terry)
>>> - Moved cxl_handle_ras()/cxl_handle_cor_ras() to earlier patch (Terry)
>>>
>>> Changes in v13->v14:
>>> - Update commit headline (Bjorn)
>>> - Rename pci_error_detected()/pci_cor_error_detected() ->
>>>   cxl_pci_error_detected/cxl_pci_cor_error_detected() (Jonathan)
>>> - Remove now-invalid comment in cxl_error_detected() (Jonathan)
>>> - Split into separate patches for UCE and CE (Terry)
>>>
>>> Changes in v12->v13:
>>> - Update commit messaqge (Terry)
>>> - Updated all the implementation and commit message. (Terry)
>>> - Refactored cxl_cor_error_detected()/cxl_error_detected() to remove
>>>   pdev (Dave Jiang)
>>>
>>> Changes in v11->v12:
>>> - None
>>>
>>> Changes in v10->v11:
>>> - cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
>>> - cxl_error_detected() - Remove extra line (Shiju)
>>> - Changes moved to core/ras.c (Terry)
>>> - cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
>>> - Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
>>> - Move #include "pci.h from cxl.h to core.h (Terry)
>>> - Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
>>> ---
>>>  drivers/cxl/core/ras.c | 68 +++++++++++++++---------------------------
>>>  drivers/cxl/cxlpci.h   |  9 +++---
>>>  drivers/cxl/pci.c      |  6 ++--
>>>  3 files changed, 31 insertions(+), 52 deletions(-)
>>>
>>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>>> index 970ff3df442c..061e6aaec176 100644
>>> --- a/drivers/cxl/core/ras.c
>>> +++ b/drivers/cxl/core/ras.c
>>> @@ -441,55 +441,35 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
>>>  }
>>>  EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>>>  
>>> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>> -				    pci_channel_state_t state)
>>> +static bool cxl_handle_aer(struct pci_dev *pdev)
>>
>> For a function that returns a bool, the function name doesn't sound quite right. Maybe cxl_uncor_aer_present()?
>>
>> DJ
>>
> 
> I was trying to follow the pattern of detected() function calls the 
> handle() function as done for cxl_handle_ras() and cxl_handle_cor_ras().
> 
> I will change to cxl_uncor_aer_present().
> 
> -Terry
> 
>>>  {
>>> -	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>>> -	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>>> -	struct device *dev = &cxlmd->dev;
>>> -	bool ue;
>>> -
>>> -	scoped_guard(device, dev) {
>>> -		if (!dev->driver) {
>>> -			dev_warn(&pdev->dev,
>>> -				 "%s: memdev disabled, abort error handling\n",
>>> -				 dev_name(dev));
>>> -			return PCI_ERS_RESULT_DISCONNECT;
>>> -		}
>>> +	struct aer_capability_regs aer;
>>> +	u32 aer_cap = pdev->aer_cap;
>>>  
>>> -		if (cxlds->rcd)
>>> -			cxl_handle_rdport_errors(cxlds);
>>> -		/*
>>> -		 * A frozen channel indicates an impending reset which is fatal to
>>> -		 * CXL.mem operation, and will likely crash the system. On the off
>>> -		 * chance the situation is recoverable dump the status of the RAS
>>> -		 * capability registers and bounce the active state of the memdev.
>>> -		 */
>>> -		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
>>> -				    cxlmd->endpoint->regs.ras);
>>> +	if (!aer_cap) {
>>> +		pr_warn_ratelimited("%s: AER capability isn't present\n",
>>> +				    pci_name(pdev));
>>> +		return false;
>>>  	}
>>>  
>>> -	switch (state) {
>>> -	case pci_channel_io_normal:
>>> -		if (ue) {
>>> -			device_release_driver(dev);
>>> -			return PCI_ERS_RESULT_NEED_RESET;
>>> -		}
>>> -		return PCI_ERS_RESULT_CAN_RECOVER;
>>> -	case pci_channel_io_frozen:
>>> -		dev_warn(&pdev->dev,
>>> -			 "%s: frozen state error detected, disable CXL.mem\n",
>>> -			 dev_name(dev));
>>> -		device_release_driver(dev);
>>> -		return PCI_ERS_RESULT_NEED_RESET;
>>> -	case pci_channel_io_perm_failure:
>>> -		dev_warn(&pdev->dev,
>>> -			 "failure state error detected, request disconnect\n");
>>> -		return PCI_ERS_RESULT_DISCONNECT;
>>> -	}
>>> -	return PCI_ERS_RESULT_NEED_RESET;
>>> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS, &aer.uncor_status);
>>> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK, &aer.uncor_mask);
>>> +
>>> +	/* The AER driver logged the error */
>>> +	pci_aer_clear_nonfatal_status(pdev);
>>> +	pci_aer_clear_fatal_status(pdev);
>>> +
>>> +	return (aer.uncor_status & aer.uncor_mask);
>>> +}
>>> +
>>> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>>> +					pci_channel_state_t error)
>>> +{
>>> +	u32 rc = cxl_handle_aer(pdev);
>>> +
>>> +	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_CAN_RECOVER;
>>>  }
>>> -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
>>> +EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");
>>>  
>>>  static void cxl_handle_proto_error(struct cxl_proto_err_work_data *err_info)
>>>  {
>>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>>> index 970add0256e9..5534422b496c 100644
>>> --- a/drivers/cxl/cxlpci.h
>>> +++ b/drivers/cxl/cxlpci.h
>>> @@ -79,15 +79,14 @@ void read_cdat_data(struct cxl_port *port);
>>>  
>>>  #ifdef CONFIG_CXL_RAS
>>>  void cxl_cor_error_detected(struct pci_dev *pdev);
>>> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>> -				    pci_channel_state_t state);
>>>  void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
>>> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>>> +					pci_channel_state_t error);
>>>  void devm_cxl_port_ras_setup(struct cxl_port *port);
>>>  #else
>>>  static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
>>> -
>>> -static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>> -						  pci_channel_state_t state)
>>> +static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>>> +						      pci_channel_state_t state)
>>>  {
>>>  	return PCI_ERS_RESULT_NONE;
>>>  }
>>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>>> index acb0eb2a13c3..ff741adc7c7f 100644
>>> --- a/drivers/cxl/pci.c
>>> +++ b/drivers/cxl/pci.c
>>> @@ -1051,8 +1051,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
>>>  	}
>>>  }
>>>  
>>> -static const struct pci_error_handlers cxl_error_handlers = {
>>> -	.error_detected	= cxl_error_detected,
>>> +static const struct pci_error_handlers pci_error_handlers = {
>>> +	.error_detected	= cxl_pci_error_detected,
>>>  	.slot_reset	= cxl_slot_reset,
>>>  	.resume		= cxl_error_resume,
>>>  	.cor_error_detected	= cxl_cor_error_detected,
>>> @@ -1063,7 +1063,7 @@ static struct pci_driver cxl_pci_driver = {
>>>  	.name			= KBUILD_MODNAME,
>>>  	.id_table		= cxl_mem_pci_tbl,
>>>  	.probe			= cxl_pci_probe,
>>> -	.err_handler		= &cxl_error_handlers,
>>> +	.err_handler		= &pci_error_handlers,
>>>  	.dev_groups		= cxl_rcd_groups,
>>>  	.driver	= {
>>>  		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
>>
> 
>
Re: [PATCH v15 7/9] cxl: Update Endpoint AER uncorrectable handler
Posted by Dave Jiang 4 days, 22 hours ago

On 2/3/26 11:49 AM, Dave Jiang wrote:
> 
> 
> On 2/3/26 11:35 AM, Bowman, Terry wrote:
>> On 2/3/2026 11:31 AM, Dave Jiang wrote:
>>>
>>>
>>> On 2/2/26 7:52 PM, Terry Bowman wrote:
>>>> CXL drivers now implement protocol RAS support. PCI protocol errors,
>>>> however, continue to be reported via the AER capability and must still be
>>>> handled by a PCI error recovery callback.
>>>>
>>>> Replace the existing cxl_error_detected() callback in cxl/pci.c with a
>>>> new cxl_pci_error_detected() implementation that handles only uncorrectable
>>>> PCI protocol errors reported through AER.
>>>
>>> Do we need to explain why only uncorrectable is handled?
>>>
>>
>> Would it be Ok if I removed "only" with s/only// ? 
>>
>> After mentioning an important detail I shoud elaborate. But, how about if 
>> remove it and not refer to the CE at all here? CE shouldnt be mentioned unless 
>> good reason in a primarily UCE patch.
> 
> Is CE handling added later? Maybe just say that.

So it's explained in the commit log of patch 8/9. Maybe just add a line here and say that CE is not needed.

> 
> DJ
> 
>>
>> - Terry
>>
>>>>
>>>> Introduce helper named cxl_handler_aer() amd implement to handle and
>>>> log the CXL device's AER error.
>>>>
>>>> This cleanly separates CXL protocol error handling from PCI AER handling
>>>> and ensures that each subsystem processes only the errors it is
>>>> responsible.
>>>>
>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>>>
>>>> ---
>>>>
>>>> Changes in v14->v15:
>>>> - Title update (Terry)
>>>> - Change cxl_pci_error-detected() to handle & log AER (Terry)
>>>> - Update commit message (Terry)
>>>> - Moved cxl_handle_ras()/cxl_handle_cor_ras() to earlier patch (Terry)
>>>>
>>>> Changes in v13->v14:
>>>> - Update commit headline (Bjorn)
>>>> - Rename pci_error_detected()/pci_cor_error_detected() ->
>>>>   cxl_pci_error_detected/cxl_pci_cor_error_detected() (Jonathan)
>>>> - Remove now-invalid comment in cxl_error_detected() (Jonathan)
>>>> - Split into separate patches for UCE and CE (Terry)
>>>>
>>>> Changes in v12->v13:
>>>> - Update commit messaqge (Terry)
>>>> - Updated all the implementation and commit message. (Terry)
>>>> - Refactored cxl_cor_error_detected()/cxl_error_detected() to remove
>>>>   pdev (Dave Jiang)
>>>>
>>>> Changes in v11->v12:
>>>> - None
>>>>
>>>> Changes in v10->v11:
>>>> - cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
>>>> - cxl_error_detected() - Remove extra line (Shiju)
>>>> - Changes moved to core/ras.c (Terry)
>>>> - cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
>>>> - Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
>>>> - Move #include "pci.h from cxl.h to core.h (Terry)
>>>> - Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
>>>> ---
>>>>  drivers/cxl/core/ras.c | 68 +++++++++++++++---------------------------
>>>>  drivers/cxl/cxlpci.h   |  9 +++---
>>>>  drivers/cxl/pci.c      |  6 ++--
>>>>  3 files changed, 31 insertions(+), 52 deletions(-)
>>>>
>>>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>>>> index 970ff3df442c..061e6aaec176 100644
>>>> --- a/drivers/cxl/core/ras.c
>>>> +++ b/drivers/cxl/core/ras.c
>>>> @@ -441,55 +441,35 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
>>>>  }
>>>>  EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>>>>  
>>>> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>>> -				    pci_channel_state_t state)
>>>> +static bool cxl_handle_aer(struct pci_dev *pdev)
>>>
>>> For a function that returns a bool, the function name doesn't sound quite right. Maybe cxl_uncor_aer_present()?
>>>
>>> DJ
>>>
>>
>> I was trying to follow the pattern of detected() function calls the 
>> handle() function as done for cxl_handle_ras() and cxl_handle_cor_ras().
>>
>> I will change to cxl_uncor_aer_present().
>>
>> -Terry
>>
>>>>  {
>>>> -	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>>>> -	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>>>> -	struct device *dev = &cxlmd->dev;
>>>> -	bool ue;
>>>> -
>>>> -	scoped_guard(device, dev) {
>>>> -		if (!dev->driver) {
>>>> -			dev_warn(&pdev->dev,
>>>> -				 "%s: memdev disabled, abort error handling\n",
>>>> -				 dev_name(dev));
>>>> -			return PCI_ERS_RESULT_DISCONNECT;
>>>> -		}
>>>> +	struct aer_capability_regs aer;
>>>> +	u32 aer_cap = pdev->aer_cap;
>>>>  
>>>> -		if (cxlds->rcd)
>>>> -			cxl_handle_rdport_errors(cxlds);
>>>> -		/*
>>>> -		 * A frozen channel indicates an impending reset which is fatal to
>>>> -		 * CXL.mem operation, and will likely crash the system. On the off
>>>> -		 * chance the situation is recoverable dump the status of the RAS
>>>> -		 * capability registers and bounce the active state of the memdev.
>>>> -		 */
>>>> -		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
>>>> -				    cxlmd->endpoint->regs.ras);
>>>> +	if (!aer_cap) {
>>>> +		pr_warn_ratelimited("%s: AER capability isn't present\n",
>>>> +				    pci_name(pdev));
>>>> +		return false;
>>>>  	}
>>>>  
>>>> -	switch (state) {
>>>> -	case pci_channel_io_normal:
>>>> -		if (ue) {
>>>> -			device_release_driver(dev);
>>>> -			return PCI_ERS_RESULT_NEED_RESET;
>>>> -		}
>>>> -		return PCI_ERS_RESULT_CAN_RECOVER;
>>>> -	case pci_channel_io_frozen:
>>>> -		dev_warn(&pdev->dev,
>>>> -			 "%s: frozen state error detected, disable CXL.mem\n",
>>>> -			 dev_name(dev));
>>>> -		device_release_driver(dev);
>>>> -		return PCI_ERS_RESULT_NEED_RESET;
>>>> -	case pci_channel_io_perm_failure:
>>>> -		dev_warn(&pdev->dev,
>>>> -			 "failure state error detected, request disconnect\n");
>>>> -		return PCI_ERS_RESULT_DISCONNECT;
>>>> -	}
>>>> -	return PCI_ERS_RESULT_NEED_RESET;
>>>> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS, &aer.uncor_status);
>>>> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK, &aer.uncor_mask);
>>>> +
>>>> +	/* The AER driver logged the error */
>>>> +	pci_aer_clear_nonfatal_status(pdev);
>>>> +	pci_aer_clear_fatal_status(pdev);
>>>> +
>>>> +	return (aer.uncor_status & aer.uncor_mask);
>>>> +}
>>>> +
>>>> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>>>> +					pci_channel_state_t error)
>>>> +{
>>>> +	u32 rc = cxl_handle_aer(pdev);
>>>> +
>>>> +	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_CAN_RECOVER;
>>>>  }
>>>> -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
>>>> +EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");
>>>>  
>>>>  static void cxl_handle_proto_error(struct cxl_proto_err_work_data *err_info)
>>>>  {
>>>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>>>> index 970add0256e9..5534422b496c 100644
>>>> --- a/drivers/cxl/cxlpci.h
>>>> +++ b/drivers/cxl/cxlpci.h
>>>> @@ -79,15 +79,14 @@ void read_cdat_data(struct cxl_port *port);
>>>>  
>>>>  #ifdef CONFIG_CXL_RAS
>>>>  void cxl_cor_error_detected(struct pci_dev *pdev);
>>>> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>>> -				    pci_channel_state_t state);
>>>>  void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
>>>> +pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>>>> +					pci_channel_state_t error);
>>>>  void devm_cxl_port_ras_setup(struct cxl_port *port);
>>>>  #else
>>>>  static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
>>>> -
>>>> -static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>>> -						  pci_channel_state_t state)
>>>> +static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>>>> +						      pci_channel_state_t state)
>>>>  {
>>>>  	return PCI_ERS_RESULT_NONE;
>>>>  }
>>>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>>>> index acb0eb2a13c3..ff741adc7c7f 100644
>>>> --- a/drivers/cxl/pci.c
>>>> +++ b/drivers/cxl/pci.c
>>>> @@ -1051,8 +1051,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
>>>>  	}
>>>>  }
>>>>  
>>>> -static const struct pci_error_handlers cxl_error_handlers = {
>>>> -	.error_detected	= cxl_error_detected,
>>>> +static const struct pci_error_handlers pci_error_handlers = {
>>>> +	.error_detected	= cxl_pci_error_detected,
>>>>  	.slot_reset	= cxl_slot_reset,
>>>>  	.resume		= cxl_error_resume,
>>>>  	.cor_error_detected	= cxl_cor_error_detected,
>>>> @@ -1063,7 +1063,7 @@ static struct pci_driver cxl_pci_driver = {
>>>>  	.name			= KBUILD_MODNAME,
>>>>  	.id_table		= cxl_mem_pci_tbl,
>>>>  	.probe			= cxl_pci_probe,
>>>> -	.err_handler		= &cxl_error_handlers,
>>>> +	.err_handler		= &pci_error_handlers,
>>>>  	.dev_groups		= cxl_rcd_groups,
>>>>  	.driver	= {
>>>>  		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
>>>
>>
>>
> 
>
Re: [PATCH v15 7/9] cxl: Update Endpoint AER uncorrectable handler
Posted by Jonathan Cameron 5 days, 2 hours ago
On Mon, 2 Feb 2026 20:52:42 -0600
Terry Bowman <terry.bowman@amd.com> wrote:

> CXL drivers now implement protocol RAS support. PCI protocol errors,
> however, continue to be reported via the AER capability and must still be
> handled by a PCI error recovery callback.
> 
> Replace the existing cxl_error_detected() callback in cxl/pci.c with a
> new cxl_pci_error_detected() implementation that handles only uncorrectable
> PCI protocol errors reported through AER.
> 
> Introduce helper named cxl_handler_aer() amd implement to handle and
> log the CXL device's AER error.
> 
> This cleanly separates CXL protocol error handling from PCI AER handling
> and ensures that each subsystem processes only the errors it is
> responsible.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> 
> ---
> 
> Changes in v14->v15:
> - Title update (Terry)
> - Change cxl_pci_error-detected() to handle & log AER (Terry)`
> - Update commit message (Terry)
> - Moved cxl_handle_ras()/cxl_handle_cor_ras() to earlier patch (Terry)
> 
> Changes in v13->v14:
> - Update commit headline (Bjorn)
> - Rename pci_error_detected()/pci_cor_error_detected() ->
>   cxl_pci_error_detected/cxl_pci_cor_error_detected() (Jonathan)
> - Remove now-invalid comment in cxl_error_detected() (Jonathan)
> - Split into separate patches for UCE and CE (Terry)
> 
> Changes in v12->v13:
> - Update commit messaqge (Terry)
> - Updated all the implementation and commit message. (Terry)
> - Refactored cxl_cor_error_detected()/cxl_error_detected() to remove
>   pdev (Dave Jiang)
> 
> Changes in v11->v12:
> - None
> 
> Changes in v10->v11:
> - cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
> - cxl_error_detected() - Remove extra line (Shiju)
> - Changes moved to core/ras.c (Terry)
> - cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
> - Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
> - Move #include "pci.h from cxl.h to core.h (Terry)
> - Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
> ---
>  drivers/cxl/core/ras.c | 68 +++++++++++++++---------------------------
>  drivers/cxl/cxlpci.h   |  9 +++---
>  drivers/cxl/pci.c      |  6 ++--
>  3 files changed, 31 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 970ff3df442c..061e6aaec176 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -441,55 +441,35 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>  
> -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> -				    pci_channel_state_t state)
> +static bool cxl_handle_aer(struct pci_dev *pdev)
>  {
> -	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> -	struct cxl_memdev *cxlmd = cxlds->cxlmd;
> -	struct device *dev = &cxlmd->dev;
> -	bool ue;
> -
> -	scoped_guard(device, dev) {
> -		if (!dev->driver) {
> -			dev_warn(&pdev->dev,
> -				 "%s: memdev disabled, abort error handling\n",
> -				 dev_name(dev));
> -			return PCI_ERS_RESULT_DISCONNECT;
> -		}
> +	struct aer_capability_regs aer;

I don't see a strong reason to use this structure given you just want two
of the registers and read into them one by one.

> +	u32 aer_cap = pdev->aer_cap;
>  
> -		if (cxlds->rcd)
> -			cxl_handle_rdport_errors(cxlds);
> -		/*
> -		 * A frozen channel indicates an impending reset which is fatal to
> -		 * CXL.mem operation, and will likely crash the system. On the off
> -		 * chance the situation is recoverable dump the status of the RAS
> -		 * capability registers and bounce the active state of the memdev.
> -		 */
> -		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
> -				    cxlmd->endpoint->regs.ras);
> +	if (!aer_cap) {
> +		pr_warn_ratelimited("%s: AER capability isn't present\n",
> +				    pci_name(pdev));

These could use dev_warn_rate_limited()
or even add a wrapper similar to pci_info_rate_limited()

> +		return false;
>  	}
>  
> -	switch (state) {
> -	case pci_channel_io_normal:
> -		if (ue) {
> -			device_release_driver(dev);
> -			return PCI_ERS_RESULT_NEED_RESET;
> -		}
> -		return PCI_ERS_RESULT_CAN_RECOVER;
> -	case pci_channel_io_frozen:
> -		dev_warn(&pdev->dev,
> -			 "%s: frozen state error detected, disable CXL.mem\n",
> -			 dev_name(dev));
> -		device_release_driver(dev);
> -		return PCI_ERS_RESULT_NEED_RESET;
> -	case pci_channel_io_perm_failure:
> -		dev_warn(&pdev->dev,
> -			 "failure state error detected, request disconnect\n");
> -		return PCI_ERS_RESULT_DISCONNECT;
> -	}
> -	return PCI_ERS_RESULT_NEED_RESET;
> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_STATUS, &aer.uncor_status);
> +	pci_read_config_dword(pdev, aer_cap + PCI_ERR_UNCOR_MASK, &aer.uncor_mask);
> +
> +	/* The AER driver logged the error */
> +	pci_aer_clear_nonfatal_status(pdev);
> +	pci_aer_clear_fatal_status(pdev);
> +
> +	return (aer.uncor_status & aer.uncor_mask);
> +}