[PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors

Terry Bowman posted 16 patches 6 months, 2 weeks ago
[PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Terry Bowman 6 months, 2 weeks ago
CXL error handling will soon be moved from the AER driver into the CXL
driver. This requires a notification mechanism for the AER driver to share
the AER interrupt with the CXL driver. The notification will be used
as an indication for the CXL drivers to handle and log the CXL RAS errors.

Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
driver will be the sole kfifo producer adding work and the cxl_core will be
the sole kfifo consumer removing work. Add the boilerplate kfifo support.

Add CXL work queue handler registration functions in the AER driver. Export
the functions allowing CXL driver to access. Implement registration
functions for the CXL driver to assign or clear the work handler function.

Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
instance with the AER severity and the erring device's PCI SBDF. The SBDF
details will be used to rediscover the erring device after the CXL driver
dequeues the kfifo work. The device rediscovery will be introduced along
with the CXL handling in future patches.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/ras.c |  31 +++++++++-
 drivers/cxl/cxlpci.h   |   1 +
 drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
 include/linux/aer.h    |  36 +++++++++++
 4 files changed, 157 insertions(+), 43 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 485a831695c7..d35525e79e04 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -5,6 +5,7 @@
 #include <linux/aer.h>
 #include <cxl/event.h>
 #include <cxlmem.h>
+#include <cxlpci.h>
 #include "trace.h"
 
 static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
@@ -107,13 +108,41 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
 }
 static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
 
+#ifdef CONFIG_PCIEAER_CXL
+
+static void cxl_prot_err_work_fn(struct work_struct *work)
+{
+}
+
+#else
+static void cxl_prot_err_work_fn(struct work_struct *work) { }
+#endif /* CONFIG_PCIEAER_CXL */
+
+static struct work_struct cxl_prot_err_work;
+static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
+
 int cxl_ras_init(void)
 {
-	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+	int rc;
+
+	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+	if (rc)
+		pr_err("Failed to register CPER AER kfifo (%x)", rc);
+
+	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
+	if (rc) {
+		pr_err("Failed to register native AER kfifo (%x)", rc);
+		return rc;
+	}
+
+	return 0;
 }
 
 void cxl_ras_exit(void)
 {
 	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
 	cancel_work_sync(&cxl_cper_prot_err_work);
+
+	cxl_unregister_prot_err_work();
+	cancel_work_sync(&cxl_prot_err_work);
 }
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 54e219b0049e..6f1396ef7b77 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -4,6 +4,7 @@
 #define __CXL_PCI_H__
 #include <linux/pci.h>
 #include "cxl.h"
+#include "linux/aer.h"
 
 #define CXL_MEMORY_PROGIF	0x10
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index adb4b1123b9b..5350fa5be784 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -114,6 +114,14 @@ struct aer_stats {
 static int pcie_aer_disable;
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
+#if defined(CONFIG_PCIEAER_CXL)
+#define CXL_ERROR_SOURCES_MAX          128
+static DEFINE_KFIFO(cxl_prot_err_fifo, struct cxl_prot_err_work_data,
+		    CXL_ERROR_SOURCES_MAX);
+static DEFINE_SPINLOCK(cxl_prot_err_fifo_lock);
+struct work_struct *cxl_prot_err_work;
+#endif
+
 void pci_no_aer(void)
 {
 	pcie_aer_disable = 1;
@@ -1004,45 +1012,17 @@ static bool is_internal_error(struct aer_err_info *info)
 	return info->status & PCI_ERR_UNC_INTN;
 }
 
-static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
+static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
 {
-	struct aer_err_info *info = (struct aer_err_info *)data;
-	const struct pci_error_handlers *err_handler;
+	if (!info || !info->is_cxl)
+		return false;
 
-	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
-		return 0;
+	/* Only CXL Endpoints are currently supported */
+	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
+	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
+		return false;
 
-	/* Protect dev->driver */
-	device_lock(&dev->dev);
-
-	err_handler = dev->driver ? dev->driver->err_handler : NULL;
-	if (!err_handler)
-		goto out;
-
-	if (info->severity == AER_CORRECTABLE) {
-		if (err_handler->cor_error_detected)
-			err_handler->cor_error_detected(dev);
-	} else if (err_handler->error_detected) {
-		if (info->severity == AER_NONFATAL)
-			err_handler->error_detected(dev, pci_channel_io_normal);
-		else if (info->severity == AER_FATAL)
-			err_handler->error_detected(dev, pci_channel_io_frozen);
-	}
-out:
-	device_unlock(&dev->dev);
-	return 0;
-}
-
-static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
-{
-	/*
-	 * Internal errors of an RCEC indicate an AER error in an
-	 * RCH's downstream port. Check and handle them in the CXL.mem
-	 * device driver.
-	 */
-	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
-	    is_internal_error(info))
-		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
+	return is_internal_error(info);
 }
 
 static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
@@ -1056,13 +1036,17 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
 	return *handles_cxl;
 }
 
-static bool handles_cxl_errors(struct pci_dev *rcec)
+static bool handles_cxl_errors(struct pci_dev *dev)
 {
 	bool handles_cxl = false;
 
-	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
-	    pcie_aer_is_native(rcec))
-		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
+	if (!pcie_aer_is_native(dev))
+		return false;
+
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
+	else
+		handles_cxl = pcie_is_cxl(dev);
 
 	return handles_cxl;
 }
@@ -1076,10 +1060,46 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
 	pci_info(rcec, "CXL: Internal errors unmasked");
 }
 
+static int cxl_create_prot_error_info(struct pci_dev *pdev,
+				      struct aer_err_info *aer_err_info,
+				      struct cxl_prot_error_info *cxl_err_info)
+{
+	cxl_err_info->severity = aer_err_info->severity;
+
+	cxl_err_info->function = PCI_FUNC(pdev->devfn);
+	cxl_err_info->device = PCI_SLOT(pdev->devfn);
+	cxl_err_info->bus = pdev->bus->number;
+	cxl_err_info->segment = pci_domain_nr(pdev->bus);
+
+	return 0;
+}
+
+static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
+{
+	struct cxl_prot_err_work_data wd;
+	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
+
+	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);
+
+	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
+		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
+		return;
+	}
+
+	schedule_work(cxl_prot_err_work);
+}
+
 #else
 static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
 static inline void cxl_rch_handle_error(struct pci_dev *dev,
 					struct aer_err_info *info) { }
+static inline void forward_cxl_error(struct pci_dev *dev,
+				    struct aer_err_info *info) { }
+static inline bool handles_cxl_errors(struct pci_dev *dev)
+{
+	return false;
+}
+static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };
 #endif
 
 /**
@@ -1117,8 +1137,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 
 static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 {
-	cxl_rch_handle_error(dev, info);
-	pci_aer_handle_error(dev, info);
+	if (is_cxl_error(dev, info))
+		forward_cxl_error(dev, info);
+	else
+		pci_aer_handle_error(dev, info);
+
 	pci_dev_put(dev);
 }
 
@@ -1582,6 +1605,31 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 }
 
+#if defined(CONFIG_PCIEAER_CXL)
+
+int cxl_register_prot_err_work(struct work_struct *work)
+{
+	guard(spinlock)(&cxl_prot_err_fifo_lock);
+	cxl_prot_err_work = work;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_register_prot_err_work, "CXL");
+
+int cxl_unregister_prot_err_work(void)
+{
+	guard(spinlock)(&cxl_prot_err_fifo_lock);
+	cxl_prot_err_work = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_unregister_prot_err_work, "CXL");
+
+int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd)
+{
+	return kfifo_get(&cxl_prot_err_fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_prot_err_kfifo_get, "CXL");
+#endif
+
 static struct pcie_port_service_driver aerdriver = {
 	.name		= "aer",
 	.port_type	= PCIE_ANY_PORT,
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 02940be66324..550407240ab5 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -10,6 +10,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/workqueue_types.h>
 
 #define AER_NONFATAL			0
 #define AER_FATAL			1
@@ -53,6 +54,27 @@ struct aer_capability_regs {
 	u16 uncor_err_source;
 };
 
+/**
+ * struct cxl_prot_err_info - Error information used in CXL error handling
+ * @severity: AER severity
+ * @function: Device's PCI function
+ * @device: Device's PCI device
+ * @bus: Device's PCI bus
+ * @segment: Device's PCI segment
+ */
+struct cxl_prot_error_info {
+	int severity;
+
+	u8 function;
+	u8 device;
+	u8 bus;
+	u16 segment;
+};
+
+struct cxl_prot_err_work_data {
+	struct cxl_prot_error_info err_info;
+};
+
 #if defined(CONFIG_PCIEAER)
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
 int pcie_aer_is_native(struct pci_dev *dev);
@@ -64,6 +86,20 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
+#if defined(CONFIG_PCIEAER_CXL)
+int cxl_register_prot_err_work(struct work_struct *work);
+int cxl_unregister_prot_err_work(void);
+int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd);
+#else
+static inline int
+cxl_register_prot_err_work(struct work_struct *work)
+{
+	return 0;
+}
+static inline int cxl_unregister_prot_err_work(void) { return 0; }
+static inline int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd) { return 0; }
+#endif
+
 void pci_print_aer(struct pci_dev *dev, int aer_severity,
 		    struct aer_capability_regs *aer);
 int cper_severity_to_aer(int cper_severity);
-- 
2.34.1
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Jonathan Cameron 6 months, 1 week ago
On Tue, 3 Jun 2025 12:22:26 -0500
Terry Bowman <terry.bowman@amd.com> wrote:

> CXL error handling will soon be moved from the AER driver into the CXL
> driver. This requires a notification mechanism for the AER driver to share
> the AER interrupt with the CXL driver. The notification will be used
> as an indication for the CXL drivers to handle and log the CXL RAS errors.
> 
> Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
> driver will be the sole kfifo producer adding work and the cxl_core will be
> the sole kfifo consumer removing work. Add the boilerplate kfifo support.
> 
> Add CXL work queue handler registration functions in the AER driver. Export
> the functions allowing CXL driver to access. Implement registration
> functions for the CXL driver to assign or clear the work handler function.
> 
> Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
> Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
> instance with the AER severity and the erring device's PCI SBDF. The SBDF
> details will be used to rediscover the erring device after the CXL driver
> dequeues the kfifo work. The device rediscovery will be introduced along
> with the CXL handling in future patches.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Hi Terry,

A few trivial bits of feedback.  I didn't find anything substantial that
hasn't already been covered by others.

Jonathan

> ---
>  drivers/cxl/core/ras.c |  31 +++++++++-
>  drivers/cxl/cxlpci.h   |   1 +
>  drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
>  include/linux/aer.h    |  36 +++++++++++
>  4 files changed, 157 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 485a831695c7..d35525e79e04 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -5,6 +5,7 @@

>  
>  void cxl_ras_exit(void)
>  {
>  	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>  	cancel_work_sync(&cxl_cper_prot_err_work);
> +
> +	cxl_unregister_prot_err_work();
> +	cancel_work_sync(&cxl_prot_err_work);
>  }

> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index adb4b1123b9b..5350fa5be784 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c

> +static int cxl_create_prot_error_info(struct pci_dev *pdev,
> +				      struct aer_err_info *aer_err_info,
> +				      struct cxl_prot_error_info *cxl_err_info)
> +{
> +	cxl_err_info->severity = aer_err_info->severity;
> +
> +	cxl_err_info->function = PCI_FUNC(pdev->devfn);
> +	cxl_err_info->device = PCI_SLOT(pdev->devfn);
> +	cxl_err_info->bus = pdev->bus->number;
> +	cxl_err_info->segment = pci_domain_nr(pdev->bus);
Maybe 
	*cxl_err_info = (struct cxl_prot_error_info) {
		.severity = aer_err_info->severity,
		.function = PCI_FUNC(pdev->devfn),
		.device = PCI_SLOT(pdev->devfn),
		.bus = pdev->bus_number,
		.segment = pci_domain_nr(dev->nbus),
	};

Or if it isn't going to get more use later, just put that assignment in
forward_cxl_error as this helper doesn't seem to add a huge amount of
value wrt to code reability.


> +
> +	return 0;
> +}
> +
> +static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
> +{
> +	struct cxl_prot_err_work_data wd;
> +	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
> +
> +	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);

	cxl_create_prot_error_info(pdev, aer_err_info, &wd.err_info);

Ignore if this gets more complicated in a later patch but for now I don't
see the local variable adding any value. If anything it slightly obscures
how this gets used via wd in the next line.  However given the code is short
that is fairly obvious either way.

Or as above
	wd.error_info = (struct cxl_prot_error_info) {
		.severity = ...
	};

> +
> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
> +		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
> +		return;
> +	}
> +
> +	schedule_work(cxl_prot_err_work);
> +}
> +
>  #else
>  static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>  					struct aer_err_info *info) { }
> +static inline void forward_cxl_error(struct pci_dev *dev,
> +				    struct aer_err_info *info) { }

Is this aligned right - looks one space short?

> +static inline bool handles_cxl_errors(struct pci_dev *dev)
> +{
> +	return false;
> +}
> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };

wrap choices for the two stubs are a bit odd.   The first one is actually shorter
if not wrapped than the second one is that you chose not to wrap.
My slightly preference would be to wrap them both as the fist one.


>  #endif
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Bowman, Terry 6 months, 1 week ago

On 6/12/2025 6:04 AM, Jonathan Cameron wrote:
> On Tue, 3 Jun 2025 12:22:26 -0500
> Terry Bowman <terry.bowman@amd.com> wrote:
>
>> CXL error handling will soon be moved from the AER driver into the CXL
>> driver. This requires a notification mechanism for the AER driver to share
>> the AER interrupt with the CXL driver. The notification will be used
>> as an indication for the CXL drivers to handle and log the CXL RAS errors.
>>
>> Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
>> driver will be the sole kfifo producer adding work and the cxl_core will be
>> the sole kfifo consumer removing work. Add the boilerplate kfifo support.
>>
>> Add CXL work queue handler registration functions in the AER driver. Export
>> the functions allowing CXL driver to access. Implement registration
>> functions for the CXL driver to assign or clear the work handler function.
>>
>> Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
>> Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
>> instance with the AER severity and the erring device's PCI SBDF. The SBDF
>> details will be used to rediscover the erring device after the CXL driver
>> dequeues the kfifo work. The device rediscovery will be introduced along
>> with the CXL handling in future patches.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Hi Terry,
>
> A few trivial bits of feedback.  I didn't find anything substantial that
> hasn't already been covered by others.
>
> Jonathan
>
>> ---
>>  drivers/cxl/core/ras.c |  31 +++++++++-
>>  drivers/cxl/cxlpci.h   |   1 +
>>  drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
>>  include/linux/aer.h    |  36 +++++++++++
>>  4 files changed, 157 insertions(+), 43 deletions(-)
>>
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index 485a831695c7..d35525e79e04 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -5,6 +5,7 @@
>>  
>>  void cxl_ras_exit(void)
>>  {
>>  	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>>  	cancel_work_sync(&cxl_cper_prot_err_work);
>> +
>> +	cxl_unregister_prot_err_work();
>> +	cancel_work_sync(&cxl_prot_err_work);
>>  }
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index adb4b1123b9b..5350fa5be784 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> +static int cxl_create_prot_error_info(struct pci_dev *pdev,
>> +				      struct aer_err_info *aer_err_info,
>> +				      struct cxl_prot_error_info *cxl_err_info)
>> +{
>> +	cxl_err_info->severity = aer_err_info->severity;
>> +
>> +	cxl_err_info->function = PCI_FUNC(pdev->devfn);
>> +	cxl_err_info->device = PCI_SLOT(pdev->devfn);
>> +	cxl_err_info->bus = pdev->bus->number;
>> +	cxl_err_info->segment = pci_domain_nr(pdev->bus);
> Maybe 
> 	*cxl_err_info = (struct cxl_prot_error_info) {
> 		.severity = aer_err_info->severity,
> 		.function = PCI_FUNC(pdev->devfn),
> 		.device = PCI_SLOT(pdev->devfn),
> 		.bus = pdev->bus_number,
> 		.segment = pci_domain_nr(dev->nbus),
> 	};
>
Ok. And I'll take your other recommendation from patch [4/16] to not split devfn.
> Or if it isn't going to get more use later, just put that assignment in
> forward_cxl_error as this helper doesn't seem to add a huge amount of
> value wrt to code reability.
>
Good idea. This function started out doing more work but has been simplified that
it no longer needs be in a helper function.

>> +
>> +	return 0;
>> +}
>> +
>> +static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
>> +{
>> +	struct cxl_prot_err_work_data wd;
>> +	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
>> +
>> +	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);
> 	cxl_create_prot_error_info(pdev, aer_err_info, &wd.err_info);
>
> Ignore if this gets more complicated in a later patch but for now I don't
> see the local variable adding any value. If anything it slightly obscures
> how this gets used via wd in the next line.  However given the code is short
> that is fairly obvious either way.
>
> Or as above
> 	wd.error_info = (struct cxl_prot_error_info) {
> 		.severity = ...
> 	};
Ok. I'll let you know if it doesn't work. ;)
>> +
>> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
>> +		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
>> +		return;
>> +	}
>> +
>> +	schedule_work(cxl_prot_err_work);
>> +}
>> +
>>  #else
>>  static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>>  					struct aer_err_info *info) { }
>> +static inline void forward_cxl_error(struct pci_dev *dev,
>> +				    struct aer_err_info *info) { }
> Is this aligned right - looks one space short?
>
>> +static inline bool handles_cxl_errors(struct pci_dev *dev)
>> +{
>> +	return false;
>> +}
>> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };
> wrap choices for the two stubs are a bit odd.   The first one is actually shorter
> if not wrapped than the second one is that you chose not to wrap.
> My slightly preference would be to wrap them both as the fist one.
>
Thanks for pointing out. Fortunately, this (and above alignment note) is moved into a new file
pci/pcie/cxl_aer.c in the next revision and no longer requires this source file #ifdef.

-Terry

>>  #endif
>
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Dave Jiang 6 months, 2 weeks ago

On 6/3/25 10:22 AM, Terry Bowman wrote:
> CXL error handling will soon be moved from the AER driver into the CXL
> driver. This requires a notification mechanism for the AER driver to share
> the AER interrupt with the CXL driver. The notification will be used
> as an indication for the CXL drivers to handle and log the CXL RAS errors.
> 
> Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
> driver will be the sole kfifo producer adding work and the cxl_core will be
> the sole kfifo consumer removing work. Add the boilerplate kfifo support.
> 
> Add CXL work queue handler registration functions in the AER driver. Export
> the functions allowing CXL driver to access. Implement registration
> functions for the CXL driver to assign or clear the work handler function.
> 
> Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
> Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
> instance with the AER severity and the erring device's PCI SBDF. The SBDF
> details will be used to rediscover the erring device after the CXL driver
> dequeues the kfifo work. The device rediscovery will be introduced along
> with the CXL handling in future patches.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/core/ras.c |  31 +++++++++-
>  drivers/cxl/cxlpci.h   |   1 +
>  drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
>  include/linux/aer.h    |  36 +++++++++++
>  4 files changed, 157 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 485a831695c7..d35525e79e04 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -5,6 +5,7 @@
>  #include <linux/aer.h>
>  #include <cxl/event.h>
>  #include <cxlmem.h>
> +#include <cxlpci.h>
>  #include "trace.h"
>  
>  static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
> @@ -107,13 +108,41 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>  }
>  static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>  
> +#ifdef CONFIG_PCIEAER_CXL
> +
> +static void cxl_prot_err_work_fn(struct work_struct *work)
> +{
> +}
> +
> +#else
> +static void cxl_prot_err_work_fn(struct work_struct *work) { }
> +#endif /* CONFIG_PCIEAER_CXL */

I wonder instead of the ifdef block we can just do:

static void cxl_prot_err_work_fn(...)
{
	if (!IS_ENABLED(CONFIG_PCIEAER_CXL))
		return;

	....
}

In general we want to avoid ifdefs in C files. 

Also, where is CONFIG_PCIEAER_CXL defined? I'm having trouble finding the Kconfig that declares it.

$ git grep CONFIG_PCIEAER_CXL
drivers/cxl/core/pci.c:#ifdef CONFIG_PCIEAER_CXL
drivers/cxl/core/ras.c:#ifdef CONFIG_PCIEAER_CXL
drivers/cxl/core/ras.c:#endif /* CONFIG_PCIEAER_CXL */
drivers/cxl/cxl.h:#ifdef CONFIG_PCIEAER_CXL
drivers/cxl/port.c:#ifdef CONFIG_PCIEAER_CXL
drivers/cxl/port.c:#endif /* CONFIG_PCIEAER_CXL */
drivers/pci/pcie/aer.c:#if defined(CONFIG_PCIEAER_CXL)
drivers/pci/pcie/aer.c:#ifdef CONFIG_PCIEAER_CXL
drivers/pci/pcie/aer.c:#if defined(CONFIG_PCIEAER_CXL)
include/linux/aer.h:#if defined(CONFIG_PCIEAER_CXL)


> +
> +static struct work_struct cxl_prot_err_work;
> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
> +
>  int cxl_ras_init(void)
>  {
> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	int rc;
> +
> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	if (rc)
> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
> +
> +	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
> +	if (rc) {
> +		pr_err("Failed to register native AER kfifo (%x)", rc);
> +		return rc;
> +	}
> +
> +	return 0;
>  }
>  
>  void cxl_ras_exit(void)
>  {
>  	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>  	cancel_work_sync(&cxl_cper_prot_err_work);
> +
> +	cxl_unregister_prot_err_work();
> +	cancel_work_sync(&cxl_prot_err_work);
>  }
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 54e219b0049e..6f1396ef7b77 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -4,6 +4,7 @@
>  #define __CXL_PCI_H__
>  #include <linux/pci.h>
>  #include "cxl.h"
> +#include "linux/aer.h"
>  
>  #define CXL_MEMORY_PROGIF	0x10
>  
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index adb4b1123b9b..5350fa5be784 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -114,6 +114,14 @@ struct aer_stats {
>  static int pcie_aer_disable;
>  static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
>  
> +#if defined(CONFIG_PCIEAER_CXL)

Would it make sense to move all the CXL bits to a cxl_aer.c instead of all the ifdefs in this C file?

DJ

> +#define CXL_ERROR_SOURCES_MAX          128
> +static DEFINE_KFIFO(cxl_prot_err_fifo, struct cxl_prot_err_work_data,
> +		    CXL_ERROR_SOURCES_MAX);
> +static DEFINE_SPINLOCK(cxl_prot_err_fifo_lock);
> +struct work_struct *cxl_prot_err_work;
> +#endif
> +
>  void pci_no_aer(void)
>  {
>  	pcie_aer_disable = 1;
> @@ -1004,45 +1012,17 @@ static bool is_internal_error(struct aer_err_info *info)
>  	return info->status & PCI_ERR_UNC_INTN;
>  }
>  
> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
>  {
> -	struct aer_err_info *info = (struct aer_err_info *)data;
> -	const struct pci_error_handlers *err_handler;
> +	if (!info || !info->is_cxl)
> +		return false;
>  
> -	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
> -		return 0;
> +	/* Only CXL Endpoints are currently supported */
> +	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
> +	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
> +		return false;
>  
> -	/* Protect dev->driver */
> -	device_lock(&dev->dev);
> -
> -	err_handler = dev->driver ? dev->driver->err_handler : NULL;
> -	if (!err_handler)
> -		goto out;
> -
> -	if (info->severity == AER_CORRECTABLE) {
> -		if (err_handler->cor_error_detected)
> -			err_handler->cor_error_detected(dev);
> -	} else if (err_handler->error_detected) {
> -		if (info->severity == AER_NONFATAL)
> -			err_handler->error_detected(dev, pci_channel_io_normal);
> -		else if (info->severity == AER_FATAL)
> -			err_handler->error_detected(dev, pci_channel_io_frozen);
> -	}
> -out:
> -	device_unlock(&dev->dev);
> -	return 0;
> -}
> -
> -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
> -{
> -	/*
> -	 * Internal errors of an RCEC indicate an AER error in an
> -	 * RCH's downstream port. Check and handle them in the CXL.mem
> -	 * device driver.
> -	 */
> -	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
> -	    is_internal_error(info))
> -		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
> +	return is_internal_error(info);
>  }
>  
>  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> @@ -1056,13 +1036,17 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>  	return *handles_cxl;
>  }
>  
> -static bool handles_cxl_errors(struct pci_dev *rcec)
> +static bool handles_cxl_errors(struct pci_dev *dev)
>  {
>  	bool handles_cxl = false;
>  
> -	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
> -	    pcie_aer_is_native(rcec))
> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> +	if (!pcie_aer_is_native(dev))
> +		return false;
> +
> +	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
> +		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
> +	else
> +		handles_cxl = pcie_is_cxl(dev);
>  
>  	return handles_cxl;
>  }
> @@ -1076,10 +1060,46 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>  	pci_info(rcec, "CXL: Internal errors unmasked");
>  }
>  
> +static int cxl_create_prot_error_info(struct pci_dev *pdev,
> +				      struct aer_err_info *aer_err_info,
> +				      struct cxl_prot_error_info *cxl_err_info)
> +{
> +	cxl_err_info->severity = aer_err_info->severity;
> +
> +	cxl_err_info->function = PCI_FUNC(pdev->devfn);
> +	cxl_err_info->device = PCI_SLOT(pdev->devfn);
> +	cxl_err_info->bus = pdev->bus->number;
> +	cxl_err_info->segment = pci_domain_nr(pdev->bus);
> +
> +	return 0;
> +}
> +
> +static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
> +{
> +	struct cxl_prot_err_work_data wd;
> +	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
> +
> +	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);
> +
> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
> +		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
> +		return;
> +	}
> +
> +	schedule_work(cxl_prot_err_work);
> +}
> +
>  #else
>  static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>  					struct aer_err_info *info) { }
> +static inline void forward_cxl_error(struct pci_dev *dev,
> +				    struct aer_err_info *info) { }
> +static inline bool handles_cxl_errors(struct pci_dev *dev)
> +{
> +	return false;
> +}
> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };
>  #endif
>  
>  /**
> @@ -1117,8 +1137,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>  
>  static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>  {
> -	cxl_rch_handle_error(dev, info);
> -	pci_aer_handle_error(dev, info);
> +	if (is_cxl_error(dev, info))
> +		forward_cxl_error(dev, info);
> +	else
> +		pci_aer_handle_error(dev, info);
> +
>  	pci_dev_put(dev);
>  }
>  
> @@ -1582,6 +1605,31 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>  	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
>  }
>  
> +#if defined(CONFIG_PCIEAER_CXL)
> +
> +int cxl_register_prot_err_work(struct work_struct *work)
> +{
> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
> +	cxl_prot_err_work = work;
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_register_prot_err_work, "CXL");
> +
> +int cxl_unregister_prot_err_work(void)
> +{
> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
> +	cxl_prot_err_work = NULL;
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_unregister_prot_err_work, "CXL");
> +
> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd)
> +{
> +	return kfifo_get(&cxl_prot_err_fifo, wd);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_prot_err_kfifo_get, "CXL");
> +#endif
> +
>  static struct pcie_port_service_driver aerdriver = {
>  	.name		= "aer",
>  	.port_type	= PCIE_ANY_PORT,
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index 02940be66324..550407240ab5 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -10,6 +10,7 @@
>  
>  #include <linux/errno.h>
>  #include <linux/types.h>
> +#include <linux/workqueue_types.h>
>  
>  #define AER_NONFATAL			0
>  #define AER_FATAL			1
> @@ -53,6 +54,27 @@ struct aer_capability_regs {
>  	u16 uncor_err_source;
>  };
>  
> +/**
> + * struct cxl_prot_err_info - Error information used in CXL error handling
> + * @severity: AER severity
> + * @function: Device's PCI function
> + * @device: Device's PCI device
> + * @bus: Device's PCI bus
> + * @segment: Device's PCI segment
> + */
> +struct cxl_prot_error_info {
> +	int severity;
> +
> +	u8 function;
> +	u8 device;
> +	u8 bus;
> +	u16 segment;
> +};
> +
> +struct cxl_prot_err_work_data {
> +	struct cxl_prot_error_info err_info;
> +};
> +
>  #if defined(CONFIG_PCIEAER)
>  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>  int pcie_aer_is_native(struct pci_dev *dev);
> @@ -64,6 +86,20 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>  static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>  #endif
>  
> +#if defined(CONFIG_PCIEAER_CXL)
> +int cxl_register_prot_err_work(struct work_struct *work);
> +int cxl_unregister_prot_err_work(void);
> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd);
> +#else
> +static inline int
> +cxl_register_prot_err_work(struct work_struct *work)
> +{
> +	return 0;
> +}
> +static inline int cxl_unregister_prot_err_work(void) { return 0; }
> +static inline int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd) { return 0; }
> +#endif
> +
>  void pci_print_aer(struct pci_dev *dev, int aer_severity,
>  		    struct aer_capability_regs *aer);
>  int cper_severity_to_aer(int cper_severity);
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Bowman, Terry 6 months, 2 weeks ago

On 6/5/2025 7:27 PM, Dave Jiang wrote:
>
> On 6/3/25 10:22 AM, Terry Bowman wrote:
>> CXL error handling will soon be moved from the AER driver into the CXL
>> driver. This requires a notification mechanism for the AER driver to share
>> the AER interrupt with the CXL driver. The notification will be used
>> as an indication for the CXL drivers to handle and log the CXL RAS errors.
>>
>> Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
>> driver will be the sole kfifo producer adding work and the cxl_core will be
>> the sole kfifo consumer removing work. Add the boilerplate kfifo support.
>>
>> Add CXL work queue handler registration functions in the AER driver. Export
>> the functions allowing CXL driver to access. Implement registration
>> functions for the CXL driver to assign or clear the work handler function.
>>
>> Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
>> Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
>> instance with the AER severity and the erring device's PCI SBDF. The SBDF
>> details will be used to rediscover the erring device after the CXL driver
>> dequeues the kfifo work. The device rediscovery will be introduced along
>> with the CXL handling in future patches.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/ras.c |  31 +++++++++-
>>  drivers/cxl/cxlpci.h   |   1 +
>>  drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
>>  include/linux/aer.h    |  36 +++++++++++
>>  4 files changed, 157 insertions(+), 43 deletions(-)
>>
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index 485a831695c7..d35525e79e04 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -5,6 +5,7 @@
>>  #include <linux/aer.h>
>>  #include <cxl/event.h>
>>  #include <cxlmem.h>
>> +#include <cxlpci.h>
>>  #include "trace.h"
>>  
>>  static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
>> @@ -107,13 +108,41 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>>  }
>>  static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>>  
>> +#ifdef CONFIG_PCIEAER_CXL
>> +
>> +static void cxl_prot_err_work_fn(struct work_struct *work)
>> +{
>> +}
>> +
>> +#else
>> +static void cxl_prot_err_work_fn(struct work_struct *work) { }
>> +#endif /* CONFIG_PCIEAER_CXL */
> I wonder instead of the ifdef block we can just do:
>
> static void cxl_prot_err_work_fn(...)
> {
> 	if (!IS_ENABLED(CONFIG_PCIEAER_CXL))
> 		return;
>
> 	....
> }
I have a TODO request from Jonathan Cameron in the previous series iteration to address the
same #ifdef cleanup. Jonathan recommended introducing drivers/cxl/core/aer.c and moving the
CXL related AER logic to the new file. Are you OK with that solution?

> In general we want to avoid ifdefs in C files. 
>
> Also, where is CONFIG_PCIEAER_CXL defined? I'm having trouble finding the Kconfig that declares it.
>
> $ git grep CONFIG_PCIEAER_CXL
> drivers/cxl/core/pci.c:#ifdef CONFIG_PCIEAER_CXL
> drivers/cxl/core/ras.c:#ifdef CONFIG_PCIEAER_CXL
> drivers/cxl/core/ras.c:#endif /* CONFIG_PCIEAER_CXL */
> drivers/cxl/cxl.h:#ifdef CONFIG_PCIEAER_CXL
> drivers/cxl/port.c:#ifdef CONFIG_PCIEAER_CXL
> drivers/cxl/port.c:#endif /* CONFIG_PCIEAER_CXL */
> drivers/pci/pcie/aer.c:#if defined(CONFIG_PCIEAER_CXL)
> drivers/pci/pcie/aer.c:#ifdef CONFIG_PCIEAER_CXL
> drivers/pci/pcie/aer.c:#if defined(CONFIG_PCIEAER_CXL)
> include/linux/aer.h:#if defined(CONFIG_PCIEAER_CXL)
>
CONFIG_PCIEAER_CXL is a Kconfig dependent on CONFIG_PCIEAER. When enabled the 
#define is found in include/generated/autoconf.h

>> +
>> +static struct work_struct cxl_prot_err_work;
>> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
>> +
>>  int cxl_ras_init(void)
>>  {
>> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>> +	int rc;
>> +
>> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>> +	if (rc)
>> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
>> +
>> +	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
>> +	if (rc) {
>> +		pr_err("Failed to register native AER kfifo (%x)", rc);
>> +		return rc;
>> +	}
>> +
>> +	return 0;
>>  }
>>  
>>  void cxl_ras_exit(void)
>>  {
>>  	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>>  	cancel_work_sync(&cxl_cper_prot_err_work);
>> +
>> +	cxl_unregister_prot_err_work();
>> +	cancel_work_sync(&cxl_prot_err_work);
>>  }
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 54e219b0049e..6f1396ef7b77 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -4,6 +4,7 @@
>>  #define __CXL_PCI_H__
>>  #include <linux/pci.h>
>>  #include "cxl.h"
>> +#include "linux/aer.h"
>>  
>>  #define CXL_MEMORY_PROGIF	0x10
>>  
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index adb4b1123b9b..5350fa5be784 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -114,6 +114,14 @@ struct aer_stats {
>>  static int pcie_aer_disable;
>>  static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
>>  
>> +#if defined(CONFIG_PCIEAER_CXL)
> Would it make sense to move all the CXL bits to a cxl_aer.c instead of all the ifdefs in this C file?
>
> DJ

Yes, this is a good idea. I'll make the AER driver related change to separate the CXL logic.

Terry

>> +#define CXL_ERROR_SOURCES_MAX          128
>> +static DEFINE_KFIFO(cxl_prot_err_fifo, struct cxl_prot_err_work_data,
>> +		    CXL_ERROR_SOURCES_MAX);
>> +static DEFINE_SPINLOCK(cxl_prot_err_fifo_lock);
>> +struct work_struct *cxl_prot_err_work;
>> +#endif
>> +
>>  void pci_no_aer(void)
>>  {
>>  	pcie_aer_disable = 1;
>> @@ -1004,45 +1012,17 @@ static bool is_internal_error(struct aer_err_info *info)
>>  	return info->status & PCI_ERR_UNC_INTN;
>>  }
>>  
>> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
>> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
>>  {
>> -	struct aer_err_info *info = (struct aer_err_info *)data;
>> -	const struct pci_error_handlers *err_handler;
>> +	if (!info || !info->is_cxl)
>> +		return false;
>>  
>> -	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
>> -		return 0;
>> +	/* Only CXL Endpoints are currently supported */
>> +	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
>> +	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
>> +		return false;
>>  
>> -	/* Protect dev->driver */
>> -	device_lock(&dev->dev);
>> -
>> -	err_handler = dev->driver ? dev->driver->err_handler : NULL;
>> -	if (!err_handler)
>> -		goto out;
>> -
>> -	if (info->severity == AER_CORRECTABLE) {
>> -		if (err_handler->cor_error_detected)
>> -			err_handler->cor_error_detected(dev);
>> -	} else if (err_handler->error_detected) {
>> -		if (info->severity == AER_NONFATAL)
>> -			err_handler->error_detected(dev, pci_channel_io_normal);
>> -		else if (info->severity == AER_FATAL)
>> -			err_handler->error_detected(dev, pci_channel_io_frozen);
>> -	}
>> -out:
>> -	device_unlock(&dev->dev);
>> -	return 0;
>> -}
>> -
>> -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>> -{
>> -	/*
>> -	 * Internal errors of an RCEC indicate an AER error in an
>> -	 * RCH's downstream port. Check and handle them in the CXL.mem
>> -	 * device driver.
>> -	 */
>> -	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
>> -	    is_internal_error(info))
>> -		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
>> +	return is_internal_error(info);
>>  }
>>  
>>  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>> @@ -1056,13 +1036,17 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>  	return *handles_cxl;
>>  }
>>  
>> -static bool handles_cxl_errors(struct pci_dev *rcec)
>> +static bool handles_cxl_errors(struct pci_dev *dev)
>>  {
>>  	bool handles_cxl = false;
>>  
>> -	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
>> -	    pcie_aer_is_native(rcec))
>> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
>> +	if (!pcie_aer_is_native(dev))
>> +		return false;
>> +
>> +	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
>> +		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
>> +	else
>> +		handles_cxl = pcie_is_cxl(dev);
>>  
>>  	return handles_cxl;
>>  }
>> @@ -1076,10 +1060,46 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>  	pci_info(rcec, "CXL: Internal errors unmasked");
>>  }
>>  
>> +static int cxl_create_prot_error_info(struct pci_dev *pdev,
>> +				      struct aer_err_info *aer_err_info,
>> +				      struct cxl_prot_error_info *cxl_err_info)
>> +{
>> +	cxl_err_info->severity = aer_err_info->severity;
>> +
>> +	cxl_err_info->function = PCI_FUNC(pdev->devfn);
>> +	cxl_err_info->device = PCI_SLOT(pdev->devfn);
>> +	cxl_err_info->bus = pdev->bus->number;
>> +	cxl_err_info->segment = pci_domain_nr(pdev->bus);
>> +
>> +	return 0;
>> +}
>> +
>> +static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
>> +{
>> +	struct cxl_prot_err_work_data wd;
>> +	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
>> +
>> +	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);
>> +
>> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
>> +		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
>> +		return;
>> +	}
>> +
>> +	schedule_work(cxl_prot_err_work);
>> +}
>> +
>>  #else
>>  static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>>  					struct aer_err_info *info) { }
>> +static inline void forward_cxl_error(struct pci_dev *dev,
>> +				    struct aer_err_info *info) { }
>> +static inline bool handles_cxl_errors(struct pci_dev *dev)
>> +{
>> +	return false;
>> +}
>> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };
>>  #endif
>>  
>>  /**
>> @@ -1117,8 +1137,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>>  
>>  static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>>  {
>> -	cxl_rch_handle_error(dev, info);
>> -	pci_aer_handle_error(dev, info);
>> +	if (is_cxl_error(dev, info))
>> +		forward_cxl_error(dev, info);
>> +	else
>> +		pci_aer_handle_error(dev, info);
>> +
>>  	pci_dev_put(dev);
>>  }
>>  
>> @@ -1582,6 +1605,31 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>>  	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
>>  }
>>  
>> +#if defined(CONFIG_PCIEAER_CXL)
>> +
>> +int cxl_register_prot_err_work(struct work_struct *work)
>> +{
>> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
>> +	cxl_prot_err_work = work;
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_register_prot_err_work, "CXL");
>> +
>> +int cxl_unregister_prot_err_work(void)
>> +{
>> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
>> +	cxl_prot_err_work = NULL;
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_unregister_prot_err_work, "CXL");
>> +
>> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd)
>> +{
>> +	return kfifo_get(&cxl_prot_err_fifo, wd);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_prot_err_kfifo_get, "CXL");
>> +#endif
>> +
>>  static struct pcie_port_service_driver aerdriver = {
>>  	.name		= "aer",
>>  	.port_type	= PCIE_ANY_PORT,
>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>> index 02940be66324..550407240ab5 100644
>> --- a/include/linux/aer.h
>> +++ b/include/linux/aer.h
>> @@ -10,6 +10,7 @@
>>  
>>  #include <linux/errno.h>
>>  #include <linux/types.h>
>> +#include <linux/workqueue_types.h>
>>  
>>  #define AER_NONFATAL			0
>>  #define AER_FATAL			1
>> @@ -53,6 +54,27 @@ struct aer_capability_regs {
>>  	u16 uncor_err_source;
>>  };
>>  
>> +/**
>> + * struct cxl_prot_err_info - Error information used in CXL error handling
>> + * @severity: AER severity
>> + * @function: Device's PCI function
>> + * @device: Device's PCI device
>> + * @bus: Device's PCI bus
>> + * @segment: Device's PCI segment
>> + */
>> +struct cxl_prot_error_info {
>> +	int severity;
>> +
>> +	u8 function;
>> +	u8 device;
>> +	u8 bus;
>> +	u16 segment;
>> +};
>> +
>> +struct cxl_prot_err_work_data {
>> +	struct cxl_prot_error_info err_info;
>> +};
>> +
>>  #if defined(CONFIG_PCIEAER)
>>  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>>  int pcie_aer_is_native(struct pci_dev *dev);
>> @@ -64,6 +86,20 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>>  static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>>  #endif
>>  
>> +#if defined(CONFIG_PCIEAER_CXL)
>> +int cxl_register_prot_err_work(struct work_struct *work);
>> +int cxl_unregister_prot_err_work(void);
>> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd);
>> +#else
>> +static inline int
>> +cxl_register_prot_err_work(struct work_struct *work)
>> +{
>> +	return 0;
>> +}
>> +static inline int cxl_unregister_prot_err_work(void) { return 0; }
>> +static inline int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd) { return 0; }
>> +#endif
>> +
>>  void pci_print_aer(struct pci_dev *dev, int aer_severity,
>>  		    struct aer_capability_regs *aer);
>>  int cper_severity_to_aer(int cper_severity);
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Dave Jiang 6 months, 2 weeks ago

On 6/6/25 7:27 AM, Bowman, Terry wrote:
> 
> 
> On 6/5/2025 7:27 PM, Dave Jiang wrote:
>>
>> On 6/3/25 10:22 AM, Terry Bowman wrote:
>>> CXL error handling will soon be moved from the AER driver into the CXL
>>> driver. This requires a notification mechanism for the AER driver to share
>>> the AER interrupt with the CXL driver. The notification will be used
>>> as an indication for the CXL drivers to handle and log the CXL RAS errors.
>>>
>>> Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
>>> driver will be the sole kfifo producer adding work and the cxl_core will be
>>> the sole kfifo consumer removing work. Add the boilerplate kfifo support.
>>>
>>> Add CXL work queue handler registration functions in the AER driver. Export
>>> the functions allowing CXL driver to access. Implement registration
>>> functions for the CXL driver to assign or clear the work handler function.
>>>
>>> Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
>>> Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
>>> instance with the AER severity and the erring device's PCI SBDF. The SBDF
>>> details will be used to rediscover the erring device after the CXL driver
>>> dequeues the kfifo work. The device rediscovery will be introduced along
>>> with the CXL handling in future patches.
>>>
>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>> ---
>>>  drivers/cxl/core/ras.c |  31 +++++++++-
>>>  drivers/cxl/cxlpci.h   |   1 +
>>>  drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
>>>  include/linux/aer.h    |  36 +++++++++++
>>>  4 files changed, 157 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>>> index 485a831695c7..d35525e79e04 100644
>>> --- a/drivers/cxl/core/ras.c
>>> +++ b/drivers/cxl/core/ras.c
>>> @@ -5,6 +5,7 @@
>>>  #include <linux/aer.h>
>>>  #include <cxl/event.h>
>>>  #include <cxlmem.h>
>>> +#include <cxlpci.h>
>>>  #include "trace.h"
>>>  
>>>  static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
>>> @@ -107,13 +108,41 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>>>  }
>>>  static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>>>  
>>> +#ifdef CONFIG_PCIEAER_CXL
>>> +
>>> +static void cxl_prot_err_work_fn(struct work_struct *work)
>>> +{
>>> +}
>>> +
>>> +#else
>>> +static void cxl_prot_err_work_fn(struct work_struct *work) { }
>>> +#endif /* CONFIG_PCIEAER_CXL */
>> I wonder instead of the ifdef block we can just do:
>>
>> static void cxl_prot_err_work_fn(...)
>> {
>> 	if (!IS_ENABLED(CONFIG_PCIEAER_CXL))
>> 		return;
>>
>> 	....
>> }
> I have a TODO request from Jonathan Cameron in the previous series iteration to address the
> same #ifdef cleanup. Jonathan recommended introducing drivers/cxl/core/aer.c and moving the
> CXL related AER logic to the new file. Are you OK with that solution?

Yes that works. Thanks!

DJ

> 
>> In general we want to avoid ifdefs in C files. 
>>
>> Also, where is CONFIG_PCIEAER_CXL defined? I'm having trouble finding the Kconfig that declares it.
>>
>> $ git grep CONFIG_PCIEAER_CXL
>> drivers/cxl/core/pci.c:#ifdef CONFIG_PCIEAER_CXL
>> drivers/cxl/core/ras.c:#ifdef CONFIG_PCIEAER_CXL
>> drivers/cxl/core/ras.c:#endif /* CONFIG_PCIEAER_CXL */
>> drivers/cxl/cxl.h:#ifdef CONFIG_PCIEAER_CXL
>> drivers/cxl/port.c:#ifdef CONFIG_PCIEAER_CXL
>> drivers/cxl/port.c:#endif /* CONFIG_PCIEAER_CXL */
>> drivers/pci/pcie/aer.c:#if defined(CONFIG_PCIEAER_CXL)
>> drivers/pci/pcie/aer.c:#ifdef CONFIG_PCIEAER_CXL
>> drivers/pci/pcie/aer.c:#if defined(CONFIG_PCIEAER_CXL)
>> include/linux/aer.h:#if defined(CONFIG_PCIEAER_CXL)
>>
> CONFIG_PCIEAER_CXL is a Kconfig dependent on CONFIG_PCIEAER. When enabled the 
> #define is found in include/generated/autoconf.h
> 
>>> +
>>> +static struct work_struct cxl_prot_err_work;
>>> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
>>> +
>>>  int cxl_ras_init(void)
>>>  {
>>> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>>> +	int rc;
>>> +
>>> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>>> +	if (rc)
>>> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
>>> +
>>> +	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
>>> +	if (rc) {
>>> +		pr_err("Failed to register native AER kfifo (%x)", rc);
>>> +		return rc;
>>> +	}
>>> +
>>> +	return 0;
>>>  }
>>>  
>>>  void cxl_ras_exit(void)
>>>  {
>>>  	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>>>  	cancel_work_sync(&cxl_cper_prot_err_work);
>>> +
>>> +	cxl_unregister_prot_err_work();
>>> +	cancel_work_sync(&cxl_prot_err_work);
>>>  }
>>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>>> index 54e219b0049e..6f1396ef7b77 100644
>>> --- a/drivers/cxl/cxlpci.h
>>> +++ b/drivers/cxl/cxlpci.h
>>> @@ -4,6 +4,7 @@
>>>  #define __CXL_PCI_H__
>>>  #include <linux/pci.h>
>>>  #include "cxl.h"
>>> +#include "linux/aer.h"
>>>  
>>>  #define CXL_MEMORY_PROGIF	0x10
>>>  
>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>> index adb4b1123b9b..5350fa5be784 100644
>>> --- a/drivers/pci/pcie/aer.c
>>> +++ b/drivers/pci/pcie/aer.c
>>> @@ -114,6 +114,14 @@ struct aer_stats {
>>>  static int pcie_aer_disable;
>>>  static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
>>>  
>>> +#if defined(CONFIG_PCIEAER_CXL)
>> Would it make sense to move all the CXL bits to a cxl_aer.c instead of all the ifdefs in this C file?
>>
>> DJ
> 
> Yes, this is a good idea. I'll make the AER driver related change to separate the CXL logic.
> 
> Terry
> 
>>> +#define CXL_ERROR_SOURCES_MAX          128
>>> +static DEFINE_KFIFO(cxl_prot_err_fifo, struct cxl_prot_err_work_data,
>>> +		    CXL_ERROR_SOURCES_MAX);
>>> +static DEFINE_SPINLOCK(cxl_prot_err_fifo_lock);
>>> +struct work_struct *cxl_prot_err_work;
>>> +#endif
>>> +
>>>  void pci_no_aer(void)
>>>  {
>>>  	pcie_aer_disable = 1;
>>> @@ -1004,45 +1012,17 @@ static bool is_internal_error(struct aer_err_info *info)
>>>  	return info->status & PCI_ERR_UNC_INTN;
>>>  }
>>>  
>>> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
>>> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
>>>  {
>>> -	struct aer_err_info *info = (struct aer_err_info *)data;
>>> -	const struct pci_error_handlers *err_handler;
>>> +	if (!info || !info->is_cxl)
>>> +		return false;
>>>  
>>> -	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
>>> -		return 0;
>>> +	/* Only CXL Endpoints are currently supported */
>>> +	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
>>> +	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
>>> +		return false;
>>>  
>>> -	/* Protect dev->driver */
>>> -	device_lock(&dev->dev);
>>> -
>>> -	err_handler = dev->driver ? dev->driver->err_handler : NULL;
>>> -	if (!err_handler)
>>> -		goto out;
>>> -
>>> -	if (info->severity == AER_CORRECTABLE) {
>>> -		if (err_handler->cor_error_detected)
>>> -			err_handler->cor_error_detected(dev);
>>> -	} else if (err_handler->error_detected) {
>>> -		if (info->severity == AER_NONFATAL)
>>> -			err_handler->error_detected(dev, pci_channel_io_normal);
>>> -		else if (info->severity == AER_FATAL)
>>> -			err_handler->error_detected(dev, pci_channel_io_frozen);
>>> -	}
>>> -out:
>>> -	device_unlock(&dev->dev);
>>> -	return 0;
>>> -}
>>> -
>>> -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>>> -{
>>> -	/*
>>> -	 * Internal errors of an RCEC indicate an AER error in an
>>> -	 * RCH's downstream port. Check and handle them in the CXL.mem
>>> -	 * device driver.
>>> -	 */
>>> -	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
>>> -	    is_internal_error(info))
>>> -		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
>>> +	return is_internal_error(info);
>>>  }
>>>  
>>>  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>> @@ -1056,13 +1036,17 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>>  	return *handles_cxl;
>>>  }
>>>  
>>> -static bool handles_cxl_errors(struct pci_dev *rcec)
>>> +static bool handles_cxl_errors(struct pci_dev *dev)
>>>  {
>>>  	bool handles_cxl = false;
>>>  
>>> -	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
>>> -	    pcie_aer_is_native(rcec))
>>> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
>>> +	if (!pcie_aer_is_native(dev))
>>> +		return false;
>>> +
>>> +	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
>>> +		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
>>> +	else
>>> +		handles_cxl = pcie_is_cxl(dev);
>>>  
>>>  	return handles_cxl;
>>>  }
>>> @@ -1076,10 +1060,46 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>  	pci_info(rcec, "CXL: Internal errors unmasked");
>>>  }
>>>  
>>> +static int cxl_create_prot_error_info(struct pci_dev *pdev,
>>> +				      struct aer_err_info *aer_err_info,
>>> +				      struct cxl_prot_error_info *cxl_err_info)
>>> +{
>>> +	cxl_err_info->severity = aer_err_info->severity;
>>> +
>>> +	cxl_err_info->function = PCI_FUNC(pdev->devfn);
>>> +	cxl_err_info->device = PCI_SLOT(pdev->devfn);
>>> +	cxl_err_info->bus = pdev->bus->number;
>>> +	cxl_err_info->segment = pci_domain_nr(pdev->bus);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
>>> +{
>>> +	struct cxl_prot_err_work_data wd;
>>> +	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
>>> +
>>> +	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);
>>> +
>>> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
>>> +		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
>>> +		return;
>>> +	}
>>> +
>>> +	schedule_work(cxl_prot_err_work);
>>> +}
>>> +
>>>  #else
>>>  static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>>>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>>>  					struct aer_err_info *info) { }
>>> +static inline void forward_cxl_error(struct pci_dev *dev,
>>> +				    struct aer_err_info *info) { }
>>> +static inline bool handles_cxl_errors(struct pci_dev *dev)
>>> +{
>>> +	return false;
>>> +}
>>> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };
>>>  #endif
>>>  
>>>  /**
>>> @@ -1117,8 +1137,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>>>  
>>>  static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>>>  {
>>> -	cxl_rch_handle_error(dev, info);
>>> -	pci_aer_handle_error(dev, info);
>>> +	if (is_cxl_error(dev, info))
>>> +		forward_cxl_error(dev, info);
>>> +	else
>>> +		pci_aer_handle_error(dev, info);
>>> +
>>>  	pci_dev_put(dev);
>>>  }
>>>  
>>> @@ -1582,6 +1605,31 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>>>  	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
>>>  }
>>>  
>>> +#if defined(CONFIG_PCIEAER_CXL)
>>> +
>>> +int cxl_register_prot_err_work(struct work_struct *work)
>>> +{
>>> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
>>> +	cxl_prot_err_work = work;
>>> +	return 0;
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(cxl_register_prot_err_work, "CXL");
>>> +
>>> +int cxl_unregister_prot_err_work(void)
>>> +{
>>> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
>>> +	cxl_prot_err_work = NULL;
>>> +	return 0;
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(cxl_unregister_prot_err_work, "CXL");
>>> +
>>> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd)
>>> +{
>>> +	return kfifo_get(&cxl_prot_err_fifo, wd);
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(cxl_prot_err_kfifo_get, "CXL");
>>> +#endif
>>> +
>>>  static struct pcie_port_service_driver aerdriver = {
>>>  	.name		= "aer",
>>>  	.port_type	= PCIE_ANY_PORT,
>>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>>> index 02940be66324..550407240ab5 100644
>>> --- a/include/linux/aer.h
>>> +++ b/include/linux/aer.h
>>> @@ -10,6 +10,7 @@
>>>  
>>>  #include <linux/errno.h>
>>>  #include <linux/types.h>
>>> +#include <linux/workqueue_types.h>
>>>  
>>>  #define AER_NONFATAL			0
>>>  #define AER_FATAL			1
>>> @@ -53,6 +54,27 @@ struct aer_capability_regs {
>>>  	u16 uncor_err_source;
>>>  };
>>>  
>>> +/**
>>> + * struct cxl_prot_err_info - Error information used in CXL error handling
>>> + * @severity: AER severity
>>> + * @function: Device's PCI function
>>> + * @device: Device's PCI device
>>> + * @bus: Device's PCI bus
>>> + * @segment: Device's PCI segment
>>> + */
>>> +struct cxl_prot_error_info {
>>> +	int severity;
>>> +
>>> +	u8 function;
>>> +	u8 device;
>>> +	u8 bus;
>>> +	u16 segment;
>>> +};
>>> +
>>> +struct cxl_prot_err_work_data {
>>> +	struct cxl_prot_error_info err_info;
>>> +};
>>> +
>>>  #if defined(CONFIG_PCIEAER)
>>>  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>>>  int pcie_aer_is_native(struct pci_dev *dev);
>>> @@ -64,6 +86,20 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>>>  static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>>>  #endif
>>>  
>>> +#if defined(CONFIG_PCIEAER_CXL)
>>> +int cxl_register_prot_err_work(struct work_struct *work);
>>> +int cxl_unregister_prot_err_work(void);
>>> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd);
>>> +#else
>>> +static inline int
>>> +cxl_register_prot_err_work(struct work_struct *work)
>>> +{
>>> +	return 0;
>>> +}
>>> +static inline int cxl_unregister_prot_err_work(void) { return 0; }
>>> +static inline int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd) { return 0; }
>>> +#endif
>>> +
>>>  void pci_print_aer(struct pci_dev *dev, int aer_severity,
>>>  		    struct aer_capability_regs *aer);
>>>  int cper_severity_to_aer(int cper_severity);
>
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Sathyanarayanan Kuppuswamy 6 months, 2 weeks ago
On 6/3/25 10:22 AM, Terry Bowman wrote:
> CXL error handling will soon be moved from the AER driver into the CXL
> driver. This requires a notification mechanism for the AER driver to share
> the AER interrupt with the CXL driver. The notification will be used
> as an indication for the CXL drivers to handle and log the CXL RAS errors.
>
> Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
> driver will be the sole kfifo producer adding work and the cxl_core will be
> the sole kfifo consumer removing work. Add the boilerplate kfifo support.
>
> Add CXL work queue handler registration functions in the AER driver. Export
> the functions allowing CXL driver to access. Implement registration
> functions for the CXL driver to assign or clear the work handler function.
>
> Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
> Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
> instance with the AER severity and the erring device's PCI SBDF. The SBDF
> details will be used to rediscover the erring device after the CXL driver
> dequeues the kfifo work. The device rediscovery will be introduced along
> with the CXL handling in future patches.
>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>   drivers/cxl/core/ras.c |  31 +++++++++-
>   drivers/cxl/cxlpci.h   |   1 +
>   drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
>   include/linux/aer.h    |  36 +++++++++++
>   4 files changed, 157 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 485a831695c7..d35525e79e04 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -5,6 +5,7 @@
>   #include <linux/aer.h>
>   #include <cxl/event.h>
>   #include <cxlmem.h>
> +#include <cxlpci.h>
>   #include "trace.h"
>   
>   static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
> @@ -107,13 +108,41 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>   }
>   static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>   
> +#ifdef CONFIG_PCIEAER_CXL
> +
> +static void cxl_prot_err_work_fn(struct work_struct *work)
> +{
> +}
> +
> +#else
> +static void cxl_prot_err_work_fn(struct work_struct *work) { }
> +#endif /* CONFIG_PCIEAER_CXL */
> +
> +static struct work_struct cxl_prot_err_work;
> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
> +
>   int cxl_ras_init(void)
>   {
> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	int rc;
> +
> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	if (rc)
> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
> +
> +	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
> +	if (rc) {
> +		pr_err("Failed to register native AER kfifo (%x)", rc);
> +		return rc;
> +	}
> +
> +	return 0;
>   }
>   
>   void cxl_ras_exit(void)
>   {
>   	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>   	cancel_work_sync(&cxl_cper_prot_err_work);
> +
> +	cxl_unregister_prot_err_work();
> +	cancel_work_sync(&cxl_prot_err_work);
>   }
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 54e219b0049e..6f1396ef7b77 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -4,6 +4,7 @@
>   #define __CXL_PCI_H__
>   #include <linux/pci.h>
>   #include "cxl.h"
> +#include "linux/aer.h"
>   
>   #define CXL_MEMORY_PROGIF	0x10
>   
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index adb4b1123b9b..5350fa5be784 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -114,6 +114,14 @@ struct aer_stats {
>   static int pcie_aer_disable;
>   static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
>   
> +#if defined(CONFIG_PCIEAER_CXL)
> +#define CXL_ERROR_SOURCES_MAX          128
> +static DEFINE_KFIFO(cxl_prot_err_fifo, struct cxl_prot_err_work_data,
> +		    CXL_ERROR_SOURCES_MAX);
> +static DEFINE_SPINLOCK(cxl_prot_err_fifo_lock);
> +struct work_struct *cxl_prot_err_work;
> +#endif
> +
>   void pci_no_aer(void)
>   {
>   	pcie_aer_disable = 1;
> @@ -1004,45 +1012,17 @@ static bool is_internal_error(struct aer_err_info *info)
>   	return info->status & PCI_ERR_UNC_INTN;
>   }
>   
> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
>   {
> -	struct aer_err_info *info = (struct aer_err_info *)data;
> -	const struct pci_error_handlers *err_handler;
> +	if (!info || !info->is_cxl)
> +		return false;
>   
> -	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
> -		return 0;
> +	/* Only CXL Endpoints are currently supported */
> +	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
> +	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
> +		return false;
>   
> -	/* Protect dev->driver */
> -	device_lock(&dev->dev);
> -
> -	err_handler = dev->driver ? dev->driver->err_handler : NULL;
> -	if (!err_handler)
> -		goto out;
> -
> -	if (info->severity == AER_CORRECTABLE) {
> -		if (err_handler->cor_error_detected)
> -			err_handler->cor_error_detected(dev);
> -	} else if (err_handler->error_detected) {
> -		if (info->severity == AER_NONFATAL)
> -			err_handler->error_detected(dev, pci_channel_io_normal);
> -		else if (info->severity == AER_FATAL)
> -			err_handler->error_detected(dev, pci_channel_io_frozen);
> -	}
> -out:
> -	device_unlock(&dev->dev);
> -	return 0;
> -}
> -
> -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
> -{
> -	/*
> -	 * Internal errors of an RCEC indicate an AER error in an
> -	 * RCH's downstream port. Check and handle them in the CXL.mem
> -	 * device driver.
> -	 */
> -	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
> -	    is_internal_error(info))
> -		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
> +	return is_internal_error(info);
>   }
>   
>   static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> @@ -1056,13 +1036,17 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>   	return *handles_cxl;
>   }
>   
> -static bool handles_cxl_errors(struct pci_dev *rcec)
> +static bool handles_cxl_errors(struct pci_dev *dev)
>   {
>   	bool handles_cxl = false;
>   
> -	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
> -	    pcie_aer_is_native(rcec))
> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> +	if (!pcie_aer_is_native(dev))
> +		return false;
> +
> +	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
> +		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
> +	else
> +		handles_cxl = pcie_is_cxl(dev);
>   
>   	return handles_cxl;
>   }
> @@ -1076,10 +1060,46 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>   	pci_info(rcec, "CXL: Internal errors unmasked");
>   }
>   
> +static int cxl_create_prot_error_info(struct pci_dev *pdev,
> +				      struct aer_err_info *aer_err_info,
> +				      struct cxl_prot_error_info *cxl_err_info)
> +{
> +	cxl_err_info->severity = aer_err_info->severity;
> +
> +	cxl_err_info->function = PCI_FUNC(pdev->devfn);
> +	cxl_err_info->device = PCI_SLOT(pdev->devfn);
> +	cxl_err_info->bus = pdev->bus->number;
> +	cxl_err_info->segment = pci_domain_nr(pdev->bus);
> +
> +	return 0;
> +}
> +
> +static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
> +{
> +	struct cxl_prot_err_work_data wd;
> +	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
> +
> +	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);
> +
> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
> +		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
> +		return;
> +	}
> +
> +	schedule_work(cxl_prot_err_work);
> +}
> +
>   #else
>   static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>   static inline void cxl_rch_handle_error(struct pci_dev *dev,
>   					struct aer_err_info *info) { }
> +static inline void forward_cxl_error(struct pci_dev *dev,
> +				    struct aer_err_info *info) { }
> +static inline bool handles_cxl_errors(struct pci_dev *dev)
> +{
> +	return false;
> +}
> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };
>   #endif
>   
>   /**
> @@ -1117,8 +1137,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>   
>   static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>   {
> -	cxl_rch_handle_error(dev, info);
> -	pci_aer_handle_error(dev, info);
> +	if (is_cxl_error(dev, info))
> +		forward_cxl_error(dev, info);
> +	else
> +		pci_aer_handle_error(dev, info);
> +
>   	pci_dev_put(dev);
>   }
>   
> @@ -1582,6 +1605,31 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>   	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
>   }
>   
> +#if defined(CONFIG_PCIEAER_CXL)
> +
> +int cxl_register_prot_err_work(struct work_struct *work)
> +{
> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
> +	cxl_prot_err_work = work;
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_register_prot_err_work, "CXL");
> +
> +int cxl_unregister_prot_err_work(void)
> +{
> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
> +	cxl_prot_err_work = NULL;
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_unregister_prot_err_work, "CXL");
> +

Above two functions can never fail, right? What not make them return void?

> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd)
> +{
> +	return kfifo_get(&cxl_prot_err_fifo, wd);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_prot_err_kfifo_get, "CXL");
> +#endif
> +
>   static struct pcie_port_service_driver aerdriver = {
>   	.name		= "aer",
>   	.port_type	= PCIE_ANY_PORT,
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index 02940be66324..550407240ab5 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -10,6 +10,7 @@
>   
>   #include <linux/errno.h>
>   #include <linux/types.h>
> +#include <linux/workqueue_types.h>
>   
>   #define AER_NONFATAL			0
>   #define AER_FATAL			1
> @@ -53,6 +54,27 @@ struct aer_capability_regs {
>   	u16 uncor_err_source;
>   };
>   
> +/**
> + * struct cxl_prot_err_info - Error information used in CXL error handling
> + * @severity: AER severity
> + * @function: Device's PCI function
> + * @device: Device's PCI device
> + * @bus: Device's PCI bus
> + * @segment: Device's PCI segment
> + */
> +struct cxl_prot_error_info {
> +	int severity;
> +
> +	u8 function;
> +	u8 device;
> +	u8 bus;
> +	u16 segment;
> +};
> +
> +struct cxl_prot_err_work_data {
> +	struct cxl_prot_error_info err_info;
> +};
> +
>   #if defined(CONFIG_PCIEAER)
>   int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>   int pcie_aer_is_native(struct pci_dev *dev);
> @@ -64,6 +86,20 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>   static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>   #endif
>   
> +#if defined(CONFIG_PCIEAER_CXL)
> +int cxl_register_prot_err_work(struct work_struct *work);
> +int cxl_unregister_prot_err_work(void);
> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd);
> +#else
> +static inline int
> +cxl_register_prot_err_work(struct work_struct *work)
> +{
> +	return 0;
> +}
> +static inline int cxl_unregister_prot_err_work(void) { return 0; }
> +static inline int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd) { return 0; }
> +#endif
> +
>   void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   		    struct aer_capability_regs *aer);
>   int cper_severity_to_aer(int cper_severity);

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Bowman, Terry 6 months, 2 weeks ago

On 6/4/2025 5:50 PM, Sathyanarayanan Kuppuswamy wrote:
> On 6/3/25 10:22 AM, Terry Bowman wrote:
>> CXL error handling will soon be moved from the AER driver into the CXL
>> driver. This requires a notification mechanism for the AER driver to share
>> the AER interrupt with the CXL driver. The notification will be used
>> as an indication for the CXL drivers to handle and log the CXL RAS errors.
>>
>> Add a kfifo work queue to be used by the AER driver and CXL driver. The AER
>> driver will be the sole kfifo producer adding work and the cxl_core will be
>> the sole kfifo consumer removing work. Add the boilerplate kfifo support.
>>
>> Add CXL work queue handler registration functions in the AER driver. Export
>> the functions allowing CXL driver to access. Implement registration
>> functions for the CXL driver to assign or clear the work handler function.
>>
>> Introduce function cxl_create_prot_err_info() and 'struct cxl_prot_err_info'.
>> Implement cxl_create_prot_err_info() to populate a 'struct cxl_prot_err_info'
>> instance with the AER severity and the erring device's PCI SBDF. The SBDF
>> details will be used to rediscover the erring device after the CXL driver
>> dequeues the kfifo work. The device rediscovery will be introduced along
>> with the CXL handling in future patches.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>   drivers/cxl/core/ras.c |  31 +++++++++-
>>   drivers/cxl/cxlpci.h   |   1 +
>>   drivers/pci/pcie/aer.c | 132 ++++++++++++++++++++++++++++-------------
>>   include/linux/aer.h    |  36 +++++++++++
>>   4 files changed, 157 insertions(+), 43 deletions(-)
>>
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index 485a831695c7..d35525e79e04 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -5,6 +5,7 @@
>>   #include <linux/aer.h>
>>   #include <cxl/event.h>
>>   #include <cxlmem.h>
>> +#include <cxlpci.h>
>>   #include "trace.h"
>>   
>>   static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
>> @@ -107,13 +108,41 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>>   }
>>   static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>>   
>> +#ifdef CONFIG_PCIEAER_CXL
>> +
>> +static void cxl_prot_err_work_fn(struct work_struct *work)
>> +{
>> +}
>> +
>> +#else
>> +static void cxl_prot_err_work_fn(struct work_struct *work) { }
>> +#endif /* CONFIG_PCIEAER_CXL */
>> +
>> +static struct work_struct cxl_prot_err_work;
>> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
>> +
>>   int cxl_ras_init(void)
>>   {
>> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>> +	int rc;
>> +
>> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>> +	if (rc)
>> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
>> +
>> +	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
>> +	if (rc) {
>> +		pr_err("Failed to register native AER kfifo (%x)", rc);
>> +		return rc;
>> +	}
>> +
>> +	return 0;
>>   }
>>   
>>   void cxl_ras_exit(void)
>>   {
>>   	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>>   	cancel_work_sync(&cxl_cper_prot_err_work);
>> +
>> +	cxl_unregister_prot_err_work();
>> +	cancel_work_sync(&cxl_prot_err_work);
>>   }
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 54e219b0049e..6f1396ef7b77 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -4,6 +4,7 @@
>>   #define __CXL_PCI_H__
>>   #include <linux/pci.h>
>>   #include "cxl.h"
>> +#include "linux/aer.h"
>>   
>>   #define CXL_MEMORY_PROGIF	0x10
>>   
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index adb4b1123b9b..5350fa5be784 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -114,6 +114,14 @@ struct aer_stats {
>>   static int pcie_aer_disable;
>>   static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
>>   
>> +#if defined(CONFIG_PCIEAER_CXL)
>> +#define CXL_ERROR_SOURCES_MAX          128
>> +static DEFINE_KFIFO(cxl_prot_err_fifo, struct cxl_prot_err_work_data,
>> +		    CXL_ERROR_SOURCES_MAX);
>> +static DEFINE_SPINLOCK(cxl_prot_err_fifo_lock);
>> +struct work_struct *cxl_prot_err_work;
>> +#endif
>> +
>>   void pci_no_aer(void)
>>   {
>>   	pcie_aer_disable = 1;
>> @@ -1004,45 +1012,17 @@ static bool is_internal_error(struct aer_err_info *info)
>>   	return info->status & PCI_ERR_UNC_INTN;
>>   }
>>   
>> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
>> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
>>   {
>> -	struct aer_err_info *info = (struct aer_err_info *)data;
>> -	const struct pci_error_handlers *err_handler;
>> +	if (!info || !info->is_cxl)
>> +		return false;
>>   
>> -	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
>> -		return 0;
>> +	/* Only CXL Endpoints are currently supported */
>> +	if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
>> +	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
>> +		return false;
>>   
>> -	/* Protect dev->driver */
>> -	device_lock(&dev->dev);
>> -
>> -	err_handler = dev->driver ? dev->driver->err_handler : NULL;
>> -	if (!err_handler)
>> -		goto out;
>> -
>> -	if (info->severity == AER_CORRECTABLE) {
>> -		if (err_handler->cor_error_detected)
>> -			err_handler->cor_error_detected(dev);
>> -	} else if (err_handler->error_detected) {
>> -		if (info->severity == AER_NONFATAL)
>> -			err_handler->error_detected(dev, pci_channel_io_normal);
>> -		else if (info->severity == AER_FATAL)
>> -			err_handler->error_detected(dev, pci_channel_io_frozen);
>> -	}
>> -out:
>> -	device_unlock(&dev->dev);
>> -	return 0;
>> -}
>> -
>> -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>> -{
>> -	/*
>> -	 * Internal errors of an RCEC indicate an AER error in an
>> -	 * RCH's downstream port. Check and handle them in the CXL.mem
>> -	 * device driver.
>> -	 */
>> -	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
>> -	    is_internal_error(info))
>> -		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
>> +	return is_internal_error(info);
>>   }
>>   
>>   static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>> @@ -1056,13 +1036,17 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>   	return *handles_cxl;
>>   }
>>   
>> -static bool handles_cxl_errors(struct pci_dev *rcec)
>> +static bool handles_cxl_errors(struct pci_dev *dev)
>>   {
>>   	bool handles_cxl = false;
>>   
>> -	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
>> -	    pcie_aer_is_native(rcec))
>> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
>> +	if (!pcie_aer_is_native(dev))
>> +		return false;
>> +
>> +	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
>> +		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
>> +	else
>> +		handles_cxl = pcie_is_cxl(dev);
>>   
>>   	return handles_cxl;
>>   }
>> @@ -1076,10 +1060,46 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>   	pci_info(rcec, "CXL: Internal errors unmasked");
>>   }
>>   
>> +static int cxl_create_prot_error_info(struct pci_dev *pdev,
>> +				      struct aer_err_info *aer_err_info,
>> +				      struct cxl_prot_error_info *cxl_err_info)
>> +{
>> +	cxl_err_info->severity = aer_err_info->severity;
>> +
>> +	cxl_err_info->function = PCI_FUNC(pdev->devfn);
>> +	cxl_err_info->device = PCI_SLOT(pdev->devfn);
>> +	cxl_err_info->bus = pdev->bus->number;
>> +	cxl_err_info->segment = pci_domain_nr(pdev->bus);
>> +
>> +	return 0;
>> +}
>> +
>> +static void forward_cxl_error(struct pci_dev *pdev, struct aer_err_info *aer_err_info)
>> +{
>> +	struct cxl_prot_err_work_data wd;
>> +	struct cxl_prot_error_info *cxl_err_info = &wd.err_info;
>> +
>> +	cxl_create_prot_error_info(pdev, aer_err_info, cxl_err_info);
>> +
>> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
>> +		dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
>> +		return;
>> +	}
>> +
>> +	schedule_work(cxl_prot_err_work);
>> +}
>> +
>>   #else
>>   static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>>   static inline void cxl_rch_handle_error(struct pci_dev *dev,
>>   					struct aer_err_info *info) { }
>> +static inline void forward_cxl_error(struct pci_dev *dev,
>> +				    struct aer_err_info *info) { }
>> +static inline bool handles_cxl_errors(struct pci_dev *dev)
>> +{
>> +	return false;
>> +}
>> +static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return 0; };
>>   #endif
>>   
>>   /**
>> @@ -1117,8 +1137,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>>   
>>   static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>>   {
>> -	cxl_rch_handle_error(dev, info);
>> -	pci_aer_handle_error(dev, info);
>> +	if (is_cxl_error(dev, info))
>> +		forward_cxl_error(dev, info);
>> +	else
>> +		pci_aer_handle_error(dev, info);
>> +
>>   	pci_dev_put(dev);
>>   }
>>   
>> @@ -1582,6 +1605,31 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>>   	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
>>   }
>>   
>> +#if defined(CONFIG_PCIEAER_CXL)
>> +
>> +int cxl_register_prot_err_work(struct work_struct *work)
>> +{
>> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
>> +	cxl_prot_err_work = work;
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_register_prot_err_work, "CXL");
>> +
>> +int cxl_unregister_prot_err_work(void)
>> +{
>> +	guard(spinlock)(&cxl_prot_err_fifo_lock);
>> +	cxl_prot_err_work = NULL;
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_unregister_prot_err_work, "CXL");
>> +
> Above two functions can never fail, right? What not make them return void?

You are correct. Good point. Thanks, I'll make the change.

Terry
>> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd)
>> +{
>> +	return kfifo_get(&cxl_prot_err_fifo, wd);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_prot_err_kfifo_get, "CXL");
>> +#endif
>> +
>>   static struct pcie_port_service_driver aerdriver = {
>>   	.name		= "aer",
>>   	.port_type	= PCIE_ANY_PORT,
>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>> index 02940be66324..550407240ab5 100644
>> --- a/include/linux/aer.h
>> +++ b/include/linux/aer.h
>> @@ -10,6 +10,7 @@
>>   
>>   #include <linux/errno.h>
>>   #include <linux/types.h>
>> +#include <linux/workqueue_types.h>
>>   
>>   #define AER_NONFATAL			0
>>   #define AER_FATAL			1
>> @@ -53,6 +54,27 @@ struct aer_capability_regs {
>>   	u16 uncor_err_source;
>>   };
>>   
>> +/**
>> + * struct cxl_prot_err_info - Error information used in CXL error handling
>> + * @severity: AER severity
>> + * @function: Device's PCI function
>> + * @device: Device's PCI device
>> + * @bus: Device's PCI bus
>> + * @segment: Device's PCI segment
>> + */
>> +struct cxl_prot_error_info {
>> +	int severity;
>> +
>> +	u8 function;
>> +	u8 device;
>> +	u8 bus;
>> +	u16 segment;
>> +};
>> +
>> +struct cxl_prot_err_work_data {
>> +	struct cxl_prot_error_info err_info;
>> +};
>> +
>>   #if defined(CONFIG_PCIEAER)
>>   int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>>   int pcie_aer_is_native(struct pci_dev *dev);
>> @@ -64,6 +86,20 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>>   static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>>   #endif
>>   
>> +#if defined(CONFIG_PCIEAER_CXL)
>> +int cxl_register_prot_err_work(struct work_struct *work);
>> +int cxl_unregister_prot_err_work(void);
>> +int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd);
>> +#else
>> +static inline int
>> +cxl_register_prot_err_work(struct work_struct *work)
>> +{
>> +	return 0;
>> +}
>> +static inline int cxl_unregister_prot_err_work(void) { return 0; }
>> +static inline int cxl_prot_err_kfifo_get(struct cxl_prot_err_work_data *wd) { return 0; }
>> +#endif
>> +
>>   void pci_print_aer(struct pci_dev *dev, int aer_severity,
>>   		    struct aer_capability_regs *aer);
>>   int cper_severity_to_aer(int cper_severity);
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Dan Carpenter 6 months, 2 weeks ago
On Tue, Jun 03, 2025 at 12:22:26PM -0500, Terry Bowman wrote:
> +static struct work_struct cxl_prot_err_work;
> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
> +
>  int cxl_ras_init(void)
>  {
> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	int rc;
> +
> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	if (rc)
> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);

This shouldn't return rc;?

> +
> +	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
> +	if (rc) {
> +		pr_err("Failed to register native AER kfifo (%x)", rc);
> +		return rc;
> +	}
> +
> +	return 0;
>  }

regards,
dan carpenter
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Bowman, Terry 6 months, 2 weeks ago

On 6/4/2025 1:01 AM, Dan Carpenter wrote:
> On Tue, Jun 03, 2025 at 12:22:26PM -0500, Terry Bowman wrote:
>> +static struct work_struct cxl_prot_err_work;
>> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
>> +
>>  int cxl_ras_init(void)
>>  {
>> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>> +	int rc;
>> +
>> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>> +	if (rc)
>> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
> This shouldn't return rc;?

This was implemented to allow for native CXL handling initialization even if
FW-first (CPER) initialization fails. This can be changed to return rc.

Thanks for reviewing dan Carpenter.

-Terry

>
>> +
>> +	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
>> +	if (rc) {
>> +		pr_err("Failed to register native AER kfifo (%x)", rc);
>> +		return rc;
>> +	}
>> +
>> +	return 0;
>>  }
> regards,
> dan carpenter
>
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Dan Carpenter 6 months, 2 weeks ago
On Wed, Jun 04, 2025 at 09:37:02AM -0500, Bowman, Terry wrote:
> 
> 
> On 6/4/2025 1:01 AM, Dan Carpenter wrote:
> > On Tue, Jun 03, 2025 at 12:22:26PM -0500, Terry Bowman wrote:
> >> +static struct work_struct cxl_prot_err_work;
> >> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
> >> +
> >>  int cxl_ras_init(void)
> >>  {
> >> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> >> +	int rc;
> >> +
> >> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> >> +	if (rc)
> >> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
> > This shouldn't return rc;?
> 
> This was implemented to allow for native CXL handling initialization even if
> FW-first (CPER) initialization fails. This can be changed to return rc.

No no.  I'm fine with it either way so long as it's deliberate.  But
maybe add a comment if we can continue.

	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
	if (rc) {
		pr_err("Failed to register CPER AER kfifo (%x)", rc);
		/* Continuing regardless.  Thanks. */
	}

	rc = cxl_register_prot_err_work(&cxl_prot_err_work);

regards,
dan carpenter
Re: [PATCH v9 03/16] CXL/AER: Introduce kfifo for forwarding CXL errors
Posted by Bowman, Terry 6 months, 2 weeks ago

On 6/4/2025 12:24 PM, Dan Carpenter wrote:
> On Wed, Jun 04, 2025 at 09:37:02AM -0500, Bowman, Terry wrote:
>>
>> On 6/4/2025 1:01 AM, Dan Carpenter wrote:
>>> On Tue, Jun 03, 2025 at 12:22:26PM -0500, Terry Bowman wrote:
>>>> +static struct work_struct cxl_prot_err_work;
>>>> +static DECLARE_WORK(cxl_prot_err_work, cxl_prot_err_work_fn);
>>>> +
>>>>  int cxl_ras_init(void)
>>>>  {
>>>> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>>>> +	int rc;
>>>> +
>>>> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>>>> +	if (rc)
>>>> +		pr_err("Failed to register CPER AER kfifo (%x)", rc);
>>> This shouldn't return rc;?
>> This was implemented to allow for native CXL handling initialization even if
>> FW-first (CPER) initialization fails. This can be changed to return rc.
> No no.  I'm fine with it either way so long as it's deliberate.  But
> maybe add a comment if we can continue.
>
> 	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> 	if (rc) {
> 		pr_err("Failed to register CPER AER kfifo (%x)", rc);
> 		/* Continuing regardless.  Thanks. */
> 	}
>
> 	rc = cxl_register_prot_err_work(&cxl_prot_err_work);
>
> regards,
> dan carpenter
>
Good idea. I made the change as you recommended.

Terry