[v8] Enable CXL PCIe port protocol error handling and logging

[PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Terry Bowman 10 months, 2 weeks ago

The AER service driver includes a CXL-specific kfifo, intended to forward
CXL errors to the CXL driver. However, the forwarding functionality is
currently unimplemented. Update the AER driver to enable error forwarding
to the CXL driver.

Modify the AER service driver's handle_error_source(), which is called from
process_aer_err_devices(), to distinguish between PCIe and CXL errors.

Rename and update is_internal_error() to is_cxl_error(). Ensuring it
checks both the 'struct aer_info::is_cxl' flag and the AER internal error
masks.

If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
as done in the current AER driver.

If the error is a CXL-related error then forward it to the CXL driver for
handling using the kfifo mechanism.

Introduce a new function forward_cxl_error(), which constructs a CXL
protocol error context using cxl_create_prot_err_info(). This context is
then passed to the CXL driver via kfifo using a 'struct work_struct'.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/pci/pcie/aer.c | 61 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 46123b70f496..d1df751cfe4b 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1010,6 +1010,14 @@ static bool is_internal_error(struct aer_err_info *info)
 	return info->status & PCI_ERR_UNC_INTN;
 }
 
+static bool is_cxl_error(struct aer_err_info *info)
+{
+	if (!info || !info->is_cxl)
+		return false;
+
+	return is_internal_error(info);
+}
+
 static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
 {
 	struct aer_err_info *info = (struct aer_err_info *)data;
@@ -1062,13 +1070,17 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
 	return *handles_cxl;
 }
 
-static bool handles_cxl_errors(struct pci_dev *rcec)
+static bool handles_cxl_errors(struct pci_dev *dev)
 {
 	bool handles_cxl = false;
 
-	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
-	    pcie_aer_is_native(rcec))
-		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
+	if (!pcie_aer_is_native(dev))
+		return false;
+
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
+	else
+		handles_cxl = pcie_is_cxl(dev);
 
 	return handles_cxl;
 }
@@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
 	pci_info(rcec, "CXL: Internal errors unmasked");
 }
 
+static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
+{
+	int severity = info->severity;
+	struct cxl_prot_err_work_data wd;
+	struct cxl_prot_error_info *err_info = &wd.err_info;
+	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(_pdev);
+
+	if (!cxl_create_prot_err_info) {
+		pci_err(pdev, "Failed. CXL-AER interface not initialized.");
+		return;
+	}
+
+	if (cxl_create_prot_err_info(pdev, severity, err_info)) {
+		pci_err(pdev, "Failed to create CXL protocol error information");
+		return;
+	}
+
+	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);
+
+	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
+		pr_err_ratelimited("CXL kfifo overflow\n");
+		return;
+	}
+
+	schedule_work(cxl_prot_err_work);
+}
+
 #else
 static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
 static inline void cxl_rch_handle_error(struct pci_dev *dev,
 					struct aer_err_info *info) { }
+static inline void forward_cxl_error(struct pci_dev *dev,
+				    struct aer_err_info *info) { }
+static inline bool handles_cxl_errors(struct pci_dev *dev)
+{
+	return false;
+}
+static bool is_cxl_error(struct aer_err_info *info) { return 0; };
 #endif
 
 /**
@@ -1123,8 +1169,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 
 static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 {
-	cxl_rch_handle_error(dev, info);
-	pci_aer_handle_error(dev, info);
+	if (is_cxl_error(info))
+		forward_cxl_error(dev, info);
+	else
+		pci_aer_handle_error(dev, info);
+
 	pci_dev_put(dev);
 }
 
-- 
2.34.1

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Gregory Price 9 months, 3 weeks ago

On Wed, Mar 26, 2025 at 08:47:05PM -0500, Terry Bowman wrote:
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 46123b70f496..d1df751cfe4b 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1123,8 +1169,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>  
>  static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>  {
> -	cxl_rch_handle_error(dev, info);
> -	pci_aer_handle_error(dev, info);


This appears to remove that last reference to cxl_rch_handle_error,
build throws a warning saying as such.

I see in patch 5 it's removed, should probably be removed in this patch
instead.

~Gregory

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Jonathan Cameron 9 months, 3 weeks ago

On Wed, 26 Mar 2025 20:47:05 -0500
Terry Bowman <terry.bowman@amd.com> wrote:

> The AER service driver includes a CXL-specific kfifo, intended to forward
> CXL errors to the CXL driver. However, the forwarding functionality is
> currently unimplemented. Update the AER driver to enable error forwarding
> to the CXL driver.
> 
> Modify the AER service driver's handle_error_source(), which is called from
> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
> 
> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
> masks.
> 
> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
> as done in the current AER driver.
> 
> If the error is a CXL-related error then forward it to the CXL driver for
> handling using the kfifo mechanism.
> 
> Introduce a new function forward_cxl_error(), which constructs a CXL
> protocol error context using cxl_create_prot_err_info(). This context is
> then passed to the CXL driver via kfifo using a 'struct work_struct'.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Hi Terry,

Finally got back to this.  I'm not following how some of the reference
counting in here is working.  It might be fine but there is a lot
taking then dropping device references - some of which are taken again later.

> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>  	pci_info(rcec, "CXL: Internal errors unmasked");
>  }
>  
> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
> +{
> +	int severity = info->severity;

So far this variable isn't really justified.  Maybe it makes sense later in the
series?

> +	struct cxl_prot_err_work_data wd;
> +	struct cxl_prot_error_info *err_info = &wd.err_info;

Similarly. Why not just use this directly in the call below?

> +	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(_pdev);
Can you talk me through the reference counting?  You take one pci device reference
here.... 
> +
> +	if (!cxl_create_prot_err_info) {
> +		pci_err(pdev, "Failed. CXL-AER interface not initialized.");
> +		return;
> +	}
> +
> +	if (cxl_create_prot_err_info(pdev, severity, err_info)) {

...but the implementation of this also takes once internally.  Can we skip that
internal one and document that it is always take by the caller?

> +		pci_err(pdev, "Failed to create CXL protocol error information");
> +		return;
> +	}
> +
> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);

Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
followed by retaking it here.  How do we know it is still about by this call
and once we pull it off the kfifo later?

> +
> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
> +		pr_err_ratelimited("CXL kfifo overflow\n");
> +		return;
> +	}
> +
> +	schedule_work(cxl_prot_err_work);
> +}
> +

Thanks,

Jonathan

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Bowman, Terry 9 months, 2 weeks ago


On 4/23/2025 10:04 AM, Jonathan Cameron wrote:
> On Wed, 26 Mar 2025 20:47:05 -0500
> Terry Bowman <terry.bowman@amd.com> wrote:
>
>> The AER service driver includes a CXL-specific kfifo, intended to forward
>> CXL errors to the CXL driver. However, the forwarding functionality is
>> currently unimplemented. Update the AER driver to enable error forwarding
>> to the CXL driver.
>>
>> Modify the AER service driver's handle_error_source(), which is called from
>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
>>
>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
>> masks.
>>
>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
>> as done in the current AER driver.
>>
>> If the error is a CXL-related error then forward it to the CXL driver for
>> handling using the kfifo mechanism.
>>
>> Introduce a new function forward_cxl_error(), which constructs a CXL
>> protocol error context using cxl_create_prot_err_info(). This context is
>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Hi Terry,
>
> Finally got back to this.  I'm not following how some of the reference
> counting in here is working.  It might be fine but there is a lot
> taking then dropping device references - some of which are taken again later.
>
>> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>  	pci_info(rcec, "CXL: Internal errors unmasked");
>>  }
>>  
>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
>> +{
>> +	int severity = info->severity;
> So far this variable isn't really justified.  Maybe it makes sense later in the
> series?

This is used below in call to cxl_create_prot_err_info().

>> +	struct cxl_prot_err_work_data wd;
>> +	struct cxl_prot_error_info *err_info = &wd.err_info;
> Similarly. Why not just use this directly in the call below?

The reference assignment is made so that err_info can be initialized above and is then
used here to assign and later passed below as part of the work structure.

>> +	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(_pdev);
> Can you talk me through the reference counting?  You take one pci device reference
> here.... 

This will add reference count to the PCI device (not the CXL device) to prevent it
from being destroyed until scope exit cleanup.

>> +
>> +	if (!cxl_create_prot_err_info) {
>> +		pci_err(pdev, "Failed. CXL-AER interface not initialized.");
>> +		return;
>> +	}
>> +
>> +	if (cxl_create_prot_err_info(pdev, severity, err_info)) {
> ...but the implementation of this also takes once internally.  Can we skip that
> internal one and document that it is always take by the caller?

Yes, it can. I will have to verify other the 5 or 6 calls do the same before calling.
I was wanting to make the reference incr as early as possible immediately after error
detection and also not forcing callers to do the same setup everywhere beforehand. I
see your point and will consolidate them.

>> +		pci_err(pdev, "Failed to create CXL protocol error information");
>> +		return;
>> +	}
>> +
>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);
> Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
> followed by retaking it here.  How do we know it is still about by this call
> and once we pull it off the kfifo later?

Yes, this is a problem I realized after sending the series.

The device reference incr could be changed for all the devices to the non-cleanup
variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
I need to look at the other calls to to cxl_create_prot_err_info() as well.

In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
This would eliminate the need for further accesses to the CXL device after being dequeued from the
fifo. Thoughts?

>> +
>> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
>> +		pr_err_ratelimited("CXL kfifo overflow\n");
>> +		return;
>> +	}
>> +
>> +	schedule_work(cxl_prot_err_work);
>> +}
>> +
> Thanks,
>
> Jonathan
>
Thanks for reviewing Jonathan.

-Terry

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Jonathan Cameron 9 months, 2 weeks ago

On Thu, 24 Apr 2025 09:17:45 -0500
"Bowman, Terry" <terry.bowman@amd.com> wrote:

> On 4/23/2025 10:04 AM, Jonathan Cameron wrote:
> > On Wed, 26 Mar 2025 20:47:05 -0500
> > Terry Bowman <terry.bowman@amd.com> wrote:
> >  
> >> The AER service driver includes a CXL-specific kfifo, intended to forward
> >> CXL errors to the CXL driver. However, the forwarding functionality is
> >> currently unimplemented. Update the AER driver to enable error forwarding
> >> to the CXL driver.
> >>
> >> Modify the AER service driver's handle_error_source(), which is called from
> >> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
> >>
> >> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
> >> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
> >> masks.
> >>
> >> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
> >> as done in the current AER driver.
> >>
> >> If the error is a CXL-related error then forward it to the CXL driver for
> >> handling using the kfifo mechanism.
> >>
> >> Introduce a new function forward_cxl_error(), which constructs a CXL
> >> protocol error context using cxl_create_prot_err_info(). This context is
> >> then passed to the CXL driver via kfifo using a 'struct work_struct'.
> >>
> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com>  
> > Hi Terry,
> >
> > Finally got back to this.  I'm not following how some of the reference
> > counting in here is working.  It might be fine but there is a lot
> > taking then dropping device references - some of which are taken again later.
> >  
> >> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
> >>  	pci_info(rcec, "CXL: Internal errors unmasked");
> >>  }
> >>  
> >> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
> >> +{
> >> +	int severity = info->severity;  
> > So far this variable isn't really justified.  Maybe it makes sense later in the
> > series?  
> 
> This is used below in call to cxl_create_prot_err_info().
Sure, but why not just do

if (cxl_create_prot_error_info(pdev, info->severity, &wd.err_info)) {

There isn't anything modifying info->severity in between so that local
variable is just padding out the code to no real benefit.


> 
> >> +		pci_err(pdev, "Failed to create CXL protocol error information");
> >> +		return;
> >> +	}
> >> +
> >> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);  
> > Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
> > followed by retaking it here.  How do we know it is still about by this call
> > and once we pull it off the kfifo later?  
> 
> Yes, this is a problem I realized after sending the series.
> 
> The device reference incr could be changed for all the devices to the non-cleanup
> variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
> I need to look at the other calls to to cxl_create_prot_err_info() as well.
> 
> In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
> This would eliminate the need for further accesses to the CXL device after being dequeued from the
> fifo. Thoughts?

That sounds like a reasonable solution to me.

Jonathan

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Bowman, Terry 8 months, 4 weeks ago


On 4/25/2025 8:18 AM, Jonathan Cameron wrote:
> On Thu, 24 Apr 2025 09:17:45 -0500
> "Bowman, Terry" <terry.bowman@amd.com> wrote:
>
>> On 4/23/2025 10:04 AM, Jonathan Cameron wrote:
>>> On Wed, 26 Mar 2025 20:47:05 -0500
>>> Terry Bowman <terry.bowman@amd.com> wrote:
>>>  
>>>> The AER service driver includes a CXL-specific kfifo, intended to forward
>>>> CXL errors to the CXL driver. However, the forwarding functionality is
>>>> currently unimplemented. Update the AER driver to enable error forwarding
>>>> to the CXL driver.
>>>>
>>>> Modify the AER service driver's handle_error_source(), which is called from
>>>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
>>>>
>>>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
>>>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
>>>> masks.
>>>>
>>>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
>>>> as done in the current AER driver.
>>>>
>>>> If the error is a CXL-related error then forward it to the CXL driver for
>>>> handling using the kfifo mechanism.
>>>>
>>>> Introduce a new function forward_cxl_error(), which constructs a CXL
>>>> protocol error context using cxl_create_prot_err_info(). This context is
>>>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
>>>>
>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>  
>>> Hi Terry,
>>>
>>> Finally got back to this.  I'm not following how some of the reference
>>> counting in here is working.  It might be fine but there is a lot
>>> taking then dropping device references - some of which are taken again later.
>>>  
>>>> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>>  	pci_info(rcec, "CXL: Internal errors unmasked");
>>>>  }
>>>>  
>>>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
>>>> +{
>>>> +	int severity = info->severity;  
>>> So far this variable isn't really justified.  Maybe it makes sense later in the
>>> series?  
>> This is used below in call to cxl_create_prot_err_info().
> Sure, but why not just do
>
> if (cxl_create_prot_error_info(pdev, info->severity, &wd.err_info)) {
>
> There isn't anything modifying info->severity in between so that local
> variable is just padding out the code to no real benefit.
>
>
>>>> +		pci_err(pdev, "Failed to create CXL protocol error information");
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);  
>>> Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
>>> followed by retaking it here.  How do we know it is still about by this call
>>> and once we pull it off the kfifo later?  
>> Yes, this is a problem I realized after sending the series.
>>
>> The device reference incr could be changed for all the devices to the non-cleanup
>> variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
>> I need to look at the other calls to to cxl_create_prot_err_info() as well.
>>
>> In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
>> This would eliminate the need for further accesses to the CXL device after being dequeued from the
>> fifo. Thoughts?
> That sounds like a reasonable solution to me.
>
> Jonathan
Hi Jonathan,

Is it sufficient to rely on correctly implemented reference counting implementation instead
of caching the RAS status I mentioned earlier?

I have the next revision coded to 'get' the CXL erring device's reference count in the AER
driver before enqueuing in the kfifo and then added a reference count 'put' in the CXL driver
after dequeuing and handling/logging. This is an alternative to what I mentioned earlier reading
the RAS status and caching it. One more question: is it OK to implement the get and put (of
the same object) in different drivers?

If we need to read and cache the RAS status before the kfifo enqueue there will be some other
details to work through.

-Terry

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Jonathan Cameron 8 months, 3 weeks ago

On Thu, 15 May 2025 16:52:15 -0500
"Bowman, Terry" <terry.bowman@amd.com> wrote:

> On 4/25/2025 8:18 AM, Jonathan Cameron wrote:
> > On Thu, 24 Apr 2025 09:17:45 -0500
> > "Bowman, Terry" <terry.bowman@amd.com> wrote:
> >  
> >> On 4/23/2025 10:04 AM, Jonathan Cameron wrote:  
> >>> On Wed, 26 Mar 2025 20:47:05 -0500
> >>> Terry Bowman <terry.bowman@amd.com> wrote:
> >>>    
> >>>> The AER service driver includes a CXL-specific kfifo, intended to forward
> >>>> CXL errors to the CXL driver. However, the forwarding functionality is
> >>>> currently unimplemented. Update the AER driver to enable error forwarding
> >>>> to the CXL driver.
> >>>>
> >>>> Modify the AER service driver's handle_error_source(), which is called from
> >>>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
> >>>>
> >>>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
> >>>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
> >>>> masks.
> >>>>
> >>>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
> >>>> as done in the current AER driver.
> >>>>
> >>>> If the error is a CXL-related error then forward it to the CXL driver for
> >>>> handling using the kfifo mechanism.
> >>>>
> >>>> Introduce a new function forward_cxl_error(), which constructs a CXL
> >>>> protocol error context using cxl_create_prot_err_info(). This context is
> >>>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
> >>>>
> >>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>    
> >>> Hi Terry,
> >>>
> >>> Finally got back to this.  I'm not following how some of the reference
> >>> counting in here is working.  It might be fine but there is a lot
> >>> taking then dropping device references - some of which are taken again later.
> >>>    
> >>>> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
> >>>>  	pci_info(rcec, "CXL: Internal errors unmasked");
> >>>>  }
> >>>>  
> >>>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
> >>>> +{
> >>>> +	int severity = info->severity;    
> >>> So far this variable isn't really justified.  Maybe it makes sense later in the
> >>> series?    
> >> This is used below in call to cxl_create_prot_err_info().  
> > Sure, but why not just do
> >
> > if (cxl_create_prot_error_info(pdev, info->severity, &wd.err_info)) {
> >
> > There isn't anything modifying info->severity in between so that local
> > variable is just padding out the code to no real benefit.
> >
> >  
> >>>> +		pci_err(pdev, "Failed to create CXL protocol error information");
> >>>> +		return;
> >>>> +	}
> >>>> +
> >>>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);    
> >>> Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
> >>> followed by retaking it here.  How do we know it is still about by this call
> >>> and once we pull it off the kfifo later?    
> >> Yes, this is a problem I realized after sending the series.
> >>
> >> The device reference incr could be changed for all the devices to the non-cleanup
> >> variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
> >> I need to look at the other calls to to cxl_create_prot_err_info() as well.
> >>
> >> In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
> >> This would eliminate the need for further accesses to the CXL device after being dequeued from the
> >> fifo. Thoughts?  
> > That sounds like a reasonable solution to me.
> >
> > Jonathan  
> Hi Jonathan,
Hi Terry,

Sorry for delay - travel etc...

> 
> Is it sufficient to rely on correctly implemented reference counting implementation instead
> of caching the RAS status I mentioned earlier?
> 
> I have the next revision coded to 'get' the CXL erring device's reference count in the AER
> driver before enqueuing in the kfifo and then added a reference count 'put' in the CXL driver
> after dequeuing and handling/logging. This is an alternative to what I mentioned earlier reading
> the RAS status and caching it. One more question: is it OK to implement the get and put (of
> the same object) in different drivers?

It's definitely unusual.  If there is anything similar to point at I'd be happier than
this 'innovation' showing up here first. 

> 
> If we need to read and cache the RAS status before the kfifo enqueue there will be some other
> details to work through.
This still smells like the cleaner solution to me, but depends on those details..

Jonathan

> 
> -Terry
> 
>

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Bowman, Terry 8 months, 3 weeks ago


On 5/20/2025 6:04 AM, Jonathan Cameron wrote:
> On Thu, 15 May 2025 16:52:15 -0500
> "Bowman, Terry" <terry.bowman@amd.com> wrote:
>
>> On 4/25/2025 8:18 AM, Jonathan Cameron wrote:
>>> On Thu, 24 Apr 2025 09:17:45 -0500
>>> "Bowman, Terry" <terry.bowman@amd.com> wrote:
>>>  
>>>> On 4/23/2025 10:04 AM, Jonathan Cameron wrote:  
>>>>> On Wed, 26 Mar 2025 20:47:05 -0500
>>>>> Terry Bowman <terry.bowman@amd.com> wrote:
>>>>>    
>>>>>> The AER service driver includes a CXL-specific kfifo, intended to forward
>>>>>> CXL errors to the CXL driver. However, the forwarding functionality is
>>>>>> currently unimplemented. Update the AER driver to enable error forwarding
>>>>>> to the CXL driver.
>>>>>>
>>>>>> Modify the AER service driver's handle_error_source(), which is called from
>>>>>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
>>>>>>
>>>>>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
>>>>>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
>>>>>> masks.
>>>>>>
>>>>>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
>>>>>> as done in the current AER driver.
>>>>>>
>>>>>> If the error is a CXL-related error then forward it to the CXL driver for
>>>>>> handling using the kfifo mechanism.
>>>>>>
>>>>>> Introduce a new function forward_cxl_error(), which constructs a CXL
>>>>>> protocol error context using cxl_create_prot_err_info(). This context is
>>>>>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
>>>>>>
>>>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>    
>>>>> Hi Terry,
>>>>>
>>>>> Finally got back to this.  I'm not following how some of the reference
>>>>> counting in here is working.  It might be fine but there is a lot
>>>>> taking then dropping device references - some of which are taken again later.
>>>>>    
>>>>>> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>>>>  	pci_info(rcec, "CXL: Internal errors unmasked");
>>>>>>  }
>>>>>>  
>>>>>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
>>>>>> +{
>>>>>> +	int severity = info->severity;    
>>>>> So far this variable isn't really justified.  Maybe it makes sense later in the
>>>>> series?    
>>>> This is used below in call to cxl_create_prot_err_info().  
>>> Sure, but why not just do
>>>
>>> if (cxl_create_prot_error_info(pdev, info->severity, &wd.err_info)) {
>>>
>>> There isn't anything modifying info->severity in between so that local
>>> variable is just padding out the code to no real benefit.
>>>
>>>  
>>>>>> +		pci_err(pdev, "Failed to create CXL protocol error information");
>>>>>> +		return;
>>>>>> +	}
>>>>>> +
>>>>>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);    
>>>>> Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
>>>>> followed by retaking it here.  How do we know it is still about by this call
>>>>> and once we pull it off the kfifo later?    
>>>> Yes, this is a problem I realized after sending the series.
>>>>
>>>> The device reference incr could be changed for all the devices to the non-cleanup
>>>> variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
>>>> I need to look at the other calls to to cxl_create_prot_err_info() as well.
>>>>
>>>> In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
>>>> This would eliminate the need for further accesses to the CXL device after being dequeued from the
>>>> fifo. Thoughts?  
>>> That sounds like a reasonable solution to me.
>>>
>>> Jonathan  
>> Hi Jonathan,
> Hi Terry,
>
> Sorry for delay - travel etc...
>
>> Is it sufficient to rely on correctly implemented reference counting implementation instead
>> of caching the RAS status I mentioned earlier?
>>
>> I have the next revision coded to 'get' the CXL erring device's reference count in the AER
>> driver before enqueuing in the kfifo and then added a reference count 'put' in the CXL driver
>> after dequeuing and handling/logging. This is an alternative to what I mentioned earlier reading
>> the RAS status and caching it. One more question: is it OK to implement the get and put (of
>> the same object) in different drivers?
> It's definitely unusual.  If there is anything similar to point at I'd be happier than
> this 'innovation' showing up here first. 
>
>> If we need to read and cache the RAS status before the kfifo enqueue there will be some other
>> details to work through.
> This still smells like the cleaner solution to me, but depends on those details..
>
> Jonathan

In this case I believe we will need to move the CE handling (RAS status reading and clearing) before
the kfifo enqueue. I think this is necessary because CXL errors may continue to be received and we
don't want their status's combined when reading or clearing. I can refactor cxl_handle_ras()/
cxl_handle_cor_ras() to return the RAS status value and remove the trace logging (to instead be
called after kfifo dequeue).

This leaves the UCE case. It's worth mentioning the UCE flow is different than the the CE case
because it uses the top-bottom traversal starting at the erring device. Correct me if I'm wrong
this would be handled before the kfifo as well. The handling and logging in the UCE case are
baked together. The UCE flow would therefore need to include the trace logging during handling.

Another flow is the PCI EP errors. The PCIe EP CE and UCE handlers remain and can call the
the refactored cxl_handle_ras()/cxl_handle_cor_ras() and then trace log afterwards. This is no
issue.

This leaves only CE trace logging to be called after the kfifo dequeue. This is what doesn't
feel right and wanted to draw attention to.

All this to say: very little work will be done after the kfifo dequeue. Most of the work in
the kfifo implementation would be before the kfifo enqueuing in the CXL create_prot_error_info()
callback. I am concerned the balance of work done before and after the kfifo enqueue/dequeue
will be very asymmetric with little value provided from the kfifo.

-Terry

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Jonathan Cameron 8 months, 3 weeks ago

On Tue, 20 May 2025 08:21:18 -0500
"Bowman, Terry" <terry.bowman@amd.com> wrote:

> On 5/20/2025 6:04 AM, Jonathan Cameron wrote:
> > On Thu, 15 May 2025 16:52:15 -0500
> > "Bowman, Terry" <terry.bowman@amd.com> wrote:
> >  
> >> On 4/25/2025 8:18 AM, Jonathan Cameron wrote:  
> >>> On Thu, 24 Apr 2025 09:17:45 -0500
> >>> "Bowman, Terry" <terry.bowman@amd.com> wrote:
> >>>    
> >>>> On 4/23/2025 10:04 AM, Jonathan Cameron wrote:    
> >>>>> On Wed, 26 Mar 2025 20:47:05 -0500
> >>>>> Terry Bowman <terry.bowman@amd.com> wrote:
> >>>>>      
> >>>>>> The AER service driver includes a CXL-specific kfifo, intended to forward
> >>>>>> CXL errors to the CXL driver. However, the forwarding functionality is
> >>>>>> currently unimplemented. Update the AER driver to enable error forwarding
> >>>>>> to the CXL driver.
> >>>>>>
> >>>>>> Modify the AER service driver's handle_error_source(), which is called from
> >>>>>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
> >>>>>>
> >>>>>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
> >>>>>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
> >>>>>> masks.
> >>>>>>
> >>>>>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
> >>>>>> as done in the current AER driver.
> >>>>>>
> >>>>>> If the error is a CXL-related error then forward it to the CXL driver for
> >>>>>> handling using the kfifo mechanism.
> >>>>>>
> >>>>>> Introduce a new function forward_cxl_error(), which constructs a CXL
> >>>>>> protocol error context using cxl_create_prot_err_info(). This context is
> >>>>>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
> >>>>>>
> >>>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>      
> >>>>> Hi Terry,
> >>>>>
> >>>>> Finally got back to this.  I'm not following how some of the reference
> >>>>> counting in here is working.  It might be fine but there is a lot
> >>>>> taking then dropping device references - some of which are taken again later.
> >>>>>      
> >>>>>> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
> >>>>>>  	pci_info(rcec, "CXL: Internal errors unmasked");
> >>>>>>  }
> >>>>>>  
> >>>>>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
> >>>>>> +{
> >>>>>> +	int severity = info->severity;      
> >>>>> So far this variable isn't really justified.  Maybe it makes sense later in the
> >>>>> series?      
> >>>> This is used below in call to cxl_create_prot_err_info().    
> >>> Sure, but why not just do
> >>>
> >>> if (cxl_create_prot_error_info(pdev, info->severity, &wd.err_info)) {
> >>>
> >>> There isn't anything modifying info->severity in between so that local
> >>> variable is just padding out the code to no real benefit.
> >>>
> >>>    
> >>>>>> +		pci_err(pdev, "Failed to create CXL protocol error information");
> >>>>>> +		return;
> >>>>>> +	}
> >>>>>> +
> >>>>>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);      
> >>>>> Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
> >>>>> followed by retaking it here.  How do we know it is still about by this call
> >>>>> and once we pull it off the kfifo later?      
> >>>> Yes, this is a problem I realized after sending the series.
> >>>>
> >>>> The device reference incr could be changed for all the devices to the non-cleanup
> >>>> variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
> >>>> I need to look at the other calls to to cxl_create_prot_err_info() as well.
> >>>>
> >>>> In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
> >>>> This would eliminate the need for further accesses to the CXL device after being dequeued from the
> >>>> fifo. Thoughts?    
> >>> That sounds like a reasonable solution to me.
> >>>
> >>> Jonathan    
> >> Hi Jonathan,  
> > Hi Terry,
> >
> > Sorry for delay - travel etc...
> >  
> >> Is it sufficient to rely on correctly implemented reference counting implementation instead
> >> of caching the RAS status I mentioned earlier?
> >>
> >> I have the next revision coded to 'get' the CXL erring device's reference count in the AER
> >> driver before enqueuing in the kfifo and then added a reference count 'put' in the CXL driver
> >> after dequeuing and handling/logging. This is an alternative to what I mentioned earlier reading
> >> the RAS status and caching it. One more question: is it OK to implement the get and put (of
> >> the same object) in different drivers?  
> > It's definitely unusual.  If there is anything similar to point at I'd be happier than
> > this 'innovation' showing up here first. 
> >  
> >> If we need to read and cache the RAS status before the kfifo enqueue there will be some other
> >> details to work through.  
> > This still smells like the cleaner solution to me, but depends on those details..
> >
> > Jonathan  
> 
> In this case I believe we will need to move the CE handling (RAS status reading and clearing) before
> the kfifo enqueue. I think this is necessary because CXL errors may continue to be received and we
> don't want their status's combined when reading or clearing. I can refactor cxl_handle_ras()/
> cxl_handle_cor_ras() to return the RAS status value and remove the trace logging (to instead be
> called after kfifo dequeue).
> 
> This leaves the UCE case. It's worth mentioning the UCE flow is different than the the CE case
> because it uses the top-bottom traversal starting at the erring device. Correct me if I'm wrong
> this would be handled before the kfifo as well. The handling and logging in the UCE case are
> baked together. The UCE flow would therefore need to include the trace logging during handling.
> 
> Another flow is the PCI EP errors. The PCIe EP CE and UCE handlers remain and can call the
> the refactored cxl_handle_ras()/cxl_handle_cor_ras() and then trace log afterwards. This is no
> issue.
> 
> This leaves only CE trace logging to be called after the kfifo dequeue. This is what doesn't
> feel right and wanted to draw attention to.
> 
> All this to say: very little work will be done after the kfifo dequeue. Most of the work in
> the kfifo implementation would be before the kfifo enqueuing in the CXL create_prot_error_info()
> callback. I am concerned the balance of work done before and after the kfifo enqueue/dequeue
> will be very asymmetric with little value provided from the kfifo.
> 
As per the discord chat - if you look up the device again from BDF or similar and get this
info once you have right locks post kfifo all should be fine as any race will be easy
to resolve by doing nothing if the driver has gone away.

Jonathan

> -Terry
>

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Bowman, Terry 8 months, 3 weeks ago

On 5/21/2025 1:34 PM, Jonathan Cameron wrote:
> On Tue, 20 May 2025 08:21:18 -0500
> "Bowman, Terry" <terry.bowman@amd.com> wrote:
> 
>> On 5/20/2025 6:04 AM, Jonathan Cameron wrote:
>>> On Thu, 15 May 2025 16:52:15 -0500
>>> "Bowman, Terry" <terry.bowman@amd.com> wrote:
>>>  
>>>> On 4/25/2025 8:18 AM, Jonathan Cameron wrote:  
>>>>> On Thu, 24 Apr 2025 09:17:45 -0500
>>>>> "Bowman, Terry" <terry.bowman@amd.com> wrote:
>>>>>    
>>>>>> On 4/23/2025 10:04 AM, Jonathan Cameron wrote:    
>>>>>>> On Wed, 26 Mar 2025 20:47:05 -0500
>>>>>>> Terry Bowman <terry.bowman@amd.com> wrote:
>>>>>>>      
>>>>>>>> The AER service driver includes a CXL-specific kfifo, intended to forward
>>>>>>>> CXL errors to the CXL driver. However, the forwarding functionality is
>>>>>>>> currently unimplemented. Update the AER driver to enable error forwarding
>>>>>>>> to the CXL driver.
>>>>>>>>
>>>>>>>> Modify the AER service driver's handle_error_source(), which is called from
>>>>>>>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
>>>>>>>>
>>>>>>>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
>>>>>>>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
>>>>>>>> masks.
>>>>>>>>
>>>>>>>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
>>>>>>>> as done in the current AER driver.
>>>>>>>>
>>>>>>>> If the error is a CXL-related error then forward it to the CXL driver for
>>>>>>>> handling using the kfifo mechanism.
>>>>>>>>
>>>>>>>> Introduce a new function forward_cxl_error(), which constructs a CXL
>>>>>>>> protocol error context using cxl_create_prot_err_info(). This context is
>>>>>>>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
>>>>>>>>
>>>>>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>      
>>>>>>> Hi Terry,
>>>>>>>
>>>>>>> Finally got back to this.  I'm not following how some of the reference
>>>>>>> counting in here is working.  It might be fine but there is a lot
>>>>>>> taking then dropping device references - some of which are taken again later.
>>>>>>>      
>>>>>>>> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>>>>>>  	pci_info(rcec, "CXL: Internal errors unmasked");
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
>>>>>>>> +{
>>>>>>>> +	int severity = info->severity;      
>>>>>>> So far this variable isn't really justified.  Maybe it makes sense later in the
>>>>>>> series?      
>>>>>> This is used below in call to cxl_create_prot_err_info().    
>>>>> Sure, but why not just do
>>>>>
>>>>> if (cxl_create_prot_error_info(pdev, info->severity, &wd.err_info)) {
>>>>>
>>>>> There isn't anything modifying info->severity in between so that local
>>>>> variable is just padding out the code to no real benefit.
>>>>>
>>>>>    
>>>>>>>> +		pci_err(pdev, "Failed to create CXL protocol error information");
>>>>>>>> +		return;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);      
>>>>>>> Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
>>>>>>> followed by retaking it here.  How do we know it is still about by this call
>>>>>>> and once we pull it off the kfifo later?      
>>>>>> Yes, this is a problem I realized after sending the series.
>>>>>>
>>>>>> The device reference incr could be changed for all the devices to the non-cleanup
>>>>>> variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
>>>>>> I need to look at the other calls to to cxl_create_prot_err_info() as well.
>>>>>>
>>>>>> In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
>>>>>> This would eliminate the need for further accesses to the CXL device after being dequeued from the
>>>>>> fifo. Thoughts?    
>>>>> That sounds like a reasonable solution to me.
>>>>>
>>>>> Jonathan    
>>>> Hi Jonathan,  
>>> Hi Terry,
>>>
>>> Sorry for delay - travel etc...
>>>  
>>>> Is it sufficient to rely on correctly implemented reference counting implementation instead
>>>> of caching the RAS status I mentioned earlier?
>>>>
>>>> I have the next revision coded to 'get' the CXL erring device's reference count in the AER
>>>> driver before enqueuing in the kfifo and then added a reference count 'put' in the CXL driver
>>>> after dequeuing and handling/logging. This is an alternative to what I mentioned earlier reading
>>>> the RAS status and caching it. One more question: is it OK to implement the get and put (of
>>>> the same object) in different drivers?  
>>> It's definitely unusual.  If there is anything similar to point at I'd be happier than
>>> this 'innovation' showing up here first. 
>>>  
>>>> If we need to read and cache the RAS status before the kfifo enqueue there will be some other
>>>> details to work through.  
>>> This still smells like the cleaner solution to me, but depends on those details..
>>>
>>> Jonathan  
>>
>> In this case I believe we will need to move the CE handling (RAS status reading and clearing) before
>> the kfifo enqueue. I think this is necessary because CXL errors may continue to be received and we
>> don't want their status's combined when reading or clearing. I can refactor cxl_handle_ras()/
>> cxl_handle_cor_ras() to return the RAS status value and remove the trace logging (to instead be
>> called after kfifo dequeue).
>>
>> This leaves the UCE case. It's worth mentioning the UCE flow is different than the the CE case
>> because it uses the top-bottom traversal starting at the erring device. Correct me if I'm wrong
>> this would be handled before the kfifo as well. The handling and logging in the UCE case are
>> baked together. The UCE flow would therefore need to include the trace logging during handling.
>>
>> Another flow is the PCI EP errors. The PCIe EP CE and UCE handlers remain and can call the
>> the refactored cxl_handle_ras()/cxl_handle_cor_ras() and then trace log afterwards. This is no
>> issue.
>>
>> This leaves only CE trace logging to be called after the kfifo dequeue. This is what doesn't
>> feel right and wanted to draw attention to.
>>
>> All this to say: very little work will be done after the kfifo dequeue. Most of the work in
>> the kfifo implementation would be before the kfifo enqueuing in the CXL create_prot_error_info()
>> callback. I am concerned the balance of work done before and after the kfifo enqueue/dequeue
>> will be very asymmetric with little value provided from the kfifo.
>>
> As per the discord chat - if you look up the device again from BDF or similar and get this
> info once you have right locks post kfifo all should be fine as any race will be easy
> to resolve by doing nothing if the driver has gone away.
> 
> Jonathan
> 
>> -Terry
>>
> 

Ok. I understand. Thanks.

-Terry

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Bowman, Terry 9 months, 2 weeks ago


On 4/25/2025 8:18 AM, Jonathan Cameron wrote:
> On Thu, 24 Apr 2025 09:17:45 -0500
> "Bowman, Terry" <terry.bowman@amd.com> wrote:
>
>> On 4/23/2025 10:04 AM, Jonathan Cameron wrote:
>>> On Wed, 26 Mar 2025 20:47:05 -0500
>>> Terry Bowman <terry.bowman@amd.com> wrote:
>>>  
>>>> The AER service driver includes a CXL-specific kfifo, intended to forward
>>>> CXL errors to the CXL driver. However, the forwarding functionality is
>>>> currently unimplemented. Update the AER driver to enable error forwarding
>>>> to the CXL driver.
>>>>
>>>> Modify the AER service driver's handle_error_source(), which is called from
>>>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
>>>>
>>>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
>>>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
>>>> masks.
>>>>
>>>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
>>>> as done in the current AER driver.
>>>>
>>>> If the error is a CXL-related error then forward it to the CXL driver for
>>>> handling using the kfifo mechanism.
>>>>
>>>> Introduce a new function forward_cxl_error(), which constructs a CXL
>>>> protocol error context using cxl_create_prot_err_info(). This context is
>>>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
>>>>
>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>  
>>> Hi Terry,
>>>
>>> Finally got back to this.  I'm not following how some of the reference
>>> counting in here is working.  It might be fine but there is a lot
>>> taking then dropping device references - some of which are taken again later.
>>>  
>>>> @@ -1082,10 +1094,44 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>>  	pci_info(rcec, "CXL: Internal errors unmasked");
>>>>  }
>>>>  
>>>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
>>>> +{
>>>> +	int severity = info->severity;  
>>> So far this variable isn't really justified.  Maybe it makes sense later in the
>>> series?  
>> This is used below in call to cxl_create_prot_err_info().
> Sure, but why not just do
>
> if (cxl_create_prot_error_info(pdev, info->severity, &wd.err_info)) {
>
> There isn't anything modifying info->severity in between so that local
> variable is just padding out the code to no real benefit.
>

I was following a common pattern I observed where a local variable pointer is assigned
to a struct member reference when passing as a function call parameter. I suppose it helps
readability but not necessary here.

Sure, I'll make that change.

>>>> +		pci_err(pdev, "Failed to create CXL protocol error information");
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);  
>>> Also this one.  A reference was acquired and dropped in cxl_create_prot_err_info()
>>> followed by retaking it here.  How do we know it is still about by this call
>>> and once we pull it off the kfifo later?  
>> Yes, this is a problem I realized after sending the series.
>>
>> The device reference incr could be changed for all the devices to the non-cleanup
>> variety. Then would add the reference incr in the caller after calling cxl_create_prot_err_info().
>> I need to look at the other calls to to cxl_create_prot_err_info() as well.
>>
>> In addition, I think we should consider adding the CXL RAS status into the struct cxl_prot_err_info.
>> This would eliminate the need for further accesses to the CXL device after being dequeued from the
>> fifo. Thoughts?
> That sounds like a reasonable solution to me.
>
> Jonathan
>
Ok.

-Terry

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Bjorn Helgaas 10 months, 2 weeks ago

On Wed, Mar 26, 2025 at 08:47:05PM -0500, Terry Bowman wrote:
> The AER service driver includes a CXL-specific kfifo, intended to forward
> CXL errors to the CXL driver. However, the forwarding functionality is
> currently unimplemented. Update the AER driver to enable error forwarding
> to the CXL driver.
> 
> Modify the AER service driver's handle_error_source(), which is called from
> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
> 
> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
> masks.
> 
> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
> as done in the current AER driver.
> 
> If the error is a CXL-related error then forward it to the CXL driver for
> handling using the kfifo mechanism.
> 
> Introduce a new function forward_cxl_error(), which constructs a CXL
> protocol error context using cxl_create_prot_err_info(). This context is
> then passed to the CXL driver via kfifo using a 'struct work_struct'.

This only touches drivers/pci, so I would make the subject prefix be
"PCI/AER".

> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
> +{
> +	int severity = info->severity;
> +	struct cxl_prot_err_work_data wd;
> +	struct cxl_prot_error_info *err_info = &wd.err_info;
> +	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(_pdev);
> +
> +	if (!cxl_create_prot_err_info) {
> +		pci_err(pdev, "Failed. CXL-AER interface not initialized.");
> +		return;
> +	}
> +
> +	if (cxl_create_prot_err_info(pdev, severity, err_info)) {
> +		pci_err(pdev, "Failed to create CXL protocol error information");
> +		return;
> +	}
> +
> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);
> +
> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
> +		pr_err_ratelimited("CXL kfifo overflow\n");

Needs a dev identifier here to anchor the message to something.

> +		return;
> +	}
> +
> +	schedule_work(cxl_prot_err_work);
> +}

Re: [PATCH v8 04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Posted by Bowman, Terry 10 months, 1 week ago


On 3/27/2025 12:13 PM, Bjorn Helgaas wrote:
> On Wed, Mar 26, 2025 at 08:47:05PM -0500, Terry Bowman wrote:
>> The AER service driver includes a CXL-specific kfifo, intended to forward
>> CXL errors to the CXL driver. However, the forwarding functionality is
>> currently unimplemented. Update the AER driver to enable error forwarding
>> to the CXL driver.
>>
>> Modify the AER service driver's handle_error_source(), which is called from
>> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
>>
>> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
>> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
>> masks.
>>
>> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
>> as done in the current AER driver.
>>
>> If the error is a CXL-related error then forward it to the CXL driver for
>> handling using the kfifo mechanism.
>>
>> Introduce a new function forward_cxl_error(), which constructs a CXL
>> protocol error context using cxl_create_prot_err_info(). This context is
>> then passed to the CXL driver via kfifo using a 'struct work_struct'.
> This only touches drivers/pci, so I would make the subject prefix be
> "PCI/AER".
Got it. Thanks Bjorn.

>> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
>> +{
>> +	int severity = info->severity;
>> +	struct cxl_prot_err_work_data wd;
>> +	struct cxl_prot_error_info *err_info = &wd.err_info;
>> +	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(_pdev);
>> +
>> +	if (!cxl_create_prot_err_info) {
>> +		pci_err(pdev, "Failed. CXL-AER interface not initialized.");
>> +		return;
>> +	}
>> +
>> +	if (cxl_create_prot_err_info(pdev, severity, err_info)) {
>> +		pci_err(pdev, "Failed to create CXL protocol error information");
>> +		return;
>> +	}
>> +
>> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);
>> +
>> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
>> +		pr_err_ratelimited("CXL kfifo overflow\n");
> Needs a dev identifier here to anchor the message to something.
Ok.

Regards,
Terry

>> +		return;
>> +	}
>> +
>> +	schedule_work(cxl_prot_err_work);
>> +}