[PATCH 04/15] cxl/aer/pci: Add CXL PCIe port correctable error support in AER service driver

Terry Bowman posted 15 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH 04/15] cxl/aer/pci: Add CXL PCIe port correctable error support in AER service driver
Posted by Terry Bowman 1 month, 2 weeks ago
The AER service driver currently does not manage CXL PCIe port
protocol errors reported by CXL root ports, CXL upstream switch ports,
and CXL downstream switch ports. Consequently, RAS protocol errors
from CXL PCIe port devices are not properly logged or handled.

These errors are reported to the OS via the root port's AER correctable
and uncorrectable internal error fields. While the AER driver supports
handling downstream port protocol errors in restricted CXL host (RCH)
mode also known as CXL1.1, it lacks the same functionality for CXL
PCIe ports operating in virtual hierarchy (VH) mode, introduced in
CXL2.0.

To address this gap, update the AER driver to handle CXL PCIe port
device protocol correctable errors (CE).

The uncorrectable error handling (UCE) will be added in a future
patch.

Make this update alongside the existing downstream port RCH error
handling logic, extending support to CXL PCIe ports in VH.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/pci/pcie/aer.c | 54 +++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index dc8b17999001..1c996287d4ce 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -40,6 +40,8 @@
 #define AER_MAX_TYPEOF_COR_ERRS		16	/* as per PCI_ERR_COR_STATUS */
 #define AER_MAX_TYPEOF_UNCOR_ERRS	27	/* as per PCI_ERR_UNCOR_STATUS*/
 
+#define CXL_DVSEC_PORT_EXTENSIONS	3
+
 struct aer_err_source {
 	u32 status;			/* PCI_ERR_ROOT_STATUS */
 	u32 id;				/* PCI_ERR_ROOT_ERR_SRC */
@@ -941,6 +943,17 @@ static bool find_source_device(struct pci_dev *parent,
 	return true;
 }
 
+static bool is_pcie_cxl_port(struct pci_dev *dev)
+{
+	if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
+	    (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
+	    (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
+		return false;
+
+	return (!!pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL,
+					    CXL_DVSEC_PORT_EXTENSIONS));
+}
+
 static bool is_internal_error(struct aer_err_info *info)
 {
 	if (info->severity == AER_CORRECTABLE)
@@ -1032,14 +1045,22 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
 
 static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 {
-	/*
-	 * Internal errors of an RCEC indicate an AER error in an
-	 * RCH's downstream port. Check and handle them in the CXL.mem
-	 * device driver.
-	 */
-	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
-	    is_internal_error(info))
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
 		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
+
+	if (info->severity == AER_CORRECTABLE) {
+		struct cxl_port_err_hndlrs *cxl_port_hndlrs =
+			find_cxl_port_hndlrs();
+		int aer = dev->aer_cap;
+
+		if (aer)
+			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
+					       info->status);
+
+		if (cxl_port_hndlrs && cxl_port_hndlrs->cor_error_detected)
+			cxl_port_hndlrs->cor_error_detected(dev);
+		pcie_clear_device_status(dev);
+	}
 }
 
 static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
@@ -1057,9 +1078,13 @@ static bool handles_cxl_errors(struct pci_dev *dev)
 {
 	bool handles_cxl = false;
 
-	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
-	    pcie_aer_is_native(dev))
+	if (!pcie_aer_is_native(dev))
+		return false;
+
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
 		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
+	else
+		handles_cxl = is_pcie_cxl_port(dev);
 
 	return handles_cxl;
 }
@@ -1077,6 +1102,10 @@ static void cxl_enable_internal_errors(struct pci_dev *dev)
 static inline void cxl_enable_internal_errors(struct pci_dev *dev) { }
 static inline void cxl_handle_error(struct pci_dev *dev,
 				    struct aer_err_info *info) { }
+static bool handles_cxl_errors(struct pci_dev *dev)
+{
+	return false;
+}
 #endif
 
 void register_cxl_port_hndlrs(struct cxl_port_err_hndlrs *_cxl_port_hndlrs)
@@ -1134,8 +1163,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 
 static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 {
-	cxl_handle_error(dev, info);
-	pci_aer_handle_error(dev, info);
+	if (is_internal_error(info) && handles_cxl_errors(dev))
+		cxl_handle_error(dev, info);
+	else
+		pci_aer_handle_error(dev, info);
+
 	pci_dev_put(dev);
 }
 
-- 
2.34.1
Re: [PATCH 04/15] cxl/aer/pci: Add CXL PCIe port correctable error support in AER service driver
Posted by Jonathan Cameron 1 month, 1 week ago
On Tue, 8 Oct 2024 17:16:46 -0500
Terry Bowman <terry.bowman@amd.com> wrote:

> The AER service driver currently does not manage CXL PCIe port
> protocol errors reported by CXL root ports, CXL upstream switch ports,
> and CXL downstream switch ports. Consequently, RAS protocol errors
> from CXL PCIe port devices are not properly logged or handled.
> 
> These errors are reported to the OS via the root port's AER correctable
> and uncorrectable internal error fields. While the AER driver supports
> handling downstream port protocol errors in restricted CXL host (RCH)
> mode also known as CXL1.1, it lacks the same functionality for CXL
> PCIe ports operating in virtual hierarchy (VH) mode, introduced in
> CXL2.0.
> 
> To address this gap, update the AER driver to handle CXL PCIe port
> device protocol correctable errors (CE).
> 
> The uncorrectable error handling (UCE) will be added in a future
> patch.
> 
> Make this update alongside the existing downstream port RCH error
> handling logic, extending support to CXL PCIe ports in VH.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Minor comments inline.

J
> ---
>  drivers/pci/pcie/aer.c | 54 +++++++++++++++++++++++++++++++++---------
>  1 file changed, 43 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index dc8b17999001..1c996287d4ce 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -40,6 +40,8 @@
>  #define AER_MAX_TYPEOF_COR_ERRS		16	/* as per PCI_ERR_COR_STATUS */
>  #define AER_MAX_TYPEOF_UNCOR_ERRS	27	/* as per PCI_ERR_UNCOR_STATUS*/
>  
> +#define CXL_DVSEC_PORT_EXTENSIONS	3

Duplicate of definition in drivers/cxl/cxlpci.h

Maybe wrap it up in an is_cxl_port() or similar? Or just 
move that to a header both places can exercise.


> +
>  struct aer_err_source {
>  	u32 status;			/* PCI_ERR_ROOT_STATUS */
>  	u32 id;				/* PCI_ERR_ROOT_ERR_SRC */
> @@ -941,6 +943,17 @@ static bool find_source_device(struct pci_dev *parent,
>  	return true;
>  }
>  
> +static bool is_pcie_cxl_port(struct pci_dev *dev)
> +{
> +	if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
> +	    (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
> +	    (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
> +		return false;
> +
> +	return (!!pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL,
> +					    CXL_DVSEC_PORT_EXTENSIONS));

No need for the !! it will return the same without that clamping to 1/0
because any non 0 value is true.

> +}
> +
Re: [PATCH 04/15] cxl/aer/pci: Add CXL PCIe port correctable error support in AER service driver
Posted by Terry Bowman 1 month, 1 week ago
Hi Jonathan,

On 10/16/24 11:22, Jonathan Cameron wrote:
> On Tue, 8 Oct 2024 17:16:46 -0500
> Terry Bowman <terry.bowman@amd.com> wrote:
> 
>> The AER service driver currently does not manage CXL PCIe port
>> protocol errors reported by CXL root ports, CXL upstream switch ports,
>> and CXL downstream switch ports. Consequently, RAS protocol errors
>> from CXL PCIe port devices are not properly logged or handled.
>>
>> These errors are reported to the OS via the root port's AER correctable
>> and uncorrectable internal error fields. While the AER driver supports
>> handling downstream port protocol errors in restricted CXL host (RCH)
>> mode also known as CXL1.1, it lacks the same functionality for CXL
>> PCIe ports operating in virtual hierarchy (VH) mode, introduced in
>> CXL2.0.
>>
>> To address this gap, update the AER driver to handle CXL PCIe port
>> device protocol correctable errors (CE).
>>
>> The uncorrectable error handling (UCE) will be added in a future
>> patch.
>>
>> Make this update alongside the existing downstream port RCH error
>> handling logic, extending support to CXL PCIe ports in VH.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Minor comments inline.
> 
> J
>> ---
>>  drivers/pci/pcie/aer.c | 54 +++++++++++++++++++++++++++++++++---------
>>  1 file changed, 43 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index dc8b17999001..1c996287d4ce 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -40,6 +40,8 @@
>>  #define AER_MAX_TYPEOF_COR_ERRS		16	/* as per PCI_ERR_COR_STATUS */
>>  #define AER_MAX_TYPEOF_UNCOR_ERRS	27	/* as per PCI_ERR_UNCOR_STATUS*/
>>  
>> +#define CXL_DVSEC_PORT_EXTENSIONS	3
> 
> Duplicate of definition in drivers/cxl/cxlpci.h
> 
> Maybe wrap it up in an is_cxl_port() or similar? Or just 
> move that to a header both places can exercise.
> 
> 

Ok. I'll move the value '3' into the function call rather than use a #define.

>> +
>>  struct aer_err_source {
>>  	u32 status;			/* PCI_ERR_ROOT_STATUS */
>>  	u32 id;				/* PCI_ERR_ROOT_ERR_SRC */
>> @@ -941,6 +943,17 @@ static bool find_source_device(struct pci_dev *parent,
>>  	return true;
>>  }
>>  
>> +static bool is_pcie_cxl_port(struct pci_dev *dev)
>> +{
>> +	if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
>> +	    (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
>> +	    (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
>> +		return false;
>> +
>> +	return (!!pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL,
>> +					    CXL_DVSEC_PORT_EXTENSIONS));
> 
> No need for the !! it will return the same without that clamping to 1/0
> because any non 0 value is true.
> 

Ok

Regards,
Terry
>> +}
>> +
Re: [PATCH 04/15] cxl/aer/pci: Add CXL PCIe port correctable error support in AER service driver
Posted by Jonathan Cameron 1 month, 1 week ago
On Wed, 16 Oct 2024 12:18:06 -0500
Terry Bowman <Terry.Bowman@amd.com> wrote:

> Hi Jonathan,
> 
> On 10/16/24 11:22, Jonathan Cameron wrote:
> > On Tue, 8 Oct 2024 17:16:46 -0500
> > Terry Bowman <terry.bowman@amd.com> wrote:
> >   
> >> The AER service driver currently does not manage CXL PCIe port
> >> protocol errors reported by CXL root ports, CXL upstream switch ports,
> >> and CXL downstream switch ports. Consequently, RAS protocol errors
> >> from CXL PCIe port devices are not properly logged or handled.
> >>
> >> These errors are reported to the OS via the root port's AER correctable
> >> and uncorrectable internal error fields. While the AER driver supports
> >> handling downstream port protocol errors in restricted CXL host (RCH)
> >> mode also known as CXL1.1, it lacks the same functionality for CXL
> >> PCIe ports operating in virtual hierarchy (VH) mode, introduced in
> >> CXL2.0.
> >>
> >> To address this gap, update the AER driver to handle CXL PCIe port
> >> device protocol correctable errors (CE).
> >>
> >> The uncorrectable error handling (UCE) will be added in a future
> >> patch.
> >>
> >> Make this update alongside the existing downstream port RCH error
> >> handling logic, extending support to CXL PCIe ports in VH.
> >>
> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com>  
> > Minor comments inline.
> > 
> > J  
> >> ---
> >>  drivers/pci/pcie/aer.c | 54 +++++++++++++++++++++++++++++++++---------
> >>  1 file changed, 43 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> >> index dc8b17999001..1c996287d4ce 100644
> >> --- a/drivers/pci/pcie/aer.c
> >> +++ b/drivers/pci/pcie/aer.c
> >> @@ -40,6 +40,8 @@
> >>  #define AER_MAX_TYPEOF_COR_ERRS		16	/* as per PCI_ERR_COR_STATUS */
> >>  #define AER_MAX_TYPEOF_UNCOR_ERRS	27	/* as per PCI_ERR_UNCOR_STATUS*/
> >>  
> >> +#define CXL_DVSEC_PORT_EXTENSIONS	3  
> > 
> > Duplicate of definition in drivers/cxl/cxlpci.h
> > 
> > Maybe wrap it up in an is_cxl_port() or similar? Or just 
> > move that to a header both places can exercise.
> > 
> >   
> 
> Ok. I'll move the value '3' into the function call rather than use a #define.
Not that's worse!

Find a way to have just one definition.

> 
> >> +
> >>  struct aer_err_source {
> >>  	u32 status;			/* PCI_ERR_ROOT_STATUS */
> >>  	u32 id;				/* PCI_ERR_ROOT_ERR_SRC */
> >> @@ -941,6 +943,17 @@ static bool find_source_device(struct pci_dev *parent,
> >>  	return true;
> >>  }
> >>  
> >> +static bool is_pcie_cxl_port(struct pci_dev *dev)
> >> +{
> >> +	if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
> >> +	    (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
> >> +	    (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
> >> +		return false;
> >> +
> >> +	return (!!pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL,
> >> +					    CXL_DVSEC_PORT_EXTENSIONS));  
> > 
> > No need for the !! it will return the same without that clamping to 1/0
> > because any non 0 value is true.
> >   
> 
> Ok
> 
> Regards,
> Terry
> >> +}
> >> +