[PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths

Karolina Stolarek posted 1 patch 8 months, 3 weeks ago
drivers/cxl/core/pci.c |  2 +-
drivers/pci/pcie/aer.c | 64 ++++++++++++++++++++----------------------
include/linux/aer.h    |  4 +--
3 files changed, 33 insertions(+), 37 deletions(-)
[PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Karolina Stolarek 8 months, 3 weeks ago
Currently, CXL and GHES feature use pci_print_aer() function to
log AER errors. Its implementation is pretty similar to aer_print_error(),
duplicating the way how native PCIe devices report errors. We shouldn't
log messages differently only because they are coming from a different
code path.

Make CXL devices and GHES to call aer_print_error() when reporting
AER errors. Add a wrapper, aer_print_platform_error(), that translates
aer_capabilities_regs to aer_err_info so we can use pci_print_aer()
function.

Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
v2:
  - Don't expose aer_err_info to the world; as aer_recover_queue()
    is tightly connected to the ghes code, introduce a wrapper for
    aer_print_error()
  - Move aer_err_info memset to the wrapper, don't expect the
    caller to clean it for us

  I'm still working on the logs; in the meantime, I think, we can
  continue reviewing the patch.

 drivers/cxl/core/pci.c |  2 +-
 drivers/pci/pcie/aer.c | 64 ++++++++++++++++++++----------------------
 include/linux/aer.h    |  4 +--
 3 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 013b869b66cb..9ba711365388 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -885,7 +885,7 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
 	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
 		return;
 
-	pci_print_aer(pdev, severity, &aer_regs);
+	aer_print_platform_error(pdev, severity, &aer_regs);
 
 	if (severity == AER_CORRECTABLE)
 		cxl_handle_rdport_cor_ras(cxlds, dport);
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a1cf8c7ef628..ec34bc9b2332 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -760,47 +760,42 @@ int cper_severity_to_aer(int cper_severity)
 EXPORT_SYMBOL_GPL(cper_severity_to_aer);
 #endif
 
-void pci_print_aer(struct pci_dev *dev, int aer_severity,
-		   struct aer_capability_regs *aer)
+static void populate_aer_err_info(struct aer_err_info *info, int severity,
+				  struct aer_capability_regs *aer_regs)
 {
-	int layer, agent, tlp_header_valid = 0;
-	u32 status, mask;
-	struct aer_err_info info;
-
-	if (aer_severity == AER_CORRECTABLE) {
-		status = aer->cor_status;
-		mask = aer->cor_mask;
-	} else {
-		status = aer->uncor_status;
-		mask = aer->uncor_mask;
-		tlp_header_valid = status & AER_LOG_TLP_MASKS;
-	}
-
-	layer = AER_GET_LAYER_ERROR(aer_severity, status);
-	agent = AER_GET_AGENT(aer_severity, status);
+	int tlp_header_valid;
 
 	memset(&info, 0, sizeof(info));
-	info.severity = aer_severity;
-	info.status = status;
-	info.mask = mask;
-	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
 
-	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
-	__aer_print_error(dev, &info);
-	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
-		aer_error_layer[layer], aer_agent_string[agent]);
+	info->severity = severity;
+	info->first_error = PCI_ERR_CAP_FEP(aer_regs->cap_control);
 
-	if (aer_severity != AER_CORRECTABLE)
-		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
-			aer->uncor_severity);
+	if (severity == AER_CORRECTABLE) {
+		info->id = aer_regs->cor_err_source;
+		info->status = aer_regs->cor_status;
+		info->mask = aer_regs->cor_mask;
+	} else {
+		info->id = aer_regs->uncor_err_source;
+		info->status = aer_regs->uncor_status;
+		info->mask = aer_regs->uncor_mask;
+		tlp_header_valid = info->status & AER_LOG_TLP_MASKS;
+
+		if (tlp_header_valid) {
+			info->tlp_header_valid = tlp_header_valid;
+			info->tlp = aer_regs->header_log;
+		}
+	}
+}
 
-	if (tlp_header_valid)
-		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
+void aer_print_platform_error(struct pci_dev *pdev, int severity,
+			      struct aer_capability_regs *aer_regs)
+{
+	struct aer_err_info info;
 
-	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
-			aer_severity, tlp_header_valid, &aer->header_log);
+	populate_aer_err_info(&info, severity, aer_regs);
+	aer_print_error(pdev, &info);
 }
-EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
+EXPORT_SYMBOL_NS_GPL(aer_print_platform_error, "CXL");
 
 /**
  * add_error_device - list device to be handled
@@ -1146,7 +1141,8 @@ static void aer_recover_work_func(struct work_struct *work)
 			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
 			continue;
 		}
-		pci_print_aer(pdev, entry.severity, entry.regs);
+
+		aer_print_platform_error(pdev, entry.severity, entry.regs);
 
 		/*
 		 * Memory for aer_capability_regs(entry.regs) is being
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 02940be66324..5593352dfb51 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -64,8 +64,8 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
-void pci_print_aer(struct pci_dev *dev, int aer_severity,
-		    struct aer_capability_regs *aer);
+void aer_print_platform_error(struct pci_dev *pdev, int severity,
+			      struct aer_capability_regs *aer_regs);
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
 		       int severity, struct aer_capability_regs *aer_regs);
-- 
2.43.5
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Karolina Stolarek 7 months, 3 weeks ago
Hi Bjorn,

On 25/03/2025 16:07, Karolina Stolarek wrote:
> Currently, CXL and GHES feature use pci_print_aer() function to
> log AER errors. Its implementation is pretty similar to aer_print_error(),
> duplicating the way how native PCIe devices report errors. We shouldn't
> log messages differently only because they are coming from a different
> code path.
> 
> Make CXL devices and GHES to call aer_print_error() when reporting
> AER errors. Add a wrapper, aer_print_platform_error(), that translates
> aer_capabilities_regs to aer_err_info so we can use pci_print_aer()
> function.
> 
> Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> ---
> v2:
>    - Don't expose aer_err_info to the world; as aer_recover_queue()
>      is tightly connected to the ghes code, introduce a wrapper for
>      aer_print_error()
>    - Move aer_err_info memset to the wrapper, don't expect the
>      caller to clean it for us
> 
>    I'm still working on the logs; in the meantime, I think, we can
>    continue reviewing the patch.

I wasn't able to produce logs for the CXL path (that is, Restricted CXL 
Device, as CXL1.1 devices not supported by the driver due to a missing 
functionality; confirmed by Terry) and faced issues when trying to 
inject errors via GHES. Is the lack of logs a blocker for this patch? I 
tested other CXL scenarios and my changes didn't cause regression, as 
far as I know.

All the best,
Karolina

> 
>   drivers/cxl/core/pci.c |  2 +-
>   drivers/pci/pcie/aer.c | 64 ++++++++++++++++++++----------------------
>   include/linux/aer.h    |  4 +--
>   3 files changed, 33 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 013b869b66cb..9ba711365388 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -885,7 +885,7 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
>   	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
>   		return;
>   
> -	pci_print_aer(pdev, severity, &aer_regs);
> +	aer_print_platform_error(pdev, severity, &aer_regs);
>   
>   	if (severity == AER_CORRECTABLE)
>   		cxl_handle_rdport_cor_ras(cxlds, dport);
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index a1cf8c7ef628..ec34bc9b2332 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -760,47 +760,42 @@ int cper_severity_to_aer(int cper_severity)
>   EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>   #endif
>   
> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> -		   struct aer_capability_regs *aer)
> +static void populate_aer_err_info(struct aer_err_info *info, int severity,
> +				  struct aer_capability_regs *aer_regs)
>   {
> -	int layer, agent, tlp_header_valid = 0;
> -	u32 status, mask;
> -	struct aer_err_info info;
> -
> -	if (aer_severity == AER_CORRECTABLE) {
> -		status = aer->cor_status;
> -		mask = aer->cor_mask;
> -	} else {
> -		status = aer->uncor_status;
> -		mask = aer->uncor_mask;
> -		tlp_header_valid = status & AER_LOG_TLP_MASKS;
> -	}
> -
> -	layer = AER_GET_LAYER_ERROR(aer_severity, status);
> -	agent = AER_GET_AGENT(aer_severity, status);
> +	int tlp_header_valid;
>   
>   	memset(&info, 0, sizeof(info));
> -	info.severity = aer_severity;
> -	info.status = status;
> -	info.mask = mask;
> -	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>   
> -	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> -	__aer_print_error(dev, &info);
> -	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> -		aer_error_layer[layer], aer_agent_string[agent]);
> +	info->severity = severity;
> +	info->first_error = PCI_ERR_CAP_FEP(aer_regs->cap_control);
>   
> -	if (aer_severity != AER_CORRECTABLE)
> -		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> -			aer->uncor_severity);
> +	if (severity == AER_CORRECTABLE) {
> +		info->id = aer_regs->cor_err_source;
> +		info->status = aer_regs->cor_status;
> +		info->mask = aer_regs->cor_mask;
> +	} else {
> +		info->id = aer_regs->uncor_err_source;
> +		info->status = aer_regs->uncor_status;
> +		info->mask = aer_regs->uncor_mask;
> +		tlp_header_valid = info->status & AER_LOG_TLP_MASKS;
> +
> +		if (tlp_header_valid) {
> +			info->tlp_header_valid = tlp_header_valid;
> +			info->tlp = aer_regs->header_log;
> +		}
> +	}
> +}
>   
> -	if (tlp_header_valid)
> -		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
> +void aer_print_platform_error(struct pci_dev *pdev, int severity,
> +			      struct aer_capability_regs *aer_regs)
> +{
> +	struct aer_err_info info;
>   
> -	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
> -			aer_severity, tlp_header_valid, &aer->header_log);
> +	populate_aer_err_info(&info, severity, aer_regs);
> +	aer_print_error(pdev, &info);
>   }
> -EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
> +EXPORT_SYMBOL_NS_GPL(aer_print_platform_error, "CXL");
>   
>   /**
>    * add_error_device - list device to be handled
> @@ -1146,7 +1141,8 @@ static void aer_recover_work_func(struct work_struct *work)
>   			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
>   			continue;
>   		}
> -		pci_print_aer(pdev, entry.severity, entry.regs);
> +
> +		aer_print_platform_error(pdev, entry.severity, entry.regs);
>   
>   		/*
>   		 * Memory for aer_capability_regs(entry.regs) is being
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index 02940be66324..5593352dfb51 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -64,8 +64,8 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>   static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>   #endif
>   
> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> -		    struct aer_capability_regs *aer);
> +void aer_print_platform_error(struct pci_dev *pdev, int severity,
> +			      struct aer_capability_regs *aer_regs);
>   int cper_severity_to_aer(int cper_severity);
>   void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>   		       int severity, struct aer_capability_regs *aer_regs);
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Bjorn Helgaas 7 months, 3 weeks ago
On Wed, Apr 23, 2025 at 03:52:27PM +0200, Karolina Stolarek wrote:
> On 25/03/2025 16:07, Karolina Stolarek wrote:
> > Currently, CXL and GHES feature use pci_print_aer() function to
> > log AER errors. Its implementation is pretty similar to aer_print_error(),
> > duplicating the way how native PCIe devices report errors. We shouldn't
> > log messages differently only because they are coming from a different
> > code path.
> > 
> > Make CXL devices and GHES to call aer_print_error() when reporting
> > AER errors. Add a wrapper, aer_print_platform_error(), that translates
> > aer_capabilities_regs to aer_err_info so we can use pci_print_aer()
> > function.
> > 
> > Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> > ---
> > v2:
> >    - Don't expose aer_err_info to the world; as aer_recover_queue()
> >      is tightly connected to the ghes code, introduce a wrapper for
> >      aer_print_error()
> >    - Move aer_err_info memset to the wrapper, don't expect the
> >      caller to clean it for us
> > 
> >    I'm still working on the logs; in the meantime, I think, we can
> >    continue reviewing the patch.
> 
> I wasn't able to produce logs for the CXL path (that is, Restricted CXL
> Device, as CXL1.1 devices not supported by the driver due to a missing
> functionality; confirmed by Terry) and faced issues when trying to inject
> errors via GHES. Is the lack of logs a blocker for this patch? I tested
> other CXL scenarios and my changes didn't cause regression, as far as I
> know.

Yes, I do think we need to say something about the output format
changes.

I assume you're trying GHES on machines that actually do
firmware-first error handling, right?  I found several GHES logs by
searching the web for "APEI Generic Hardware Error Source" "PCIe
error".  The majority were Dell boxes.

If you can't produce actual logs for comparison, I think we can take
info from a sample log somebody has posted and synthesize what the
changes would be after this patch.

> >   drivers/cxl/core/pci.c |  2 +-
> >   drivers/pci/pcie/aer.c | 64 ++++++++++++++++++++----------------------
> >   include/linux/aer.h    |  4 +--
> >   3 files changed, 33 insertions(+), 37 deletions(-)
> > 
> > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> > index 013b869b66cb..9ba711365388 100644
> > --- a/drivers/cxl/core/pci.c
> > +++ b/drivers/cxl/core/pci.c
> > @@ -885,7 +885,7 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
> >   	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
> >   		return;
> > -	pci_print_aer(pdev, severity, &aer_regs);
> > +	aer_print_platform_error(pdev, severity, &aer_regs);
> >   	if (severity == AER_CORRECTABLE)
> >   		cxl_handle_rdport_cor_ras(cxlds, dport);
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index a1cf8c7ef628..ec34bc9b2332 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -760,47 +760,42 @@ int cper_severity_to_aer(int cper_severity)
> >   EXPORT_SYMBOL_GPL(cper_severity_to_aer);
> >   #endif
> > -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> > -		   struct aer_capability_regs *aer)
> > +static void populate_aer_err_info(struct aer_err_info *info, int severity,
> > +				  struct aer_capability_regs *aer_regs)
> >   {
> > -	int layer, agent, tlp_header_valid = 0;
> > -	u32 status, mask;
> > -	struct aer_err_info info;
> > -
> > -	if (aer_severity == AER_CORRECTABLE) {
> > -		status = aer->cor_status;
> > -		mask = aer->cor_mask;
> > -	} else {
> > -		status = aer->uncor_status;
> > -		mask = aer->uncor_mask;
> > -		tlp_header_valid = status & AER_LOG_TLP_MASKS;
> > -	}
> > -
> > -	layer = AER_GET_LAYER_ERROR(aer_severity, status);
> > -	agent = AER_GET_AGENT(aer_severity, status);
> > +	int tlp_header_valid;
> >   	memset(&info, 0, sizeof(info));
> > -	info.severity = aer_severity;
> > -	info.status = status;
> > -	info.mask = mask;
> > -	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
> > -	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> > -	__aer_print_error(dev, &info);
> > -	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> > -		aer_error_layer[layer], aer_agent_string[agent]);
> > +	info->severity = severity;
> > +	info->first_error = PCI_ERR_CAP_FEP(aer_regs->cap_control);
> > -	if (aer_severity != AER_CORRECTABLE)
> > -		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> > -			aer->uncor_severity);
> > +	if (severity == AER_CORRECTABLE) {
> > +		info->id = aer_regs->cor_err_source;
> > +		info->status = aer_regs->cor_status;
> > +		info->mask = aer_regs->cor_mask;
> > +	} else {
> > +		info->id = aer_regs->uncor_err_source;
> > +		info->status = aer_regs->uncor_status;
> > +		info->mask = aer_regs->uncor_mask;
> > +		tlp_header_valid = info->status & AER_LOG_TLP_MASKS;
> > +
> > +		if (tlp_header_valid) {
> > +			info->tlp_header_valid = tlp_header_valid;
> > +			info->tlp = aer_regs->header_log;
> > +		}
> > +	}
> > +}
> > -	if (tlp_header_valid)
> > -		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
> > +void aer_print_platform_error(struct pci_dev *pdev, int severity,
> > +			      struct aer_capability_regs *aer_regs)
> > +{
> > +	struct aer_err_info info;
> > -	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
> > -			aer_severity, tlp_header_valid, &aer->header_log);
> > +	populate_aer_err_info(&info, severity, aer_regs);
> > +	aer_print_error(pdev, &info);
> >   }
> > -EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
> > +EXPORT_SYMBOL_NS_GPL(aer_print_platform_error, "CXL");
> >   /**
> >    * add_error_device - list device to be handled
> > @@ -1146,7 +1141,8 @@ static void aer_recover_work_func(struct work_struct *work)
> >   			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
> >   			continue;
> >   		}
> > -		pci_print_aer(pdev, entry.severity, entry.regs);
> > +
> > +		aer_print_platform_error(pdev, entry.severity, entry.regs);
> >   		/*
> >   		 * Memory for aer_capability_regs(entry.regs) is being
> > diff --git a/include/linux/aer.h b/include/linux/aer.h
> > index 02940be66324..5593352dfb51 100644
> > --- a/include/linux/aer.h
> > +++ b/include/linux/aer.h
> > @@ -64,8 +64,8 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
> >   static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
> >   #endif
> > -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> > -		    struct aer_capability_regs *aer);
> > +void aer_print_platform_error(struct pci_dev *pdev, int severity,
> > +			      struct aer_capability_regs *aer_regs);
> >   int cper_severity_to_aer(int cper_severity);
> >   void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
> >   		       int severity, struct aer_capability_regs *aer_regs);
>
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Karolina Stolarek 7 months, 3 weeks ago
On 23/04/2025 22:31, Bjorn Helgaas wrote:
> On Wed, Apr 23, 2025 at 03:52:27PM +0200, Karolina Stolarek wrote:
>>
>> I wasn't able to produce logs for the CXL path (that is, Restricted CXL
>> Device, as CXL1.1 devices not supported by the driver due to a missing
>> functionality; confirmed by Terry) and faced issues when trying to inject
>> errors via GHES. Is the lack of logs a blocker for this patch? I tested
>> other CXL scenarios and my changes didn't cause regression, as far as I
>> know.
> 
> Yes, I do think we need to say something about the output format
> changes.

I understand.

> I assume you're trying GHES on machines that actually do
> firmware-first error handling, right?  I found several GHES logs by
> searching the web for "APEI Generic Hardware Error Source" "PCIe
> error".  The majority were Dell boxes.

The only way to inject GHES errors I'm aware of is Mauro's patch for 
qemu[1], so I went down the virtualization path. As for working with the 
actual hardware, I'd need to ask around and learn more about the platform.

> If you can't produce actual logs for comparison, I think we can take
> info from a sample log somebody has posted and synthesize what the
> changes would be after this patch.

I also found some logs at some point, mostly from 2021 and 2023, but I 
felt bad about mocking up the messages and tried to produce actual logs. 
If I can't find a way to get this working in two weeks, I'll revisit 
this idea.

All the best,
Karolina

-------------------------------------------------------------
[1] - 
https://lore.kernel.org/lkml/76824dfc6bb5dd23a9f04607a907ac4ccf7cb147.1740653898.git.mchehab+huawei@kernel.org/
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Bjorn Helgaas 7 months, 3 weeks ago
[+to Yijun @Dell in case there's some testing opportunity, thread at
https://lore.kernel.org/r/81c040d54209627de2d8b150822636b415834c7f.1742900213.git.karolina.stolarek@oracle.com]

On Thu, Apr 24, 2025 at 11:01:11AM +0200, Karolina Stolarek wrote:
> On 23/04/2025 22:31, Bjorn Helgaas wrote:
> > On Wed, Apr 23, 2025 at 03:52:27PM +0200, Karolina Stolarek wrote:
> > > 
> > > I wasn't able to produce logs for the CXL path (that is, Restricted CXL
> > > Device, as CXL1.1 devices not supported by the driver due to a missing
> > > functionality; confirmed by Terry) and faced issues when trying to inject
> > > errors via GHES. Is the lack of logs a blocker for this patch? I tested
> > > other CXL scenarios and my changes didn't cause regression, as far as I
> > > know.
> > 
> > Yes, I do think we need to say something about the output format
> > changes.
> 
> I understand.
> 
> > I assume you're trying GHES on machines that actually do
> > firmware-first error handling, right?  I found several GHES logs by
> > searching the web for "APEI Generic Hardware Error Source" "PCIe
> > error".  The majority were Dell boxes.
> 
> The only way to inject GHES errors I'm aware of is Mauro's patch for
> qemu[1], so I went down the virtualization path. As for working with the
> actual hardware, I'd need to ask around and learn more about the platform.

I'd be surprised if the qemu firmware supports firmware-first
handling, so I wouldn't expect to be able to exercise this path that
way.  I think there are some bits in HEST and similar tables that tell
us about this, e.g., ACPI r6.5, sec 18.3.2.4.

Unfortunately there are some typos in the spec (FIRMWARE_FIRST,
FIRMWAREFIRST in 18.4), so it's a little hard to find all the
references.

It's a long shot, but I added Yijun as a Dell contact that who might
have a pointer to someone who could possibly test GHES logging on a
Dell box with and without your patch so we could have a concrete
comparison of the dmesg log differences.

> > If you can't produce actual logs for comparison, I think we can take
> > info from a sample log somebody has posted and synthesize what the
> > changes would be after this patch.
> 
> I also found some logs at some point, mostly from 2021 and 2023, but I felt
> bad about mocking up the messages and tried to produce actual logs. If I
> can't find a way to get this working in two weeks, I'll revisit this idea.
> 
> All the best,
> Karolina
> 
> -------------------------------------------------------------
> [1] - https://lore.kernel.org/lkml/76824dfc6bb5dd23a9f04607a907ac4ccf7cb147.1740653898.git.mchehab+huawei@kernel.org/
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Karolina Stolarek 7 months, 3 weeks ago
On 24/04/2025 19:28, Bjorn Helgaas wrote:
> [+to Yijun @Dell in case there's some testing opportunity, thread at
> https://lore.kernel.org/r/81c040d54209627de2d8b150822636b415834c7f.1742900213.git.karolina.stolarek@oracle.com]
> 
> On Thu, Apr 24, 2025 at 11:01:11AM +0200, Karolina Stolarek wrote:
 >>
>> The only way to inject GHES errors I'm aware of is Mauro's patch for
>> qemu[1], so I went down the virtualization path. As for working with the
>> actual hardware, I'd need to ask around and learn more about the platform.
> 
> I'd be surprised if the qemu firmware supports firmware-first
> handling, so I wouldn't expect to be able to exercise this path that
> way.  I think there are some bits in HEST and similar tables that tell
> us about this, e.g., ACPI r6.5, sec 18.3.2.4.

It's possible that some of the nuances of this escaped me. I decided to 
pick up the series, as I saw "PCI Express bus error injection via GHES" 
script and thought it might be useful.

> Unfortunately there are some typos in the spec (FIRMWARE_FIRST,
> FIRMWAREFIRST in 18.4), so it's a little hard to find all the
> references.

Thanks for the pointers, I'll take a look.

> It's a long shot, but I added Yijun as a Dell contact that who might
> have a pointer to someone who could possibly test GHES logging on a
> Dell box with and without your patch so we could have a concrete
> comparison of the dmesg log differences.

Thank you very much. Let's see, maybe we'll get lucky :)

All the best,
Karolina

> 
>>> If you can't produce actual logs for comparison, I think we can take
>>> info from a sample log somebody has posted and synthesize what the
>>> changes would be after this patch.
>>
>> I also found some logs at some point, mostly from 2021 and 2023, but I felt
>> bad about mocking up the messages and tried to produce actual logs. If I
>> can't find a way to get this working in two weeks, I'll revisit this idea.
>>
>> All the best,
>> Karolina
>>
>> -------------------------------------------------------------
>> [1] - https://lore.kernel.org/lkml/76824dfc6bb5dd23a9f04607a907ac4ccf7cb147.1740653898.git.mchehab+huawei@kernel.org/
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Jonathan Cameron 7 months, 3 weeks ago
On Fri, 25 Apr 2025 12:32:10 +0200
Karolina Stolarek <karolina.stolarek@oracle.com> wrote:

> On 24/04/2025 19:28, Bjorn Helgaas wrote:
> > [+to Yijun @Dell in case there's some testing opportunity, thread at
> > https://lore.kernel.org/r/81c040d54209627de2d8b150822636b415834c7f.1742900213.git.karolina.stolarek@oracle.com]
> > 
> > On Thu, Apr 24, 2025 at 11:01:11AM +0200, Karolina Stolarek wrote:  
>  >>
> >> The only way to inject GHES errors I'm aware of is Mauro's patch for
> >> qemu[1], so I went down the virtualization path. As for working with the
> >> actual hardware, I'd need to ask around and learn more about the platform.  
> > 
> > I'd be surprised if the qemu firmware supports firmware-first
> > handling, so I wouldn't expect to be able to exercise this path that
> > way.  I think there are some bits in HEST and similar tables that tell
> > us about this, e.g., ACPI r6.5, sec 18.3.2.4.  
> 
> It's possible that some of the nuances of this escaped me. I decided to 
> pick up the series, as I saw "PCI Express bus error injection via GHES" 
> script and thought it might be useful.

With Mauro's series you can inject (on ARM64 virt) any CPER record you
like.  That doesn't synchronize the wider state of the system though
so may not exercise everything (PCI registers etc not updated as it
is only injecting the record).  Mostly it just works, as remarkably 
few error handlers actually take the state of the components on which
the error is reported into account.

The aim is specifically to allow exercising FW first error handling
paths because it's a pain to get real systems that have firmware to inject
the full range of what the kernel etc need to handle.

x86 support for emulated injection is a work in progress (more of a mess wrt
to the different ways the event signaling is handled than it is on arm64).

I did have an earlier version of that work wired up to the same
hooks as the native CXL error injection but I dropped it from my QEMU
CXL staging tree for now as it was a pain to rebase whilst Mauro was rapidly
revising the infrastructure.  I'll bring it back when I get time.

Jonathan

> 
> > Unfortunately there are some typos in the spec (FIRMWARE_FIRST,
> > FIRMWAREFIRST in 18.4), so it's a little hard to find all the
> > references.  
> 
> Thanks for the pointers, I'll take a look.
> 
> > It's a long shot, but I added Yijun as a Dell contact that who might
> > have a pointer to someone who could possibly test GHES logging on a
> > Dell box with and without your patch so we could have a concrete
> > comparison of the dmesg log differences.  
> 
> Thank you very much. Let's see, maybe we'll get lucky :)
> 
> All the best,
> Karolina
> 
> >   
> >>> If you can't produce actual logs for comparison, I think we can take
> >>> info from a sample log somebody has posted and synthesize what the
> >>> changes would be after this patch.  
> >>
> >> I also found some logs at some point, mostly from 2021 and 2023, but I felt
> >> bad about mocking up the messages and tried to produce actual logs. If I
> >> can't find a way to get this working in two weeks, I'll revisit this idea.
> >>
> >> All the best,
> >> Karolina
> >>
> >> -------------------------------------------------------------
> >> [1] - https://lore.kernel.org/lkml/76824dfc6bb5dd23a9f04607a907ac4ccf7cb147.1740653898.git.mchehab+huawei@kernel.org/  
> 
>
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Karolina Stolarek 7 months, 3 weeks ago
On 25/04/2025 15:14, Jonathan Cameron wrote:
> On Fri, 25 Apr 2025 12:32:10 +0200
> Karolina Stolarek <karolina.stolarek@oracle.com> wrote:
>> 
>> It's possible that some of the nuances of this escaped me. I decided to
>> pick up the series, as I saw "PCI Express bus error injection via GHES"
>> script and thought it might be useful.
> 
> With Mauro's series you can inject (on ARM64 virt) any CPER record you
> like.  That doesn't synchronize the wider state of the system though
> so may not exercise everything (PCI registers etc not updated as it
> is only injecting the record).  Mostly it just works, as remarkably
> few error handlers actually take the state of the components on which
> the error is reported into account.

OK, that means even if we manage to inject a PCIe error, AER wouldn't be 
able to look up the Source ID and other values it needs to report an 
error, which is not quite the solution I was looking for.

> The aim is specifically to allow exercising FW first error handling
> paths because it's a pain to get real systems that have firmware to inject
> the full range of what the kernel etc need to handle.

Does this include PCIe errors? If so, that probably doesn't make sense 
to try to test my patch on an actual system?

> x86 support for emulated injection is a work in progress (more of a mess wrt
> to the different ways the event signaling is handled than it is on arm64).
> 
> I did have an earlier version of that work wired up to the same
> hooks as the native CXL error injection but I dropped it from my QEMU
> CXL staging tree for now as it was a pain to rebase whilst Mauro was rapidly
> revising the infrastructure.  I'll bring it back when I get time.

I understand, I saw some of your series while looking for ways to test 
my patch. Thank you very much for your work. As you can see, there are 
people actually looking forward to it :)


All the best,
Karolina

> 
> Jonathan
> 
>>
>>> Unfortunately there are some typos in the spec (FIRMWARE_FIRST,
>>> FIRMWAREFIRST in 18.4), so it's a little hard to find all the
>>> references.
>>
>> Thanks for the pointers, I'll take a look.
>>
>>> It's a long shot, but I added Yijun as a Dell contact that who might
>>> have a pointer to someone who could possibly test GHES logging on a
>>> Dell box with and without your patch so we could have a concrete
>>> comparison of the dmesg log differences.
>>
>> Thank you very much. Let's see, maybe we'll get lucky :)
>>
>> All the best,
>> Karolina
>>
>>>    
>>>>> If you can't produce actual logs for comparison, I think we can take
>>>>> info from a sample log somebody has posted and synthesize what the
>>>>> changes would be after this patch.
>>>>
>>>> I also found some logs at some point, mostly from 2021 and 2023, but I felt
>>>> bad about mocking up the messages and tried to produce actual logs. If I
>>>> can't find a way to get this working in two weeks, I'll revisit this idea.
>>>>
>>>> All the best,
>>>> Karolina
>>>>
>>>> -------------------------------------------------------------
>>>> [1] - https://lore.kernel.org/lkml/76824dfc6bb5dd23a9f04607a907ac4ccf7cb147.1740653898.git.mchehab+huawei@kernel.org/
>>
>>
>
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Jonathan Cameron 7 months, 3 weeks ago
On Fri, 25 Apr 2025 16:12:26 +0200
Karolina Stolarek <karolina.stolarek@oracle.com> wrote:

> On 25/04/2025 15:14, Jonathan Cameron wrote:
> > On Fri, 25 Apr 2025 12:32:10 +0200
> > Karolina Stolarek <karolina.stolarek@oracle.com> wrote:  
> >> 
> >> It's possible that some of the nuances of this escaped me. I decided to
> >> pick up the series, as I saw "PCI Express bus error injection via GHES"
> >> script and thought it might be useful.  
> > 
> > With Mauro's series you can inject (on ARM64 virt) any CPER record you
> > like.  That doesn't synchronize the wider state of the system though
> > so may not exercise everything (PCI registers etc not updated as it
> > is only injecting the record).  Mostly it just works, as remarkably
> > few error handlers actually take the state of the components on which
> > the error is reported into account.  
> 
> OK, that means even if we manage to inject a PCIe error, AER wouldn't be 
> able to look up the Source ID and other values it needs to report an 
> error, which is not quite the solution I was looking for.

Isn't the source ID in the CPER record? (Device ID field) or do
you mean something else?

> 
> > The aim is specifically to allow exercising FW first error handling
> > paths because it's a pain to get real systems that have firmware to inject
> > the full range of what the kernel etc need to handle.  
> 
> Does this include PCIe errors? If so, that probably doesn't make sense 
> to try to test my patch on an actual system?

Ideally test it on a real system as well, but indeed the intent is to
allow testing of PCI errors on emulation.

> 
> > x86 support for emulated injection is a work in progress (more of a mess wrt
> > to the different ways the event signaling is handled than it is on arm64).
> > 
> > I did have an earlier version of that work wired up to the same
> > hooks as the native CXL error injection but I dropped it from my QEMU
> > CXL staging tree for now as it was a pain to rebase whilst Mauro was rapidly
> > revising the infrastructure.  I'll bring it back when I get time.  
> 
> I understand, I saw some of your series while looking for ways to test 
> my patch. Thank you very much for your work. As you can see, there are 
> people actually looking forward to it :)

Great!  I'll try and get back to wiring it all up again sometime soon.

Jonathan

> 
> 
> All the best,
> Karolina
> 
> > 
> > Jonathan
> >   
> >>  
> >>> Unfortunately there are some typos in the spec (FIRMWARE_FIRST,
> >>> FIRMWAREFIRST in 18.4), so it's a little hard to find all the
> >>> references.  
> >>
> >> Thanks for the pointers, I'll take a look.
> >>  
> >>> It's a long shot, but I added Yijun as a Dell contact that who might
> >>> have a pointer to someone who could possibly test GHES logging on a
> >>> Dell box with and without your patch so we could have a concrete
> >>> comparison of the dmesg log differences.  
> >>
> >> Thank you very much. Let's see, maybe we'll get lucky :)
> >>
> >> All the best,
> >> Karolina
> >>  
> >>>      
> >>>>> If you can't produce actual logs for comparison, I think we can take
> >>>>> info from a sample log somebody has posted and synthesize what the
> >>>>> changes would be after this patch.  
> >>>>
> >>>> I also found some logs at some point, mostly from 2021 and 2023, but I felt
> >>>> bad about mocking up the messages and tried to produce actual logs. If I
> >>>> can't find a way to get this working in two weeks, I'll revisit this idea.
> >>>>
> >>>> All the best,
> >>>> Karolina
> >>>>
> >>>> -------------------------------------------------------------
> >>>> [1] - https://lore.kernel.org/lkml/76824dfc6bb5dd23a9f04607a907ac4ccf7cb147.1740653898.git.mchehab+huawei@kernel.org/  
> >>
> >>  
> >   
> 
>
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Karolina Stolarek 7 months, 2 weeks ago
On 29/04/2025 17:54, Jonathan Cameron wrote:
> On Fri, 25 Apr 2025 16:12:26 +0200
> Karolina Stolarek <karolina.stolarek@oracle.com> wrote:
>>
>> OK, that means even if we manage to inject a PCIe error, AER wouldn't be
>> able to look up the Source ID and other values it needs to report an
>> error, which is not quite the solution I was looking for.
> 
> Isn't the source ID in the CPER record? (Device ID field) or do
> you mean something else?

Ah, sorry, I got confused on the way. I meant that even if we have the 
Device ID in CPER set, the specific device has no data in aer_regs if we 
inject an error using the GHES error injection script. We probably would 
end up with !info->status in aer_print_error(), thus printing only a 
line about "Inaccessible" agent and return early.

>>> The aim is specifically to allow exercising FW first error handling
>>> paths because it's a pain to get real systems that have firmware to inject
>>> the full range of what the kernel etc need to handle.
>>
>> Does this include PCIe errors? If so, that probably doesn't make sense
>> to try to test my patch on an actual system?
> 
> Ideally test it on a real system as well, but indeed the intent is to
> allow testing of PCI errors on emulation.

I understand. Do you have pointers on how to inject it on a real system? 
All info I could find about FW error injection pointed to the qemu 
scripts I mentioned.

>>> x86 support for emulated injection is a work in progress (more of a mess wrt
>>> to the different ways the event signaling is handled than it is on arm64).
>>>
>>> I did have an earlier version of that work wired up to the same
>>> hooks as the native CXL error injection but I dropped it from my QEMU
>>> CXL staging tree for now as it was a pain to rebase whilst Mauro was rapidly
>>> revising the infrastructure.  I'll bring it back when I get time.
>>
>> I understand, I saw some of your series while looking for ways to test
>> my patch. Thank you very much for your work. As you can see, there are
>> people actually looking forward to it :)
> 
> Great!  I'll try and get back to wiring it all up again sometime soon.

Awesome, thanks.

Bjorn, is this patch blocking the ratelimiting series? Would it be 
acceptable to use public logs in the commit message? I'm asking because 
it looks like there's no easy way to trigger the GHES path, or it would 
take some time, further delaying the ratelimiting work.

All the best,
Karolina

> 
> Jonathan
>
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Jonathan Cameron 7 months, 2 weeks ago
On Mon, 5 May 2025 11:58:25 +0200
Karolina Stolarek <karolina.stolarek@oracle.com> wrote:

> On 29/04/2025 17:54, Jonathan Cameron wrote:
> > On Fri, 25 Apr 2025 16:12:26 +0200
> > Karolina Stolarek <karolina.stolarek@oracle.com> wrote:  
> >>
> >> OK, that means even if we manage to inject a PCIe error, AER wouldn't be
> >> able to look up the Source ID and other values it needs to report an
> >> error, which is not quite the solution I was looking for.  
> > 
> > Isn't the source ID in the CPER record? (Device ID field) or do
> > you mean something else?  
> 
> Ah, sorry, I got confused on the way. I meant that even if we have the 
> Device ID in CPER set, the specific device has no data in aer_regs if we 
> inject an error using the GHES error injection script. We probably would 
> end up with !info->status in aer_print_error(), thus printing only a 
> line about "Inaccessible" agent and return early.

If you were feeling creative with scripts you might be able to make this
work today...  Qemu does allow native aer injection via pcie_aer_inject_error
which will fill in the stuff in the device and 'try' to trigger an interrupt.
That last bit will fail (I think) if we are doing fw first handling.
(you might need to just prevent the interrupt generation in a similar fashion
to this code did here:

https://gitlab.com/jic23/qemu/-/commit/ce801e4d5b5cc5417cc7c7e5ecdaaa2ca5d6efe3#8eeec1fb38fa7149cc37b7a56dc193d69281ee96_704_708

At that point if you were to inject GHES error using Mauro's stuff it will work
and find that pre injected hardware info.

If not we need a refresh of that patch to hook up record generation with
Mauro's new handling. That's what I plan to get to but will be a while yet.

J



> 
> >>> The aim is specifically to allow exercising FW first error handling
> >>> paths because it's a pain to get real systems that have firmware to inject
> >>> the full range of what the kernel etc need to handle.  
> >>
> >> Does this include PCIe errors? If so, that probably doesn't make sense
> >> to try to test my patch on an actual system?  
> > 
> > Ideally test it on a real system as well, but indeed the intent is to
> > allow testing of PCI errors on emulation.  
> 
> I understand. Do you have pointers on how to inject it on a real system? 
> All info I could find about FW error injection pointed to the qemu 
> scripts I mentioned.

Sorry no.  It maybe system specific and disabled on production bios.

> 
> >>> x86 support for emulated injection is a work in progress (more of a mess wrt
> >>> to the different ways the event signaling is handled than it is on arm64).
> >>>
> >>> I did have an earlier version of that work wired up to the same
> >>> hooks as the native CXL error injection but I dropped it from my QEMU
> >>> CXL staging tree for now as it was a pain to rebase whilst Mauro was rapidly
> >>> revising the infrastructure.  I'll bring it back when I get time.  
> >>
> >> I understand, I saw some of your series while looking for ways to test
> >> my patch. Thank you very much for your work. As you can see, there are
> >> people actually looking forward to it :)  
> > 
> > Great!  I'll try and get back to wiring it all up again sometime soon.  
> 
> Awesome, thanks.
> 
> Bjorn, is this patch blocking the ratelimiting series? Would it be 
> acceptable to use public logs in the commit message? I'm asking because 
> it looks like there's no easy way to trigger the GHES path, or it would 
> take some time, further delaying the ratelimiting work.
> 
> All the best,
> Karolina
> 
> > 
> > Jonathan
> >   
>
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Bjorn Helgaas 7 months, 2 weeks ago
On Mon, May 05, 2025 at 11:58:25AM +0200, Karolina Stolarek wrote:
> On 29/04/2025 17:54, Jonathan Cameron wrote:
> > On Fri, 25 Apr 2025 16:12:26 +0200
> > Karolina Stolarek <karolina.stolarek@oracle.com> wrote:
> ...

> Bjorn, is this patch blocking the ratelimiting series? Would it be
> acceptable to use public logs in the commit message? I'm asking because it
> looks like there's no easy way to trigger the GHES path, or it would take
> some time, further delaying the ratelimiting work.

Nope, I think ratelimiting is more urgent, so I'm going to push harder
on that.  If we can do both this cycle, so much the better.
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Jon Pan-Doh 8 months, 2 weeks ago
On Tue, Mar 25, 2025 at 8:08 AM Karolina Stolarek
<karolina.stolarek@oracle.com> wrote:
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index a1cf8c7ef628..ec34bc9b2332 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -760,47 +760,42 @@ int cper_severity_to_aer(int cper_severity)
>  EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>  #endif
>
> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> -                  struct aer_capability_regs *aer)
> +static void populate_aer_err_info(struct aer_err_info *info, int severity,
> +                                 struct aer_capability_regs *aer_regs)
>  {
> [...]
> +       if (severity == AER_CORRECTABLE) {
> +               info->id = aer_regs->cor_err_source;
> +               info->status = aer_regs->cor_status;
> +               info->mask = aer_regs->cor_mask;
> +       } else {
> +               info->id = aer_regs->uncor_err_source;

info->id isn't filled in pci_print_aer(). Is the addition due to
aer_print_error() having a conditional/log for info->id == id? From a
brief look at the code, FW-first (e.g. errors from GHES), I think it
will always be true. However, that doesn't always seem to be the case
for CXL (e.g. when cxl_dev_state.rcd == true).

Disclaimer: not a CXL/GHES expert though.

> +void aer_print_platform_error(struct pci_dev *pdev, int severity,
> +                             struct aer_capability_regs *aer_regs)

I like the encapsulation.

Reviewed-by: Jon Pan-Doh <pandoh@google.com>

Thanks,
Jon
Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths
Posted by Karolina Stolarek 8 months, 2 weeks ago
On 01/04/2025 03:47, Jon Pan-Doh wrote:
> On Tue, Mar 25, 2025 at 8:08 AM Karolina Stolarek
> <karolina.stolarek@oracle.com> wrote:
>>
>> +static void populate_aer_err_info(struct aer_err_info *info, int severity,
>> +                                 struct aer_capability_regs *aer_regs)
>>   {
>> [...]
>> +       if (severity == AER_CORRECTABLE) {
>> +               info->id = aer_regs->cor_err_source;
>> +               info->status = aer_regs->cor_status;
>> +               info->mask = aer_regs->cor_mask;
>> +       } else {
>> +               info->id = aer_regs->uncor_err_source;
> 
> info->id isn't filled in pci_print_aer(). Is the addition due to
> aer_print_error() having a conditional/log for info->id == id? From a
> brief look at the code, FW-first (e.g. errors from GHES), I think it
> will always be true. However, that doesn't always seem to be the case
> for CXL (e.g. when cxl_dev_state.rcd == true).

I just went with what aer_print_error() expected to get from the 
aer_err_info struct. It's possible that this id check is not be needed 
for GHES, but I thought that as the values are already in the regs, it 
wouldn't hurt to extract them.

> 
> Disclaimer: not a CXL/GHES expert though.
> 
>> +void aer_print_platform_error(struct pci_dev *pdev, int severity,
>> +                             struct aer_capability_regs *aer_regs)
> 
> I like the encapsulation.
> 
> Reviewed-by: Jon Pan-Doh <pandoh@google.com>

Thanks a lot!

All the best,
Karolina

> 
> Thanks,
> Jon