[v6] Rate limit AER logs

[PATCH v6 14/16] PCI/AER: Introduce ratelimit for error logs

Posted by Bjorn Helgaas 8 months, 3 weeks ago

From: Jon Pan-Doh <pandoh@google.com>

Spammy devices can flood kernel logs with AER errors and slow/stall
execution. Add per-device ratelimits for AER correctable and uncorrectable
errors that use the kernel defaults (10 per 5s).

There are two AER logging entry points:

  - aer_print_error() is used by DPC and native AER

  - pci_print_aer() is used by GHES and CXL

The native AER aer_print_error() case includes a loop that may log details
from multiple devices.  This is ratelimited by the union of ratelimits for
these devices, set by add_error_device(), which collects the devices.  If
no such device is found, the Error Source message is ratelimited by the
Root Port or RCEC that received the ERR_* message.

The DPC aer_print_error() case is currently not ratelimited.

The GHES and CXL pci_print_aer() cases are ratelimited by the Error Source
device.

Sargun at Meta reported internally that a flood of AER errors causes RCU
CPU stall warnings and CSD-lock warnings.

Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged
while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show
true count of 11.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git

[bhelgaas: commit log, factor out trace_aer_event() and aer_print_rp_info()
changes to previous patches, collect single aer_err_info.ratelimit as union
of ratelimits of all error source devices]
Link: https://lore.kernel.org/r/20250321015806.954866-7-pandoh@google.com
Reported-by: Sargun Dhillon <sargun@meta.com>
Signed-off-by: Jon Pan-Doh <pandoh@google.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.h      |  3 ++-
 drivers/pci/pcie/aer.c | 49 ++++++++++++++++++++++++++++++++++++------
 drivers/pci/pcie/dpc.c |  1 +
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 705f9ef58acc..65c466279ade 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -593,7 +593,8 @@ struct aer_err_info {
 	unsigned int id:16;
 
 	unsigned int severity:2;	/* 0:NONFATAL | 1:FATAL | 2:COR */
-	unsigned int __pad1:5;
+	unsigned int ratelimit:1;	/* 0=skip, 1=print */
+	unsigned int __pad1:4;
 	unsigned int multi_error_valid:1;
 
 	unsigned int first_error:5;
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index da62032bf024..c335e0bb9f51 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -28,6 +28,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/kfifo.h>
+#include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <acpi/apei.h>
 #include <acpi/ghes.h>
@@ -88,6 +89,10 @@ struct aer_report {
 	u64 rootport_total_cor_errs;
 	u64 rootport_total_fatal_errs;
 	u64 rootport_total_nonfatal_errs;
+
+	/* Ratelimits for errors */
+	struct ratelimit_state cor_log_ratelimit;
+	struct ratelimit_state uncor_log_ratelimit;
 };
 
 #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
@@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev)
 
 	dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL);
 
+	ratelimit_state_init(&dev->aer_report->cor_log_ratelimit,
+			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+	ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit,
+			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
 	/*
 	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
 	 * PCI_ERR_COR_MASK, and PCI_ERR_CAP.  Root and Root Complex Event
@@ -672,6 +682,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
 	}
 }
 
+static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
+{
+	struct ratelimit_state *ratelimit;
+
+	if (severity == AER_CORRECTABLE)
+		ratelimit = &dev->aer_report->cor_log_ratelimit;
+	else
+		ratelimit = &dev->aer_report->uncor_log_ratelimit;
+
+	return __ratelimit(ratelimit);
+}
+
 static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 {
 	const char **strings;
@@ -715,6 +737,9 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 
 	pci_dev_aer_stats_incr(dev, info);
 
+	if (!info->ratelimit)
+		return;
+
 	if (!info->status) {
 		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
 			aer_error_severity_string[info->severity]);
@@ -785,6 +810,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
 
 	pci_dev_aer_stats_incr(dev, &info);
 
+	if (!aer_ratelimit(dev, info.severity))
+		return;
+
 	layer = AER_GET_LAYER_ERROR(aer_severity, status);
 	agent = AER_GET_AGENT(aer_severity, status);
 
@@ -815,8 +843,14 @@ EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
  */
 static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
 {
+	/*
+	 * Ratelimit AER log messages.  Generally we add the Error Source
+	 * device, but there are is_error_source() cases that can result in
+	 * multiple devices being added here, so we OR them all together.
+	 */
 	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
 		e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
+		e_info->ratelimit |= aer_ratelimit(dev, e_info->severity);
 		e_info->error_dev_num++;
 		return 0;
 	}
@@ -914,7 +948,7 @@ static int find_device_iter(struct pci_dev *dev, void *data)
  * e_info->error_dev_num and e_info->dev[], based on the given information.
  */
 static bool find_source_device(struct pci_dev *parent,
-		struct aer_err_info *e_info)
+			       struct aer_err_info *e_info)
 {
 	struct pci_dev *dev = parent;
 	int result;
@@ -935,10 +969,12 @@ static bool find_source_device(struct pci_dev *parent,
 	/*
 	 * If we didn't find any devices with errors logged in the AER
 	 * Capability, just print the Error Source ID from the Root Port or
-	 * RCEC that received an ERR_* Message.
+	 * RCEC that received an ERR_* Message, ratelimited by the RP or
+	 * RCEC.
 	 */
 	if (!e_info->error_dev_num) {
-		aer_print_source(parent, e_info, " (no details found)");
+		if (aer_ratelimit(parent, e_info->severity))
+			aer_print_source(parent, e_info, " (no details found)");
 		return false;
 	}
 	return true;
@@ -1147,9 +1183,10 @@ static void aer_recover_work_func(struct work_struct *work)
 		pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
 						   entry.devfn);
 		if (!pdev) {
-			pr_err("no pci_dev for %04x:%02x:%02x.%x\n",
-			       entry.domain, entry.bus,
-			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
+			pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",
+					   entry.domain, entry.bus,
+					   PCI_SLOT(entry.devfn),
+					   PCI_FUNC(entry.devfn));
 			continue;
 		}
 		pci_print_aer(pdev, entry.severity, entry.regs);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 34af0ea45c0d..597df7790f36 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -301,6 +301,7 @@ void dpc_process_error(struct pci_dev *pdev)
 	else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
 		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 		 aer_get_device_error_info(pdev, &info)) {
+		info.ratelimit = 1;	/* no ratelimiting */
 		aer_print_error(pdev, &info);
 		pci_aer_clear_nonfatal_status(pdev);
 		pci_aer_clear_fatal_status(pdev);
-- 
2.43.0

Re: [PATCH v6 14/16] PCI/AER: Introduce ratelimit for error logs

Posted by Ilpo Järvinen 8 months, 3 weeks ago

On Mon, 19 May 2025, Bjorn Helgaas wrote:

> From: Jon Pan-Doh <pandoh@google.com>
> 
> Spammy devices can flood kernel logs with AER errors and slow/stall
> execution. Add per-device ratelimits for AER correctable and uncorrectable
> errors that use the kernel defaults (10 per 5s).
> 
> There are two AER logging entry points:
> 
>   - aer_print_error() is used by DPC and native AER
> 
>   - pci_print_aer() is used by GHES and CXL
> 
> The native AER aer_print_error() case includes a loop that may log details
> from multiple devices.  This is ratelimited by the union of ratelimits for
> these devices, set by add_error_device(), which collects the devices.  If
> no such device is found, the Error Source message is ratelimited by the
> Root Port or RCEC that received the ERR_* message.
> 
> The DPC aer_print_error() case is currently not ratelimited.
> 
> The GHES and CXL pci_print_aer() cases are ratelimited by the Error Source
> device.
> 
> Sargun at Meta reported internally that a flood of AER errors causes RCU
> CPU stall warnings and CSD-lock warnings.
> 
> Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged
> while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show
> true count of 11.
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
> 
> [bhelgaas: commit log, factor out trace_aer_event() and aer_print_rp_info()
> changes to previous patches, collect single aer_err_info.ratelimit as union
> of ratelimits of all error source devices]
> Link: https://lore.kernel.org/r/20250321015806.954866-7-pandoh@google.com
> Reported-by: Sargun Dhillon <sargun@meta.com>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>  drivers/pci/pci.h      |  3 ++-
>  drivers/pci/pcie/aer.c | 49 ++++++++++++++++++++++++++++++++++++------
>  drivers/pci/pcie/dpc.c |  1 +
>  3 files changed, 46 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 705f9ef58acc..65c466279ade 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -593,7 +593,8 @@ struct aer_err_info {
>  	unsigned int id:16;
>  
>  	unsigned int severity:2;	/* 0:NONFATAL | 1:FATAL | 2:COR */
> -	unsigned int __pad1:5;
> +	unsigned int ratelimit:1;	/* 0=skip, 1=print */
> +	unsigned int __pad1:4;
>  	unsigned int multi_error_valid:1;
>  
>  	unsigned int first_error:5;
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index da62032bf024..c335e0bb9f51 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -28,6 +28,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/delay.h>
>  #include <linux/kfifo.h>
> +#include <linux/ratelimit.h>
>  #include <linux/slab.h>
>  #include <acpi/apei.h>
>  #include <acpi/ghes.h>
> @@ -88,6 +89,10 @@ struct aer_report {
>  	u64 rootport_total_cor_errs;
>  	u64 rootport_total_fatal_errs;
>  	u64 rootport_total_nonfatal_errs;
> +
> +	/* Ratelimits for errors */
> +	struct ratelimit_state cor_log_ratelimit;
> +	struct ratelimit_state uncor_log_ratelimit;
>  };
>  
>  #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
> @@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev)
>  
>  	dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL);
>  
> +	ratelimit_state_init(&dev->aer_report->cor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> +	ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> +
>  	/*
>  	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
>  	 * PCI_ERR_COR_MASK, and PCI_ERR_CAP.  Root and Root Complex Event
> @@ -672,6 +682,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
>  	}
>  }
>  
> +static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
> +{
> +	struct ratelimit_state *ratelimit;
> +
> +	if (severity == AER_CORRECTABLE)
> +		ratelimit = &dev->aer_report->cor_log_ratelimit;
> +	else
> +		ratelimit = &dev->aer_report->uncor_log_ratelimit;
> +
> +	return __ratelimit(ratelimit);
> +}
> +
>  static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  {
>  	const char **strings;
> @@ -715,6 +737,9 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  
>  	pci_dev_aer_stats_incr(dev, info);
>  
> +	if (!info->ratelimit)
> +		return;
> +
>  	if (!info->status) {
>  		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
>  			aer_error_severity_string[info->severity]);
> @@ -785,6 +810,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>  
>  	pci_dev_aer_stats_incr(dev, &info);
>  
> +	if (!aer_ratelimit(dev, info.severity))
> +		return;
> +
>  	layer = AER_GET_LAYER_ERROR(aer_severity, status);
>  	agent = AER_GET_AGENT(aer_severity, status);
>  
> @@ -815,8 +843,14 @@ EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
>   */
>  static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
>  {
> +	/*
> +	 * Ratelimit AER log messages.  Generally we add the Error Source
> +	 * device, but there are is_error_source() cases that can result in
> +	 * multiple devices being added here, so we OR them all together.

I can see the code uses OR ;-) but I wasn't helpful because this comment 
didn't explain why at all. As this ratelimit thing is using reverse logic 
to begin with, this is a very tricky bit.

Perhaps something less vague like:

... we ratelimit if all devices have reached their ratelimit.

Assuming that was the intention here? (I'm not sure.)

> +	 */
>  	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
>  		e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
> +		e_info->ratelimit |= aer_ratelimit(dev, e_info->severity);
>  		e_info->error_dev_num++;
>  		return 0;
>  	}
> @@ -914,7 +948,7 @@ static int find_device_iter(struct pci_dev *dev, void *data)
>   * e_info->error_dev_num and e_info->dev[], based on the given information.
>   */
>  static bool find_source_device(struct pci_dev *parent,
> -		struct aer_err_info *e_info)
> +			       struct aer_err_info *e_info)
>  {
>  	struct pci_dev *dev = parent;
>  	int result;
> @@ -935,10 +969,12 @@ static bool find_source_device(struct pci_dev *parent,
>  	/*
>  	 * If we didn't find any devices with errors logged in the AER
>  	 * Capability, just print the Error Source ID from the Root Port or
> -	 * RCEC that received an ERR_* Message.
> +	 * RCEC that received an ERR_* Message, ratelimited by the RP or
> +	 * RCEC.
>  	 */
>  	if (!e_info->error_dev_num) {
> -		aer_print_source(parent, e_info, " (no details found)");
> +		if (aer_ratelimit(parent, e_info->severity))
> +			aer_print_source(parent, e_info, " (no details found)");
>  		return false;
>  	}
>  	return true;
> @@ -1147,9 +1183,10 @@ static void aer_recover_work_func(struct work_struct *work)
>  		pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
>  						   entry.devfn);
>  		if (!pdev) {
> -			pr_err("no pci_dev for %04x:%02x:%02x.%x\n",
> -			       entry.domain, entry.bus,
> -			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
> +			pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",

This case was not mentioned in the changelog.

> +					   entry.domain, entry.bus,
> +					   PCI_SLOT(entry.devfn),
> +					   PCI_FUNC(entry.devfn));
>  			continue;
>  		}
>  		pci_print_aer(pdev, entry.severity, entry.regs);
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 34af0ea45c0d..597df7790f36 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -301,6 +301,7 @@ void dpc_process_error(struct pci_dev *pdev)
>  	else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
>  		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
>  		 aer_get_device_error_info(pdev, &info)) {
> +		info.ratelimit = 1;	/* no ratelimiting */
>  		aer_print_error(pdev, &info);
>  		pci_aer_clear_nonfatal_status(pdev);
>  		pci_aer_clear_fatal_status(pdev);
> 

-- 
 i.

Re: [PATCH v6 14/16] PCI/AER: Introduce ratelimit for error logs

Posted by Bjorn Helgaas 8 months, 3 weeks ago

On Tue, May 20, 2025 at 02:55:32PM +0300, Ilpo Järvinen wrote:
> On Mon, 19 May 2025, Bjorn Helgaas wrote:
> 
> > From: Jon Pan-Doh <pandoh@google.com>
> > 
> > Spammy devices can flood kernel logs with AER errors and slow/stall
> > execution. Add per-device ratelimits for AER correctable and uncorrectable
> > errors that use the kernel defaults (10 per 5s).
> > 
> > There are two AER logging entry points:
> > 
> >   - aer_print_error() is used by DPC and native AER
> > 
> >   - pci_print_aer() is used by GHES and CXL
> > 
> > The native AER aer_print_error() case includes a loop that may log details
> > from multiple devices.  This is ratelimited by the union of ratelimits for
> > these devices, set by add_error_device(), which collects the devices.  If
> > no such device is found, the Error Source message is ratelimited by the
> > Root Port or RCEC that received the ERR_* message.
> > 
> > The DPC aer_print_error() case is currently not ratelimited.
> > 
> > The GHES and CXL pci_print_aer() cases are ratelimited by the Error Source
> > device.

> >  static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
> >  {
> > +	/*
> > +	 * Ratelimit AER log messages.  Generally we add the Error Source
> > +	 * device, but there are is_error_source() cases that can result in
> > +	 * multiple devices being added here, so we OR them all together.
> 
> I can see the code uses OR ;-) but I wasn't helpful because this comment 
> didn't explain why at all. As this ratelimit thing is using reverse logic 
> to begin with, this is a very tricky bit.
> 
> Perhaps something less vague like:
> 
> ... we ratelimit if all devices have reached their ratelimit.
> 
> Assuming that was the intention here? (I'm not sure.)

My intention was that if there's any downstream device that has an
unmasked error logged and it has not reached its ratelimit, we should
log messages for all devices with errors logged.  Does something like
this help?

  /*
   * Ratelimit AER log messages.  "dev" is either the source
   * identified by the root's Error Source ID or it has an unmasked
   * error logged in its own AER Capability.  If any of these devices
   * has not reached its ratelimit, log messages for all of them.
   * Messages are emitted when e_info->ratelimit is non-zero.
   *
   * Note that e_info->ratelimit was already initialized to 1 for the
   * ERR_FATAL case.
   */

The ERR_FATAL case is from this post-v6 change that I haven't posted
yet:

  aer_isr_one_error(...)
  {
    ...
    if (status & PCI_ERR_ROOT_UNCOR_RCV) {
      int fatal = status & PCI_ERR_ROOT_FATAL_RCV;
      struct aer_err_info e_info = {
        ...
 +      .ratelimit = fatal ? 1 : 0;

> > +	 */
> >  	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
> >  		e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
> > +		e_info->ratelimit |= aer_ratelimit(dev, e_info->severity);
> >  		e_info->error_dev_num++;
> >  		return 0;
> >  	}

> > @@ -1147,9 +1183,10 @@ static void aer_recover_work_func(struct work_struct *work)
> >  		pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
> >  						   entry.devfn);
> >  		if (!pdev) {
> > -			pr_err("no pci_dev for %04x:%02x:%02x.%x\n",
> > -			       entry.domain, entry.bus,
> > -			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
> > +			pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",
> 
> This case was not mentioned in the changelog.

Sharp eyes!  What do you think of this commit log text?

  The CXL pci_print_aer() case is ratelimited by the Error Source device.

  The GHES pci_print_aer() case is via aer_recover_work_func(), which
  searches for the Error Source device.  If the device is not found, there's
  no per-device ratelimit, so we use a system-wide ratelimit that covers all
  error types (correctable, non-fatal, and fatal).

This isn't really ideal because in pci_print_aer(), the struct
aer_capability_regs has already been filled by firmware and the
logging doesn't read any registers from the device at all.

However, pci_print_aer() *does* want the pci_dev for statistics and
tracing (pci_dev_aer_stats_incr()) and, of course, for the aer_printks
themselves.

We could leave this pr_err() completely alone; hopefully it's a rare
case.  I think the CXL path just silently skips pci_print_aer() if
this happens.

Eventually I would really like the native AER path to start by doing
whatever firmware is doing, e.g., fill in struct aer_capability_regs,
so the core of the AER handling could be identical between native AER
and GHES/CXL.  If we could do that, maybe we could figure out a
cleaner way to handle this corner case.

Re: [PATCH v6 14/16] PCI/AER: Introduce ratelimit for error logs

Posted by Ilpo Järvinen 8 months, 3 weeks ago

On Tue, 20 May 2025, Bjorn Helgaas wrote:

> On Tue, May 20, 2025 at 02:55:32PM +0300, Ilpo Järvinen wrote:
> > On Mon, 19 May 2025, Bjorn Helgaas wrote:
> > 
> > > From: Jon Pan-Doh <pandoh@google.com>
> > > 
> > > Spammy devices can flood kernel logs with AER errors and slow/stall
> > > execution. Add per-device ratelimits for AER correctable and uncorrectable
> > > errors that use the kernel defaults (10 per 5s).
> > > 
> > > There are two AER logging entry points:
> > > 
> > >   - aer_print_error() is used by DPC and native AER
> > > 
> > >   - pci_print_aer() is used by GHES and CXL
> > > 
> > > The native AER aer_print_error() case includes a loop that may log details
> > > from multiple devices.  This is ratelimited by the union of ratelimits for
> > > these devices, set by add_error_device(), which collects the devices.  If
> > > no such device is found, the Error Source message is ratelimited by the
> > > Root Port or RCEC that received the ERR_* message.
> > > 
> > > The DPC aer_print_error() case is currently not ratelimited.
> > > 
> > > The GHES and CXL pci_print_aer() cases are ratelimited by the Error Source
> > > device.
> 
> > >  static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
> > >  {
> > > +	/*
> > > +	 * Ratelimit AER log messages.  Generally we add the Error Source
> > > +	 * device, but there are is_error_source() cases that can result in
> > > +	 * multiple devices being added here, so we OR them all together.
> > 
> > I can see the code uses OR ;-) but I wasn't helpful because this comment 
> > didn't explain why at all. As this ratelimit thing is using reverse logic 
> > to begin with, this is a very tricky bit.
> > 
> > Perhaps something less vague like:
> > 
> > ... we ratelimit if all devices have reached their ratelimit.
> > 
> > Assuming that was the intention here? (I'm not sure.)
> 
> My intention was that if there's any downstream device that has an
> unmasked error logged and it has not reached its ratelimit, we should
> log messages for all devices with errors logged.  Does something like
> this help?
> 
>   /*
>    * Ratelimit AER log messages.  "dev" is either the source
>    * identified by the root's Error Source ID or it has an unmasked
>    * error logged in its own AER Capability.  If any of these devices
>    * has not reached its ratelimit, log messages for all of them.
>    * Messages are emitted when e_info->ratelimit is non-zero.
>    *
>    * Note that e_info->ratelimit was already initialized to 1 for the
>    * ERR_FATAL case.
>    */

Yes, this is much clearer of intent, thanks.

> The ERR_FATAL case is from this post-v6 change that I haven't posted
> yet:
> 
>   aer_isr_one_error(...)
>   {
>     ...
>     if (status & PCI_ERR_ROOT_UNCOR_RCV) {
>       int fatal = status & PCI_ERR_ROOT_FATAL_RCV;
>       struct aer_err_info e_info = {
>         ...
>  +      .ratelimit = fatal ? 1 : 0;
> 
> 
> > > +	 */
> > >  	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
> > >  		e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
> > > +		e_info->ratelimit |= aer_ratelimit(dev, e_info->severity);
> > >  		e_info->error_dev_num++;
> > >  		return 0;
> > >  	}
> 
> > > @@ -1147,9 +1183,10 @@ static void aer_recover_work_func(struct work_struct *work)
> > >  		pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
> > >  						   entry.devfn);
> > >  		if (!pdev) {
> > > -			pr_err("no pci_dev for %04x:%02x:%02x.%x\n",
> > > -			       entry.domain, entry.bus,
> > > -			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
> > > +			pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",
> > 
> > This case was not mentioned in the changelog.
> 
> Sharp eyes!  What do you think of this commit log text?
> 
>   The CXL pci_print_aer() case is ratelimited by the Error Source device.
> 
>   The GHES pci_print_aer() case is via aer_recover_work_func(), which
>   searches for the Error Source device.  If the device is not found, there's
>   no per-device ratelimit, so we use a system-wide ratelimit that covers all
>   error types (correctable, non-fatal, and fatal).

Works for me as long as it is mentioned.

> This isn't really ideal because in pci_print_aer(), the struct
> aer_capability_regs has already been filled by firmware and the
> logging doesn't read any registers from the device at all.
> 
> However, pci_print_aer() *does* want the pci_dev for statistics and
> tracing (pci_dev_aer_stats_incr()) and, of course, for the aer_printks
> themselves.

While not a perfect solution, this looks yet another case where it would 
help to create a dummy pci_dev struct with minimal setup which allows 
calling functions that input a pci_dev.

That solution is not perfect because it arms a trap. Downstream 
functions could get changed and if the developer assumes they have a full 
pci_dev at hand, it could cause issues with the dummy pci_dev. How likely
it happens is debatable but for many cases where the call-chain isn't 
overly complex such as here, dummy pci_dev seems helpful.

> We could leave this pr_err() completely alone; hopefully it's a rare
> case.  I think the CXL path just silently skips pci_print_aer() if
> this happens.
> 
> Eventually I would really like the native AER path to start by doing
> whatever firmware is doing, e.g., fill in struct aer_capability_regs,
> so the core of the AER handling could be identical between native AER
> and GHES/CXL.  If we could do that, maybe we could figure out a
> cleaner way to handle this corner case.


-- 
 i.

Re: [PATCH v6 14/16] PCI/AER: Introduce ratelimit for error logs

Posted by Sathyanarayanan Kuppuswamy 8 months, 3 weeks ago

Hi Bjorn,

On 5/19/25 2:35 PM, Bjorn Helgaas wrote:
> From: Jon Pan-Doh <pandoh@google.com>
>
> Spammy devices can flood kernel logs with AER errors and slow/stall
> execution. Add per-device ratelimits for AER correctable and uncorrectable
> errors that use the kernel defaults (10 per 5s).
>
> There are two AER logging entry points:
>
>    - aer_print_error() is used by DPC and native AER
>
>    - pci_print_aer() is used by GHES and CXL
>
> The native AER aer_print_error() case includes a loop that may log details
> from multiple devices.  This is ratelimited by the union of ratelimits for
> these devices, set by add_error_device(), which collects the devices.  If
> no such device is found, the Error Source message is ratelimited by the
> Root Port or RCEC that received the ERR_* message.
>
> The DPC aer_print_error() case is currently not ratelimited.

Can we also not rate limit fatal errors in AER driver?

>
> The GHES and CXL pci_print_aer() cases are ratelimited by the Error Source
> device.
>
> Sargun at Meta reported internally that a flood of AER errors causes RCU
> CPU stall warnings and CSD-lock warnings.
>
> Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged
> while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show
> true count of 11.
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
>
> [bhelgaas: commit log, factor out trace_aer_event() and aer_print_rp_info()
> changes to previous patches, collect single aer_err_info.ratelimit as union
> of ratelimits of all error source devices]
> Link: https://lore.kernel.org/r/20250321015806.954866-7-pandoh@google.com
> Reported-by: Sargun Dhillon <sargun@meta.com>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>   drivers/pci/pci.h      |  3 ++-
>   drivers/pci/pcie/aer.c | 49 ++++++++++++++++++++++++++++++++++++------
>   drivers/pci/pcie/dpc.c |  1 +
>   3 files changed, 46 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 705f9ef58acc..65c466279ade 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -593,7 +593,8 @@ struct aer_err_info {
>   	unsigned int id:16;
>   
>   	unsigned int severity:2;	/* 0:NONFATAL | 1:FATAL | 2:COR */
> -	unsigned int __pad1:5;
> +	unsigned int ratelimit:1;	/* 0=skip, 1=print */
> +	unsigned int __pad1:4;
>   	unsigned int multi_error_valid:1;
>   
>   	unsigned int first_error:5;
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index da62032bf024..c335e0bb9f51 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -28,6 +28,7 @@
>   #include <linux/interrupt.h>
>   #include <linux/delay.h>
>   #include <linux/kfifo.h>
> +#include <linux/ratelimit.h>
>   #include <linux/slab.h>
>   #include <acpi/apei.h>
>   #include <acpi/ghes.h>
> @@ -88,6 +89,10 @@ struct aer_report {
>   	u64 rootport_total_cor_errs;
>   	u64 rootport_total_fatal_errs;
>   	u64 rootport_total_nonfatal_errs;
> +
> +	/* Ratelimits for errors */
> +	struct ratelimit_state cor_log_ratelimit;
> +	struct ratelimit_state uncor_log_ratelimit;
>   };
>   
>   #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
> @@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev)
>   
>   	dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL);
>   
> +	ratelimit_state_init(&dev->aer_report->cor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> +	ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> +
>   	/*
>   	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
>   	 * PCI_ERR_COR_MASK, and PCI_ERR_CAP.  Root and Root Complex Event
> @@ -672,6 +682,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
>   	}
>   }
>   
> +static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
> +{
> +	struct ratelimit_state *ratelimit;
> +
> +	if (severity == AER_CORRECTABLE)
> +		ratelimit = &dev->aer_report->cor_log_ratelimit;
> +	else
> +		ratelimit = &dev->aer_report->uncor_log_ratelimit;
> +
> +	return __ratelimit(ratelimit);
> +}
> +
>   static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>   {
>   	const char **strings;
> @@ -715,6 +737,9 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>   
>   	pci_dev_aer_stats_incr(dev, info);
>   
> +	if (!info->ratelimit)
> +		return;
> +
>   	if (!info->status) {
>   		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
>   			aer_error_severity_string[info->severity]);
> @@ -785,6 +810,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   
>   	pci_dev_aer_stats_incr(dev, &info);
>   
> +	if (!aer_ratelimit(dev, info.severity))
> +		return;
> +
>   	layer = AER_GET_LAYER_ERROR(aer_severity, status);
>   	agent = AER_GET_AGENT(aer_severity, status);
>   
> @@ -815,8 +843,14 @@ EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
>    */
>   static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
>   {
> +	/*
> +	 * Ratelimit AER log messages.  Generally we add the Error Source
> +	 * device, but there are is_error_source() cases that can result in
> +	 * multiple devices being added here, so we OR them all together.
> +	 */
>   	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
>   		e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
> +		e_info->ratelimit |= aer_ratelimit(dev, e_info->severity);
>   		e_info->error_dev_num++;
>   		return 0;
>   	}
> @@ -914,7 +948,7 @@ static int find_device_iter(struct pci_dev *dev, void *data)
>    * e_info->error_dev_num and e_info->dev[], based on the given information.
>    */
>   static bool find_source_device(struct pci_dev *parent,
> -		struct aer_err_info *e_info)
> +			       struct aer_err_info *e_info)
>   {
>   	struct pci_dev *dev = parent;
>   	int result;
> @@ -935,10 +969,12 @@ static bool find_source_device(struct pci_dev *parent,
>   	/*
>   	 * If we didn't find any devices with errors logged in the AER
>   	 * Capability, just print the Error Source ID from the Root Port or
> -	 * RCEC that received an ERR_* Message.
> +	 * RCEC that received an ERR_* Message, ratelimited by the RP or
> +	 * RCEC.
>   	 */
>   	if (!e_info->error_dev_num) {
> -		aer_print_source(parent, e_info, " (no details found)");
> +		if (aer_ratelimit(parent, e_info->severity))
> +			aer_print_source(parent, e_info, " (no details found)");
>   		return false;
>   	}
>   	return true;
> @@ -1147,9 +1183,10 @@ static void aer_recover_work_func(struct work_struct *work)
>   		pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
>   						   entry.devfn);
>   		if (!pdev) {
> -			pr_err("no pci_dev for %04x:%02x:%02x.%x\n",
> -			       entry.domain, entry.bus,
> -			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
> +			pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",
> +					   entry.domain, entry.bus,
> +					   PCI_SLOT(entry.devfn),
> +					   PCI_FUNC(entry.devfn));
>   			continue;
>   		}
>   		pci_print_aer(pdev, entry.severity, entry.regs);
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 34af0ea45c0d..597df7790f36 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -301,6 +301,7 @@ void dpc_process_error(struct pci_dev *pdev)
>   	else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
>   		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
>   		 aer_get_device_error_info(pdev, &info)) {
> +		info.ratelimit = 1;	/* no ratelimiting */
>   		aer_print_error(pdev, &info);
>   		pci_aer_clear_nonfatal_status(pdev);
>   		pci_aer_clear_fatal_status(pdev);

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

Re: [PATCH v6 14/16] PCI/AER: Introduce ratelimit for error logs

Posted by Bjorn Helgaas 8 months, 3 weeks ago

On Mon, May 19, 2025 at 09:59:29PM -0700, Sathyanarayanan Kuppuswamy wrote:
> On 5/19/25 2:35 PM, Bjorn Helgaas wrote:
> > From: Jon Pan-Doh <pandoh@google.com>
> > 
> > Spammy devices can flood kernel logs with AER errors and slow/stall
> > execution. Add per-device ratelimits for AER correctable and uncorrectable
> > errors that use the kernel defaults (10 per 5s).
> > 
> > There are two AER logging entry points:
> > 
> >    - aer_print_error() is used by DPC and native AER
> > 
> >    - pci_print_aer() is used by GHES and CXL
> > 
> > The native AER aer_print_error() case includes a loop that may log details
> > from multiple devices.  This is ratelimited by the union of ratelimits for
> > these devices, set by add_error_device(), which collects the devices.  If
> > no such device is found, the Error Source message is ratelimited by the
> > Root Port or RCEC that received the ERR_* message.
> > 
> > The DPC aer_print_error() case is currently not ratelimited.
> 
> Can we also not rate limit fatal errors in AER driver?

In other words, only rate limit AER_CORRECTABLE and AER_NONFATAL for
AER?  Seems plausible to me.

Re: [PATCH v6 14/16] PCI/AER: Introduce ratelimit for error logs

Posted by Sathyanarayanan Kuppuswamy 8 months, 3 weeks ago

On 5/20/25 11:31 AM, Bjorn Helgaas wrote:
> On Mon, May 19, 2025 at 09:59:29PM -0700, Sathyanarayanan Kuppuswamy wrote:
>> On 5/19/25 2:35 PM, Bjorn Helgaas wrote:
>>> From: Jon Pan-Doh <pandoh@google.com>
>>>
>>> Spammy devices can flood kernel logs with AER errors and slow/stall
>>> execution. Add per-device ratelimits for AER correctable and uncorrectable
>>> errors that use the kernel defaults (10 per 5s).
>>>
>>> There are two AER logging entry points:
>>>
>>>     - aer_print_error() is used by DPC and native AER
>>>
>>>     - pci_print_aer() is used by GHES and CXL
>>>
>>> The native AER aer_print_error() case includes a loop that may log details
>>> from multiple devices.  This is ratelimited by the union of ratelimits for
>>> these devices, set by add_error_device(), which collects the devices.  If
>>> no such device is found, the Error Source message is ratelimited by the
>>> Root Port or RCEC that received the ERR_* message.
>>>
>>> The DPC aer_print_error() case is currently not ratelimited.
>> Can we also not rate limit fatal errors in AER driver?
> In other words, only rate limit AER_CORRECTABLE and AER_NONFATAL for
> AER?  Seems plausible to me.
Yes, we might lose important information by rate-limiting FATAL errors. I
believe FATAL errors should be infrequent, so it's reasonable to allow them
through without rate limiting. Once you make this change, please also
update the related SysFS documentation and update code accordingly.

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer