[PATCH platform-next v4 2/2] platform/mellanox: mlxreg-hotplug: Enabling interrupt storm detection

Ciju Rajan K posted 2 patches 3 weeks, 4 days ago
[PATCH platform-next v4 2/2] platform/mellanox: mlxreg-hotplug: Enabling interrupt storm detection
Posted by Ciju Rajan K 3 weeks, 4 days ago
This patch enables the interrupt storm detection feature and
also adds the per device counter for tracking the faulty
devices. It also masks the faulty devices from generating
any further interrupts.

Add field for interrupt storm handling.
Extend structure mlxreg_core_data with the following field:
 'wmark_cntr'   - interrupt storm counter.

Extend structure mlxreg_core_item with the following field:
 'storming_bits' - interrupt storming bits mask.

Reviewed-by: Vadim Pasternak <vadimp@nvidia.com>
Signed-off-by: Ciju Rajan K <crajank@nvidia.com>
--
---
 drivers/platform/mellanox/mlxreg-hotplug.c | 74 +++++++++++++++++++++-
 include/linux/platform_data/mlxreg.h       |  4 ++
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/mellanox/mlxreg-hotplug.c b/drivers/platform/mellanox/mlxreg-hotplug.c
index d246772aafd6..4752477207d4 100644
--- a/drivers/platform/mellanox/mlxreg-hotplug.c
+++ b/drivers/platform/mellanox/mlxreg-hotplug.c
@@ -30,6 +30,9 @@
 #define MLXREG_HOTPLUG_ATTRS_MAX	128
 #define MLXREG_HOTPLUG_NOT_ASSERT	3
 
+/* Interrupt storm frequency */
+#define MLXREG_HOTPLUG_INTR_FREQ_HZ	100
+
 /**
  * struct mlxreg_hotplug_priv_data - platform private data:
  * @irq: platform device interrupt number;
@@ -339,6 +342,57 @@ static int mlxreg_hotplug_attr_init(struct mlxreg_hotplug_priv_data *priv)
 	return 0;
 }
 
+/**
+ * mlxreg_hotplug_storm_handler - generic interrupt storm detection callback
+ * @irq: interrupt number experiencing storm
+ * @freq: detected frequency (interrupts per second)
+ * @dev_id: device data (mlxreg_hotplug_priv_data)
+ *
+ * This callback is invoked by the generic interrupt storm detection mechanism
+ * when an interrupt storm is detected on the shared IRQ line. The driver then
+ * analyzes per-device interrupt counters to identify which specific devices
+ * are causing excessive interrupts without blocking operations.
+ */
+static void mlxreg_hotplug_storm_handler(unsigned int irq, unsigned int freq, void *dev_id)
+{
+	struct mlxreg_hotplug_priv_data *priv = dev_id;
+	struct mlxreg_core_hotplug_platform_data *pdata;
+	struct mlxreg_core_item *item;
+	struct mlxreg_core_data *data;
+	unsigned long asserted;
+	u32 bit;
+
+	dev_warn(priv->dev,
+		 "Interrupt storm detected on IRQ %u (%u interrupts/sec)",
+		 irq, freq);
+
+	pdata = dev_get_platdata(&priv->pdev->dev);
+	item = pdata->items;
+	asserted = item->cache;
+
+	for_each_set_bit(bit, &asserted, 8) {
+		int pos;
+
+		pos = mlxreg_hotplug_item_label_index_get(item->mask, bit);
+		if (pos < 0)
+			goto out;
+
+		data = item->data + pos;
+		/* Check per device interrupt counter */
+		if (data->wmark_cntr >= MLXREG_HOTPLUG_INTR_FREQ_HZ - 1) {
+			dev_err(priv->dev,
+				"Storming bit %d (label: %s) - interrupt masked permanently. Replace broken HW.",
+				bit, data->label);
+			/* Mark bit as storming. */
+			item->storming_bits |= BIT(bit);
+		}
+		data->wmark_cntr = 0;
+	}
+	return;
+ out:
+	dev_err(priv->dev, "Failed to complete interrupt storm handler\n");
+}
+
 static void
 mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
 			   struct mlxreg_core_item *item)
@@ -371,6 +425,10 @@ mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
 			goto out;
 
 		data = item->data + pos;
+
+		/* Counter to keep track of interrupt storm */
+		data->wmark_cntr++;
+
 		if (regval & BIT(bit)) {
 			if (item->inversed)
 				mlxreg_hotplug_device_destroy(priv, data, item->kind);
@@ -390,9 +448,9 @@ mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
 	if (ret)
 		goto out;
 
-	/* Unmask event. */
+	/* Unmask event, exclude storming bits. */
 	ret = regmap_write(priv->regmap, item->reg + MLXREG_HOTPLUG_MASK_OFF,
-			   item->mask);
+			   item->mask & ~item->storming_bits);
 
  out:
 	if (ret)
@@ -767,6 +825,15 @@ static int mlxreg_hotplug_probe(struct platform_device *pdev)
 
 	/* Perform initial interrupts setup. */
 	mlxreg_hotplug_set_irq(priv);
+
+	/* Register with generic interrupt storm detection */
+	if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
+					  mlxreg_hotplug_storm_handler, priv)) {
+		dev_warn(&pdev->dev, "Failed to register generic interrupt storm detection\n");
+	} else {
+		dev_info(&pdev->dev, "Registered generic storm detection for IRQ %d\n", priv->irq);
+	}
+
 	priv->after_probe = true;
 
 	return 0;
@@ -776,6 +843,9 @@ static void mlxreg_hotplug_remove(struct platform_device *pdev)
 {
 	struct mlxreg_hotplug_priv_data *priv = dev_get_drvdata(&pdev->dev);
 
+	/* Unregister generic interrupt storm detection */
+	irq_unregister_storm_detection(priv->irq);
+
 	/* Clean interrupts setup. */
 	mlxreg_hotplug_unset_irq(priv);
 	devm_free_irq(&pdev->dev, priv->irq, priv);
diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index f6cca7a035c7..592256570175 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -131,6 +131,7 @@ struct mlxreg_hotplug_device {
  * @regnum: number of registers occupied by multi-register attribute;
  * @slot: slot number, at which device is located;
  * @secured: if set indicates that entry access is secured;
+ * @wmark_cntr: interrupt storm counter;
  */
 struct mlxreg_core_data {
 	char label[MLXREG_CORE_LABEL_MAX_SIZE];
@@ -151,6 +152,7 @@ struct mlxreg_core_data {
 	u8 regnum;
 	u8 slot;
 	u8 secured;
+	unsigned int wmark_cntr;
 };
 
 /**
@@ -167,6 +169,7 @@ struct mlxreg_core_data {
  * @ind: element's index inside the group;
  * @inversed: if 0: 0 for signal status is OK, if 1 - 1 is OK;
  * @health: true if device has health indication, false in other case;
+ * @storming_bits: interrupt storming bits mask;
  */
 struct mlxreg_core_item {
 	struct mlxreg_core_data *data;
@@ -180,6 +183,7 @@ struct mlxreg_core_item {
 	u8 ind;
 	u8 inversed;
 	u8 health;
+	u32 storming_bits;
 };
 
 /**
-- 
2.47.3
Re: [PATCH platform-next v4 2/2] platform/mellanox: mlxreg-hotplug: Enabling interrupt storm detection
Posted by kernel test robot 3 weeks, 3 days ago
Hi Ciju,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc5]
[cannot apply to tip/irq/core next-20260115]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Ciju-Rajan-K/kernel-irq-Add-generic-interrupt-storm-detection-mechanism/20260115-155438
base:   linus/master
patch link:    https://lore.kernel.org/r/20260115074909.245852-3-crajank%40nvidia.com
patch subject: [PATCH platform-next v4 2/2] platform/mellanox: mlxreg-hotplug: Enabling interrupt storm detection
config: x86_64-randconfig-161-20260115 (https://download.01.org/0day-ci/archive/20260115/202601152235.2MC3FUQp-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
rustc: rustc 1.88.0 (6b00bc388 2025-06-23)
smatch version: v0.5.0-8985-g2614ff1a
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260115/202601152235.2MC3FUQp-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601152235.2MC3FUQp-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/platform/mellanox/mlxreg-hotplug.c:830:7: error: call to undeclared function 'irq_register_storm_detection'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     830 |         if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
         |              ^
>> drivers/platform/mellanox/mlxreg-hotplug.c:847:2: error: call to undeclared function 'irq_unregister_storm_detection'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     847 |         irq_unregister_storm_detection(priv->irq);
         |         ^
   2 errors generated.


vim +/irq_register_storm_detection +830 drivers/platform/mellanox/mlxreg-hotplug.c

   762	
   763	static int mlxreg_hotplug_probe(struct platform_device *pdev)
   764	{
   765		struct mlxreg_core_hotplug_platform_data *pdata;
   766		struct mlxreg_hotplug_priv_data *priv;
   767		struct i2c_adapter *deferred_adap;
   768		int err;
   769	
   770		pdata = dev_get_platdata(&pdev->dev);
   771		if (!pdata) {
   772			dev_err(&pdev->dev, "Failed to get platform data.\n");
   773			return -EINVAL;
   774		}
   775	
   776		/* Defer probing if the necessary adapter is not configured yet. */
   777		deferred_adap = i2c_get_adapter(pdata->deferred_nr);
   778		if (!deferred_adap)
   779			return -EPROBE_DEFER;
   780		i2c_put_adapter(deferred_adap);
   781	
   782		priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
   783		if (!priv)
   784			return -ENOMEM;
   785	
   786		if (pdata->irq) {
   787			priv->irq = pdata->irq;
   788		} else {
   789			priv->irq = platform_get_irq(pdev, 0);
   790			if (priv->irq < 0)
   791				return priv->irq;
   792		}
   793	
   794		priv->regmap = pdata->regmap;
   795		priv->dev = pdev->dev.parent;
   796		priv->pdev = pdev;
   797	
   798		err = devm_request_irq(&pdev->dev, priv->irq,
   799				       mlxreg_hotplug_irq_handler, IRQF_TRIGGER_FALLING
   800				       | IRQF_SHARED, "mlxreg-hotplug", priv);
   801		if (err) {
   802			dev_err(&pdev->dev, "Failed to request irq: %d\n", err);
   803			return err;
   804		}
   805	
   806		disable_irq(priv->irq);
   807		spin_lock_init(&priv->lock);
   808		INIT_DELAYED_WORK(&priv->dwork_irq, mlxreg_hotplug_work_handler);
   809		dev_set_drvdata(&pdev->dev, priv);
   810	
   811		err = mlxreg_hotplug_attr_init(priv);
   812		if (err) {
   813			dev_err(&pdev->dev, "Failed to allocate attributes: %d\n",
   814				err);
   815			return err;
   816		}
   817	
   818		priv->hwmon = devm_hwmon_device_register_with_groups(&pdev->dev,
   819						"mlxreg_hotplug", priv, priv->groups);
   820		if (IS_ERR(priv->hwmon)) {
   821			dev_err(&pdev->dev, "Failed to register hwmon device %ld\n",
   822				PTR_ERR(priv->hwmon));
   823			return PTR_ERR(priv->hwmon);
   824		}
   825	
   826		/* Perform initial interrupts setup. */
   827		mlxreg_hotplug_set_irq(priv);
   828	
   829		/* Register with generic interrupt storm detection */
 > 830		if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
   831						  mlxreg_hotplug_storm_handler, priv)) {
   832			dev_warn(&pdev->dev, "Failed to register generic interrupt storm detection\n");
   833		} else {
   834			dev_info(&pdev->dev, "Registered generic storm detection for IRQ %d\n", priv->irq);
   835		}
   836	
   837		priv->after_probe = true;
   838	
   839		return 0;
   840	}
   841	
   842	static void mlxreg_hotplug_remove(struct platform_device *pdev)
   843	{
   844		struct mlxreg_hotplug_priv_data *priv = dev_get_drvdata(&pdev->dev);
   845	
   846		/* Unregister generic interrupt storm detection */
 > 847		irq_unregister_storm_detection(priv->irq);
   848	
   849		/* Clean interrupts setup. */
   850		mlxreg_hotplug_unset_irq(priv);
   851		devm_free_irq(&pdev->dev, priv->irq, priv);
   852	}
   853	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH platform-next v4 2/2] platform/mellanox: mlxreg-hotplug: Enabling interrupt storm detection
Posted by Andy Shevchenko 3 weeks, 4 days ago
On Thu, Jan 15, 2026 at 09:49:09AM +0200, Ciju Rajan K wrote:
> This patch enables the interrupt storm detection feature and
> also adds the per device counter for tracking the faulty
> devices. It also masks the faulty devices from generating
> any further interrupts.
> 
> Add field for interrupt storm handling.
> Extend structure mlxreg_core_data with the following field:
>  'wmark_cntr'   - interrupt storm counter.
> 
> Extend structure mlxreg_core_item with the following field:
>  'storming_bits' - interrupt storming bits mask.

...

> +static void mlxreg_hotplug_storm_handler(unsigned int irq, unsigned int freq, void *dev_id)
> +{
> +	struct mlxreg_hotplug_priv_data *priv = dev_id;
> +	struct mlxreg_core_hotplug_platform_data *pdata;
> +	struct mlxreg_core_item *item;
> +	struct mlxreg_core_data *data;
> +	unsigned long asserted;
> +	u32 bit;
> +
> +	dev_warn(priv->dev,
> +		 "Interrupt storm detected on IRQ %u (%u interrupts/sec)",
> +		 irq, freq);

Below you put long line, here it seems wrapped by 80, why so inconsistent?
Please, choose one style and use it everywhere (inside the same file).

> +	pdata = dev_get_platdata(&priv->pdev->dev);
> +	item = pdata->items;
> +	asserted = item->cache;
> +
> +	for_each_set_bit(bit, &asserted, 8) {
> +		int pos;
> +
> +		pos = mlxreg_hotplug_item_label_index_get(item->mask, bit);
> +		if (pos < 0)

> +			goto out;

Used only once. Just drop the label and move the related code under the branch.

> +		data = item->data + pos;
> +		/* Check per device interrupt counter */
> +		if (data->wmark_cntr >= MLXREG_HOTPLUG_INTR_FREQ_HZ - 1) {
> +			dev_err(priv->dev,
> +				"Storming bit %d (label: %s) - interrupt masked permanently. Replace broken HW.",
> +				bit, data->label);
> +			/* Mark bit as storming. */
> +			item->storming_bits |= BIT(bit);
> +		}
> +		data->wmark_cntr = 0;
> +	}
> +	return;
> + out:
> +	dev_err(priv->dev, "Failed to complete interrupt storm handler\n");
> +}

...

> +	/* Register with generic interrupt storm detection */
> +	if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
> +					  mlxreg_hotplug_storm_handler, priv)) {
> +		dev_warn(&pdev->dev, "Failed to register generic interrupt storm detection\n");
> +	} else {
> +		dev_info(&pdev->dev, "Registered generic storm detection for IRQ %d\n", priv->irq);
> +	}

Invert the conditional, it will be slightly easier to parse.

...

>  struct mlxreg_core_data {
>  	char label[MLXREG_CORE_LABEL_MAX_SIZE];

>  	u8 regnum;
>  	u8 slot;
>  	u8 secured;
> +	unsigned int wmark_cntr;
>  };

Have you run `pahole`? No issues / room to improve this layout?

...

>  struct mlxreg_core_item {
>  	struct mlxreg_core_data *data;

>  	u8 ind;
>  	u8 inversed;
>  	u8 health;
> +	u32 storming_bits;
>  };

Ditto.

-- 
With Best Regards,
Andy Shevchenko