This patch enables the interrupt storm detection feature and
also adds the per device counter for tracking the faulty
devices. It also masks the faulty devices from generating
any further interrupts.
Add field for interrupt storm handling.
Extend structure mlxreg_core_data with the following field:
'wmark_cntr' - interrupt storm counter.
Extend structure mlxreg_core_item with the following field:
'storming_bits' - interrupt storming bits mask.
Reviewed-by: Vadim Pasternak <vadimp@nvidia.com>
Signed-off-by: Ciju Rajan K <crajank@nvidia.com>
--
---
drivers/platform/mellanox/mlxreg-hotplug.c | 74 +++++++++++++++++++++-
include/linux/platform_data/mlxreg.h | 4 ++
2 files changed, 76 insertions(+), 2 deletions(-)
diff --git a/drivers/platform/mellanox/mlxreg-hotplug.c b/drivers/platform/mellanox/mlxreg-hotplug.c
index d246772aafd6..4752477207d4 100644
--- a/drivers/platform/mellanox/mlxreg-hotplug.c
+++ b/drivers/platform/mellanox/mlxreg-hotplug.c
@@ -30,6 +30,9 @@
#define MLXREG_HOTPLUG_ATTRS_MAX 128
#define MLXREG_HOTPLUG_NOT_ASSERT 3
+/* Interrupt storm frequency */
+#define MLXREG_HOTPLUG_INTR_FREQ_HZ 100
+
/**
* struct mlxreg_hotplug_priv_data - platform private data:
* @irq: platform device interrupt number;
@@ -339,6 +342,57 @@ static int mlxreg_hotplug_attr_init(struct mlxreg_hotplug_priv_data *priv)
return 0;
}
+/**
+ * mlxreg_hotplug_storm_handler - generic interrupt storm detection callback
+ * @irq: interrupt number experiencing storm
+ * @freq: detected frequency (interrupts per second)
+ * @dev_id: device data (mlxreg_hotplug_priv_data)
+ *
+ * This callback is invoked by the generic interrupt storm detection mechanism
+ * when an interrupt storm is detected on the shared IRQ line. The driver then
+ * analyzes per-device interrupt counters to identify which specific devices
+ * are causing excessive interrupts without blocking operations.
+ */
+static void mlxreg_hotplug_storm_handler(unsigned int irq, unsigned int freq, void *dev_id)
+{
+ struct mlxreg_hotplug_priv_data *priv = dev_id;
+ struct mlxreg_core_hotplug_platform_data *pdata;
+ struct mlxreg_core_item *item;
+ struct mlxreg_core_data *data;
+ unsigned long asserted;
+ u32 bit;
+
+ dev_warn(priv->dev,
+ "Interrupt storm detected on IRQ %u (%u interrupts/sec)",
+ irq, freq);
+
+ pdata = dev_get_platdata(&priv->pdev->dev);
+ item = pdata->items;
+ asserted = item->cache;
+
+ for_each_set_bit(bit, &asserted, 8) {
+ int pos;
+
+ pos = mlxreg_hotplug_item_label_index_get(item->mask, bit);
+ if (pos < 0)
+ goto out;
+
+ data = item->data + pos;
+ /* Check per device interrupt counter */
+ if (data->wmark_cntr >= MLXREG_HOTPLUG_INTR_FREQ_HZ - 1) {
+ dev_err(priv->dev,
+ "Storming bit %d (label: %s) - interrupt masked permanently. Replace broken HW.",
+ bit, data->label);
+ /* Mark bit as storming. */
+ item->storming_bits |= BIT(bit);
+ }
+ data->wmark_cntr = 0;
+ }
+ return;
+ out:
+ dev_err(priv->dev, "Failed to complete interrupt storm handler\n");
+}
+
static void
mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
struct mlxreg_core_item *item)
@@ -371,6 +425,10 @@ mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
goto out;
data = item->data + pos;
+
+ /* Counter to keep track of interrupt storm */
+ data->wmark_cntr++;
+
if (regval & BIT(bit)) {
if (item->inversed)
mlxreg_hotplug_device_destroy(priv, data, item->kind);
@@ -390,9 +448,9 @@ mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
if (ret)
goto out;
- /* Unmask event. */
+ /* Unmask event, exclude storming bits. */
ret = regmap_write(priv->regmap, item->reg + MLXREG_HOTPLUG_MASK_OFF,
- item->mask);
+ item->mask & ~item->storming_bits);
out:
if (ret)
@@ -767,6 +825,15 @@ static int mlxreg_hotplug_probe(struct platform_device *pdev)
/* Perform initial interrupts setup. */
mlxreg_hotplug_set_irq(priv);
+
+ /* Register with generic interrupt storm detection */
+ if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
+ mlxreg_hotplug_storm_handler, priv)) {
+ dev_warn(&pdev->dev, "Failed to register generic interrupt storm detection\n");
+ } else {
+ dev_info(&pdev->dev, "Registered generic storm detection for IRQ %d\n", priv->irq);
+ }
+
priv->after_probe = true;
return 0;
@@ -776,6 +843,9 @@ static void mlxreg_hotplug_remove(struct platform_device *pdev)
{
struct mlxreg_hotplug_priv_data *priv = dev_get_drvdata(&pdev->dev);
+ /* Unregister generic interrupt storm detection */
+ irq_unregister_storm_detection(priv->irq);
+
/* Clean interrupts setup. */
mlxreg_hotplug_unset_irq(priv);
devm_free_irq(&pdev->dev, priv->irq, priv);
diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index f6cca7a035c7..592256570175 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -131,6 +131,7 @@ struct mlxreg_hotplug_device {
* @regnum: number of registers occupied by multi-register attribute;
* @slot: slot number, at which device is located;
* @secured: if set indicates that entry access is secured;
+ * @wmark_cntr: interrupt storm counter;
*/
struct mlxreg_core_data {
char label[MLXREG_CORE_LABEL_MAX_SIZE];
@@ -151,6 +152,7 @@ struct mlxreg_core_data {
u8 regnum;
u8 slot;
u8 secured;
+ unsigned int wmark_cntr;
};
/**
@@ -167,6 +169,7 @@ struct mlxreg_core_data {
* @ind: element's index inside the group;
* @inversed: if 0: 0 for signal status is OK, if 1 - 1 is OK;
* @health: true if device has health indication, false in other case;
+ * @storming_bits: interrupt storming bits mask;
*/
struct mlxreg_core_item {
struct mlxreg_core_data *data;
@@ -180,6 +183,7 @@ struct mlxreg_core_item {
u8 ind;
u8 inversed;
u8 health;
+ u32 storming_bits;
};
/**
--
2.47.3
Hi Ciju,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc5]
[cannot apply to tip/irq/core next-20260115]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Ciju-Rajan-K/kernel-irq-Add-generic-interrupt-storm-detection-mechanism/20260115-155438
base: linus/master
patch link: https://lore.kernel.org/r/20260115074909.245852-3-crajank%40nvidia.com
patch subject: [PATCH platform-next v4 2/2] platform/mellanox: mlxreg-hotplug: Enabling interrupt storm detection
config: x86_64-randconfig-161-20260115 (https://download.01.org/0day-ci/archive/20260115/202601152235.2MC3FUQp-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
rustc: rustc 1.88.0 (6b00bc388 2025-06-23)
smatch version: v0.5.0-8985-g2614ff1a
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260115/202601152235.2MC3FUQp-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601152235.2MC3FUQp-lkp@intel.com/
All errors (new ones prefixed by >>):
>> drivers/platform/mellanox/mlxreg-hotplug.c:830:7: error: call to undeclared function 'irq_register_storm_detection'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
830 | if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
| ^
>> drivers/platform/mellanox/mlxreg-hotplug.c:847:2: error: call to undeclared function 'irq_unregister_storm_detection'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
847 | irq_unregister_storm_detection(priv->irq);
| ^
2 errors generated.
vim +/irq_register_storm_detection +830 drivers/platform/mellanox/mlxreg-hotplug.c
762
763 static int mlxreg_hotplug_probe(struct platform_device *pdev)
764 {
765 struct mlxreg_core_hotplug_platform_data *pdata;
766 struct mlxreg_hotplug_priv_data *priv;
767 struct i2c_adapter *deferred_adap;
768 int err;
769
770 pdata = dev_get_platdata(&pdev->dev);
771 if (!pdata) {
772 dev_err(&pdev->dev, "Failed to get platform data.\n");
773 return -EINVAL;
774 }
775
776 /* Defer probing if the necessary adapter is not configured yet. */
777 deferred_adap = i2c_get_adapter(pdata->deferred_nr);
778 if (!deferred_adap)
779 return -EPROBE_DEFER;
780 i2c_put_adapter(deferred_adap);
781
782 priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
783 if (!priv)
784 return -ENOMEM;
785
786 if (pdata->irq) {
787 priv->irq = pdata->irq;
788 } else {
789 priv->irq = platform_get_irq(pdev, 0);
790 if (priv->irq < 0)
791 return priv->irq;
792 }
793
794 priv->regmap = pdata->regmap;
795 priv->dev = pdev->dev.parent;
796 priv->pdev = pdev;
797
798 err = devm_request_irq(&pdev->dev, priv->irq,
799 mlxreg_hotplug_irq_handler, IRQF_TRIGGER_FALLING
800 | IRQF_SHARED, "mlxreg-hotplug", priv);
801 if (err) {
802 dev_err(&pdev->dev, "Failed to request irq: %d\n", err);
803 return err;
804 }
805
806 disable_irq(priv->irq);
807 spin_lock_init(&priv->lock);
808 INIT_DELAYED_WORK(&priv->dwork_irq, mlxreg_hotplug_work_handler);
809 dev_set_drvdata(&pdev->dev, priv);
810
811 err = mlxreg_hotplug_attr_init(priv);
812 if (err) {
813 dev_err(&pdev->dev, "Failed to allocate attributes: %d\n",
814 err);
815 return err;
816 }
817
818 priv->hwmon = devm_hwmon_device_register_with_groups(&pdev->dev,
819 "mlxreg_hotplug", priv, priv->groups);
820 if (IS_ERR(priv->hwmon)) {
821 dev_err(&pdev->dev, "Failed to register hwmon device %ld\n",
822 PTR_ERR(priv->hwmon));
823 return PTR_ERR(priv->hwmon);
824 }
825
826 /* Perform initial interrupts setup. */
827 mlxreg_hotplug_set_irq(priv);
828
829 /* Register with generic interrupt storm detection */
> 830 if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
831 mlxreg_hotplug_storm_handler, priv)) {
832 dev_warn(&pdev->dev, "Failed to register generic interrupt storm detection\n");
833 } else {
834 dev_info(&pdev->dev, "Registered generic storm detection for IRQ %d\n", priv->irq);
835 }
836
837 priv->after_probe = true;
838
839 return 0;
840 }
841
842 static void mlxreg_hotplug_remove(struct platform_device *pdev)
843 {
844 struct mlxreg_hotplug_priv_data *priv = dev_get_drvdata(&pdev->dev);
845
846 /* Unregister generic interrupt storm detection */
> 847 irq_unregister_storm_detection(priv->irq);
848
849 /* Clean interrupts setup. */
850 mlxreg_hotplug_unset_irq(priv);
851 devm_free_irq(&pdev->dev, priv->irq, priv);
852 }
853
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Thu, Jan 15, 2026 at 09:49:09AM +0200, Ciju Rajan K wrote:
> This patch enables the interrupt storm detection feature and
> also adds the per device counter for tracking the faulty
> devices. It also masks the faulty devices from generating
> any further interrupts.
>
> Add field for interrupt storm handling.
> Extend structure mlxreg_core_data with the following field:
> 'wmark_cntr' - interrupt storm counter.
>
> Extend structure mlxreg_core_item with the following field:
> 'storming_bits' - interrupt storming bits mask.
...
> +static void mlxreg_hotplug_storm_handler(unsigned int irq, unsigned int freq, void *dev_id)
> +{
> + struct mlxreg_hotplug_priv_data *priv = dev_id;
> + struct mlxreg_core_hotplug_platform_data *pdata;
> + struct mlxreg_core_item *item;
> + struct mlxreg_core_data *data;
> + unsigned long asserted;
> + u32 bit;
> +
> + dev_warn(priv->dev,
> + "Interrupt storm detected on IRQ %u (%u interrupts/sec)",
> + irq, freq);
Below you put long line, here it seems wrapped by 80, why so inconsistent?
Please, choose one style and use it everywhere (inside the same file).
> + pdata = dev_get_platdata(&priv->pdev->dev);
> + item = pdata->items;
> + asserted = item->cache;
> +
> + for_each_set_bit(bit, &asserted, 8) {
> + int pos;
> +
> + pos = mlxreg_hotplug_item_label_index_get(item->mask, bit);
> + if (pos < 0)
> + goto out;
Used only once. Just drop the label and move the related code under the branch.
> + data = item->data + pos;
> + /* Check per device interrupt counter */
> + if (data->wmark_cntr >= MLXREG_HOTPLUG_INTR_FREQ_HZ - 1) {
> + dev_err(priv->dev,
> + "Storming bit %d (label: %s) - interrupt masked permanently. Replace broken HW.",
> + bit, data->label);
> + /* Mark bit as storming. */
> + item->storming_bits |= BIT(bit);
> + }
> + data->wmark_cntr = 0;
> + }
> + return;
> + out:
> + dev_err(priv->dev, "Failed to complete interrupt storm handler\n");
> +}
...
> + /* Register with generic interrupt storm detection */
> + if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
> + mlxreg_hotplug_storm_handler, priv)) {
> + dev_warn(&pdev->dev, "Failed to register generic interrupt storm detection\n");
> + } else {
> + dev_info(&pdev->dev, "Registered generic storm detection for IRQ %d\n", priv->irq);
> + }
Invert the conditional, it will be slightly easier to parse.
...
> struct mlxreg_core_data {
> char label[MLXREG_CORE_LABEL_MAX_SIZE];
> u8 regnum;
> u8 slot;
> u8 secured;
> + unsigned int wmark_cntr;
> };
Have you run `pahole`? No issues / room to improve this layout?
...
> struct mlxreg_core_item {
> struct mlxreg_core_data *data;
> u8 ind;
> u8 inversed;
> u8 health;
> + u32 storming_bits;
> };
Ditto.
--
With Best Regards,
Andy Shevchenko
© 2016 - 2026 Red Hat, Inc.