Activate soft_moderation via the hooks in handle_irq_event()
and per-CPU and irq_desc initialization.
This change only implements fixed moderation. It needs to be
explicitly enabled at runtime on individual interrupts.
Example (kernel built with CONFIG_SOFT_IRQ_MODERATION=y)
# enable fixed moderation
echo "delay_us=400" > /proc/irq/soft_moderation
# enable on network interrupts (change name as appropriate)
echo on | tee /proc/irq/*/*eth*/../soft_moderation
# show it works by looking at counters in /proc/irq/soft_moderation
cat /proc/irq/soft_moderation
# Show runtime impact on ping times changing delay_us
ping -n -f -q -c 1000 ${some_nearby_host}
echo "delay_us=100" > /proc/irq/soft_moderation
ping -n -f -q -c 1000 ${some_nearby_host}
Configuration via module parameters (irq_moderation.${name}=${value}) or
echo "${name}=${value}" > /proc/irq/soft_moderation)
delay_us 0=off, range 1-500, default 100
how long an interrupt is disabled after it fires. Small values are
accumulated until they are large enough, e.g. 10us. As an example, a 2us value
means that the timer is set only every 5 interrupts.
timer_rounds 0-20, default 0
How many extra timer runs before re-enabling interrupts. This allows
reducing the number of MSI interrupts while keeping delay_us small.
This is similar to the "napi_defer_hard_irqs" option in NAPI, but with
some subtle differences (e.g. here the number of rounds is
deterministic, and interrupts are disabled at MSI level).
Change-Id: I47c5059ad537fcb9561f924620cf68e1d648aae6
---
arch/x86/kernel/cpu/common.c | 1 +
drivers/irqchip/irq-gic-v3.c | 2 ++
kernel/irq/handle.c | 3 +++
kernel/irq/irqdesc.c | 1 +
4 files changed, 7 insertions(+)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 02d97834a1d4d..1953419fde6ff 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2440,6 +2440,7 @@ void cpu_init(void)
intel_posted_msi_init();
}
+ irq_moderation_percpu_init();
mmgrab(&init_mm);
cur->active_mm = &init_mm;
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 3de351e66ee84..902bcbf9d85d8 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1226,6 +1226,8 @@ static void gic_cpu_sys_reg_init(void)
WARN_ON(gic_dist_security_disabled() != cpus_have_security_disabled);
}
+ irq_moderation_percpu_init();
+
/*
* Some firmwares hand over to the kernel with the BPR changed from
* its reset value (and with a value large enough to prevent
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e103451243a0b..2cacceaaea9d0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -12,6 +12,7 @@
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
+#include <linux/irq_moderation.h>
#include <linux/kernel_stat.h>
#include <asm/irq_regs.h>
@@ -254,9 +255,11 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
+ irq_moderation_hook(desc); /* may disable irq so must run unlocked */
ret = handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
+ irq_moderation_epilogue(desc); /* start moderation timer if needed */
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
return ret;
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index db714d3014b5f..e3efbecf5b937 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -134,6 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
+ irq_moderation_init_fields(desc);
for_each_possible_cpu(cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
desc_smp_init(desc, node, affinity);
--
2.51.2.1041.gc1ab5b90ca-goog
On Wed, Nov 12 2025 at 19:24, Luigi Rizzo wrote:
Forgot to mention it on the earlier patches. The subject line is wrong
in multiple aspects. See documentation.
> arch/x86/kernel/cpu/common.c | 1 +
> drivers/irqchip/irq-gic-v3.c | 2 ++
How are those related to the subject?
> kernel/irq/handle.c | 3 +++
> kernel/irq/irqdesc.c | 1 +
> 4 files changed, 7 insertions(+)
>
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 02d97834a1d4d..1953419fde6ff 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2440,6 +2440,7 @@ void cpu_init(void)
>
> intel_posted_msi_init();
> }
> + irq_moderation_percpu_init();
Why is this called in architecture specific code? There is absolutely
nothing architecture specific about this. The CPU hotplug infrastructure
can handle this just fine in a generic way.
> #include <asm/irq_regs.h>
> @@ -254,9 +255,11 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
> irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
> raw_spin_unlock(&desc->lock);
>
> + irq_moderation_hook(desc); /* may disable irq so must run unlocked */
That's just wrong. That can trivially be implemented in a way which
works with the lock held.
> ret = handle_irq_event_percpu(desc);
> raw_spin_lock(&desc->lock);
> + irq_moderation_epilogue(desc); /* start moderation timer if needed */
> irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
> return ret;
> }
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index db714d3014b5f..e3efbecf5b937 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -134,6 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
> desc->tot_count = 0;
> desc->name = NULL;
> desc->owner = owner;
> + irq_moderation_init_fields(desc);
That's clearly part of activation in handle_irq_event() ....
Thanks
tglx
On Thu, Nov 13, 2025 at 10:45 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Wed, Nov 12 2025 at 19:24, Luigi Rizzo wrote:
> > [...]
> > @@ -254,9 +255,11 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
> > irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
> > raw_spin_unlock(&desc->lock);
> >
> > + irq_moderation_hook(desc); /* may disable irq so must run unlocked */
>
> That's just wrong. That can trivially be implemented in a way which
> works with the lock held.
Do you mean I can move the call before the unlock and use __disable_irq()
given that moderation is only enabled on interrupts that have
irq_bus_lock == NULL, so disable_irq_nosync() which is
scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
__disable_irq(scoped_irqdesc);
return 0;
}
effectively collapses to raw_spin_lock()
This would save a lock/unlock dance and a few conditionals?
> > @@ -134,6 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
> > desc->tot_count = 0;
> > desc->name = NULL;
> > desc->owner = owner;
> > + irq_moderation_init_fields(desc);
>
> That's clearly part of activation in handle_irq_event() ....
I don't follow. As far as I understand desc_set_defaults() makes sure we
have a clean descriptor after release/before allocation. This includes
clearing the moderation enable flag from a previous user of this desc,
and the flag is only read in the hook in handle_irq_event().
cheers
luigi
© 2016 - 2026 Red Hat, Inc.