[PATCH v4 16/26] genirq: Allow per-cpu interrupt sharing for non-overlapping affinities

Marc Zyngier posted 26 patches 1 month, 3 weeks ago
[PATCH v4 16/26] genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
Posted by Marc Zyngier 1 month, 3 weeks ago
Interrupt sharing for percpu-devid interrupts is forbidden, and
for good reasons. These are interrupts generated *from* a CPU and
handled by itself (timer, for example). Nobody in their right mind
would put two devices on the same pin (and if they have, they get to
keep the pieces...).

But this also prevents more benign cases, where devices are connected
to groups of CPUs, and for which the affinities are not overlapping.
Effectively, the only thing they share is the interrupt number, and
nothing else.

Let's tweak the definition of IRQF_SHARED applied to percpu_devid
interrupts to allow this particular case. This results in extra
validation at the point of the interrupt being setup and freed,
as well as a tiny bit of extra complexity for interrupts at handling
time (to pick the correct irqaction).

Tested-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 kernel/irq/chip.c   |  8 ++++--
 kernel/irq/manage.c | 67 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 633e1f67bb6f4..19e0a87a2663e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -897,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc)
 void handle_percpu_devid_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct irqaction *action = desc->action;
 	unsigned int irq = irq_desc_get_irq(desc);
+	unsigned int cpu = smp_processor_id();
+	struct irqaction *action;
 	irqreturn_t res;
 
 	/*
@@ -910,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
 	if (chip->irq_ack)
 		chip->irq_ack(&desc->irq_data);
 
+	for (action = desc->action; action; action = action->next)
+		if (cpumask_test_cpu(cpu, action->affinity))
+			break;
+
 	if (likely(action)) {
 		trace_irq_handler_entry(irq, action);
 		res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
 		trace_irq_handler_exit(irq, action, res);
 	} else {
-		unsigned int cpu = smp_processor_id();
 		bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
 
 		if (enabled)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b1a3140e5f3c9..ea3dbf6fee194 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1418,6 +1418,19 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
 	return 0;
 }
 
+static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new)
+{
+	do {
+		if (cpumask_intersects(old->affinity, new->affinity) ||
+		    old->percpu_dev_id == new->percpu_dev_id)
+			return false;
+
+		old = old->next;
+	} while (old);
+
+	return true;
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -1438,6 +1451,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	struct irqaction *old, **old_ptr;
 	unsigned long flags, thread_mask = 0;
 	int ret, nested, shared = 0;
+	bool per_cpu_devid;
 
 	if (!desc)
 		return -EINVAL;
@@ -1447,6 +1461,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (!try_module_get(desc->owner))
 		return -ENODEV;
 
+	per_cpu_devid = irq_settings_is_per_cpu_devid(desc);
+
 	new->irq = irq;
 
 	/*
@@ -1554,13 +1570,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 */
 		unsigned int oldtype;
 
-		if (irq_is_nmi(desc)) {
+		if (irq_is_nmi(desc) && !per_cpu_devid) {
 			pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
 				new->name, irq, desc->irq_data.chip->name);
 			ret = -EINVAL;
 			goto out_unlock;
 		}
 
+		if (per_cpu_devid && !valid_percpu_irqaction(old, new)) {
+			pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n",
+				new->name, irq, desc->irq_data.chip->name);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
 		/*
 		 * If nobody did set the configuration before, inherit
 		 * the one provided by the requester.
@@ -1711,7 +1734,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (!(new->flags & IRQF_NO_AUTOEN) &&
 		    irq_settings_can_autoenable(desc)) {
 			irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
-		} else {
+		} else if (!per_cpu_devid) {
 			/*
 			 * Shared interrupts do not go well with disabling
 			 * auto enable. The sharing interrupt might request
@@ -2346,7 +2369,7 @@ void disable_percpu_nmi(unsigned int irq)
 static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action;
+	struct irqaction *action, **action_ptr;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
 
@@ -2354,21 +2377,33 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
 		return NULL;
 
 	scoped_guard(raw_spinlock_irqsave, &desc->lock) {
-		action = desc->action;
-		if (!action || action->percpu_dev_id != dev_id) {
-			WARN(1, "Trying to free already-free IRQ %d\n", irq);
-			return NULL;
+		action_ptr = &desc->action;
+		for (;;) {
+			action = *action_ptr;
+
+			if (!action) {
+				WARN(1, "Trying to free already-free IRQ %d\n", irq);
+				return NULL;
+			}
+
+			if (action->percpu_dev_id == dev_id)
+				break;
+
+			action_ptr = &action->next;
 		}
 
-		if (!cpumask_empty(desc->percpu_enabled)) {
-			WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
-			     irq, cpumask_first(desc->percpu_enabled));
+		if (cpumask_intersects(desc->percpu_enabled, action->affinity)) {
+			WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq,
+			     cpumask_first_and(desc->percpu_enabled, action->affinity));
 			return NULL;
 		}
 
 		/* Found it - now remove it from the list of entries: */
-		desc->action = NULL;
-		desc->istate &= ~IRQS_NMI;
+		*action_ptr = action->next;
+
+		/* Demote from NMI if we killed the last action */
+		if (!desc->action)
+			desc->istate &= ~IRQS_NMI;
 	}
 
 	unregister_handler_proc(irq, action);
@@ -2462,6 +2497,14 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f
 	action->percpu_dev_id = dev_id;
 	action->affinity = affinity;
 
+	/*
+	 * We allow some form of sharing for non-overlapping affinity
+	 * masks. Obviously, covering all CPUs prevents any sharing
+	 * the first place.
+	 */
+	if (!cpumask_equal(affinity, cpu_possible_mask))
+		action->flags |= IRQF_SHARED;
+
 	return action;
 }
 
-- 
2.47.3
Re: [PATCH v4 16/26] genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
Posted by Daniel Thompson 1 week, 5 days ago
On Mon, Oct 20, 2025 at 01:29:33PM +0100, Marc Zyngier wrote:
> Interrupt sharing for percpu-devid interrupts is forbidden, and
> for good reasons. These are interrupts generated *from* a CPU and
> handled by itself (timer, for example). Nobody in their right mind
> would put two devices on the same pin (and if they have, they get to
> keep the pieces...).
>
> But this also prevents more benign cases, where devices are connected
> to groups of CPUs, and for which the affinities are not overlapping.
> Effectively, the only thing they share is the interrupt number, and
> nothing else.
>
> Let's tweak the definition of IRQF_SHARED applied to percpu_devid
> interrupts to allow this particular case. This results in extra
> validation at the point of the interrupt being setup and freed,
> as well as a tiny bit of extra complexity for interrupts at handling
> time (to pick the correct irqaction).
>
> Tested-by: Will Deacon <will@kernel.org>
> Signed-off-by: Marc Zyngier <maz@kernel.org>

I picked up this patch via linux-next and it appears be causing boot
regressions on MIPS/qemu. This patch was identified with a bisect and
a git revert of this patch from the linux-next tip resolves the problem
(specifically, next-20251204 with git revert bdf4e2ac295f).

I'm running the code as part of the kgdb test suite but the system
doesn't survive long enough for kgdb to be involved. In fact I was able
to reduce things to the following reproduction with all the kgdb pieces
removed:

    make malta_kvm_defconfig generic/64r6.config
    ../scripts/config \
        --enable WERROR --enable CPU_MIPS64_R6 --enable MIPS_CPS \
	--enable BLK_DEV_INITRD --set-val FRAME_WARN 2048
    make olddefconfig
    make -j$(nproc) all
    qemu-system-mips64el -cpu I6400 -M malta -m 1G -smp 2 \
        -kernel vmlinux -nographic \
	-append " console=ttyS0,115200 clk_ignore_unused"

The stack dumps continually so I have to abridge the logs but the logs
to second stack trace are:

~~~
Linux version 6.18.0-next-20251204 (drt@wychelm) (mips64el-linux-gcc.br_real (Buildroot 2025.02.8) 14.3.0, GNU ld (GNU Binutils) 2.44) #20 SMP Thu Dec  4 10:37:28 GMT 2025
earlycon: uart8250 at I/O port 0x3f8 (options '38400n8')
printk: legacy bootconsole [uart8250] enabled
CPU0 revision is: 0001a900 (MIPS I6400)
FPU revision is: 20f30300
MSA revision is: 00000300
MIPS: machine is mti,malta
Software DMA cache coherency enabled
Initial ramdisk at: 0x900000000fc40000 (3818338 bytes)
OF: reserved mem: Reserved memory: No reserved-memory node in the DT
VP topology {2} total 2
Primary instruction cache 64kB, VIPT, 4-way, linesize 64 bytes.
Primary data cache 64kB, 4-way, PIPT, no aliases, linesize 64 bytes
Zone ranges:
  DMA      [mem 0x0000000000000000-0x0000000000ffffff]
  DMA32    [mem 0x0000000001000000-0x00000000ffffffff]
  Normal   empty
Movable zone start for each node
Early memory node ranges
  node   0: [mem 0x0000000000000000-0x000000000fffffff]
  node   0: [mem 0x0000000090000000-0x00000000bfffffff]
Initmem setup node 0 [mem 0x0000000000000000-0x00000000bfffffff]
On node 0, zone DMA32: 131072 pages in unavailable ranges
random: crng init done
percpu: Embedded 7 pages/cpu s69728 r8192 d36768 u114688
Kernel command line: rd_start=0xffffffff8fc40000 rd_size=3818338  console=ttyS0,115200 kgdboc=ttyS0 clk_ignore_unused
Unknown kernel command line parameters "kgdboc=ttyS0", will be passed to user space.
printk: log buffer data + meta data: 32768 + 114688 = 147456 bytes
Dentry cache hash table entries: 131072 (order: 6, 1048576 bytes, linear)
Inode-cache hash table entries: 65536 (order: 5, 524288 bytes, linear)
Cache parity protection disabled
MAAR configuration:
  [0]: 0x0000000000000000-0x000000000fffffff speculate
  [1]: 0x0000000090000000-0x00000000bfffffff speculate
  [2]: disabled
  [3]: disabled
  [4]: disabled
  [5]: disabled
  [6]: disabled
  [7]: disabled
Built 1 zonelists, mobility grouping on.  Total pages: 65536
mem auto-init: stack:all(zero), heap alloc:off, heap free:off
SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
rcu: Hierarchical RCU implementation.
rcu: 	RCU event tracing is enabled.
	Tracing variant of Tasks RCU enabled.
rcu: RCU calculated value of scheduler-enlistment delay is 10 jiffies.
RCU Tasks Trace: Setting shift to 1 and lim to 1 rcu_task_cb_adjust=1 rcu_task_cpu_ids=2.
NR_IRQS: 256
rcu: srcu_init: Setting srcu_struct sizes based on contention.
CPU frequency 320.00 MHz
GIC frequency 100.00 MHz
clocksource: GIC: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19113045595 ns
sched_clock: 32 bits at 100MHz, resolution 10ns, wraps every 21475332090ns
clocksource: MIPS: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 11945377117 ns
Console: colour dummy device 80x25
Calibrating delay loop...
CPU 0 Unable to handle kernel paging request at virtual address 0000000000000000, epc == ffffffff801c2398, ra == ffffffff801bab00
Oops[#1]:
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.18.0-next-20251204 #20 NONE
Hardware name: mti,malta
$ 0   : 0000000000000000 0000000000000001 0000000000000000 0000000000000000
$ 4   : 0000000000000001 a8000000020e8008 0000000000000000 ffffffff80c23b80
$ 8   : 0000000000000004 0000000000000000 0000000000000000 000000000000002f
$12   : a8000000020f4000 0000000000003ff0 0000000000003000 0000000000000003
$16   : ffffffff80d095c0 ffffffff80ceb410 0000000000000019 ffffffff80c378c0
$20   : ffffffff80c4bec8 0000000000000000 ffffffff80e00000 ffffffff80de0000
$24   : 0000000000000000 0000000000000010
$28   : ffffffff80c20000 a8000000020f7ec0 a800000000e12fcd ffffffff801bab00
epc   : ffffffff801c2398 handle_percpu_devid_irq+0xb8/0x250
ra    : ffffffff801bab00 handle_irq_desc+0x48/0x88
Status: 1400a4e2	KX SX UX KERNEL EXL
Cause : 00800408 (ExcCode 02)
BadVA : 0000000000000000
PrId  : 0001a900 (MIPS I6400)
Modules linked in:
Process swapper/0 (pid: 0, threadinfo=(____ptrval____), task=(____ptrval____), tls=0000000000000000)
Stack : ffffffff80c35a2c 0000000000000002 ffffffff80e00000 0fffffffffffffff
        ffffffff80c50000 0000000000000001 0000000000000003 ffffffff801bab00
        0000000000000000 ffffffff805d82a8 0000000000000000 0000000000000008
        0000000000000000 0000000000000000 0000000000000000 5189d95a7a4f4800
        a800000002014300 0000000000000002 0000000000000001 000000000000001f
        ffffffff80e00000 0000000000000004 0000000000000000 ffffffff801bab00
        0000000000000000 ffffffff809ec128 0000000000000001 fffffffffffffffb
        0000000000000001 ffffffff805d7ebc 0000000000000000 0000000000000000
        ffffffff80c23c80 ffffffff80c50000 ffffffff80de0000 ffffffff80db0000
        0000000000000000 ffffffff80112f10 ffffffff80c23c80 0000000000000000
Call Trace:
[<ffffffff801c2398>] handle_percpu_devid_irq+0xb8/0x250
[<ffffffff801bab00>] handle_irq_desc+0x48/0x88
[<ffffffff805d82a8>] gic_irq_dispatch+0xc0/0x288
[<ffffffff801bab00>] handle_irq_desc+0x48/0x88
[<ffffffff809ec128>] do_domain_IRQ+0x28/0x40
[<ffffffff805d7ebc>] plat_irq_dispatch+0x64/0xe8
[<ffffffff80112f10>] handle_int+0x134/0x140
[<ffffffff80110dc8>] calibrate_delay+0x158/0x290
[<ffffffff80d58e48>] start_kernel+0x754/0x7a4

Code: da000048  de020010  0043102d <dc420000> 00c21016  30420001  d85ffff8  00000000  00000000

CPU 0 Unable to handle kernel paging request at virtual address 0000000000000000, epc == ffffffff801c2398, ra == ffffffff801bab00
Oops[#2]:
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G      D             6.18.0-next-20251204 #20 NONE
Tainted: [D]=DIE
Hardware name: mti,malta
$ 0   : 0000000000000000 0000000000000001 0000000000000000 0000000000000000
$ 4   : 0000000000000002 a8000000020e8008 0000000000000000 a8000000020f7ae0
$ 8   : 0000000000000004 0000000000000000 0000000000000000 0000000000000000
$12   : a8000000020f4000 a8000000020f4000 0000000000003000 0000000000000000
$16   : ffffffff80d095c0 ffffffff80ceb410 0000000000000019 ffffffff80c378c0
$20   : ffffffff80c4bec8 0000000000000000 ffffffff80e00000 0000000000000001
$24   : 0000000000000000 ffffffffffffffff
$28   : ffffffff80c20000 a8000000020f7ab0 a800000000e12fcd ffffffff801bab00
epc   : ffffffff801c2398 handle_percpu_devid_irq+0xb8/0x250
ra    : ffffffff801bab00 handle_irq_desc+0x48/0x88
Status: 1400a4e2	KX SX UX KERNEL EXL
Cause : 00800408 (ExcCode 02)
BadVA : 0000000000000000
PrId  : 0001a900 (MIPS I6400)
Modules linked in:
Process swapper/0 (pid: 0, threadinfo=(____ptrval____), task=(____ptrval____), tls=0000000000000000)
Stack : ffffffff80c50000 0000000000000002 ffffffff80e00000 0fffffffffffffff
        ffffffff80c50000 0000000000000001 0000000000000003 ffffffff801bab00
        0000000000000000 ffffffff805d82a8 a8000000020f7bd8 ffffffff801b94f4
        0000000000000000 0000000000000000 ffffffff80c4bc10 5189d95a7a4f4800
        a800000002014300 0000000000000002 0000000000000001 000000000000001f
        ffffffff80e00000 0000000000000004 0000000000000000 ffffffff801bab00
        0000000000000001 ffffffff809ec128 0000000000000000 fffffffffffffffb
        0000000000000001 ffffffff805d7ebc ffffffff80c7b558 ffffffff80c23c80
        a8000000020f7be0 ffffffff80b2fcb8 ffffffff80de0000 ffffffff80c4bec8
        ffffffff80c515c0 ffffffff80112f10 0000000000000000 0000000000000001
        ...
Call Trace:
[<ffffffff801c2398>] handle_percpu_devid_irq+0xb8/0x250
[<ffffffff801bab00>] handle_irq_desc+0x48/0x88
[<ffffffff805d82a8>] gic_irq_dispatch+0xc0/0x288
[<ffffffff801bab00>] handle_irq_desc+0x48/0x88
[<ffffffff809ec128>] do_domain_IRQ+0x28/0x40
[<ffffffff805d7ebc>] plat_irq_dispatch+0x64/0xe8
[<ffffffff80112f10>] handle_int+0x134/0x140
[<ffffffff8011a730>] die+0xa8/0xf8
[<ffffffff80134680>] do_page_fault+0x530/0x540
[<ffffffff8013a1a8>] tlb_do_page_fault_0+0x108/0x110
[<ffffffff801bab00>] handle_irq_desc+0x48/0x88

Code: da000048  de020010  0043102d <dc420000> 00c21016  30420001  d85ffff8  00000000  00000000
~~~

FWIW I'm using a buildroot gcc-14 toolchain but I've seen the same problem on
gcc-13 in previous kgdbtest logs.


Daniel.
Re: [PATCH v4 16/26] genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
Posted by Marc Zyngier 1 week, 4 days ago
On Thu, 04 Dec 2025 10:56:13 +0000,
Daniel Thompson <danielt@kernel.org> wrote:
> 
> On Mon, Oct 20, 2025 at 01:29:33PM +0100, Marc Zyngier wrote:
> > Interrupt sharing for percpu-devid interrupts is forbidden, and
> > for good reasons. These are interrupts generated *from* a CPU and
> > handled by itself (timer, for example). Nobody in their right mind
> > would put two devices on the same pin (and if they have, they get to
> > keep the pieces...).
> >
> > But this also prevents more benign cases, where devices are connected
> > to groups of CPUs, and for which the affinities are not overlapping.
> > Effectively, the only thing they share is the interrupt number, and
> > nothing else.
> >
> > Let's tweak the definition of IRQF_SHARED applied to percpu_devid
> > interrupts to allow this particular case. This results in extra
> > validation at the point of the interrupt being setup and freed,
> > as well as a tiny bit of extra complexity for interrupts at handling
> > time (to pick the correct irqaction).
> >
> > Tested-by: Will Deacon <will@kernel.org>
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> 
> I picked up this patch via linux-next and it appears be causing boot
> regressions on MIPS/qemu. This patch was identified with a bisect and
> a git revert of this patch from the linux-next tip resolves the problem
> (specifically, next-20251204 with git revert bdf4e2ac295f).
> 
> I'm running the code as part of the kgdb test suite but the system
> doesn't survive long enough for kgdb to be involved. In fact I was able
> to reduce things to the following reproduction with all the kgdb pieces
> removed:
> 
>     make malta_kvm_defconfig generic/64r6.config
>     ../scripts/config \
>         --enable WERROR --enable CPU_MIPS64_R6 --enable MIPS_CPS \
> 	--enable BLK_DEV_INITRD --set-val FRAME_WARN 2048
>     make olddefconfig
>     make -j$(nproc) all
>     qemu-system-mips64el -cpu I6400 -M malta -m 1G -smp 2 \
>         -kernel vmlinux -nographic \
> 	-append " console=ttyS0,115200 clk_ignore_unused"

Many thanks for the minimal reproducer, that really helped a lot!

[...]

> CPU 0 Unable to handle kernel paging request at virtual address 0000000000000000, epc == ffffffff801c2398, ra == ffffffff801bab00
> Oops[#1]:
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.18.0-next-20251204 #20 NONE
> Hardware name: mti,malta
> $ 0   : 0000000000000000 0000000000000001 0000000000000000 0000000000000000
> $ 4   : 0000000000000001 a8000000020e8008 0000000000000000 ffffffff80c23b80
> $ 8   : 0000000000000004 0000000000000000 0000000000000000 000000000000002f
> $12   : a8000000020f4000 0000000000003ff0 0000000000003000 0000000000000003
> $16   : ffffffff80d095c0 ffffffff80ceb410 0000000000000019 ffffffff80c378c0
> $20   : ffffffff80c4bec8 0000000000000000 ffffffff80e00000 ffffffff80de0000
> $24   : 0000000000000000 0000000000000010
> $28   : ffffffff80c20000 a8000000020f7ec0 a800000000e12fcd ffffffff801bab00
> epc   : ffffffff801c2398 handle_percpu_devid_irq+0xb8/0x250
> ra    : ffffffff801bab00 handle_irq_desc+0x48/0x88
> Status: 1400a4e2	KX SX UX KERNEL EXL
> Cause : 00800408 (ExcCode 02)
> BadVA : 0000000000000000
> PrId  : 0001a900 (MIPS I6400)
> Modules linked in:
> Process swapper/0 (pid: 0, threadinfo=(____ptrval____), task=(____ptrval____), tls=0000000000000000)
> Stack : ffffffff80c35a2c 0000000000000002 ffffffff80e00000 0fffffffffffffff
>         ffffffff80c50000 0000000000000001 0000000000000003 ffffffff801bab00
>         0000000000000000 ffffffff805d82a8 0000000000000000 0000000000000008
>         0000000000000000 0000000000000000 0000000000000000 5189d95a7a4f4800
>         a800000002014300 0000000000000002 0000000000000001 000000000000001f
>         ffffffff80e00000 0000000000000004 0000000000000000 ffffffff801bab00
>         0000000000000000 ffffffff809ec128 0000000000000001 fffffffffffffffb
>         0000000000000001 ffffffff805d7ebc 0000000000000000 0000000000000000
>         ffffffff80c23c80 ffffffff80c50000 ffffffff80de0000 ffffffff80db0000
>         0000000000000000 ffffffff80112f10 ffffffff80c23c80 0000000000000000
> Call Trace:
> [<ffffffff801c2398>] handle_percpu_devid_irq+0xb8/0x250
> [<ffffffff801bab00>] handle_irq_desc+0x48/0x88
> [<ffffffff805d82a8>] gic_irq_dispatch+0xc0/0x288
> [<ffffffff801bab00>] handle_irq_desc+0x48/0x88
> [<ffffffff809ec128>] do_domain_IRQ+0x28/0x40
> [<ffffffff805d7ebc>] plat_irq_dispatch+0x64/0xe8
> [<ffffffff80112f10>] handle_int+0x134/0x140
> [<ffffffff80110dc8>] calibrate_delay+0x158/0x290
> [<ffffffff80d58e48>] start_kernel+0x754/0x7a4

This hack fixes it for me, but really, mips needs to grow up and stop
using these antiquated APIs.

Can please you give it a go?

Thanks,

	M.

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0bb29316b4362..8b1b4c8a4f54c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2470,6 +2470,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
 	if (retval < 0)
 		return retval;
 
+	if (!act->affinity)
+		act->affinity = cpu_online_mask;
+
 	retval = __setup_irq(irq, desc, act);
 
 	if (retval)

-- 
Without deviation from the norm, progress is not possible.
Re: [PATCH v4 16/26] genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
Posted by Daniel Thompson 1 week, 4 days ago
On Thu, Dec 04, 2025 at 02:21:51PM +0000, Marc Zyngier wrote:
> On Thu, 04 Dec 2025 10:56:13 +0000,
> Daniel Thompson <danielt@kernel.org> wrote:
> > CPU 0 Unable to handle kernel paging request at virtual address 0000000000000000, epc == ffffffff801c2398, ra == ffffffff801bab00
> > Oops[#1]:
> > CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.18.0-next-20251204 #20 NONE
> > Hardware name: mti,malta
> > $ 0   : 0000000000000000 0000000000000001 0000000000000000 0000000000000000
> > $ 4   : 0000000000000001 a8000000020e8008 0000000000000000 ffffffff80c23b80
> > $ 8   : 0000000000000004 0000000000000000 0000000000000000 000000000000002f
> > $12   : a8000000020f4000 0000000000003ff0 0000000000003000 0000000000000003
> > $16   : ffffffff80d095c0 ffffffff80ceb410 0000000000000019 ffffffff80c378c0
> > $20   : ffffffff80c4bec8 0000000000000000 ffffffff80e00000 ffffffff80de0000
> > $24   : 0000000000000000 0000000000000010
> > $28   : ffffffff80c20000 a8000000020f7ec0 a800000000e12fcd ffffffff801bab00
> > epc   : ffffffff801c2398 handle_percpu_devid_irq+0xb8/0x250
> > ra    : ffffffff801bab00 handle_irq_desc+0x48/0x88
> > Status: 1400a4e2	KX SX UX KERNEL EXL
> > Cause : 00800408 (ExcCode 02)
> > BadVA : 0000000000000000
> > PrId  : 0001a900 (MIPS I6400)
> > Modules linked in:
> > Process swapper/0 (pid: 0, threadinfo=(____ptrval____), task=(____ptrval____), tls=0000000000000000)
> > Stack : ffffffff80c35a2c 0000000000000002 ffffffff80e00000 0fffffffffffffff
> >         ffffffff80c50000 0000000000000001 0000000000000003 ffffffff801bab00
> >         0000000000000000 ffffffff805d82a8 0000000000000000 0000000000000008
> >         0000000000000000 0000000000000000 0000000000000000 5189d95a7a4f4800
> >         a800000002014300 0000000000000002 0000000000000001 000000000000001f
> >         ffffffff80e00000 0000000000000004 0000000000000000 ffffffff801bab00
> >         0000000000000000 ffffffff809ec128 0000000000000001 fffffffffffffffb
> >         0000000000000001 ffffffff805d7ebc 0000000000000000 0000000000000000
> >         ffffffff80c23c80 ffffffff80c50000 ffffffff80de0000 ffffffff80db0000
> >         0000000000000000 ffffffff80112f10 ffffffff80c23c80 0000000000000000
> > Call Trace:
> > [<ffffffff801c2398>] handle_percpu_devid_irq+0xb8/0x250
> > [<ffffffff801bab00>] handle_irq_desc+0x48/0x88
> > [<ffffffff805d82a8>] gic_irq_dispatch+0xc0/0x288
> > [<ffffffff801bab00>] handle_irq_desc+0x48/0x88
> > [<ffffffff809ec128>] do_domain_IRQ+0x28/0x40
> > [<ffffffff805d7ebc>] plat_irq_dispatch+0x64/0xe8
> > [<ffffffff80112f10>] handle_int+0x134/0x140
> > [<ffffffff80110dc8>] calibrate_delay+0x158/0x290
> > [<ffffffff80d58e48>] start_kernel+0x754/0x7a4
>
> This hack fixes it for me, but really, mips needs to grow up and stop
> using these antiquated APIs.

I can't comment on that. Literally the only thing I ever do on MIPS is
test kgdb ;-) .


> Can please you give it a go?

Looks good to me.

Local testing on MIPS worked nicely and I got a successful build back
for the kgdb tests:
https://gitlab.com/daniel-thompson/linux/-/pipelines/2196473049


Daniel.
Re: [PATCH v4 16/26] genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
Posted by Jonathan Cameron 1 month ago
On Mon, 20 Oct 2025 13:29:33 +0100
Marc Zyngier <maz@kernel.org> wrote:

> Interrupt sharing for percpu-devid interrupts is forbidden, and
> for good reasons. These are interrupts generated *from* a CPU and
> handled by itself (timer, for example). Nobody in their right mind
> would put two devices on the same pin (and if they have, they get to
> keep the pieces...).
> 
> But this also prevents more benign cases, where devices are connected
> to groups of CPUs, and for which the affinities are not overlapping.
> Effectively, the only thing they share is the interrupt number, and
> nothing else.
> 
> Let's tweak the definition of IRQF_SHARED applied to percpu_devid
> interrupts to allow this particular case. This results in extra
> validation at the point of the interrupt being setup and freed,
> as well as a tiny bit of extra complexity for interrupts at handling
> time (to pick the correct irqaction).
> 
> Tested-by: Will Deacon <will@kernel.org>
> Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
[tip: irq/core] genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
Posted by tip-bot2 for Marc Zyngier 1 month, 2 weeks ago
The following commit has been merged into the irq/core branch of tip:

Commit-ID:     bdf4e2ac295fe77c94b570a1ad12c0882bc89b53
Gitweb:        https://git.kernel.org/tip/bdf4e2ac295fe77c94b570a1ad12c0882bc89b53
Author:        Marc Zyngier <maz@kernel.org>
AuthorDate:    Mon, 20 Oct 2025 13:29:33 +01:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Mon, 27 Oct 2025 17:16:35 +01:00

genirq: Allow per-cpu interrupt sharing for non-overlapping affinities

Interrupt sharing for percpu-devid interrupts is forbidden, and for good
reasons. These are interrupts generated *from* a CPU and handled by itself
(timer, for example). Nobody in their right mind would put two devices on
the same pin (and if they have, they get to keep the pieces...).

But this also prevents more benign cases, where devices are connected
to groups of CPUs, and for which the affinities are not overlapping.
Effectively, the only thing they share is the interrupt number, and
nothing else.

Tweak the definition of IRQF_SHARED applied to percpu_devid interrupts to
allow this particular use case. This results in extra validation at the
point of the interrupt being setup and freed, as well as a tiny bit of
extra complexity for interrupts at handling time (to pick the correct
irqaction).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-17-maz@kernel.org
---
 kernel/irq/chip.c   |  8 +++--
 kernel/irq/manage.c | 67 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 633e1f6..19e0a87 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -897,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc)
 void handle_percpu_devid_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct irqaction *action = desc->action;
 	unsigned int irq = irq_desc_get_irq(desc);
+	unsigned int cpu = smp_processor_id();
+	struct irqaction *action;
 	irqreturn_t res;
 
 	/*
@@ -910,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
 	if (chip->irq_ack)
 		chip->irq_ack(&desc->irq_data);
 
+	for (action = desc->action; action; action = action->next)
+		if (cpumask_test_cpu(cpu, action->affinity))
+			break;
+
 	if (likely(action)) {
 		trace_irq_handler_entry(irq, action);
 		res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
 		trace_irq_handler_exit(irq, action, res);
 	} else {
-		unsigned int cpu = smp_processor_id();
 		bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
 
 		if (enabled)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b1a3140..7a09d96 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1418,6 +1418,19 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
 	return 0;
 }
 
+static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new)
+{
+	do {
+		if (cpumask_intersects(old->affinity, new->affinity) ||
+		    old->percpu_dev_id == new->percpu_dev_id)
+			return false;
+
+		old = old->next;
+	} while (old);
+
+	return true;
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -1438,6 +1451,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	struct irqaction *old, **old_ptr;
 	unsigned long flags, thread_mask = 0;
 	int ret, nested, shared = 0;
+	bool per_cpu_devid;
 
 	if (!desc)
 		return -EINVAL;
@@ -1447,6 +1461,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (!try_module_get(desc->owner))
 		return -ENODEV;
 
+	per_cpu_devid = irq_settings_is_per_cpu_devid(desc);
+
 	new->irq = irq;
 
 	/*
@@ -1554,13 +1570,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 */
 		unsigned int oldtype;
 
-		if (irq_is_nmi(desc)) {
+		if (irq_is_nmi(desc) && !per_cpu_devid) {
 			pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
 				new->name, irq, desc->irq_data.chip->name);
 			ret = -EINVAL;
 			goto out_unlock;
 		}
 
+		if (per_cpu_devid && !valid_percpu_irqaction(old, new)) {
+			pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n",
+				new->name, irq, desc->irq_data.chip->name);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
 		/*
 		 * If nobody did set the configuration before, inherit
 		 * the one provided by the requester.
@@ -1711,7 +1734,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (!(new->flags & IRQF_NO_AUTOEN) &&
 		    irq_settings_can_autoenable(desc)) {
 			irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
-		} else {
+		} else if (!per_cpu_devid) {
 			/*
 			 * Shared interrupts do not go well with disabling
 			 * auto enable. The sharing interrupt might request
@@ -2346,7 +2369,7 @@ void disable_percpu_nmi(unsigned int irq)
 static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action;
+	struct irqaction *action, **action_ptr;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
 
@@ -2354,21 +2377,33 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
 		return NULL;
 
 	scoped_guard(raw_spinlock_irqsave, &desc->lock) {
-		action = desc->action;
-		if (!action || action->percpu_dev_id != dev_id) {
-			WARN(1, "Trying to free already-free IRQ %d\n", irq);
-			return NULL;
+		action_ptr = &desc->action;
+		for (;;) {
+			action = *action_ptr;
+
+			if (!action) {
+				WARN(1, "Trying to free already-free IRQ %d\n", irq);
+				return NULL;
+			}
+
+			if (action->percpu_dev_id == dev_id)
+				break;
+
+			action_ptr = &action->next;
 		}
 
-		if (!cpumask_empty(desc->percpu_enabled)) {
-			WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
-			     irq, cpumask_first(desc->percpu_enabled));
+		if (cpumask_intersects(desc->percpu_enabled, action->affinity)) {
+			WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq,
+			     cpumask_first_and(desc->percpu_enabled, action->affinity));
 			return NULL;
 		}
 
 		/* Found it - now remove it from the list of entries: */
-		desc->action = NULL;
-		desc->istate &= ~IRQS_NMI;
+		*action_ptr = action->next;
+
+		/* Demote from NMI if we killed the last action */
+		if (!desc->action)
+			desc->istate &= ~IRQS_NMI;
 	}
 
 	unregister_handler_proc(irq, action);
@@ -2462,6 +2497,14 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f
 	action->percpu_dev_id = dev_id;
 	action->affinity = affinity;
 
+	/*
+	 * We allow some form of sharing for non-overlapping affinity
+	 * masks. Obviously, covering all CPUs prevents any sharing in
+	 * the first place.
+	 */
+	if (!cpumask_equal(affinity, cpu_possible_mask))
+		action->flags |= IRQF_SHARED;
+
 	return action;
 }