[v2] Enable MSI affinity support for dwc PCI

[PATCH v2 1/3] genirq: Add interrupt redirection infrastructure

Posted by Radu Rendec 4 months ago

From: Thomas Gleixner <tglx@linutronix.de>

Add infrastructure to redirect interrupt handler execution to a
different CPU when the current CPU is not part of the interrupt's CPU
affinity mask.

This is primarily aimed at (de)multiplexed interrupts, where the child
interrupt handler runs in the context of the parent interrupt handler,
and therefore CPU affinity control for the child interrupt is typically
not available.

With the new infrastructure, the child interrupt is allowed to freely
change its affinity setting, independently of the parent. If the
interrupt handler happens to be triggered on an "incompatible" CPU (a
CPU that's not part of the child interrupt's affinity mask), the handler
is redirected and runs in IRQ work context on a "compatible" CPU.

No functional change is being made to any existing irqchip driver, and
irqchip drivers must be explicitly modified to use the newly added
infrastructure to support interrupt redirection.

This is a direct follow up to the patches that Thomas Gleixner proposed
in https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
Co-developed-by: Radu Rendec <rrendec@redhat.com>
Signed-off-by: Radu Rendec <rrendec@redhat.com>
---
 include/linux/irq.h     | 10 ++++++++
 include/linux/irqdesc.h | 11 ++++++++-
 kernel/irq/chip.c       | 22 +++++++++++++++++-
 kernel/irq/irqdesc.c    | 51 +++++++++++++++++++++++++++++++++++++++--
 kernel/irq/manage.c     | 16 +++++++++++--
 5 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c67e76fbcc077..3e5a6ffb1e551 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -459,6 +459,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  *			checks against the supplied affinity mask are not
  *			required. This is used for CPU hotplug where the
  *			target CPU is not yet set in the cpu_online_mask.
+ * @irq_pre_redirect:	Optional function to be invoked before redirecting
+ *			an interrupt via irq_work.
  * @irq_retrigger:	resend an IRQ to the CPU
  * @irq_set_type:	set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
  * @irq_set_wake:	enable/disable power-management wake-on of an IRQ
@@ -503,6 +505,7 @@ struct irq_chip {
 	void		(*irq_eoi)(struct irq_data *data);
 
 	int		(*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force);
+	void		(*irq_pre_redirect)(struct irq_data *data);
 	int		(*irq_retrigger)(struct irq_data *data);
 	int		(*irq_set_type)(struct irq_data *data, unsigned int flow_type);
 	int		(*irq_set_wake)(struct irq_data *data, unsigned int on);
@@ -688,6 +691,13 @@ extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data,
 extern int irq_chip_set_type_parent(struct irq_data *data, unsigned int type);
 extern int irq_chip_request_resources_parent(struct irq_data *data);
 extern void irq_chip_release_resources_parent(struct irq_data *data);
+#ifdef CONFIG_SMP
+void irq_chip_pre_redirect_parent(struct irq_data *data);
+#endif
+#endif
+
+#ifdef CONFIG_SMP
+int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force);
 #endif
 
 /* Disable or mask interrupts during a kernel kexec */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index fd091c35d5721..aeead91884668 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -2,9 +2,10 @@
 #ifndef _LINUX_IRQDESC_H
 #define _LINUX_IRQDESC_H
 
-#include <linux/rcupdate.h>
+#include <linux/irq_work.h>
 #include <linux/kobject.h>
 #include <linux/mutex.h>
+#include <linux/rcupdate.h>
 
 /*
  * Core internal functions to deal with irq descriptors
@@ -29,6 +30,11 @@ struct irqstat {
 #endif
 };
 
+struct irq_redirect {
+	struct irq_work	work;
+	unsigned int	fallback_cpu;
+};
+
 /**
  * struct irq_desc - interrupt descriptor
  * @irq_common_data:	per irq and chip data passed down to chip functions
@@ -46,6 +52,7 @@ struct irqstat {
  * @threads_handled:	stats field for deferred spurious detection of threaded handlers
  * @threads_handled_last: comparator field for deferred spurious detection of threaded handlers
  * @lock:		locking for SMP
+ * @redirect:		Facility for redirecting interrupts via irq_work
  * @affinity_hint:	hint to user space for preferred irq affinity
  * @affinity_notify:	context for notification of affinity changes
  * @pending_mask:	pending rebalanced interrupts
@@ -84,6 +91,7 @@ struct irq_desc {
 	struct cpumask		*percpu_enabled;
 	const struct cpumask	*percpu_affinity;
 #ifdef CONFIG_SMP
+	struct irq_redirect	redirect;
 	const struct cpumask	*affinity_hint;
 	struct irq_affinity_notify *affinity_notify;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -186,6 +194,7 @@ int generic_handle_irq_safe(unsigned int irq);
 int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
 int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq);
 int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq);
+bool generic_handle_demux_domain_irq(struct irq_domain *domain, unsigned int hwirq);
 #endif
 
 /* Test to see if a driver has successfully requested an irq */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ffa0d80ddd19..b6b2ce29d0ce1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1143,7 +1143,7 @@ void irq_cpu_offline(void)
 }
 #endif
 
-#ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 
 #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
 /**
@@ -1215,6 +1215,15 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
 
 #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
 
+#ifdef CONFIG_SMP
+void irq_chip_pre_redirect_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	data->chip->irq_pre_redirect(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_pre_redirect_parent);
+#endif
+
 /**
  * irq_chip_set_parent_state - set the state of a parent interrupt.
  *
@@ -1497,6 +1506,17 @@ void irq_chip_release_resources_parent(struct irq_data *data)
 		data->chip->irq_release_resources(data);
 }
 EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+
+#ifdef CONFIG_SMP
+int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force)
+{
+	struct irq_redirect *redir = &irq_data_to_desc(data)->redirect;
+
+	WRITE_ONCE(redir->fallback_cpu, cpumask_first(dest));
+	return IRQ_SET_MASK_OK;
+}
+EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
 #endif
 
 /**
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index db714d3014b5f..d704025751315 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -78,8 +78,12 @@ static int alloc_masks(struct irq_desc *desc, int node)
 	return 0;
 }
 
-static void desc_smp_init(struct irq_desc *desc, int node,
-			  const struct cpumask *affinity)
+static void irq_redirect_work(struct irq_work *work)
+{
+	handle_irq_desc(container_of(work, struct irq_desc, redirect.work));
+}
+
+static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity)
 {
 	if (!affinity)
 		affinity = irq_default_affinity;
@@ -91,6 +95,7 @@ static void desc_smp_init(struct irq_desc *desc, int node,
 #ifdef CONFIG_NUMA
 	desc->irq_common_data.node = node;
 #endif
+	desc->redirect.work = IRQ_WORK_INIT_HARD(irq_redirect_work);
 }
 
 static void free_masks(struct irq_desc *desc)
@@ -766,6 +771,48 @@ int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
 	WARN_ON_ONCE(!in_nmi());
 	return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
 }
+
+static bool demux_redirect_remote(struct irq_desc *desc)
+{
+#ifdef CONFIG_SMP
+	const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+	unsigned int target_cpu = READ_ONCE(desc->redirect.fallback_cpu);
+
+	if (!cpumask_test_cpu(smp_processor_id(), m)) {
+		/* Protect against shutdown */
+		if (desc->action)
+			irq_work_queue_on(&desc->redirect.work, target_cpu);
+		return true;
+	}
+#endif
+	return false;
+}
+
+/**
+ * generic_handle_demux_domain_irq - Invoke the handler for a hardware interrupt
+ *				     of a demultiplexing domain.
+ * @domain:	The domain where to perform the lookup
+ * @hwirq:	The hardware interrupt number to convert to a logical one
+ *
+ * Returns:	True on success, or false if lookup has failed
+ */
+bool generic_handle_demux_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+{
+	struct irq_desc *desc = irq_resolve_mapping(domain, hwirq);
+
+	if (unlikely(!desc))
+		return false;
+
+	scoped_guard(raw_spinlock, &desc->lock) {
+		if (desc->irq_data.chip->irq_pre_redirect)
+			desc->irq_data.chip->irq_pre_redirect(&desc->irq_data);
+		if (demux_redirect_remote(desc))
+			return true;
+	}
+	return !handle_irq_desc(desc);
+}
+EXPORT_SYMBOL_GPL(generic_handle_demux_domain_irq);
+
 #endif
 
 /* Dynamic interrupt handling */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c94837382037e..ed8f8b2667b0b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -35,6 +35,16 @@ static int __init setup_forced_irqthreads(char *arg)
 early_param("threadirqs", setup_forced_irqthreads);
 #endif
 
+#ifdef CONFIG_SMP
+static inline void synchronize_irqwork(struct irq_desc *desc)
+{
+	/* Synchronize pending or on the fly redirect work */
+	irq_work_sync(&desc->redirect.work);
+}
+#else
+static inline void synchronize_irqwork(struct irq_desc *desc) { }
+#endif
+
 static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
 
 static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
@@ -43,6 +53,8 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
 	bool inprogress;
 
 	do {
+		synchronize_irqwork(desc);
+
 		/*
 		 * Wait until we're out of the critical section.  This might
 		 * give the wrong answer due to the lack of memory barriers.
@@ -108,6 +120,7 @@ EXPORT_SYMBOL(synchronize_hardirq);
 static void __synchronize_irq(struct irq_desc *desc)
 {
 	__synchronize_hardirq(desc, true);
+
 	/*
 	 * We made sure that no hardirq handler is running. Now verify that no
 	 * threaded handlers are active.
@@ -217,8 +230,7 @@ static inline void irq_validate_effective_affinity(struct irq_data *data) { }
 
 static DEFINE_PER_CPU(struct cpumask, __tmp_mask);
 
-int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			bool force)
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
 	struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask);
 	struct irq_desc *desc = irq_data_to_desc(data);
-- 
2.51.0

Re: [PATCH v2 1/3] genirq: Add interrupt redirection infrastructure

Posted by Thomas Gleixner 4 months ago

On Mon, Oct 06 2025 at 18:38, Radu Rendec wrote:
> +
> +static bool demux_redirect_remote(struct irq_desc *desc)
> +{
> +#ifdef CONFIG_SMP

Please have a function and a stub for the !SMP case.

> +	const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data);
> +	unsigned int target_cpu = READ_ONCE(desc->redirect.fallback_cpu);

what means fallback_cpu in this context? That's confusing at best. 

> +	if (!cpumask_test_cpu(smp_processor_id(), m)) {

Why cpumask_test?

    if (target != smp_processor_id()

should be good enough and understandable :)

> +		/* Protect against shutdown */
> +		if (desc->action)
> +			irq_work_queue_on(&desc->redirect.work, target_cpu);

Can you please document why this is correct vs. CPU hotplug (especially unplug)?

I think it is, but I didn't look too carefully.

> +/**
> + * generic_handle_demux_domain_irq - Invoke the handler for a hardware interrupt
> + *				     of a demultiplexing domain.
> + * @domain:	The domain where to perform the lookup
> + * @hwirq:	The hardware interrupt number to convert to a logical one
> + *
> + * Returns:	True on success, or false if lookup has failed
> + */
> +bool generic_handle_demux_domain_irq(struct irq_domain *domain, unsigned int hwirq)
> +{
> +	struct irq_desc *desc = irq_resolve_mapping(domain, hwirq);
> +
> +	if (unlikely(!desc))
> +		return false;
> +
> +	scoped_guard(raw_spinlock, &desc->lock) {
> +		if (desc->irq_data.chip->irq_pre_redirect)
> +			desc->irq_data.chip->irq_pre_redirect(&desc->irq_data);

I'd rather see that in the redirect function aboive.

> +		if (demux_redirect_remote(desc))
> +			return true;
> +	}
> +	return !handle_irq_desc(desc);
> +}
> +EXPORT_SYMBOL_GPL(generic_handle_demux_domain_irq);
> +
>  #endif
>  
>  /* Dynamic interrupt handling */
> diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
> index c94837382037e..ed8f8b2667b0b 100644
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -35,6 +35,16 @@ static int __init setup_forced_irqthreads(char *arg)
>  early_param("threadirqs", setup_forced_irqthreads);
>  #endif
>  
> +#ifdef CONFIG_SMP
> +static inline void synchronize_irqwork(struct irq_desc *desc)
> +{
> +	/* Synchronize pending or on the fly redirect work */
> +	irq_work_sync(&desc->redirect.work);
> +}
> +#else
> +static inline void synchronize_irqwork(struct irq_desc *desc) { }
> +#endif
> +
>  static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
>  
>  static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
> @@ -43,6 +53,8 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
>  	bool inprogress;
>  
>  	do {
> +		synchronize_irqwork(desc);

That can't work. irq_work_sync() requires interrupts and preemption
enabled. But __synchronize_hardirq() can be invoked from interrupt or
preemption disabled context.

And it's not required at all beacuse that's not any different from a
hardware device interrupt. Either it is already handled (IRQ_INPROGRESS)
on some other CPU or not. That code can't anticipiate that there is a
interrupt somewhere on the flight in the system and not yet raised and
handled in a CPU.

Though you want to invoke it in __synchronize_irq() _before_ invoking
__synchronize_hardirq().

Thanks,

        tglx

Re: [PATCH v2 1/3] genirq: Add interrupt redirection infrastructure

Posted by Radu Rendec 4 months ago

On Tue, 2025-10-07 at 22:04 +0200, Thomas Gleixner wrote:
> On Mon, Oct 06 2025 at 18:38, Radu Rendec wrote:
> > +
> > +static bool demux_redirect_remote(struct irq_desc *desc)
> > +{
> > +#ifdef CONFIG_SMP
> 
> Please have a function and a stub for the !SMP case.

Will do.

> > +	const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data);
> > +	unsigned int target_cpu = READ_ONCE(desc->redirect.fallback_cpu);
> 
> what means fallback_cpu in this context? That's confusing at best.

Please see below.

> > +	if (!cpumask_test_cpu(smp_processor_id(), m)) {
> 
> Why cpumask_test?
> 
>     if (target != smp_processor_id()
> 
> should be good enough and understandable :)

This is where I deviated from your initial implementation, and I tried
to explain it in the cover letter (text reproduced below).

  | Instead of keeping track of the parent interrupt's affinity setting (or
  | rather the first CPU in its affinity mask) and attempting to pick the
  | same CPU for the child (as the target CPU) if possible, I just check if
  | the child handler fires on a CPU that's part of its affinity mask (which
  | is already stored anyway). As an optimization for the case when the
  | current CPU is *not* part of the mask and the handler needs to be
  | redirected, I pre-calculate and store the first CPU in the mask, at the
  | time when the child affinity is set. In my opinion, this is simpler and
  | cleaner, at the expense of a cpumask_test_cpu() call on the fast path,
  | because:
  | - It no longer needs to keep track of the parent interrupt's affinity
  |   setting.
  | - If the parent interrupt can run on more than one CPU, the child can
  |   also run on any of those CPUs without being redirected (in case the
  |   child's affinity mask is the same as the parent's or a superset).

Let me provide an example.
- parent affinity is set to 0,1,2,3
- child affinity is set to 2,3
- parent (hardware) interrupt runs on cpu 3

In the original implementation, the child target would be pre-calculated 
as cpu 2, and therefore in the scenario above, the child would be
redirected to cpu 2. But in reality there's no need to redirect because
cpu 3 is part of the child's affinity mask.

Now, to answer your previous question, "fallback_cpu" means the cpu
that we redirect to in case the current cpu is not part of the child's
affinity mask.

> > +		/* Protect against shutdown */
> > +		if (desc->action)
> > +			irq_work_queue_on(&desc->redirect.work, target_cpu);
> 
> Can you please document why this is correct vs. CPU hotplug (especially unplug)?
> 
> I think it is, but I didn't look too carefully.

Sure. To be honest, left this unchanged from your original version and
didn't give it much thought. I'll look closer and document it in the
next version.

> > +/**
> > + * generic_handle_demux_domain_irq - Invoke the handler for a hardware interrupt
> > + *				     of a demultiplexing domain.
> > + * @domain:	The domain where to perform the lookup
> > + * @hwirq:	The hardware interrupt number to convert to a logical one
> > + *
> > + * Returns:	True on success, or false if lookup has failed
> > + */
> > +bool generic_handle_demux_domain_irq(struct irq_domain *domain, unsigned int hwirq)
> > +{
> > +	struct irq_desc *desc = irq_resolve_mapping(domain, hwirq);
> > +
> > +	if (unlikely(!desc))
> > +		return false;
> > +
> > +	scoped_guard(raw_spinlock, &desc->lock) {
> > +		if (desc->irq_data.chip->irq_pre_redirect)
> > +			desc->irq_data.chip->irq_pre_redirect(&desc->irq_data);
> 
> I'd rather see that in the redirect function aboive.

What? The scoped_guard() and calling irq_pre_redirect()? Then, if
demux_redirect_remote() becomes a stub when CONFIG_SMP is not defined,
it means irq_pre_redirect() will not be called either, even if it's set
in struct irq_chip.

Now, I don't see any reason why irq_pre_redirect would be set for the
non-SMP case, and in fact it isn't if you look at (currently) the only
implementation, which is dwc PCI (patch 3). Redirection just doesn't
make sense if you have only one cpu.

Perhaps (struct irq_chip).irq_pre_redirect should not even exist (as a
field in the structure) unless CONFIG_SMP is defined?

> > +		if (demux_redirect_remote(desc))
> > +			return true;
> > +	}
> > +	return !handle_irq_desc(desc);
> > +}
> > +EXPORT_SYMBOL_GPL(generic_handle_demux_domain_irq);
> > +
> >  #endif
> >  
> >  /* Dynamic interrupt handling */
> > diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
> > index c94837382037e..ed8f8b2667b0b 100644
> > --- a/kernel/irq/manage.c
> > +++ b/kernel/irq/manage.c
> > @@ -35,6 +35,16 @@ static int __init setup_forced_irqthreads(char *arg)
> >  early_param("threadirqs", setup_forced_irqthreads);
> >  #endif
> >  
> > +#ifdef CONFIG_SMP
> > +static inline void synchronize_irqwork(struct irq_desc *desc)
> > +{
> > +	/* Synchronize pending or on the fly redirect work */
> > +	irq_work_sync(&desc->redirect.work);
> > +}
> > +#else
> > +static inline void synchronize_irqwork(struct irq_desc *desc) { }
> > +#endif
> > +
> >  static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
> >  
> >  static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
> > @@ -43,6 +53,8 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
> >  	bool inprogress;
> >  
> >  	do {
> > +		synchronize_irqwork(desc);
> 
> That can't work. irq_work_sync() requires interrupts and preemption
> enabled. But __synchronize_hardirq() can be invoked from interrupt or
> preemption disabled context.
> 
> And it's not required at all beacuse that's not any different from a
> hardware device interrupt. Either it is already handled (IRQ_INPROGRESS)
> on some other CPU or not. That code can't anticipiate that there is a
> interrupt somewhere on the flight in the system and not yet raised and
> handled in a CPU.
> 
> Though you want to invoke it in __synchronize_irq() _before_ invoking
> __synchronize_hardirq().

Same comment as before: I left this unchanged from your original
version and didn't give it much thought. This is definitely one of
those "back to the design board" cases. Thanks for the guidance, and I
will give it more thought and address it in the next version.

-- 

Thanks a lot for reviewing!

Best regards,
Radu

[PATCH v2 1/3] genirq: Add interrupt redirection infrastructure
[PATCH v2 2/3] PCI: dwc: Code cleanup
[PATCH v2 3/3] PCI: dwc: Enable MSI affinity support