[v3] Enable MSI affinity support for dwc PCI

[PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Radu Rendec 2 months, 1 week ago

Leverage the interrupt redirection infrastructure to enable CPU affinity
support for MSI interrupts. Since the parent interrupt affinity cannot
be changed, affinity control for the child interrupt (MSI) is achieved
by redirecting the handler to run in IRQ work context on the target CPU.

This patch was originally prepared by Thomas Gleixner (see Link tag
below) in a patch series that was never submitted as is, and only
parts of that series have made it upstream so far.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
Signed-off-by: Radu Rendec <rrendec@redhat.com>
---
 .../pci/controller/dwc/pcie-designware-host.c | 33 ++++++++++++++++---
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index aa93acaa579a5..90d9cb45e7842 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -26,9 +26,27 @@ static struct pci_ops dw_pcie_ops;
 static struct pci_ops dw_pcie_ecam_ops;
 static struct pci_ops dw_child_pcie_ops;
 
+#ifdef CONFIG_SMP
+static void dw_irq_noop(struct irq_data *d) { }
+#endif
+
+static bool dw_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				      struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
+
+#ifdef CONFIG_SMP
+	info->chip->irq_ack = dw_irq_noop;
+	info->chip->irq_pre_redirect = irq_chip_pre_redirect_parent;
+#else
+	info->chip->irq_ack = irq_chip_ack_parent;
+#endif
+	return true;
+}
+
 #define DW_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS		| \
 				    MSI_FLAG_USE_DEF_CHIP_OPS		| \
-				    MSI_FLAG_NO_AFFINITY		| \
 				    MSI_FLAG_PCI_MSI_MASK_PARENT)
 #define DW_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI		| \
 				     MSI_FLAG_PCI_MSIX			| \
@@ -40,9 +58,8 @@ static const struct msi_parent_ops dw_pcie_msi_parent_ops = {
 	.required_flags		= DW_PCIE_MSI_FLAGS_REQUIRED,
 	.supported_flags	= DW_PCIE_MSI_FLAGS_SUPPORTED,
 	.bus_select_token	= DOMAIN_BUS_PCI_MSI,
-	.chip_flags		= MSI_CHIP_FLAG_SET_ACK,
 	.prefix			= "DW-",
-	.init_dev_msi_info	= msi_lib_init_dev_msi_info,
+	.init_dev_msi_info	= dw_pcie_init_dev_msi_info,
 };
 
 /* MSI int handler */
@@ -63,7 +80,7 @@ void dw_handle_msi_irq(struct dw_pcie_rp *pp)
 			continue;
 
 		for_each_set_bit(pos, &status, MAX_MSI_IRQS_PER_CTRL)
-			generic_handle_domain_irq(pp->irq_domain, irq_off + pos);
+			generic_handle_demux_domain_irq(pp->irq_domain, irq_off + pos);
 	}
 }
 
@@ -140,10 +157,16 @@ static void dw_pci_bottom_ack(struct irq_data *d)
 
 static struct irq_chip dw_pci_msi_bottom_irq_chip = {
 	.name			= "DWPCI-MSI",
-	.irq_ack		= dw_pci_bottom_ack,
 	.irq_compose_msi_msg	= dw_pci_setup_msi_msg,
 	.irq_mask		= dw_pci_bottom_mask,
 	.irq_unmask		= dw_pci_bottom_unmask,
+#ifdef CONFIG_SMP
+	.irq_ack		= dw_irq_noop,
+	.irq_pre_redirect	= dw_pci_bottom_ack,
+	.irq_set_affinity	= irq_chip_redirect_set_affinity,
+#else
+	.irq_ack		= dw_pci_bottom_ack,
+#endif
 };
 
 static int dw_pcie_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
-- 
2.51.1

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Jon Hunter 2 weeks, 5 days ago

Hi Radu,

On 28/11/2025 21:20, Radu Rendec wrote:
> Leverage the interrupt redirection infrastructure to enable CPU affinity
> support for MSI interrupts. Since the parent interrupt affinity cannot
> be changed, affinity control for the child interrupt (MSI) is achieved
> by redirecting the handler to run in IRQ work context on the target CPU.
> 
> This patch was originally prepared by Thomas Gleixner (see Link tag
> below) in a patch series that was never submitted as is, and only
> parts of that series have made it upstream so far.
> 
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
> Signed-off-by: Radu Rendec <rrendec@redhat.com>
> ---
>   .../pci/controller/dwc/pcie-designware-host.c | 33 ++++++++++++++++---
>   1 file changed, 28 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> index aa93acaa579a5..90d9cb45e7842 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> @@ -26,9 +26,27 @@ static struct pci_ops dw_pcie_ops;
>   static struct pci_ops dw_pcie_ecam_ops;
>   static struct pci_ops dw_child_pcie_ops;
>   
> +#ifdef CONFIG_SMP
> +static void dw_irq_noop(struct irq_data *d) { }
> +#endif
> +
> +static bool dw_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
> +				      struct irq_domain *real_parent, struct msi_domain_info *info)
> +{
> +	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
> +		return false;
> +
> +#ifdef CONFIG_SMP
> +	info->chip->irq_ack = dw_irq_noop;
> +	info->chip->irq_pre_redirect = irq_chip_pre_redirect_parent;
> +#else
> +	info->chip->irq_ack = irq_chip_ack_parent;
> +#endif
> +	return true;
> +}
> +
>   #define DW_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS		| \
>   				    MSI_FLAG_USE_DEF_CHIP_OPS		| \
> -				    MSI_FLAG_NO_AFFINITY		| \
>   				    MSI_FLAG_PCI_MSI_MASK_PARENT)
>   #define DW_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI		| \
>   				     MSI_FLAG_PCI_MSIX			| \
> @@ -40,9 +58,8 @@ static const struct msi_parent_ops dw_pcie_msi_parent_ops = {
>   	.required_flags		= DW_PCIE_MSI_FLAGS_REQUIRED,
>   	.supported_flags	= DW_PCIE_MSI_FLAGS_SUPPORTED,
>   	.bus_select_token	= DOMAIN_BUS_PCI_MSI,
> -	.chip_flags		= MSI_CHIP_FLAG_SET_ACK,
>   	.prefix			= "DW-",
> -	.init_dev_msi_info	= msi_lib_init_dev_msi_info,
> +	.init_dev_msi_info	= dw_pcie_init_dev_msi_info,
>   };
>   
>   /* MSI int handler */
> @@ -63,7 +80,7 @@ void dw_handle_msi_irq(struct dw_pcie_rp *pp)
>   			continue;
>   
>   		for_each_set_bit(pos, &status, MAX_MSI_IRQS_PER_CTRL)
> -			generic_handle_domain_irq(pp->irq_domain, irq_off + pos);
> +			generic_handle_demux_domain_irq(pp->irq_domain, irq_off + pos);
>   	}
>   }
>   
> @@ -140,10 +157,16 @@ static void dw_pci_bottom_ack(struct irq_data *d)
>   
>   static struct irq_chip dw_pci_msi_bottom_irq_chip = {
>   	.name			= "DWPCI-MSI",
> -	.irq_ack		= dw_pci_bottom_ack,
>   	.irq_compose_msi_msg	= dw_pci_setup_msi_msg,
>   	.irq_mask		= dw_pci_bottom_mask,
>   	.irq_unmask		= dw_pci_bottom_unmask,
> +#ifdef CONFIG_SMP
> +	.irq_ack		= dw_irq_noop,
> +	.irq_pre_redirect	= dw_pci_bottom_ack,
> +	.irq_set_affinity	= irq_chip_redirect_set_affinity,
> +#else
> +	.irq_ack		= dw_pci_bottom_ack,
> +#endif
>   };
>   
>   static int dw_pcie_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,

I am seeing another issue with this patch. On the Tegra194 AGX Xavier 
platform suspend is failing and reverting this patch fixes the problem.

Unfortunately the logs don't tell me much. In a bad case I see ...

  PM: suspend entry (deep)
  Filesystems sync: 0.000 seconds
  Freezing user space processes
  Freezing user space processes completed (elapsed 0.002 seconds)
  OOM killer disabled.
  Freezing remaining freezable tasks
  Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
  tegra-xusb 3610000.usb: Firmware timestamp: 2020-09-11 16:55:03 UTC
  dwc-eth-dwmac 2490000.ethernet eth0: Link is Down
  tegra194-pcie 14100000.pcie: Link didn't transition to L2 state
  Disabling non-boot CPUs ...

It appears to hang here. In a good case I see ...

  PM: suspend entry (deep)
  Filesystems sync: 0.000 seconds
  Freezing user space processes
  Freezing user space processes completed (elapsed 0.002 seconds)
  OOM killer disabled.
  Freezing remaining freezable tasks
  Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
  tegra-xusb 3610000.usb: Firmware timestamp: 2020-09-11 16:55:03 UTC
  dwc-eth-dwmac 2490000.ethernet eth0: Link is Down
  tegra194-pcie 14100000.pcie: Link didn't transition to L2 state
  Disabling non-boot CPUs ...
  psci: CPU7 killed (polled 0 ms)
  psci: CPU6 killed (polled 4 ms)
  psci: CPU5 killed (polled 0 ms)
  psci: CPU4 killed (polled 4 ms)
  psci: CPU3 killed (polled 4 ms)
  psci: CPU2 killed (polled 0 ms)
  psci: CPU1 killed (polled 0 ms)
  ...
  Enabling non-boot CPUs ... (resume starts)

So it looks like it is hanging when disabling the non-boot CPUs. So far 
it only appears to happen on Tegra194.

Let me know if you have any suggestions.

Thanks
Jon

-- 
nvpublic

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Radu Rendec 2 weeks, 5 days ago

Hi Jon,

On Tue, 2026-01-20 at 18:01 +0000, Jon Hunter wrote:
> On 28/11/2025 21:20, Radu Rendec wrote:
> > Leverage the interrupt redirection infrastructure to enable CPU affinity
> > support for MSI interrupts. Since the parent interrupt affinity cannot
> > be changed, affinity control for the child interrupt (MSI) is achieved
> > by redirecting the handler to run in IRQ work context on the target CPU.
> > 
> > This patch was originally prepared by Thomas Gleixner (see Link tag
> > below) in a patch series that was never submitted as is, and only
> > parts of that series have made it upstream so far.
> > 
> > Originally-by: Thomas Gleixner <tglx@linutronix.de>
> > Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
> > Signed-off-by: Radu Rendec <rrendec@redhat.com>
> > ---
> >   .../pci/controller/dwc/pcie-designware-host.c | 33 ++++++++++++++++---
> >   1 file changed, 28 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> > index aa93acaa579a5..90d9cb45e7842 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> > @@ -26,9 +26,27 @@ static struct pci_ops dw_pcie_ops;
> >   static struct pci_ops dw_pcie_ecam_ops;
> >   static struct pci_ops dw_child_pcie_ops;
> >   
> > +#ifdef CONFIG_SMP
> > +static void dw_irq_noop(struct irq_data *d) { }
> > +#endif
> > +
> > +static bool dw_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
> > +       struct irq_domain *real_parent, struct msi_domain_info *info)
> > +{
> > + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
> > + return false;
> > +
> > +#ifdef CONFIG_SMP
> > + info->chip->irq_ack = dw_irq_noop;
> > + info->chip->irq_pre_redirect = irq_chip_pre_redirect_parent;
> > +#else
> > + info->chip->irq_ack = irq_chip_ack_parent;
> > +#endif
> > + return true;
> > +}
> > +
> >   #define DW_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \
> >        MSI_FLAG_USE_DEF_CHIP_OPS | \
> > -     MSI_FLAG_NO_AFFINITY | \
> >        MSI_FLAG_PCI_MSI_MASK_PARENT)
> >   #define DW_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI | \
> >         MSI_FLAG_PCI_MSIX | \
> > @@ -40,9 +58,8 @@ static const struct msi_parent_ops dw_pcie_msi_parent_ops = {
> >    .required_flags = DW_PCIE_MSI_FLAGS_REQUIRED,
> >    .supported_flags = DW_PCIE_MSI_FLAGS_SUPPORTED,
> >    .bus_select_token = DOMAIN_BUS_PCI_MSI,
> > - .chip_flags = MSI_CHIP_FLAG_SET_ACK,
> >    .prefix = "DW-",
> > - .init_dev_msi_info = msi_lib_init_dev_msi_info,
> > + .init_dev_msi_info = dw_pcie_init_dev_msi_info,
> >   };
> >   
> >   /* MSI int handler */
> > @@ -63,7 +80,7 @@ void dw_handle_msi_irq(struct dw_pcie_rp *pp)
> >    continue;
> >   
> >    for_each_set_bit(pos, &status, MAX_MSI_IRQS_PER_CTRL)
> > - generic_handle_domain_irq(pp->irq_domain, irq_off + pos);
> > + generic_handle_demux_domain_irq(pp->irq_domain, irq_off + pos);
> >    }
> >   }
> >   
> > @@ -140,10 +157,16 @@ static void dw_pci_bottom_ack(struct irq_data *d)
> >   
> >   static struct irq_chip dw_pci_msi_bottom_irq_chip = {
> >    .name = "DWPCI-MSI",
> > - .irq_ack = dw_pci_bottom_ack,
> >    .irq_compose_msi_msg = dw_pci_setup_msi_msg,
> >    .irq_mask = dw_pci_bottom_mask,
> >    .irq_unmask = dw_pci_bottom_unmask,
> > +#ifdef CONFIG_SMP
> > + .irq_ack = dw_irq_noop,
> > + .irq_pre_redirect = dw_pci_bottom_ack,
> > + .irq_set_affinity = irq_chip_redirect_set_affinity,
> > +#else
> > + .irq_ack = dw_pci_bottom_ack,
> > +#endif
> >   };
> >   
> >   static int dw_pcie_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
> 
> 
> I am seeing another issue with this patch. On the Tegra194 AGX Xavier
> platform suspend is failing and reverting this patch fixes the problem.
> 
> Unfortunately the logs don't tell me much. In a bad case I see ...
> 
>   PM: suspend entry (deep)
>   Filesystems sync: 0.000 seconds
>   Freezing user space processes
>   Freezing user space processes completed (elapsed 0.002 seconds)
>   OOM killer disabled.
>   Freezing remaining freezable tasks
>   Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
>   tegra-xusb 3610000.usb: Firmware timestamp: 2020-09-11 16:55:03 UTC
>   dwc-eth-dwmac 2490000.ethernet eth0: Link is Down
>   tegra194-pcie 14100000.pcie: Link didn't transition to L2 state
>   Disabling non-boot CPUs ...
> 
> It appears to hang here. In a good case I see ...
> 
>   PM: suspend entry (deep)
>   Filesystems sync: 0.000 seconds
>   Freezing user space processes
>   Freezing user space processes completed (elapsed 0.002 seconds)
>   OOM killer disabled.
>   Freezing remaining freezable tasks
>   Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
>   tegra-xusb 3610000.usb: Firmware timestamp: 2020-09-11 16:55:03 UTC
>   dwc-eth-dwmac 2490000.ethernet eth0: Link is Down
>   tegra194-pcie 14100000.pcie: Link didn't transition to L2 state
>   Disabling non-boot CPUs ...
>   psci: CPU7 killed (polled 0 ms)
>   psci: CPU6 killed (polled 4 ms)
>   psci: CPU5 killed (polled 0 ms)
>   psci: CPU4 killed (polled 4 ms)
>   psci: CPU3 killed (polled 4 ms)
>   psci: CPU2 killed (polled 0 ms)
>   psci: CPU1 killed (polled 0 ms)
>   ...
>   Enabling non-boot CPUs ... (resume starts)
> 
> So it looks like it is hanging when disabling the non-boot CPUs. So far 
> it only appears to happen on Tegra194.
> 
> Let me know if you have any suggestions.

Ouch. I'm afraid this is going to be much harder to figure out than the
previous one, especially since I can't get access easily to a board to
test on. I will try to reserve a board and reproduce the bug.

Meanwhile, if you (or someone else in your team) can spare a few cycles,
could you please try to reproduce the bug again with the debug patch
below applied, and a few other changes:
 * enable debug messages in kernel/irq/cpuhotplug.c;
 * save the contents of /proc/interrupts to a file before suspending;
 * add "no_console_suspend" to the kernel command line (although it
   looks like you already have it).

It will be much more verbose during suspend but hopefully we can at
least figure out how far along it goes and how it's related to the MSI
affinity configuration.

Thanks,
Radu

---

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 84cc4bea773c0..62ae76661f26d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1492,6 +1492,8 @@ int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *
 {
 struct irq_redirect *redir = &irq_data_to_desc(data)->redirect;
 
+ pr_info("%s: irq %u mask 0x%*pb\n", __func__, data->irq, cpumask_pr_args(dest));
+
 WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
 irq_data_update_effective_affinity(data, dest);
 
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index cd5689e383b00..d8c62547f9d06 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -59,6 +59,8 @@ static bool migrate_one_irq(struct irq_desc *desc)
 bool brokeaff = false;
 int err;
 
+ pr_info("%s: irq %u cpu %u\n", __func__, d->irq, smp_processor_id());
+
 /*
 * IRQ chip might be already torn down, but the irq descriptor is
 * still in the radix tree. Also if the chip has no affinity setter,
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 3fe6b0c99f3d8..94bd7ad64c9b7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -227,6 +227,7 @@ static int multi_cpu_stop(void *data)
 stop_machine_yield(cpumask);
 newstate = READ_ONCE(msdata->state);
 if (newstate != curstate) {
+ pr_info("%s: cpu %d entering state %d\n", __func__, cpu, newstate);
 curstate = newstate;
 switch (curstate) {
 case MULTI_STOP_DISABLE_IRQ:

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Jon Hunter 2 weeks, 4 days ago

On 20/01/2026 22:30, Radu Rendec wrote:

...

>> So it looks like it is hanging when disabling the non-boot CPUs. So far
>> it only appears to happen on Tegra194.
>>
>> Let me know if you have any suggestions.
> 
> Ouch. I'm afraid this is going to be much harder to figure out than the
> previous one, especially since I can't get access easily to a board to
> test on. I will try to reserve a board and reproduce the bug.
> 
> Meanwhile, if you (or someone else in your team) can spare a few cycles,
> could you please try to reproduce the bug again with the debug patch
> below applied, and a few other changes:
>   * enable debug messages in kernel/irq/cpuhotplug.c;
>   * save the contents of /proc/interrupts to a file before suspending;
>   * add "no_console_suspend" to the kernel command line (although it
>     looks like you already have it).
> 
> It will be much more verbose during suspend but hopefully we can at
> least figure out how far along it goes and how it's related to the MSI
> affinity configuration.


Thanks. I have dumped the boot log with the prints here:

https://pastebin.com/G8c2ssdt

And the dump of /proc/interrupts here:

https://pastebin.com/Wqzxw3r6

Looks like the last thing I see entering suspend is ...

  irq_chip_redirect_set_affinity: irq 162 mask 0x7f

That appears to be a PCIe interrupt. Let me know if there are more tests 
I can run.

Cheers
Jon

-- 
nvpublic

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Radu Rendec 2 weeks, 2 days ago

Hi Jon,

On Wed, 2026-01-21 at 14:00 +0000, Jon Hunter wrote:
> 
> On 20/01/2026 22:30, Radu Rendec wrote:
> 
> ...
> 
> > > So it looks like it is hanging when disabling the non-boot CPUs. So far
> > > it only appears to happen on Tegra194.
> > > 
> > > Let me know if you have any suggestions.
> > 
> > Ouch. I'm afraid this is going to be much harder to figure out than the
> > previous one, especially since I can't get access easily to a board to
> > test on. I will try to reserve a board and reproduce the bug.
> > 
> > Meanwhile, if you (or someone else in your team) can spare a few cycles,
> > could you please try to reproduce the bug again with the debug patch
> > below applied, and a few other changes:
> >   * enable debug messages in kernel/irq/cpuhotplug.c;
> >   * save the contents of /proc/interrupts to a file before suspending;
> >   * add "no_console_suspend" to the kernel command line (although it
> >     looks like you already have it).
> > 
> > It will be much more verbose during suspend but hopefully we can at
> > least figure out how far along it goes and how it's related to the MSI
> > affinity configuration.
> 
> 
> Thanks. I have dumped the boot log with the prints here:
> 
> https://pastebin.com/G8c2ssdt
> 
> And the dump of /proc/interrupts here:
> 
> https://pastebin.com/Wqzxw3r6
> 
> Looks like the last thing I see entering suspend is ...
> 
>   irq_chip_redirect_set_affinity: irq 162 mask 0x7f
> 
> That appears to be a PCIe interrupt. Let me know if there are more tests 
> I can run.

Thanks very much for running the test and for the logs. The good news
is good ol' printk debugging seems to be working, and the last message
in the log is indeed related to dw-pci irq affinity control, which is
what the patch touches. So we're on to something. The bad news is I
can't yet figure out what's wrong.

The CPUs are taken offline one by one, starting with CPU 7. The code in
question runs on the dying CPU, and with hardware interrupts disabled
on all CPUs. The (simplified) call stack looks like this:

irq_migrate_all_off_this_cpu
  for_each_active_irq
    migrate_one_irq
      irq_do_set_affinity
        irq_chip_redirect_set_affinity (via chip->irq_set_affinity)

The debug patch I gave you adds:
 * a printk to irq_chip_redirect_set_affinity (which is very small)
 * a printk at the beginning of migrate_one_irq

Also, the call to irq_do_set_affinity is almost the last thing that
happens in migrate_one_irq, and that for_each_active_irq loop is quite
small too. So, there isn't much happening between the printk in
irq_chip_redirect_set_affinity for the msi irq (which we do see in the
log) and the printk in migrate_one_irq for the next irq (which we don't
see).

My first thought is to add more printk's between those two and narrow
down the spot where it gets stuck.

I think the fastest way to debug it is if I can test myself. I tried to
reproduce the issue on a Jetson AGX Orin, and I couldn't. By the way,
how often does it hang? e.g., out of say 10 suspend attempts, how many
fail?

I do have access to a Jetson Xavier NX (in theory) but it looks like
there's a lab issue with that board, which hopefully gets sorted out
tomorrow. If I can't get a hold of that board (or can't reproduce the
problem on it), I may ask you to try a few other things. In any case,
I'll update this thread again either tomorrow or (more likely) early
next week.

-- 
Thanks,
Radu

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Thomas Gleixner 1 week, 6 days ago

On Thu, Jan 22 2026 at 18:31, Radu Rendec wrote:
> The CPUs are taken offline one by one, starting with CPU 7. The code in
> question runs on the dying CPU, and with hardware interrupts disabled
> on all CPUs. The (simplified) call stack looks like this:
>
> irq_migrate_all_off_this_cpu
>   for_each_active_irq
>     migrate_one_irq
>       irq_do_set_affinity
>         irq_chip_redirect_set_affinity (via chip->irq_set_affinity)
>
> The debug patch I gave you adds:
>  * a printk to irq_chip_redirect_set_affinity (which is very small)
>  * a printk at the beginning of migrate_one_irq
>
> Also, the call to irq_do_set_affinity is almost the last thing that
> happens in migrate_one_irq, and that for_each_active_irq loop is quite
> small too. So, there isn't much happening between the printk in
> irq_chip_redirect_set_affinity for the msi irq (which we do see in the
> log) and the printk in migrate_one_irq for the next irq (which we don't
> see).

This doesn't make any sense at all. irq_chip_redirect_set_affinity() is
only accessing interrupt descriptor associated memory and the new
redirection CPU is the same as the previous one as the mask changes from
0xff to 0x7f and therefore cpumask_first() yields 0 in both cases.

According to the provided dmesg, this happens on linux-next.

Jon, can you please validate that this happens as well on

     git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/msi

Thanks

        tglx

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Jon Hunter 1 week, 6 days ago

Hi Thomas,

On 26/01/2026 07:59, Thomas Gleixner wrote:
> On Thu, Jan 22 2026 at 18:31, Radu Rendec wrote:
>> The CPUs are taken offline one by one, starting with CPU 7. The code in
>> question runs on the dying CPU, and with hardware interrupts disabled
>> on all CPUs. The (simplified) call stack looks like this:
>>
>> irq_migrate_all_off_this_cpu
>>    for_each_active_irq
>>      migrate_one_irq
>>        irq_do_set_affinity
>>          irq_chip_redirect_set_affinity (via chip->irq_set_affinity)
>>
>> The debug patch I gave you adds:
>>   * a printk to irq_chip_redirect_set_affinity (which is very small)
>>   * a printk at the beginning of migrate_one_irq
>>
>> Also, the call to irq_do_set_affinity is almost the last thing that
>> happens in migrate_one_irq, and that for_each_active_irq loop is quite
>> small too. So, there isn't much happening between the printk in
>> irq_chip_redirect_set_affinity for the msi irq (which we do see in the
>> log) and the printk in migrate_one_irq for the next irq (which we don't
>> see).
> 
> This doesn't make any sense at all. irq_chip_redirect_set_affinity() is
> only accessing interrupt descriptor associated memory and the new
> redirection CPU is the same as the previous one as the mask changes from
> 0xff to 0x7f and therefore cpumask_first() yields 0 in both cases.
> 
> According to the provided dmesg, this happens on linux-next.
> 
> Jon, can you please validate that this happens as well on
> 
>       git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/msi


I tried this branch and I see suspend failing with that branch too. If I 
revert this change on top of your branch or -next, I don't see any 
problems.

Thanks
Jon

-- 
nvpublic

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Radu Rendec 1 week, 6 days ago

Hi Jon,

On Mon, 2026-01-26 at 22:07 +0000, Jon Hunter wrote:
> On 26/01/2026 07:59, Thomas Gleixner wrote:
> > On Thu, Jan 22 2026 at 18:31, Radu Rendec wrote:
> > > The CPUs are taken offline one by one, starting with CPU 7. The code in
> > > question runs on the dying CPU, and with hardware interrupts disabled
> > > on all CPUs. The (simplified) call stack looks like this:
> > > 
> > > irq_migrate_all_off_this_cpu
> > >    for_each_active_irq
> > >      migrate_one_irq
> > >        irq_do_set_affinity
> > >          irq_chip_redirect_set_affinity (via chip->irq_set_affinity)
> > > 
> > > The debug patch I gave you adds:
> > >   * a printk to irq_chip_redirect_set_affinity (which is very small)
> > >   * a printk at the beginning of migrate_one_irq
> > > 
> > > Also, the call to irq_do_set_affinity is almost the last thing that
> > > happens in migrate_one_irq, and that for_each_active_irq loop is quite
> > > small too. So, there isn't much happening between the printk in
> > > irq_chip_redirect_set_affinity for the msi irq (which we do see in the
> > > log) and the printk in migrate_one_irq for the next irq (which we don't
> > > see).
> > 
> > This doesn't make any sense at all. irq_chip_redirect_set_affinity() is
> > only accessing interrupt descriptor associated memory and the new
> > redirection CPU is the same as the previous one as the mask changes from
> > 0xff to 0x7f and therefore cpumask_first() yields 0 in both cases.
> > 
> > According to the provided dmesg, this happens on linux-next.
> > 
> > Jon, can you please validate that this happens as well on
> > 
> >       git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/msi
> 
> 
> I tried this branch and I see suspend failing with that branch too. If I 
> revert this change on top of your branch or -next, I don't see any 
> problems.

The closest hardware I have access to is Jetson Xavier NX, and you
already mentioned you couldn't reproduce the issue there (and it looks
like I can't even get a hold of that board anyway). So I'm going to ask
you to test a few more things for me.

Can you please apply the patch below on top of the previous one I sent?
The suspect is the spinlock lock in irq_migrate_all_off_this_cpu(),
although I can't think of any reason why it shouldn't be free. But I
don't have any better idea, and I would like to narrow down the spot
where hotplug gets stuck.

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index d8c62547f9d06..69c44da68e3a9 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -178,9 +178,11 @@ void irq_migrate_all_off_this_cpu(void)
 	for_each_active_irq(irq) {
 		bool affinity_broken;
 
+		pr_info("%s: irq %u\n", __func__, irq);
 		desc = irq_to_desc(irq);
 		scoped_guard(raw_spinlock, &desc->lock) {
 			affinity_broken = migrate_one_irq(desc);
+			pr_info("%s: migrate_one_irq -> %u\n", __func__, affinity_broken);
 			if (affinity_broken && desc->affinity_notify)
 				irq_affinity_schedule_notify_work(desc);
 		}

-- 
Thanks,
Radu

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Thomas Gleixner 1 week, 5 days ago

On Mon, Jan 26 2026 at 17:26, Radu Rendec wrote:
> On Mon, 2026-01-26 at 22:07 +0000, Jon Hunter wrote:
>> > Jon, can you please validate that this happens as well on
>> > 
>> >       git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/msi
>> 
>> 
>> I tried this branch and I see suspend failing with that branch too. If I 
>> revert this change on top of your branch or -next, I don't see any 
>> problems.
>
> The closest hardware I have access to is Jetson Xavier NX, and you
> already mentioned you couldn't reproduce the issue there (and it looks
> like I can't even get a hold of that board anyway). So I'm going to ask
> you to test a few more things for me.
>
> Can you please apply the patch below on top of the previous one I sent?
> The suspect is the spinlock lock in irq_migrate_all_off_this_cpu(),
> although I can't think of any reason why it shouldn't be free. But I
> don't have any better idea, and I would like to narrow down the spot
> where hotplug gets stuck.

Can we please take a step back and think about what is actually
different when this change is in effect instead of halluzinating about
completely unrelated spinlocks?

Without this change the interrupt is ignored in the hotplug migration
because it has MSI_FLAG_NO_AFFINITY set.

Now with this new magic in place the following happens:

migrate_one_irq()
   ...
   irq_do_set_affinity()
      chip->irq_set_affinity()                // --> msi_domain_set_affinity()
         parent->chip->irq_set_affinity()     // --> irq_chip_redirect_set_affinity()
            update target_cpu/effective mask; // Benign
         ...
         irq_chip_write_msi_msg()             // --> pci_msi_domain_write_msg()
  
I'm pretty sure that this write screws things up because the
devices/busses are already frozen. It simply hangs there.

Usually this is prevented by this check in pci_msi_domain_write_msg():

        if (dev->current_state != PCI_D0 || pci_dev_is_disconnected(dev))
        	do_nothing();
        else ...

As the boot log contains this:

[   44.101151] tegra194-pcie 14100000.pcie: Link didn't transition to L2 state
[   44.110764] Disabling non-boot CPUs ...

... I suspect that there is some weirdness going on with this PCIe
controller which subsequently screws up the check.

The below untested hack should confirm that theory.

Thanks,

        tglx
---
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -672,7 +672,11 @@ int msi_domain_set_affinity(struct irq_d
 	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
 		BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
 		msi_check_level(irq_data->domain, msg);
-		irq_chip_write_msi_msg(irq_data, msg);
+		// Hack alert
+		struct irq_desc *desc = irq_data_to_desc(irq_data);
+
+		if (!(desc->istate & IRQS_SUSPENDED))
+			irq_chip_write_msi_msg(irq_data, msg);
 	}
 
 	return ret;

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Thomas Gleixner 1 week, 5 days ago

On Tue, Jan 27 2026 at 11:30, Thomas Gleixner wrote:
> The below untested hack should confirm that theory.

Actually looking at it deeper the solution is trivial because in this
case writing the MSI message to the device is not required when the
affinity changes because the message does not change. It is set once via
msi_domain_activate() and stays the same for the life time of the
interrupt.

So the below prevents the invocation of irq_chip_write_msi_msg() in
msi_domain_set_affinity(), but I would recommend to investigate the
actual underlying problem nevertheless:

It is going to roar its ugly head at some other place sooner than later
as there are tons of other places which guard against
pci_dev::current_state != PCI_D0.

Thanks,

        tglx
---
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1495,7 +1495,7 @@ int irq_chip_redirect_set_affinity(struc
 	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
 	irq_data_update_effective_affinity(data, dest);
 
-	return IRQ_SET_MASK_OK;
+	return IRQ_SET_MASK_OK_DONE;
 }
 EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
 #endif

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Jon Hunter 1 week, 5 days ago

Hi Thomas,

On 27/01/2026 13:34, Thomas Gleixner wrote:
> On Tue, Jan 27 2026 at 11:30, Thomas Gleixner wrote:
>> The below untested hack should confirm that theory.
> 
> Actually looking at it deeper the solution is trivial because in this
> case writing the MSI message to the device is not required when the
> affinity changes because the message does not change. It is set once via
> msi_domain_activate() and stays the same for the life time of the
> interrupt.
> 
> So the below prevents the invocation of irq_chip_write_msi_msg() in
> msi_domain_set_affinity(), but I would recommend to investigate the
> actual underlying problem nevertheless:
> 
> It is going to roar its ugly head at some other place sooner than later
> as there are tons of other places which guard against
> pci_dev::current_state != PCI_D0.
> 
> Thanks,
> 
>          tglx
> ---
> --- a/kernel/irq/chip.c
> +++ b/kernel/irq/chip.c
> @@ -1495,7 +1495,7 @@ int irq_chip_redirect_set_affinity(struc
>   	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
>   	irq_data_update_effective_affinity(data, dest);
>   
> -	return IRQ_SET_MASK_OK;
> +	return IRQ_SET_MASK_OK_DONE;
>   }
>   EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
>   #endif
> 

Yes that does fix it!

Tested-by: Jon Hunter <jonathanh@nvidia.com>

Thanks!
Jon

-- 
nvpublic

[PATCH] genirq/redirect: Prevent writing MSI message on affinity change

Posted by Thomas Gleixner 1 week, 5 days ago

The interrupts which are handled by the redirection infrastructure provide
a irq_set_affinity() callback, which solely determines the target CPU for
redirection via irq_work and und updates the effective affinity mask.

Contrary to regular MSI interrupts this affinity setting does not change
the underlying interrupt message as the message is only created at setup
time to deliver to the demultiplexing interrupt.

Therefore the message write in msi_domain_set_affinity() is a pointless
exercise. In principle the write is harmless, but a Tegra system exposes a
full system hang during suspend due to that write.

It's unclear why the check for the PCI device state PCI_D0 in
pci_msi_domain_write_msg(), which prevents the actual hardware access if
a device is powered down state, fails on this particular system, but
that's a different problem which needs to be investigated by the Tegra
experts.

The irq_set_affinity() callback can advise msi_domain_set_affinity() not to
write the MSI message by returning IRQ_SET_MASK_OK_DONE instead of
IRQ_SET_MASK_OK. Do exactly that.

Just to make it clear again:

This is not a correctness issue of the redirection code as returning
IRQ_SET_MASK_OK in that context is completely correct. From the core
code point of view this is solely a optimization to avoid an redundant
hardware write.

As a byproduct it papers over the underlying problem on the Tegra platform,
which fails to put the PCIe device[s] out of PCI_D0 despite the fact that
the devices and busses have been shut down. The redirect infrastructure
just unearthed the underlying issue, which is prone to happen in quite some
other code paths which use the PCI_D0 check to prevent hardware access to
powered down devices.

This therefore has neither a 'Fixes:' nor a 'Closes:' tag associated as the
underlying problem, which is outside the scope of the interrupt code, is
still unresolved.

Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Link: https://lore.kernel.org/all/4e5b349c-6599-4871-9e3b-e10352ae0ca0@nvidia.com
---
 kernel/irq/chip.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1495,7 +1495,7 @@ int irq_chip_redirect_set_affinity(struc
 	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
 	irq_data_update_effective_affinity(data, dest);
 
-	return IRQ_SET_MASK_OK;
+	return IRQ_SET_MASK_OK_DONE;
 }
 EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
 #endif

[tip: irq/msi] genirq/redirect: Prevent writing MSI message on affinity change

Posted by tip-bot2 for Thomas Gleixner 1 week, 3 days ago

The following commit has been merged into the irq/msi branch of tip:

Commit-ID:     37f9d5026cd78fbe80a124edbbadab382b26545f
Gitweb:        https://git.kernel.org/tip/37f9d5026cd78fbe80a124edbbadab382b26545f
Author:        Thomas Gleixner <tglx@kernel.org>
AuthorDate:    Tue, 27 Jan 2026 22:30:16 +01:00
Committer:     Thomas Gleixner <tglx@kernel.org>
CommitterDate: Thu, 29 Jan 2026 23:49:55 +01:00

genirq/redirect: Prevent writing MSI message on affinity change

The interrupts which are handled by the redirection infrastructure provide
a irq_set_affinity() callback, which solely determines the target CPU for
redirection via irq_work and und updates the effective affinity mask.

Contrary to regular MSI interrupts this affinity setting does not change
the underlying interrupt message as the message is only created at setup
time to deliver to the demultiplexing interrupt.

Therefore the message write in msi_domain_set_affinity() is a pointless
exercise. In principle the write is harmless, but a Tegra system exposes a
full system hang during suspend due to that write.

It's unclear why the check for the PCI device state PCI_D0 in
pci_msi_domain_write_msg(), which prevents the actual hardware access if
a device is in powered down state, fails on this particular system, but
that's a different problem which needs to be investigated by the Tegra
experts.

The irq_set_affinity() callback can advise msi_domain_set_affinity() not to
write the MSI message by returning IRQ_SET_MASK_OK_DONE instead of
IRQ_SET_MASK_OK. Do exactly that.

Just to make it clear again:

This is not a correctness issue of the redirection code as returning
IRQ_SET_MASK_OK in that context is completely correct. From the core
code point of view this is solely a optimization to avoid an redundant
hardware write.

As a byproduct it papers over the underlying problem on the Tegra platform,
which fails to put the PCIe device[s] out of PCI_D0 despite the fact that
the devices and busses have been shut down. The redirect infrastructure
just unearthed the underlying issue, which is prone to happen in quite some
other code paths which use the PCI_D0 check to prevent hardware access to
powered down devices.

This therefore has neither a 'Fixes:' nor a 'Closes:' tag associated as the
underlying problem, which is outside the scope of the interrupt code, is
still unresolved.

Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Link: https://lore.kernel.org/all/4e5b349c-6599-4871-9e3b-e10352ae0ca0@nvidia.com
Link: https://patch.msgid.link/87tsw6aglz.ffs@tglx
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 35bc17b..ccdc47a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1495,7 +1495,7 @@ int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *
 	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
 	irq_data_update_effective_affinity(data, dest);
 
-	return IRQ_SET_MASK_OK;
+	return IRQ_SET_MASK_OK_DONE;
 }
 EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
 #endif

Re: [PATCH v3 3/3] PCI: dwc: Enable MSI affinity support

Posted by Jon Hunter 2 weeks, 2 days ago


On 22/01/2026 23:31, Radu Rendec wrote:

...

> Thanks very much for running the test and for the logs. The good news
> is good ol' printk debugging seems to be working, and the last message
> in the log is indeed related to dw-pci irq affinity control, which is
> what the patch touches. So we're on to something. The bad news is I
> can't yet figure out what's wrong.
> 
> The CPUs are taken offline one by one, starting with CPU 7. The code in
> question runs on the dying CPU, and with hardware interrupts disabled
> on all CPUs. The (simplified) call stack looks like this:
> 
> irq_migrate_all_off_this_cpu
>    for_each_active_irq
>      migrate_one_irq
>        irq_do_set_affinity
>          irq_chip_redirect_set_affinity (via chip->irq_set_affinity)
> 
> The debug patch I gave you adds:
>   * a printk to irq_chip_redirect_set_affinity (which is very small)
>   * a printk at the beginning of migrate_one_irq
> 
> Also, the call to irq_do_set_affinity is almost the last thing that
> happens in migrate_one_irq, and that for_each_active_irq loop is quite
> small too. So, there isn't much happening between the printk in
> irq_chip_redirect_set_affinity for the msi irq (which we do see in the
> log) and the printk in migrate_one_irq for the next irq (which we don't
> see).
> 
> My first thought is to add more printk's between those two and narrow
> down the spot where it gets stuck.
> 
> I think the fastest way to debug it is if I can test myself. I tried to
> reproduce the issue on a Jetson AGX Orin, and I couldn't. By the way,
> how often does it hang? e.g., out of say 10 suspend attempts, how many
> fail?

For Jetson AGX Xavier it fails on the first suspend attempt.

> I do have access to a Jetson Xavier NX (in theory) but it looks like
> there's a lab issue with that board, which hopefully gets sorted out
> tomorrow. If I can't get a hold of that board (or can't reproduce the
> problem on it), I may ask you to try a few other things. In any case,
> I'll update this thread again either tomorrow or (more likely) early
> next week.

Weirdly I don't see this with Jetson Xavier NX. However, could be worth 
trying but you may wish to revert this change [0] because it is causing 
other issues for Jetson Xavier NX.

Jon

[0] 
https://lore.kernel.org/linux-tegra/e32b0819-2c29-4c83-83d5-e28dc4b2b01f@nvidia.com/


-- 
nvpublic

[tip: irq/msi] PCI: dwc: Enable MSI affinity support

Posted by tip-bot2 for Radu Rendec 1 month, 3 weeks ago

The following commit has been merged into the irq/msi branch of tip:

Commit-ID:     eaf290c404f7c39f23292e9ce83b8b5b51ab598a
Gitweb:        https://git.kernel.org/tip/eaf290c404f7c39f23292e9ce83b8b5b51ab598a
Author:        Radu Rendec <rrendec@redhat.com>
AuthorDate:    Fri, 28 Nov 2025 16:20:55 -05:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Mon, 15 Dec 2025 22:30:48 +01:00

PCI: dwc: Enable MSI affinity support

Leverage the interrupt redirection infrastructure to enable CPU affinity
support for MSI interrupts. Since the parent interrupt affinity cannot
be changed, affinity control for the child interrupt (MSI) is achieved
by redirecting the handler to run in IRQ work context on the target CPU.

This patch was originally prepared by Thomas Gleixner (see Link tag below)
in a patch series that was never submitted as is, and only parts of that
series have made it upstream so far.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Radu Rendec <rrendec@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
Link: https://patch.msgid.link/20251128212055.1409093-4-rrendec@redhat.com
---
 drivers/pci/controller/dwc/pcie-designware-host.c | 33 +++++++++++---
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index 25ad1ae..f116591 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -26,9 +26,27 @@ static struct pci_ops dw_pcie_ops;
 static struct pci_ops dw_pcie_ecam_ops;
 static struct pci_ops dw_child_pcie_ops;
 
+#ifdef CONFIG_SMP
+static void dw_irq_noop(struct irq_data *d) { }
+#endif
+
+static bool dw_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				      struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
+
+#ifdef CONFIG_SMP
+	info->chip->irq_ack = dw_irq_noop;
+	info->chip->irq_pre_redirect = irq_chip_pre_redirect_parent;
+#else
+	info->chip->irq_ack = irq_chip_ack_parent;
+#endif
+	return true;
+}
+
 #define DW_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS		| \
 				    MSI_FLAG_USE_DEF_CHIP_OPS		| \
-				    MSI_FLAG_NO_AFFINITY		| \
 				    MSI_FLAG_PCI_MSI_MASK_PARENT)
 #define DW_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI		| \
 				     MSI_FLAG_PCI_MSIX			| \
@@ -40,9 +58,8 @@ static const struct msi_parent_ops dw_pcie_msi_parent_ops = {
 	.required_flags		= DW_PCIE_MSI_FLAGS_REQUIRED,
 	.supported_flags	= DW_PCIE_MSI_FLAGS_SUPPORTED,
 	.bus_select_token	= DOMAIN_BUS_PCI_MSI,
-	.chip_flags		= MSI_CHIP_FLAG_SET_ACK,
 	.prefix			= "DW-",
-	.init_dev_msi_info	= msi_lib_init_dev_msi_info,
+	.init_dev_msi_info	= dw_pcie_init_dev_msi_info,
 };
 
 /* MSI int handler */
@@ -63,7 +80,7 @@ void dw_handle_msi_irq(struct dw_pcie_rp *pp)
 			continue;
 
 		for_each_set_bit(pos, &status, MAX_MSI_IRQS_PER_CTRL)
-			generic_handle_domain_irq(pp->irq_domain, irq_off + pos);
+			generic_handle_demux_domain_irq(pp->irq_domain, irq_off + pos);
 	}
 }
 
@@ -140,10 +157,16 @@ static void dw_pci_bottom_ack(struct irq_data *d)
 
 static struct irq_chip dw_pci_msi_bottom_irq_chip = {
 	.name			= "DWPCI-MSI",
-	.irq_ack		= dw_pci_bottom_ack,
 	.irq_compose_msi_msg	= dw_pci_setup_msi_msg,
 	.irq_mask		= dw_pci_bottom_mask,
 	.irq_unmask		= dw_pci_bottom_unmask,
+#ifdef CONFIG_SMP
+	.irq_ack		= dw_irq_noop,
+	.irq_pre_redirect	= dw_pci_bottom_ack,
+	.irq_set_affinity	= irq_chip_redirect_set_affinity,
+#else
+	.irq_ack		= dw_pci_bottom_ack,
+#endif
 };
 
 static int dw_pcie_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,

Re: [tip: irq/msi] PCI: dwc: Enable MSI affinity support

Posted by Jon Hunter 1 month ago

Hi Radu,

On 15/12/2025 21:34, tip-bot2 for Radu Rendec wrote:
> The following commit has been merged into the irq/msi branch of tip:
> 
> Commit-ID:     eaf290c404f7c39f23292e9ce83b8b5b51ab598a
> Gitweb:        https://git.kernel.org/tip/eaf290c404f7c39f23292e9ce83b8b5b51ab598a
> Author:        Radu Rendec <rrendec@redhat.com>
> AuthorDate:    Fri, 28 Nov 2025 16:20:55 -05:00
> Committer:     Thomas Gleixner <tglx@linutronix.de>
> CommitterDate: Mon, 15 Dec 2025 22:30:48 +01:00
> 
> PCI: dwc: Enable MSI affinity support
> 
> Leverage the interrupt redirection infrastructure to enable CPU affinity
> support for MSI interrupts. Since the parent interrupt affinity cannot
> be changed, affinity control for the child interrupt (MSI) is achieved
> by redirecting the handler to run in IRQ work context on the target CPU.
> 
> This patch was originally prepared by Thomas Gleixner (see Link tag below)
> in a patch series that was never submitted as is, and only parts of that
> series have made it upstream so far.
> 
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Radu Rendec <rrendec@redhat.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
> Link: https://patch.msgid.link/20251128212055.1409093-4-rrendec@redhat.com


With next-20260105 I am observing the following warning on the Tegra194 
Jetson AGX platform ...

  WARNING KERN genirq: irq_chip DW-PCI-MSI-0001:01:00.0 did not update
   eff. affinity mask of irq 171

Bisect is point to this commit. This platform is using the driver 
drivers/pci/controller/dwc/pcie-tegra194.c. Is there some default 
affinity that we should be setting to avoid this warning?

Thanks
Jon

-- 
nvpublic

Re: [tip: irq/msi] PCI: dwc: Enable MSI affinity support

Posted by Radu Rendec 1 month ago

Hi Jon,

On Tue, 2026-01-06 at 09:53 +0000, Jon Hunter wrote:
> On 15/12/2025 21:34, tip-bot2 for Radu Rendec wrote:
> > The following commit has been merged into the irq/msi branch of tip:
> > 
> > Commit-ID:     eaf290c404f7c39f23292e9ce83b8b5b51ab598a
> > Gitweb:        https://git.kernel.org/tip/eaf290c404f7c39f23292e9ce83b8b5b51ab598a
> > Author:        Radu Rendec <rrendec@redhat.com>
> > AuthorDate:    Fri, 28 Nov 2025 16:20:55 -05:00
> > Committer:     Thomas Gleixner <tglx@linutronix.de>
> > CommitterDate: Mon, 15 Dec 2025 22:30:48 +01:00
> > 
> > PCI: dwc: Enable MSI affinity support
> > 
> > Leverage the interrupt redirection infrastructure to enable CPU affinity
> > support for MSI interrupts. Since the parent interrupt affinity cannot
> > be changed, affinity control for the child interrupt (MSI) is achieved
> > by redirecting the handler to run in IRQ work context on the target CPU.
> > 
> > This patch was originally prepared by Thomas Gleixner (see Link tag below)
> > in a patch series that was never submitted as is, and only parts of that
> > series have made it upstream so far.
> > 
> > Originally-by: Thomas Gleixner <tglx@linutronix.de>
> > Signed-off-by: Radu Rendec <rrendec@redhat.com>
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
> > Link: https://patch.msgid.link/20251128212055.1409093-4-rrendec@redhat.com
> 
> 
> With next-20260105 I am observing the following warning on the Tegra194 
> Jetson AGX platform ...
> 
>   WARNING KERN genirq: irq_chip DW-PCI-MSI-0001:01:00.0 did not update
>    eff. affinity mask of irq 171
> 
> Bisect is point to this commit. This platform is using the driver 
> drivers/pci/controller/dwc/pcie-tegra194.c. Is there some default 
> affinity that we should be setting to avoid this warning?

Before that patch, affinity control wasn't even possible for PCI MSIs
exposed by the dw_pci drivers. Without having looked at the code yet,
I suspect it's just because now that affinity control is enabled,
something tries to use it.

I don't think you should set some default affinity. By default, the PCI
MSIs should be affine to all available CPUs, and that warning shouldn't
happen in the first place. Let me test on Jetson AGX and see what's
going on. I'll update the thread with my findings, hopefully later
today.

-- 
Thanks,
Radu

Re: [tip: irq/msi] PCI: dwc: Enable MSI affinity support

Posted by Radu Rendec 1 month ago

Hi Jon,

On Tue, 2026-01-06 at 10:07 -0500, Radu Rendec wrote:
> On Tue, 2026-01-06 at 09:53 +0000, Jon Hunter wrote:
> > On 15/12/2025 21:34, tip-bot2 for Radu Rendec wrote:
> > > The following commit has been merged into the irq/msi branch of tip:
> > > 
> > > Commit-ID:     eaf290c404f7c39f23292e9ce83b8b5b51ab598a
> > > Gitweb:        https://git.kernel.org/tip/eaf290c404f7c39f23292e9ce83b8b5b51ab598a
> > > Author:        Radu Rendec <rrendec@redhat.com>
> > > AuthorDate:    Fri, 28 Nov 2025 16:20:55 -05:00
> > > Committer:     Thomas Gleixner <tglx@linutronix.de>
> > > CommitterDate: Mon, 15 Dec 2025 22:30:48 +01:00
> > > 
> > > PCI: dwc: Enable MSI affinity support
> > > 
> > > Leverage the interrupt redirection infrastructure to enable CPU affinity
> > > support for MSI interrupts. Since the parent interrupt affinity cannot
> > > be changed, affinity control for the child interrupt (MSI) is achieved
> > > by redirecting the handler to run in IRQ work context on the target CPU.
> > > 
> > > This patch was originally prepared by Thomas Gleixner (see Link tag below)
> > > in a patch series that was never submitted as is, and only parts of that
> > > series have made it upstream so far.
> > > 
> > > Originally-by: Thomas Gleixner <tglx@linutronix.de>
> > > Signed-off-by: Radu Rendec <rrendec@redhat.com>
> > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > > Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
> > > Link: https://patch.msgid.link/20251128212055.1409093-4-rrendec@redhat.com
> > 
> > 
> > With next-20260105 I am observing the following warning on the Tegra194 
> > Jetson AGX platform ...
> > 
> >   WARNING KERN genirq: irq_chip DW-PCI-MSI-0001:01:00.0 did not update
> >    eff. affinity mask of irq 171
> > 
> > Bisect is point to this commit. This platform is using the driver 
> > drivers/pci/controller/dwc/pcie-tegra194.c. Is there some default 
> > affinity that we should be setting to avoid this warning?
> 
> Before that patch, affinity control wasn't even possible for PCI MSIs
> exposed by the dw_pci drivers. Without having looked at the code yet,
> I suspect it's just because now that affinity control is enabled,
> something tries to use it.
> 
> I don't think you should set some default affinity. By default, the PCI
> MSIs should be affine to all available CPUs, and that warning shouldn't
> happen in the first place. Let me test on Jetson AGX and see what's
> going on. I'll update the thread with my findings, hopefully later
> today.

I looked at the code and tested, and the problem is that the effective
affinity mask is not updated for interrupt redirection. The bug is not
in this patch, but the previous one in the series [1], which adds the
interrupt redirection framework.

The warning is actually triggered when the MSI is set up. This is the
top part of the relevant stack trace:
  irq_do_set_affinity+0x28c/0x300 (P)
  irq_setup_affinity+0x130/0x208
  irq_startup+0x118/0x170
  __setup_irq+0x5b0/0x6a0
  request_threaded_irq+0xb8/0x180
  devm_request_threaded_irq+0x88/0x150
  rtw_pci_probe+0x1e8/0x370 [rtw88_pci]

I don't immediately see an easy way to fix it for the generic case
because the affinity of the demultiplexing IRQ (the "parent" IRQ) can
change after the affinity of the demultiplexed IRQ (the "child" IRQ)
has been set up. But since dw_pcie is currently the only user of the
interrupt redirection infrastructure, and it sets up the demultiplexing
IRQ as a chained IRQ, there is no way its affinity can change other
than CPU hot(un)plug. And in this particular case, something as simple
as will work:

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d5c3f6ee24cc2..036641f9534ae 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1512,8 +1512,11 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
 int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force)
 {
 	struct irq_redirect *redir = &irq_data_to_desc(data)->redirect;
+	unsigned int target_cpu = cpumask_first(dest);
+
+	WRITE_ONCE(redir->target_cpu, target_cpu);
+	irq_data_update_effective_affinity(data, cpumask_of(target_cpu));
 
-	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
 	return IRQ_SET_MASK_OK;
 }
 EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);

I will send this as a proper patch tomorrow, and it will fix the
immediate problem and buy some time for a more elaborate fix for the
generic case. Meanwhile, thanks a lot for finding/reporting this!

[1] https://lore.kernel.org/all/20251128212055.1409093-2-rrendec@redhat.com/

-- 
Best regards,
Radu