Move IRQs from dying CPU to the online ones when a CPU is getting
offlined. When onlining, rebalance all IRQs in a round-robin fashion.
Guest-bound IRQs are already handled by scheduler in the process of
moving vCPUs to active pCPUs, so we only need to handle IRQs used by Xen
itself.
Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
---
v4->v5:
* handle CPU onlining as well
* more comments
* fix crash when ESPI is disabled
* don't assume CPU 0 is a boot CPU
* use insigned int for irq number
* remove assumption that all irqs a bound to CPU 0 by default from the
commit message
v3->v4:
* patch introduced
---
xen/arch/arm/include/asm/irq.h | 2 ++
xen/arch/arm/irq.c | 54 ++++++++++++++++++++++++++++++++++
xen/arch/arm/smpboot.c | 6 ++++
3 files changed, 62 insertions(+)
diff --git a/xen/arch/arm/include/asm/irq.h b/xen/arch/arm/include/asm/irq.h
index 09788dbfeb..a0250bac85 100644
--- a/xen/arch/arm/include/asm/irq.h
+++ b/xen/arch/arm/include/asm/irq.h
@@ -126,6 +126,8 @@ bool irq_type_set_by_domain(const struct domain *d);
void irq_end_none(struct irq_desc *irq);
#define irq_end_none irq_end_none
+void rebalance_irqs(unsigned int from, bool up);
+
#endif /* _ASM_HW_IRQ_H */
/*
* Local variables:
diff --git a/xen/arch/arm/irq.c b/xen/arch/arm/irq.c
index 7204bc2b68..a32dc729f8 100644
--- a/xen/arch/arm/irq.c
+++ b/xen/arch/arm/irq.c
@@ -158,6 +158,58 @@ static int init_local_irq_data(unsigned int cpu)
return 0;
}
+static int cpu_next;
+
+static void balance_irq(int irq, unsigned int from, bool up)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ ASSERT(!cpumask_empty(&cpu_online_map));
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if ( likely(!desc->action) )
+ goto out;
+
+ if ( likely(test_bit(_IRQ_GUEST, &desc->status) ||
+ test_bit(_IRQ_MOVE_PENDING, &desc->status)) )
+ goto out;
+
+ /*
+ * Setting affinity to a mask of multiple CPUs causes the GIC drivers to
+ * select one CPU from that mask. If the dying CPU was included in the IRQ's
+ * affinity mask, we cannot determine exactly which CPU the interrupt is
+ * currently routed to, as GIC drivers lack a concrete get_affinity API. So
+ * to be safe we must reroute it to a new, definitely online, CPU. In the
+ * case of CPU going down, we move only the interrupt that could reside on
+ * it. Otherwise, we rearrange all interrupts in a round-robin fashion.
+ */
+ if ( !up && !cpumask_test_cpu(from, desc->affinity) )
+ goto out;
+
+ cpu_next = cpumask_cycle(cpu_next, &cpu_online_map);
+ irq_set_affinity(desc, cpumask_of(cpu_next));
+
+out:
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void rebalance_irqs(unsigned int from, bool up)
+{
+ int irq;
+
+ if ( cpumask_empty(&cpu_online_map) )
+ return;
+
+ for ( irq = NR_LOCAL_IRQS; irq < NR_IRQS; irq++ )
+ balance_irq(irq, from, up);
+
+#ifdef CONFIG_GICV3_ESPI
+ for ( irq = ESPI_BASE_INTID; irq < ESPI_MAX_INTID; irq++ )
+ balance_irq(irq, from, up);
+#endif
+}
+
static int cpu_callback(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
@@ -172,6 +224,8 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
printk(XENLOG_ERR "Unable to allocate local IRQ for CPU%u\n",
cpu);
break;
+ case CPU_ONLINE:
+ rebalance_irqs(cpu, true);
}
return notifier_from_errno(rc);
diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
index 7f3cfa812e..e1b9f94458 100644
--- a/xen/arch/arm/smpboot.c
+++ b/xen/arch/arm/smpboot.c
@@ -425,6 +425,12 @@ void __cpu_disable(void)
smp_mb();
+ /*
+ * Now that the interrupts are cleared and the CPU marked as offline,
+ * move interrupts out of it
+ */
+ rebalance_irqs(cpu, false);
+
/* Return to caller; eventually the IPI mechanism will unwind and the
* scheduler will drop to the idle loop, which will call stop_cpu(). */
}
--
2.51.2
Hi Mykyta.
> On 13 Jan 2026, at 09:45, Mykyta Poturai <Mykyta_Poturai@epam.com> wrote:
>
> Move IRQs from dying CPU to the online ones when a CPU is getting
> offlined. When onlining, rebalance all IRQs in a round-robin fashion.
> Guest-bound IRQs are already handled by scheduler in the process of
> moving vCPUs to active pCPUs, so we only need to handle IRQs used by Xen
> itself.
>
> Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
> ---
> v4->v5:
> * handle CPU onlining as well
> * more comments
> * fix crash when ESPI is disabled
> * don't assume CPU 0 is a boot CPU
> * use insigned int for irq number
> * remove assumption that all irqs a bound to CPU 0 by default from the
> commit message
>
> v3->v4:
> * patch introduced
> ---
> xen/arch/arm/include/asm/irq.h | 2 ++
> xen/arch/arm/irq.c | 54 ++++++++++++++++++++++++++++++++++
> xen/arch/arm/smpboot.c | 6 ++++
> 3 files changed, 62 insertions(+)
>
> diff --git a/xen/arch/arm/include/asm/irq.h b/xen/arch/arm/include/asm/irq.h
> index 09788dbfeb..a0250bac85 100644
> --- a/xen/arch/arm/include/asm/irq.h
> +++ b/xen/arch/arm/include/asm/irq.h
> @@ -126,6 +126,8 @@ bool irq_type_set_by_domain(const struct domain *d);
> void irq_end_none(struct irq_desc *irq);
> #define irq_end_none irq_end_none
>
> +void rebalance_irqs(unsigned int from, bool up);
> +
> #endif /* _ASM_HW_IRQ_H */
> /*
> * Local variables:
> diff --git a/xen/arch/arm/irq.c b/xen/arch/arm/irq.c
> index 7204bc2b68..a32dc729f8 100644
> --- a/xen/arch/arm/irq.c
> +++ b/xen/arch/arm/irq.c
> @@ -158,6 +158,58 @@ static int init_local_irq_data(unsigned int cpu)
> return 0;
> }
>
> +static int cpu_next;
> +
> +static void balance_irq(int irq, unsigned int from, bool up)
> +{
> + struct irq_desc *desc = irq_to_desc(irq);
> + unsigned long flags;
> +
> + ASSERT(!cpumask_empty(&cpu_online_map));
> +
> + spin_lock_irqsave(&desc->lock, flags);
> + if ( likely(!desc->action) )
> + goto out;
> +
> + if ( likely(test_bit(_IRQ_GUEST, &desc->status) ||
> + test_bit(_IRQ_MOVE_PENDING, &desc->status)) )
> + goto out;
> +
> + /*
> + * Setting affinity to a mask of multiple CPUs causes the GIC drivers to
> + * select one CPU from that mask. If the dying CPU was included in the IRQ's
> + * affinity mask, we cannot determine exactly which CPU the interrupt is
> + * currently routed to, as GIC drivers lack a concrete get_affinity API. So
> + * to be safe we must reroute it to a new, definitely online, CPU. In the
> + * case of CPU going down, we move only the interrupt that could reside on
> + * it. Otherwise, we rearrange all interrupts in a round-robin fashion.
> + */
> + if ( !up && !cpumask_test_cpu(from, desc->affinity) )
> + goto out;
I am a bit lost here on what you are trying to do in the case where
a cpu is coming up here, it feels like you are trying to change the
affinity of all interrupts in this case to cycle everything.
Is it really what is expected ?
If affinity was set by a VM on its interrupts, I would not expect
Xen to round-robin everything each time a cpu comes up.
> +
> + cpu_next = cpumask_cycle(cpu_next, &cpu_online_map);
> + irq_set_affinity(desc, cpumask_of(cpu_next));
> +
> +out:
> + spin_unlock_irqrestore(&desc->lock, flags);
> +}
> +
> +void rebalance_irqs(unsigned int from, bool up)
> +{
> + int irq;
> +
> + if ( cpumask_empty(&cpu_online_map) )
> + return;
> +
> + for ( irq = NR_LOCAL_IRQS; irq < NR_IRQS; irq++ )
> + balance_irq(irq, from, up);
> +
> +#ifdef CONFIG_GICV3_ESPI
> + for ( irq = ESPI_BASE_INTID; irq < ESPI_MAX_INTID; irq++ )
> + balance_irq(irq, from, up);
> +#endif
> +}
> +
> static int cpu_callback(struct notifier_block *nfb, unsigned long action,
> void *hcpu)
> {
> @@ -172,6 +224,8 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
> printk(XENLOG_ERR "Unable to allocate local IRQ for CPU%u\n",
> cpu);
> break;
> + case CPU_ONLINE:
> + rebalance_irqs(cpu, true);
> }
>
> return notifier_from_errno(rc);
> diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
> index 7f3cfa812e..e1b9f94458 100644
> --- a/xen/arch/arm/smpboot.c
> +++ b/xen/arch/arm/smpboot.c
> @@ -425,6 +425,12 @@ void __cpu_disable(void)
>
> smp_mb();
>
> + /*
> + * Now that the interrupts are cleared and the CPU marked as offline,
> + * move interrupts out of it
> + */
> + rebalance_irqs(cpu, false);
> +
I would expect this to only be useful when HOTPLUG is enabled, maybe
we could have a static inline doing nothing when HOTPLUG is not on
and only do something if HOTPLUG is enabled here ?
Cheers
Bertrand
> /* Return to caller; eventually the IPI mechanism will unwind and the
> * scheduler will drop to the idle loop, which will call stop_cpu(). */
> }
> --
> 2.51.2
On 04.02.26 16:20, Bertrand Marquis wrote:
> Hi Mykyta.
>
>> On 13 Jan 2026, at 09:45, Mykyta Poturai <Mykyta_Poturai@epam.com> wrote:
>>
>> Move IRQs from dying CPU to the online ones when a CPU is getting
>> offlined. When onlining, rebalance all IRQs in a round-robin fashion.
>> Guest-bound IRQs are already handled by scheduler in the process of
>> moving vCPUs to active pCPUs, so we only need to handle IRQs used by Xen
>> itself.
>>
>> Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
>> ---
>> v4->v5:
>> * handle CPU onlining as well
>> * more comments
>> * fix crash when ESPI is disabled
>> * don't assume CPU 0 is a boot CPU
>> * use insigned int for irq number
>> * remove assumption that all irqs a bound to CPU 0 by default from the
>> commit message
>>
>> v3->v4:
>> * patch introduced
>> ---
>> xen/arch/arm/include/asm/irq.h | 2 ++
>> xen/arch/arm/irq.c | 54 ++++++++++++++++++++++++++++++++++
>> xen/arch/arm/smpboot.c | 6 ++++
>> 3 files changed, 62 insertions(+)
>>
>> diff --git a/xen/arch/arm/include/asm/irq.h b/xen/arch/arm/include/asm/irq.h
>> index 09788dbfeb..a0250bac85 100644
>> --- a/xen/arch/arm/include/asm/irq.h
>> +++ b/xen/arch/arm/include/asm/irq.h
>> @@ -126,6 +126,8 @@ bool irq_type_set_by_domain(const struct domain *d);
>> void irq_end_none(struct irq_desc *irq);
>> #define irq_end_none irq_end_none
>>
>> +void rebalance_irqs(unsigned int from, bool up);
>> +
>> #endif /* _ASM_HW_IRQ_H */
>> /*
>> * Local variables:
>> diff --git a/xen/arch/arm/irq.c b/xen/arch/arm/irq.c
>> index 7204bc2b68..a32dc729f8 100644
>> --- a/xen/arch/arm/irq.c
>> +++ b/xen/arch/arm/irq.c
>> @@ -158,6 +158,58 @@ static int init_local_irq_data(unsigned int cpu)
>> return 0;
>> }
>>
>> +static int cpu_next;
>> +
>> +static void balance_irq(int irq, unsigned int from, bool up)
>> +{
>> + struct irq_desc *desc = irq_to_desc(irq);
>> + unsigned long flags;
>> +
>> + ASSERT(!cpumask_empty(&cpu_online_map));
>> +
>> + spin_lock_irqsave(&desc->lock, flags);
>> + if ( likely(!desc->action) )
>> + goto out;
>> +
>> + if ( likely(test_bit(_IRQ_GUEST, &desc->status) ||
>> + test_bit(_IRQ_MOVE_PENDING, &desc->status)) )
>> + goto out;
>> +
>> + /*
>> + * Setting affinity to a mask of multiple CPUs causes the GIC drivers to
>> + * select one CPU from that mask. If the dying CPU was included in the IRQ's
>> + * affinity mask, we cannot determine exactly which CPU the interrupt is
>> + * currently routed to, as GIC drivers lack a concrete get_affinity API. So
>> + * to be safe we must reroute it to a new, definitely online, CPU. In the
>> + * case of CPU going down, we move only the interrupt that could reside on
>> + * it. Otherwise, we rearrange all interrupts in a round-robin fashion.
>> + */
>> + if ( !up && !cpumask_test_cpu(from, desc->affinity) )
>> + goto out;
>
> I am a bit lost here on what you are trying to do in the case where
> a cpu is coming up here, it feels like you are trying to change the
> affinity of all interrupts in this case to cycle everything.
> Is it really what is expected ?
> If affinity was set by a VM on its interrupts, I would not expect
> Xen to round-robin everything each time a cpu comes up.
>
The idea is to evenly spread interrupts between CPUs when the new ones
are being brought online. This is needed to prevent Xen-bound IRQs from
piling up on CPU 0 when other cores are being offlined and then onlined
back. It shouldn’t mess with guest affinities, as the code skips
everything that is assigned to guests and leaves it to be handled by the
scheduler/vgic. Performance-wise, it should also be okay, as from what
I’ve seen, there are not many interrupts used by Xen, and I expect CPU
hotplug operations to be fairly infrequent.
>> +
>> + cpu_next = cpumask_cycle(cpu_next, &cpu_online_map);
>> + irq_set_affinity(desc, cpumask_of(cpu_next));
>> +
>> +out:
>> + spin_unlock_irqrestore(&desc->lock, flags);
>> +}
>> +
>> +void rebalance_irqs(unsigned int from, bool up)
>> +{
>> + int irq;
>> +
>> + if ( cpumask_empty(&cpu_online_map) )
>> + return;
>> +
>> + for ( irq = NR_LOCAL_IRQS; irq < NR_IRQS; irq++ )
>> + balance_irq(irq, from, up);
>> +
>> +#ifdef CONFIG_GICV3_ESPI
>> + for ( irq = ESPI_BASE_INTID; irq < ESPI_MAX_INTID; irq++ )
>> + balance_irq(irq, from, up);
>> +#endif
>> +}
>> +
>> static int cpu_callback(struct notifier_block *nfb, unsigned long action,
>> void *hcpu)
>> {
>> @@ -172,6 +224,8 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
>> printk(XENLOG_ERR "Unable to allocate local IRQ for CPU%u\n",
>> cpu);
>> break;
>> + case CPU_ONLINE:
>> + rebalance_irqs(cpu, true);
>> }
>>
>> return notifier_from_errno(rc);
>> diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
>> index 7f3cfa812e..e1b9f94458 100644
>> --- a/xen/arch/arm/smpboot.c
>> +++ b/xen/arch/arm/smpboot.c
>> @@ -425,6 +425,12 @@ void __cpu_disable(void)
>>
>> smp_mb();
>>
>> + /*
>> + * Now that the interrupts are cleared and the CPU marked as offline,
>> + * move interrupts out of it
>> + */
>> + rebalance_irqs(cpu, false);
>> +
>
> I would expect this to only be useful when HOTPLUG is enabled, maybe
> we could have a static inline doing nothing when HOTPLUG is not on
> and only do something if HOTPLUG is enabled here ?
>
Yes I will add this in the next version.
> Cheers
> Bertrand
>
>> /* Return to caller; eventually the IPI mechanism will unwind and the
>> * scheduler will drop to the idle loop, which will call stop_cpu(). */
>> }
>> --
>> 2.51.2
>
--
Mykyta
Hi Mykyta,
> On 5 Feb 2026, at 14:23, Mykyta Poturai <Mykyta_Poturai@epam.com> wrote:
>
> On 04.02.26 16:20, Bertrand Marquis wrote:
>> Hi Mykyta.
>>
>>> On 13 Jan 2026, at 09:45, Mykyta Poturai <Mykyta_Poturai@epam.com> wrote:
>>>
>>> Move IRQs from dying CPU to the online ones when a CPU is getting
>>> offlined. When onlining, rebalance all IRQs in a round-robin fashion.
>>> Guest-bound IRQs are already handled by scheduler in the process of
>>> moving vCPUs to active pCPUs, so we only need to handle IRQs used by Xen
>>> itself.
>>>
>>> Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
>>> ---
>>> v4->v5:
>>> * handle CPU onlining as well
>>> * more comments
>>> * fix crash when ESPI is disabled
>>> * don't assume CPU 0 is a boot CPU
>>> * use insigned int for irq number
>>> * remove assumption that all irqs a bound to CPU 0 by default from the
>>> commit message
>>>
>>> v3->v4:
>>> * patch introduced
>>> ---
>>> xen/arch/arm/include/asm/irq.h | 2 ++
>>> xen/arch/arm/irq.c | 54 ++++++++++++++++++++++++++++++++++
>>> xen/arch/arm/smpboot.c | 6 ++++
>>> 3 files changed, 62 insertions(+)
>>>
>>> diff --git a/xen/arch/arm/include/asm/irq.h b/xen/arch/arm/include/asm/irq.h
>>> index 09788dbfeb..a0250bac85 100644
>>> --- a/xen/arch/arm/include/asm/irq.h
>>> +++ b/xen/arch/arm/include/asm/irq.h
>>> @@ -126,6 +126,8 @@ bool irq_type_set_by_domain(const struct domain *d);
>>> void irq_end_none(struct irq_desc *irq);
>>> #define irq_end_none irq_end_none
>>>
>>> +void rebalance_irqs(unsigned int from, bool up);
>>> +
>>> #endif /* _ASM_HW_IRQ_H */
>>> /*
>>> * Local variables:
>>> diff --git a/xen/arch/arm/irq.c b/xen/arch/arm/irq.c
>>> index 7204bc2b68..a32dc729f8 100644
>>> --- a/xen/arch/arm/irq.c
>>> +++ b/xen/arch/arm/irq.c
>>> @@ -158,6 +158,58 @@ static int init_local_irq_data(unsigned int cpu)
>>> return 0;
>>> }
>>>
>>> +static int cpu_next;
>>> +
>>> +static void balance_irq(int irq, unsigned int from, bool up)
>>> +{
>>> + struct irq_desc *desc = irq_to_desc(irq);
>>> + unsigned long flags;
>>> +
>>> + ASSERT(!cpumask_empty(&cpu_online_map));
>>> +
>>> + spin_lock_irqsave(&desc->lock, flags);
>>> + if ( likely(!desc->action) )
>>> + goto out;
>>> +
>>> + if ( likely(test_bit(_IRQ_GUEST, &desc->status) ||
>>> + test_bit(_IRQ_MOVE_PENDING, &desc->status)) )
>>> + goto out;
>>> +
>>> + /*
>>> + * Setting affinity to a mask of multiple CPUs causes the GIC drivers to
>>> + * select one CPU from that mask. If the dying CPU was included in the IRQ's
>>> + * affinity mask, we cannot determine exactly which CPU the interrupt is
>>> + * currently routed to, as GIC drivers lack a concrete get_affinity API. So
>>> + * to be safe we must reroute it to a new, definitely online, CPU. In the
>>> + * case of CPU going down, we move only the interrupt that could reside on
>>> + * it. Otherwise, we rearrange all interrupts in a round-robin fashion.
>>> + */
>>> + if ( !up && !cpumask_test_cpu(from, desc->affinity) )
>>> + goto out;
>>
>> I am a bit lost here on what you are trying to do in the case where
>> a cpu is coming up here, it feels like you are trying to change the
>> affinity of all interrupts in this case to cycle everything.
>> Is it really what is expected ?
>> If affinity was set by a VM on its interrupts, I would not expect
>> Xen to round-robin everything each time a cpu comes up.
>>
>
> The idea is to evenly spread interrupts between CPUs when the new ones
> are being brought online. This is needed to prevent Xen-bound IRQs from
> piling up on CPU 0 when other cores are being offlined and then onlined
> back. It shouldn’t mess with guest affinities, as the code skips
> everything that is assigned to guests and leaves it to be handled by the
> scheduler/vgic. Performance-wise, it should also be okay, as from what
> I’ve seen, there are not many interrupts used by Xen, and I expect CPU
> hotplug operations to be fairly infrequent.
My fear here is a bit that by removing and adding a cpu we will completely
change irq affinities. I am not so sure that those kind of random assignments
are compatible with embedded or safety use cases and here there is no way
to configure this.
@Julien, Stefano and Michal: What do you think here ?
Cheers
Bertrand
>
>>> +
>>> + cpu_next = cpumask_cycle(cpu_next, &cpu_online_map);
>>> + irq_set_affinity(desc, cpumask_of(cpu_next));
>>> +
>>> +out:
>>> + spin_unlock_irqrestore(&desc->lock, flags);
>>> +}
>>> +
>>> +void rebalance_irqs(unsigned int from, bool up)
>>> +{
>>> + int irq;
>>> +
>>> + if ( cpumask_empty(&cpu_online_map) )
>>> + return;
>>> +
>>> + for ( irq = NR_LOCAL_IRQS; irq < NR_IRQS; irq++ )
>>> + balance_irq(irq, from, up);
>>> +
>>> +#ifdef CONFIG_GICV3_ESPI
>>> + for ( irq = ESPI_BASE_INTID; irq < ESPI_MAX_INTID; irq++ )
>>> + balance_irq(irq, from, up);
>>> +#endif
>>> +}
>>> +
>>> static int cpu_callback(struct notifier_block *nfb, unsigned long action,
>>> void *hcpu)
>>> {
>>> @@ -172,6 +224,8 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
>>> printk(XENLOG_ERR "Unable to allocate local IRQ for CPU%u\n",
>>> cpu);
>>> break;
>>> + case CPU_ONLINE:
>>> + rebalance_irqs(cpu, true);
>>> }
>>>
>>> return notifier_from_errno(rc);
>>> diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
>>> index 7f3cfa812e..e1b9f94458 100644
>>> --- a/xen/arch/arm/smpboot.c
>>> +++ b/xen/arch/arm/smpboot.c
>>> @@ -425,6 +425,12 @@ void __cpu_disable(void)
>>>
>>> smp_mb();
>>>
>>> + /*
>>> + * Now that the interrupts are cleared and the CPU marked as offline,
>>> + * move interrupts out of it
>>> + */
>>> + rebalance_irqs(cpu, false);
>>> +
>>
>> I would expect this to only be useful when HOTPLUG is enabled, maybe
>> we could have a static inline doing nothing when HOTPLUG is not on
>> and only do something if HOTPLUG is enabled here ?
>>
>
> Yes I will add this in the next version.
>
>> Cheers
>> Bertrand
>>
>>> /* Return to caller; eventually the IPI mechanism will unwind and the
>>> * scheduler will drop to the idle loop, which will call stop_cpu(). */
>>> }
>>> --
>>> 2.51.2
>>
>
> --
> Mykyta
© 2016 - 2026 Red Hat, Inc.