drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-)
When running a PREEMPT_RT debug kernel on a 2-socket Grace arm64 system,
the following bug report was produced at bootup time.
BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/72
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 1
:
CPU: 72 UID: 0 PID: 0 Comm: swapper/72 Tainted: G W 6.19.0-rc4-test+ #4 PREEMPT_{RT,(full)}
Tainted: [W]=WARN
Call trace:
:
rt_spin_lock+0xe4/0x408
rmqueue_bulk+0x48/0x1de8
__rmqueue_pcplist+0x410/0x650
rmqueue.constprop.0+0x6a8/0x2b50
get_page_from_freelist+0x3c0/0xe68
__alloc_frozen_pages_noprof+0x1dc/0x348
alloc_pages_mpol+0xe4/0x2f8
alloc_frozen_pages_noprof+0x124/0x190
allocate_slab+0x2f0/0x438
new_slab+0x4c/0x80
___slab_alloc+0x410/0x798
__slab_alloc.constprop.0+0x88/0x1e0
__kmalloc_cache_noprof+0x2dc/0x4b0
allocate_vpe_l1_table+0x114/0x788
its_cpu_init_lpis+0x344/0x790
its_cpu_init+0x60/0x220
gic_starting_cpu+0x64/0xe8
cpuhp_invoke_callback+0x438/0x6d8
__cpuhp_invoke_callback_range+0xd8/0x1f8
notify_cpu_starting+0x11c/0x178
secondary_start_kernel+0xc8/0x188
__secondary_switched+0xc0/0xc8
This is due to the fact that allocate_vpe_l1_table() will call
kzalloc() to allocate a cpumask_t when the first CPU of the
second node of the 72-cpu Grace system is being called from the
CPUHP_AP_MIPS_GIC_TIMER_STARTING state inside the starting section of
the CPU hotplug bringup pipeline where interrupt is disabled. This is an
atomic context where sleeping is not allowed and acquiring a sleeping
rt_spin_lock within kzalloc() may lead to system hang in case there is
a lock contention.
To work around this issue, a static buffer is used for cpumask
allocation when running a PREEMPT_RT kernel via the newly introduced
vpe_alloc_cpumask() helper. The static buffer is currently set to be
4 kbytes in size. As only one cpumask is needed per node, the current
size should be big enough as long as (cpumask_size() * nr_node_ids)
is not bigger than 4k.
Signed-off-by: Waiman Long <longman@redhat.com>
---
drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++++++-
1 file changed, 25 insertions(+), 1 deletion(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ada585bfa451..9185785524dc 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2896,6 +2896,30 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
return true;
}
+static void *vpe_alloc_cpumask(void)
+{
+ /*
+ * With PREEMPT_RT kernel, we can't call any k*alloc() APIs as they
+ * may acquire a sleeping rt_spin_lock in an atomic context. So use
+ * a pre-allocated buffer instead.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ static unsigned long mask_buf[512];
+ static atomic_t alloc_idx;
+ int idx, mask_size = cpumask_size();
+ int nr_cpumasks = sizeof(mask_buf)/mask_size;
+
+ /*
+ * Fetch an allocation index and if it points to a buffer within
+ * mask_buf[], return that. Fall back to kzalloc() otherwise.
+ */
+ idx = atomic_fetch_inc(&alloc_idx);
+ if (idx < nr_cpumasks)
+ return &mask_buf[idx * mask_size/sizeof(long)];
+ }
+ return kzalloc(sizeof(cpumask_t), GFP_ATOMIC);
+}
+
static int allocate_vpe_l1_table(void)
{
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -2927,7 +2951,7 @@ static int allocate_vpe_l1_table(void)
if (val & GICR_VPROPBASER_4_1_VALID)
goto out;
- gic_data_rdist()->vpe_table_mask = kzalloc(sizeof(cpumask_t), GFP_ATOMIC);
+ gic_data_rdist()->vpe_table_mask = vpe_alloc_cpumask();
if (!gic_data_rdist()->vpe_table_mask)
return -ENOMEM;
--
2.52.0
On Wed, 07 Jan 2026 21:53:53 +0000,
Waiman Long <longman@redhat.com> wrote:
>
> When running a PREEMPT_RT debug kernel on a 2-socket Grace arm64 system,
> the following bug report was produced at bootup time.
>
> BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
> in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/72
> preempt_count: 1, expected: 0
> RCU nest depth: 1, expected: 1
> :
> CPU: 72 UID: 0 PID: 0 Comm: swapper/72 Tainted: G W 6.19.0-rc4-test+ #4 PREEMPT_{RT,(full)}
> Tainted: [W]=WARN
> Call trace:
> :
> rt_spin_lock+0xe4/0x408
> rmqueue_bulk+0x48/0x1de8
> __rmqueue_pcplist+0x410/0x650
> rmqueue.constprop.0+0x6a8/0x2b50
> get_page_from_freelist+0x3c0/0xe68
> __alloc_frozen_pages_noprof+0x1dc/0x348
> alloc_pages_mpol+0xe4/0x2f8
> alloc_frozen_pages_noprof+0x124/0x190
> allocate_slab+0x2f0/0x438
> new_slab+0x4c/0x80
> ___slab_alloc+0x410/0x798
> __slab_alloc.constprop.0+0x88/0x1e0
> __kmalloc_cache_noprof+0x2dc/0x4b0
> allocate_vpe_l1_table+0x114/0x788
> its_cpu_init_lpis+0x344/0x790
> its_cpu_init+0x60/0x220
> gic_starting_cpu+0x64/0xe8
> cpuhp_invoke_callback+0x438/0x6d8
> __cpuhp_invoke_callback_range+0xd8/0x1f8
> notify_cpu_starting+0x11c/0x178
> secondary_start_kernel+0xc8/0x188
> __secondary_switched+0xc0/0xc8
>
> This is due to the fact that allocate_vpe_l1_table() will call
> kzalloc() to allocate a cpumask_t when the first CPU of the
> second node of the 72-cpu Grace system is being called from the
> CPUHP_AP_MIPS_GIC_TIMER_STARTING state inside the starting section of
Surely *not* that particular state.
> the CPU hotplug bringup pipeline where interrupt is disabled. This is an
> atomic context where sleeping is not allowed and acquiring a sleeping
> rt_spin_lock within kzalloc() may lead to system hang in case there is
> a lock contention.
>
> To work around this issue, a static buffer is used for cpumask
> allocation when running a PREEMPT_RT kernel via the newly introduced
> vpe_alloc_cpumask() helper. The static buffer is currently set to be
> 4 kbytes in size. As only one cpumask is needed per node, the current
> size should be big enough as long as (cpumask_size() * nr_node_ids)
> is not bigger than 4k.
What role does the node play here? The GIC topology has nothing to do
with NUMA. It may be true on your particular toy, but that's
definitely not true architecturally. You could, at worse, end-up with
one such cpumask per *CPU*. That'd be a braindead system, but this
code is written to support the architecture, not any particular
implementation.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
> drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++++++-
> 1 file changed, 25 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index ada585bfa451..9185785524dc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -2896,6 +2896,30 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
> return true;
> }
>
> +static void *vpe_alloc_cpumask(void)
> +{
> + /*
> + * With PREEMPT_RT kernel, we can't call any k*alloc() APIs as they
> + * may acquire a sleeping rt_spin_lock in an atomic context. So use
> + * a pre-allocated buffer instead.
> + */
> + if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> + static unsigned long mask_buf[512];
> + static atomic_t alloc_idx;
> + int idx, mask_size = cpumask_size();
> + int nr_cpumasks = sizeof(mask_buf)/mask_size;
> +
> + /*
> + * Fetch an allocation index and if it points to a buffer within
> + * mask_buf[], return that. Fall back to kzalloc() otherwise.
> + */
> + idx = atomic_fetch_inc(&alloc_idx);
> + if (idx < nr_cpumasks)
> + return &mask_buf[idx * mask_size/sizeof(long)];
> + }
Err, no. That's horrible. I can see three ways to address this in a
more appealing way:
- you give RT a generic allocator that works for (small) atomic
allocations. I appreciate that's not easy, and even probably
contrary to the RT goals. But I'm also pretty sure that the GIC code
is not the only pile of crap being caught doing that.
- you pre-compute upfront how many cpumasks you are going to require,
based on the actual GIC topology. You do that on CPU0, outside of
the hotplug constraints, and allocate what you need. This is
difficult as you need to ensure the RD<->CPU matching without the
CPUs having booted, which means wading through the DT/ACPI gunk to
try and guess what you have.
- you delay the allocation of L1 tables to a context where you can
perform allocations, and before we have a chance of running a guest
on this CPU. That's probably the simplest option (though dealing
with late onlining while guests are already running could be
interesting...).
But I'm always going to say no to something that is a poor hack and
ultimately falling back to the same broken behaviour.
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
On 1/8/26 3:26 AM, Marc Zyngier wrote:
> On Wed, 07 Jan 2026 21:53:53 +0000,
> Waiman Long <longman@redhat.com> wrote:
>> When running a PREEMPT_RT debug kernel on a 2-socket Grace arm64 system,
>> the following bug report was produced at bootup time.
>>
>> BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
>> in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/72
>> preempt_count: 1, expected: 0
>> RCU nest depth: 1, expected: 1
>> :
>> CPU: 72 UID: 0 PID: 0 Comm: swapper/72 Tainted: G W 6.19.0-rc4-test+ #4 PREEMPT_{RT,(full)}
>> Tainted: [W]=WARN
>> Call trace:
>> :
>> rt_spin_lock+0xe4/0x408
>> rmqueue_bulk+0x48/0x1de8
>> __rmqueue_pcplist+0x410/0x650
>> rmqueue.constprop.0+0x6a8/0x2b50
>> get_page_from_freelist+0x3c0/0xe68
>> __alloc_frozen_pages_noprof+0x1dc/0x348
>> alloc_pages_mpol+0xe4/0x2f8
>> alloc_frozen_pages_noprof+0x124/0x190
>> allocate_slab+0x2f0/0x438
>> new_slab+0x4c/0x80
>> ___slab_alloc+0x410/0x798
>> __slab_alloc.constprop.0+0x88/0x1e0
>> __kmalloc_cache_noprof+0x2dc/0x4b0
>> allocate_vpe_l1_table+0x114/0x788
>> its_cpu_init_lpis+0x344/0x790
>> its_cpu_init+0x60/0x220
>> gic_starting_cpu+0x64/0xe8
>> cpuhp_invoke_callback+0x438/0x6d8
>> __cpuhp_invoke_callback_range+0xd8/0x1f8
>> notify_cpu_starting+0x11c/0x178
>> secondary_start_kernel+0xc8/0x188
>> __secondary_switched+0xc0/0xc8
>>
>> This is due to the fact that allocate_vpe_l1_table() will call
>> kzalloc() to allocate a cpumask_t when the first CPU of the
>> second node of the 72-cpu Grace system is being called from the
>> CPUHP_AP_MIPS_GIC_TIMER_STARTING state inside the starting section of
> Surely *not* that particular state.
My mistake, it should be CPUHP_AP_IRQ_GIC_STARTING. There are three
static gic_starting_cpu() functions that confuse me.
>> the CPU hotplug bringup pipeline where interrupt is disabled. This is an
>> atomic context where sleeping is not allowed and acquiring a sleeping
>> rt_spin_lock within kzalloc() may lead to system hang in case there is
>> a lock contention.
>>
>> To work around this issue, a static buffer is used for cpumask
>> allocation when running a PREEMPT_RT kernel via the newly introduced
>> vpe_alloc_cpumask() helper. The static buffer is currently set to be
>> 4 kbytes in size. As only one cpumask is needed per node, the current
>> size should be big enough as long as (cpumask_size() * nr_node_ids)
>> is not bigger than 4k.
> What role does the node play here? The GIC topology has nothing to do
> with NUMA. It may be true on your particular toy, but that's
> definitely not true architecturally. You could, at worse, end-up with
> one such cpumask per *CPU*. That'd be a braindead system, but this
> code is written to support the architecture, not any particular
> implementation.
>
It is just what I have observed on the hardware that I used for
reproducing the problem. I agree that it may be different in other arm64
CPUs.
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
>> drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++++++-
>> 1 file changed, 25 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>> index ada585bfa451..9185785524dc 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -2896,6 +2896,30 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
>> return true;
>> }
>>
>> +static void *vpe_alloc_cpumask(void)
>> +{
>> + /*
>> + * With PREEMPT_RT kernel, we can't call any k*alloc() APIs as they
>> + * may acquire a sleeping rt_spin_lock in an atomic context. So use
>> + * a pre-allocated buffer instead.
>> + */
>> + if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
>> + static unsigned long mask_buf[512];
>> + static atomic_t alloc_idx;
>> + int idx, mask_size = cpumask_size();
>> + int nr_cpumasks = sizeof(mask_buf)/mask_size;
>> +
>> + /*
>> + * Fetch an allocation index and if it points to a buffer within
>> + * mask_buf[], return that. Fall back to kzalloc() otherwise.
>> + */
>> + idx = atomic_fetch_inc(&alloc_idx);
>> + if (idx < nr_cpumasks)
>> + return &mask_buf[idx * mask_size/sizeof(long)];
>> + }
> Err, no. That's horrible. I can see three ways to address this in a
> more appealing way:
>
> - you give RT a generic allocator that works for (small) atomic
> allocations. I appreciate that's not easy, and even probably
> contrary to the RT goals. But I'm also pretty sure that the GIC code
> is not the only pile of crap being caught doing that.
>
> - you pre-compute upfront how many cpumasks you are going to require,
> based on the actual GIC topology. You do that on CPU0, outside of
> the hotplug constraints, and allocate what you need. This is
> difficult as you need to ensure the RD<->CPU matching without the
> CPUs having booted, which means wading through the DT/ACPI gunk to
> try and guess what you have.
>
> - you delay the allocation of L1 tables to a context where you can
> perform allocations, and before we have a chance of running a guest
> on this CPU. That's probably the simplest option (though dealing
> with late onlining while guests are already running could be
> interesting...).
>
> But I'm always going to say no to something that is a poor hack and
> ultimately falling back to the same broken behaviour.
Thanks for the suggestion. I will try the first alternative of a more
generic memory allocator.
Cheers,
Longman
>
> Thanks,
>
> M.
>
On Thu, Jan 08 2026 at 08:26, Marc Zyngier wrote:
> Err, no. That's horrible. I can see three ways to address this in a
> more appealing way:
>
> - you give RT a generic allocator that works for (small) atomic
> allocations. I appreciate that's not easy, and even probably
> contrary to the RT goals. But I'm also pretty sure that the GIC code
> is not the only pile of crap being caught doing that.
>
> - you pre-compute upfront how many cpumasks you are going to require,
> based on the actual GIC topology. You do that on CPU0, outside of
> the hotplug constraints, and allocate what you need. This is
> difficult as you need to ensure the RD<->CPU matching without the
> CPUs having booted, which means wading through the DT/ACPI gunk to
> try and guess what you have.
>
> - you delay the allocation of L1 tables to a context where you can
> perform allocations, and before we have a chance of running a guest
> on this CPU. That's probably the simplest option (though dealing
> with late onlining while guests are already running could be
> interesting...).
At the point where a CPU is brought up, the topology should be known
already, which means this can be allocated on the control CPU _before_
the new CPU comes up, no?
Thanks,
tglx
On Thu, 08 Jan 2026 22:11:33 +0000, Thomas Gleixner <tglx@kernel.org> wrote: > > On Thu, Jan 08 2026 at 08:26, Marc Zyngier wrote: > > Err, no. That's horrible. I can see three ways to address this in a > > more appealing way: > > > > - you give RT a generic allocator that works for (small) atomic > > allocations. I appreciate that's not easy, and even probably > > contrary to the RT goals. But I'm also pretty sure that the GIC code > > is not the only pile of crap being caught doing that. > > > > - you pre-compute upfront how many cpumasks you are going to require, > > based on the actual GIC topology. You do that on CPU0, outside of > > the hotplug constraints, and allocate what you need. This is > > difficult as you need to ensure the RD<->CPU matching without the > > CPUs having booted, which means wading through the DT/ACPI gunk to > > try and guess what you have. > > > > - you delay the allocation of L1 tables to a context where you can > > perform allocations, and before we have a chance of running a guest > > on this CPU. That's probably the simplest option (though dealing > > with late onlining while guests are already running could be > > interesting...). > > At the point where a CPU is brought up, the topology should be known > already, which means this can be allocated on the control CPU _before_ > the new CPU comes up, no? No. Each CPU finds *itself* in the forest of redistributors, and from there tries to find whether it has some shared resource with a CPU that has booted before it. That's because firmware is absolutely awful and can't present a consistent view of the system. Anyway, I expect it could be solved by moving this part of the init to an ONLINE HP callback. Thanks, M. -- Without deviation from the norm, progress is not possible.
On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> On Thu, 08 Jan 2026 22:11:33 +0000,
> Thomas Gleixner <tglx@kernel.org> wrote:
>> At the point where a CPU is brought up, the topology should be known
>> already, which means this can be allocated on the control CPU _before_
>> the new CPU comes up, no?
>
> No. Each CPU finds *itself* in the forest of redistributors, and from
> there tries to find whether it has some shared resource with a CPU
> that has booted before it. That's because firmware is absolutely awful
> and can't present a consistent view of the system.
Groan....
> Anyway, I expect it could be solved by moving this part of the init to
> an ONLINE HP callback.
Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
might be to late because there are callbacks in the STARTING section,
i.e. timer, perf, which might rely on interrupts being accessible.
Also that patch seems to be incomplete because there is another
allocation further down in allocate_vpe_l1_table()....
Thanks,
tglx
On Sun, 11 Jan 2026 09:39:07 +0000, Thomas Gleixner <tglx@kernel.org> wrote: > > On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote: > > On Thu, 08 Jan 2026 22:11:33 +0000, > > Thomas Gleixner <tglx@kernel.org> wrote: > >> At the point where a CPU is brought up, the topology should be known > >> already, which means this can be allocated on the control CPU _before_ > >> the new CPU comes up, no? > > > > No. Each CPU finds *itself* in the forest of redistributors, and from > > there tries to find whether it has some shared resource with a CPU > > that has booted before it. That's because firmware is absolutely awful > > and can't present a consistent view of the system. > > Groan.... > > > Anyway, I expect it could be solved by moving this part of the init to > > an ONLINE HP callback. > > Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that > might be to late because there are callbacks in the STARTING section, > i.e. timer, perf, which might rely on interrupts being accessible. Nah. This stuff is only for direct injection of vLPIs into guests, so as long as this is done before we can schedule a vcpu on this physical CPU, we're good. No physical interrupt is concerned with this code. > Also that patch seems to be incomplete because there is another > allocation further down in allocate_vpe_l1_table().... Yeah, I wondered why page allocation wasn't affected by this issue, but didn't try to find out. M. -- Without deviation from the norm, progress is not possible.
On 1/11/26 5:38 AM, Marc Zyngier wrote: >> Also that patch seems to be incomplete because there is another >> allocation further down in allocate_vpe_l1_table().... > Yeah, I wondered why page allocation wasn't affected by this issue, > but didn't try to find out. The use of GFP_ATOMIC flag in the page allocation request may help it to dip into the reserved area and avoid taking any spinlock. In my own test, just removing the kzalloc() call is enough to avoid any invalid context warning. In the page allocation code, there is a zone lock and a per_cpu_pages lock. They were not acquired in my particular test case, though further investigation may be needed to make sure it is really safe. Cheers, Longman
On Sun, Jan 11 2026 at 18:02, Waiman Long wrote: > On 1/11/26 5:38 AM, Marc Zyngier wrote: >>> Also that patch seems to be incomplete because there is another >>> allocation further down in allocate_vpe_l1_table().... >> Yeah, I wondered why page allocation wasn't affected by this issue, >> but didn't try to find out. > > The use of GFP_ATOMIC flag in the page allocation request may help it to > dip into the reserved area and avoid taking any spinlock. In my own > test, just removing the kzalloc() call is enough to avoid any invalid > context warning. In the page allocation code, there is a zone lock and a > per_cpu_pages lock. They were not acquired in my particular test case, > though further investigation may be needed to make sure it is really safe. They might be acquired though. Only alloc_pages_nolock() guarantees that no lock is taken IIRC.
On 1/12/26 10:09 AM, Thomas Gleixner wrote: > On Sun, Jan 11 2026 at 18:02, Waiman Long wrote: >> On 1/11/26 5:38 AM, Marc Zyngier wrote: >>>> Also that patch seems to be incomplete because there is another >>>> allocation further down in allocate_vpe_l1_table().... >>> Yeah, I wondered why page allocation wasn't affected by this issue, >>> but didn't try to find out. >> The use of GFP_ATOMIC flag in the page allocation request may help it to >> dip into the reserved area and avoid taking any spinlock. In my own >> test, just removing the kzalloc() call is enough to avoid any invalid >> context warning. In the page allocation code, there is a zone lock and a >> per_cpu_pages lock. They were not acquired in my particular test case, >> though further investigation may be needed to make sure it is really safe. > They might be acquired though. Only alloc_pages_nolock() guarantees that > no lock is taken IIRC. Thanks for the suggestion. I will look into using that for page allocation. I had actually attempt to use kmalloc_nolock() to replace kzalloc() initially. Even though it removed the call to rmqueue(), but there were other spinlocks in the slub code that were still being acquired like the local_lock() or the spinlock in the get_random() code. So I gave up using that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet. Cheers, Longman
On 2026-01-12 12:14:30 [-0500], Waiman Long wrote: > On 1/12/26 10:09 AM, Thomas Gleixner wrote: > > They might be acquired though. Only alloc_pages_nolock() guarantees that > > no lock is taken IIRC. > > Thanks for the suggestion. I will look into using that for page allocation. > I had actually attempt to use kmalloc_nolock() to replace kzalloc() > initially. Even though it removed the call to rmqueue(), but there were > other spinlocks in the slub code that were still being acquired like the > local_lock() or the spinlock in the get_random() code. So I gave up using > that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet. with kmalloc_nolock() you have to be able to deal with a NULL pointer. Looking at kmalloc_nolock(), it has this (in_nmi() || in_hardirq()) check on PREEMPT_RT. The reasoning was unconditional raw_spinlock_t locking and bad lock-owner recording for hardirq. There was a trylock path for local_lock to make it work from atomic context. But from what I can tell this goes kmalloc_nolock_noprof() -> __slab_alloc_node() -> __slab_alloc() -> ___slab_alloc() -> local_lock_cpu_slab() The last one does local_lock_irqsave() on PREEMPT_RT which does a spin_lock(). That means atomic context is not possible. Where did I make a wrong turn? Or did this change recently? I do remember that Alexei reworked parts of the allocator to make the local_lock based trylock allocation work. > Cheers, > Longman Sebastian
On 1/13/26 12:55, Sebastian Andrzej Siewior wrote: > On 2026-01-12 12:14:30 [-0500], Waiman Long wrote: >> On 1/12/26 10:09 AM, Thomas Gleixner wrote: >> > They might be acquired though. Only alloc_pages_nolock() guarantees that >> > no lock is taken IIRC. >> >> Thanks for the suggestion. I will look into using that for page allocation. >> I had actually attempt to use kmalloc_nolock() to replace kzalloc() >> initially. Even though it removed the call to rmqueue(), but there were >> other spinlocks in the slub code that were still being acquired like the >> local_lock() or the spinlock in the get_random() code. So I gave up using Hmm if get_random() code takes a spinlock, we have an unsolved incompatibility with kmalloc_nolock() and CONFIG_SLAB_FREELIST_RANDOM. >> that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet. > > with kmalloc_nolock() you have to be able to deal with a NULL pointer. Yes. So even after we fix the current problems with incompatible context, I think kmalloc_nolock() would still be a bad fit for hw bringup code that should not really fail. Because the possibility of failure will always exist. The BPF use case that motivated it is quite different. > Looking at kmalloc_nolock(), it has this (in_nmi() || in_hardirq()) > check on PREEMPT_RT. The reasoning was unconditional raw_spinlock_t > locking and bad lock-owner recording for hardirq. > There was a trylock path for local_lock to make it work from atomic > context. But from what I can tell this goes > kmalloc_nolock_noprof() -> __slab_alloc_node() -> __slab_alloc() -> > ___slab_alloc() -> local_lock_cpu_slab() > > The last one does local_lock_irqsave() on PREEMPT_RT which does a > spin_lock(). That means atomic context is not possible. Where did I make > a wrong turn? Or did this change recently? I do remember that Alexei > reworked parts of the allocator to make the local_lock based trylock > allocation work. > >> Cheers, >> Longman > > Sebastian
On 1/14/26 12:59 PM, Vlastimil Babka wrote: > On 1/13/26 12:55, Sebastian Andrzej Siewior wrote: >> On 2026-01-12 12:14:30 [-0500], Waiman Long wrote: >>> On 1/12/26 10:09 AM, Thomas Gleixner wrote: >>>> They might be acquired though. Only alloc_pages_nolock() guarantees that >>>> no lock is taken IIRC. >>> Thanks for the suggestion. I will look into using that for page allocation. >>> I had actually attempt to use kmalloc_nolock() to replace kzalloc() >>> initially. Even though it removed the call to rmqueue(), but there were >>> other spinlocks in the slub code that were still being acquired like the >>> local_lock() or the spinlock in the get_random() code. So I gave up using > Hmm if get_random() code takes a spinlock, we have an unsolved > incompatibility with kmalloc_nolock() and CONFIG_SLAB_FREELIST_RANDOM. > >>> that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet. >> with kmalloc_nolock() you have to be able to deal with a NULL pointer. > Yes. So even after we fix the current problems with incompatible context, I > think kmalloc_nolock() would still be a bad fit for hw bringup code that > should not really fail. Because the possibility of failure will always > exist. The BPF use case that motivated it is quite different. Yes, it is an issue too that kmalloc_nolock() may fail. If that happens, we don't have another good alternative. Cheers, Longman
On Tue, Jan 13, 2026 at 3:55 AM Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > On 2026-01-12 12:14:30 [-0500], Waiman Long wrote: > > On 1/12/26 10:09 AM, Thomas Gleixner wrote: > > > They might be acquired though. Only alloc_pages_nolock() guarantees that > > > no lock is taken IIRC. > > > > Thanks for the suggestion. I will look into using that for page allocation. > > I had actually attempt to use kmalloc_nolock() to replace kzalloc() > > initially. Even though it removed the call to rmqueue(), but there were > > other spinlocks in the slub code that were still being acquired like the > > local_lock() or the spinlock in the get_random() code. So I gave up using > > that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet. > > with kmalloc_nolock() you have to be able to deal with a NULL pointer. > Looking at kmalloc_nolock(), it has this (in_nmi() || in_hardirq()) > check on PREEMPT_RT. The reasoning was unconditional raw_spinlock_t > locking and bad lock-owner recording for hardirq. > There was a trylock path for local_lock to make it work from atomic > context. But from what I can tell this goes > kmalloc_nolock_noprof() -> __slab_alloc_node() -> __slab_alloc() -> > ___slab_alloc() -> local_lock_cpu_slab() > > The last one does local_lock_irqsave() on PREEMPT_RT which does a > spin_lock(). That means atomic context is not possible. Where did I make > a wrong turn? Or did this change recently? I do remember that Alexei > reworked parts of the allocator to make the local_lock based trylock > allocation work. Are you forgetting about local_lock_is_locked() in __slab_alloc() ? With sheaves the whole thing will be very different.
On 2026-01-13 15:25:26 [-0800], Alexei Starovoitov wrote: > On Tue, Jan 13, 2026 at 3:55 AM Sebastian Andrzej Siewior > <bigeasy@linutronix.de> wrote: > > The last one does local_lock_irqsave() on PREEMPT_RT which does a > > spin_lock(). That means atomic context is not possible. Where did I make > > a wrong turn? Or did this change recently? I do remember that Alexei > > reworked parts of the allocator to make the local_lock based trylock > > allocation work. > > Are you forgetting about local_lock_is_locked() in __slab_alloc() ? Yeah but this just checks it. Further down the road there is local_lock_cpu_slab() for the allocation and there is no try-lock on RT. > With sheaves the whole thing will be very different. Yes. Sebastian
On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> On Sun, 11 Jan 2026 09:39:07 +0000,
> Thomas Gleixner <tglx@kernel.org> wrote:
>>
>> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
>> > On Thu, 08 Jan 2026 22:11:33 +0000,
>> > Thomas Gleixner <tglx@kernel.org> wrote:
>> >> At the point where a CPU is brought up, the topology should be known
>> >> already, which means this can be allocated on the control CPU _before_
>> >> the new CPU comes up, no?
>> >
>> > No. Each CPU finds *itself* in the forest of redistributors, and from
>> > there tries to find whether it has some shared resource with a CPU
>> > that has booted before it. That's because firmware is absolutely awful
>> > and can't present a consistent view of the system.
>>
>> Groan....
>>
>> > Anyway, I expect it could be solved by moving this part of the init to
>> > an ONLINE HP callback.
>>
>> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
>> might be to late because there are callbacks in the STARTING section,
>> i.e. timer, perf, which might rely on interrupts being accessible.
>
> Nah. This stuff is only for direct injection of vLPIs into guests, so
> as long as this is done before we can schedule a vcpu on this physical
> CPU, we're good. No physical interrupt is concerned with this code.
That's fine then. vCPUs are considered "user-space" tasks and can't be
scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
Thanks,
tglx
On Sun, 11 Jan 2026 16:20:45 +0000,
Thomas Gleixner <tglx@kernel.org> wrote:
>
> On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> > On Sun, 11 Jan 2026 09:39:07 +0000,
> > Thomas Gleixner <tglx@kernel.org> wrote:
> >>
> >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> >> > On Thu, 08 Jan 2026 22:11:33 +0000,
> >> > Thomas Gleixner <tglx@kernel.org> wrote:
> >> >> At the point where a CPU is brought up, the topology should be known
> >> >> already, which means this can be allocated on the control CPU _before_
> >> >> the new CPU comes up, no?
> >> >
> >> > No. Each CPU finds *itself* in the forest of redistributors, and from
> >> > there tries to find whether it has some shared resource with a CPU
> >> > that has booted before it. That's because firmware is absolutely awful
> >> > and can't present a consistent view of the system.
> >>
> >> Groan....
> >>
> >> > Anyway, I expect it could be solved by moving this part of the init to
> >> > an ONLINE HP callback.
> >>
> >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> >> might be to late because there are callbacks in the STARTING section,
> >> i.e. timer, perf, which might rely on interrupts being accessible.
> >
> > Nah. This stuff is only for direct injection of vLPIs into guests, so
> > as long as this is done before we can schedule a vcpu on this physical
> > CPU, we're good. No physical interrupt is concerned with this code.
>
> That's fine then. vCPUs are considered "user-space" tasks and can't be
> scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
Waiman, can you please give the following hack a go on your box? The
machines I have are thankfully limited to a single ITS group, so I
can't directly reproduce your issue.
Thanks,
M.
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ada585bfa4517..20967000f2348 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2896,7 +2896,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
return true;
}
-static int allocate_vpe_l1_table(void)
+static int allocate_vpe_l1_table(unsigned int cpu)
{
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
u64 val, gpsz, npg, pa;
@@ -3012,10 +3012,11 @@ static int allocate_vpe_l1_table(void)
out:
gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER);
- cpumask_set_cpu(smp_processor_id(), gic_data_rdist()->vpe_table_mask);
+ cpumask_set_cpu(cpu, gic_data_rdist()->vpe_table_mask);
+ dsb(sy);
pr_debug("CPU%d: VPROPBASER = %llx %*pbl\n",
- smp_processor_id(), val,
+ cpu, val,
cpumask_pr_args(gic_data_rdist()->vpe_table_mask));
return 0;
@@ -3264,15 +3265,9 @@ static void its_cpu_init_lpis(void)
val = its_clear_vpend_valid(vlpi_base, 0, 0);
}
- if (allocate_vpe_l1_table()) {
- /*
- * If the allocation has failed, we're in massive trouble.
- * Disable direct injection, and pray that no VM was
- * already running...
- */
- gic_rdists->has_rvpeid = false;
- gic_rdists->has_vlpis = false;
- }
+ if (smp_processor_id() == 0)
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "irqchip/arm/gicv3:vpe",
+ allocate_vpe_l1_table, NULL);
/* Make sure the GIC has seen the above */
dsb(sy);
--
Without deviation from the norm, progress is not possible.
On Mon, 12 Jan 2026 11:20:07 +0000, Marc Zyngier <maz@kernel.org> wrote: > > On Sun, 11 Jan 2026 16:20:45 +0000, > Thomas Gleixner <tglx@kernel.org> wrote: > > > > On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote: > > > On Sun, 11 Jan 2026 09:39:07 +0000, > > > Thomas Gleixner <tglx@kernel.org> wrote: > > >> > > >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote: > > >> > On Thu, 08 Jan 2026 22:11:33 +0000, > > >> > Thomas Gleixner <tglx@kernel.org> wrote: > > >> >> At the point where a CPU is brought up, the topology should be known > > >> >> already, which means this can be allocated on the control CPU _before_ > > >> >> the new CPU comes up, no? > > >> > > > >> > No. Each CPU finds *itself* in the forest of redistributors, and from > > >> > there tries to find whether it has some shared resource with a CPU > > >> > that has booted before it. That's because firmware is absolutely awful > > >> > and can't present a consistent view of the system. > > >> > > >> Groan.... > > >> > > >> > Anyway, I expect it could be solved by moving this part of the init to > > >> > an ONLINE HP callback. > > >> > > >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that > > >> might be to late because there are callbacks in the STARTING section, > > >> i.e. timer, perf, which might rely on interrupts being accessible. > > > > > > Nah. This stuff is only for direct injection of vLPIs into guests, so > > > as long as this is done before we can schedule a vcpu on this physical > > > CPU, we're good. No physical interrupt is concerned with this code. > > > > That's fine then. vCPUs are considered "user-space" tasks and can't be > > scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler. > > Waiman, can you please give the following hack a go on your box? The > machines I have are thankfully limited to a single ITS group, so I > can't directly reproduce your issue. Have you managed to try this hack? I may be able to spend some time addressing the issue in the next cycle if I have an indication that I'm on the right track. Thanks, M. -- Without deviation from the norm, progress is not possible.
On 1/21/26 3:38 AM, Marc Zyngier wrote: > On Mon, 12 Jan 2026 11:20:07 +0000, > Marc Zyngier <maz@kernel.org> wrote: >> On Sun, 11 Jan 2026 16:20:45 +0000, >> Thomas Gleixner <tglx@kernel.org> wrote: >>> On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote: >>>> On Sun, 11 Jan 2026 09:39:07 +0000, >>>> Thomas Gleixner <tglx@kernel.org> wrote: >>>>> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote: >>>>>> On Thu, 08 Jan 2026 22:11:33 +0000, >>>>>> Thomas Gleixner <tglx@kernel.org> wrote: >>>>>>> At the point where a CPU is brought up, the topology should be known >>>>>>> already, which means this can be allocated on the control CPU _before_ >>>>>>> the new CPU comes up, no? >>>>>> No. Each CPU finds *itself* in the forest of redistributors, and from >>>>>> there tries to find whether it has some shared resource with a CPU >>>>>> that has booted before it. That's because firmware is absolutely awful >>>>>> and can't present a consistent view of the system. >>>>> Groan.... >>>>> >>>>>> Anyway, I expect it could be solved by moving this part of the init to >>>>>> an ONLINE HP callback. >>>>> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that >>>>> might be to late because there are callbacks in the STARTING section, >>>>> i.e. timer, perf, which might rely on interrupts being accessible. >>>> Nah. This stuff is only for direct injection of vLPIs into guests, so >>>> as long as this is done before we can schedule a vcpu on this physical >>>> CPU, we're good. No physical interrupt is concerned with this code. >>> That's fine then. vCPUs are considered "user-space" tasks and can't be >>> scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler. >> Waiman, can you please give the following hack a go on your box? The >> machines I have are thankfully limited to a single ITS group, so I >> can't directly reproduce your issue. > Have you managed to try this hack? I may be able to spend some time > addressing the issue in the next cycle if I have an indication that > I'm on the right track. Yes, I have tried out your hack patch and the 2-socket Grace test system booted up without producing any bug report for a RT debug kernel. I will try out your official patch once it come out. So moving the memory allocation to a later part of the hotplug bringup pipeline where sleeping is allowed should work. Cheers, Longman
On 1/21/26 3:41 PM, Waiman Long wrote: > >>> Waiman, can you please give the following hack a go on your box? The >>> machines I have are thankfully limited to a single ITS group, so I >>> can't directly reproduce your issue. >> Have you managed to try this hack? I may be able to spend some time >> addressing the issue in the next cycle if I have an indication that >> I'm on the right track. > > Yes, I have tried out your hack patch and the 2-socket Grace test > system booted up without producing any bug report for a RT debug > kernel. I will try out your official patch once it come out. So moving > the memory allocation to a later part of the hotplug bringup pipeline > where sleeping is allowed should work. Attaching the dmesg log for your further investigation. Cheers, Longman
On 1/21/26 3:38 AM, Marc Zyngier wrote: > On Mon, 12 Jan 2026 11:20:07 +0000, > Marc Zyngier <maz@kernel.org> wrote: >> On Sun, 11 Jan 2026 16:20:45 +0000, >> Thomas Gleixner <tglx@kernel.org> wrote: >>> On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote: >>>> On Sun, 11 Jan 2026 09:39:07 +0000, >>>> Thomas Gleixner <tglx@kernel.org> wrote: >>>>> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote: >>>>>> On Thu, 08 Jan 2026 22:11:33 +0000, >>>>>> Thomas Gleixner <tglx@kernel.org> wrote: >>>>>>> At the point where a CPU is brought up, the topology should be known >>>>>>> already, which means this can be allocated on the control CPU _before_ >>>>>>> the new CPU comes up, no? >>>>>> No. Each CPU finds *itself* in the forest of redistributors, and from >>>>>> there tries to find whether it has some shared resource with a CPU >>>>>> that has booted before it. That's because firmware is absolutely awful >>>>>> and can't present a consistent view of the system. >>>>> Groan.... >>>>> >>>>>> Anyway, I expect it could be solved by moving this part of the init to >>>>>> an ONLINE HP callback. >>>>> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that >>>>> might be to late because there are callbacks in the STARTING section, >>>>> i.e. timer, perf, which might rely on interrupts being accessible. >>>> Nah. This stuff is only for direct injection of vLPIs into guests, so >>>> as long as this is done before we can schedule a vcpu on this physical >>>> CPU, we're good. No physical interrupt is concerned with this code. >>> That's fine then. vCPUs are considered "user-space" tasks and can't be >>> scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler. >> Waiman, can you please give the following hack a go on your box? The >> machines I have are thankfully limited to a single ITS group, so I >> can't directly reproduce your issue. > Have you managed to try this hack? I may be able to spend some time > addressing the issue in the next cycle if I have an indication that > I'm on the right track. I am sorry that I was busy working on other stuff. Will try out the hack today and report back ASAP. Cheers, Longman
On 2026-01-12 11:20:07 [+0000], Marc Zyngier wrote:
> On Sun, 11 Jan 2026 16:20:45 +0000,
> Thomas Gleixner <tglx@kernel.org> wrote:
> >
> > On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> > > On Sun, 11 Jan 2026 09:39:07 +0000,
> > > Thomas Gleixner <tglx@kernel.org> wrote:
> > >>
> > >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> > >> > On Thu, 08 Jan 2026 22:11:33 +0000,
> > >> > Thomas Gleixner <tglx@kernel.org> wrote:
> > >> >> At the point where a CPU is brought up, the topology should be known
> > >> >> already, which means this can be allocated on the control CPU _before_
> > >> >> the new CPU comes up, no?
> > >> >
> > >> > No. Each CPU finds *itself* in the forest of redistributors, and from
> > >> > there tries to find whether it has some shared resource with a CPU
> > >> > that has booted before it. That's because firmware is absolutely awful
> > >> > and can't present a consistent view of the system.
> > >>
> > >> Groan....
> > >>
> > >> > Anyway, I expect it could be solved by moving this part of the init to
> > >> > an ONLINE HP callback.
> > >>
> > >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> > >> might be to late because there are callbacks in the STARTING section,
> > >> i.e. timer, perf, which might rely on interrupts being accessible.
> > >
> > > Nah. This stuff is only for direct injection of vLPIs into guests, so
> > > as long as this is done before we can schedule a vcpu on this physical
> > > CPU, we're good. No physical interrupt is concerned with this code.
> >
> > That's fine then. vCPUs are considered "user-space" tasks and can't be
> > scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
>
> Waiman, can you please give the following hack a go on your box? The
> machines I have are thankfully limited to a single ITS group, so I
> can't directly reproduce your issue.
>
> Thanks,
>
> M.
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index ada585bfa4517..20967000f2348 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -2896,7 +2896,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
> return true;
> }
>
> -static int allocate_vpe_l1_table(void)
> +static int allocate_vpe_l1_table(unsigned int cpu)
> {
> void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
> u64 val, gpsz, npg, pa;
> @@ -3012,10 +3012,11 @@ static int allocate_vpe_l1_table(void)
>
> out:
> gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER);
> - cpumask_set_cpu(smp_processor_id(), gic_data_rdist()->vpe_table_mask);
> + cpumask_set_cpu(cpu, gic_data_rdist()->vpe_table_mask);
> + dsb(sy);
>
> pr_debug("CPU%d: VPROPBASER = %llx %*pbl\n",
> - smp_processor_id(), val,
> + cpu, val,
> cpumask_pr_args(gic_data_rdist()->vpe_table_mask));
>
> return 0;
> @@ -3264,15 +3265,9 @@ static void its_cpu_init_lpis(void)
> val = its_clear_vpend_valid(vlpi_base, 0, 0);
> }
>
> - if (allocate_vpe_l1_table()) {
> - /*
> - * If the allocation has failed, we're in massive trouble.
> - * Disable direct injection, and pray that no VM was
> - * already running...
> - */
> - gic_rdists->has_rvpeid = false;
> - gic_rdists->has_vlpis = false;
> - }
> + if (smp_processor_id() == 0)
> + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "irqchip/arm/gicv3:vpe",
> + allocate_vpe_l1_table, NULL);
If you move it the online state then you could also
s/GFP_ATOMIC/GFP_KERNEL.
Also previously you checked the error code set has_rvpeid, has_vlpis on
failure. Now you you should the same in case of a failure during
registration.
This also happens happens on CPU hotplug and I don't see how you avoid a
second allocation. But I also don't understand why this registrations
happens on CPU0. It might be just a test patch…
>
> /* Make sure the GIC has seen the above */
> dsb(sy);
Sebastian
On Mon, 12 Jan 2026 14:08:37 +0000,
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
>
> On 2026-01-12 11:20:07 [+0000], Marc Zyngier wrote:
> > On Sun, 11 Jan 2026 16:20:45 +0000,
> > Thomas Gleixner <tglx@kernel.org> wrote:
> > >
> > > On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> > > > On Sun, 11 Jan 2026 09:39:07 +0000,
> > > > Thomas Gleixner <tglx@kernel.org> wrote:
> > > >>
> > > >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> > > >> > On Thu, 08 Jan 2026 22:11:33 +0000,
> > > >> > Thomas Gleixner <tglx@kernel.org> wrote:
> > > >> >> At the point where a CPU is brought up, the topology should be known
> > > >> >> already, which means this can be allocated on the control CPU _before_
> > > >> >> the new CPU comes up, no?
> > > >> >
> > > >> > No. Each CPU finds *itself* in the forest of redistributors, and from
> > > >> > there tries to find whether it has some shared resource with a CPU
> > > >> > that has booted before it. That's because firmware is absolutely awful
> > > >> > and can't present a consistent view of the system.
> > > >>
> > > >> Groan....
> > > >>
> > > >> > Anyway, I expect it could be solved by moving this part of the init to
> > > >> > an ONLINE HP callback.
> > > >>
> > > >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> > > >> might be to late because there are callbacks in the STARTING section,
> > > >> i.e. timer, perf, which might rely on interrupts being accessible.
> > > >
> > > > Nah. This stuff is only for direct injection of vLPIs into guests, so
> > > > as long as this is done before we can schedule a vcpu on this physical
> > > > CPU, we're good. No physical interrupt is concerned with this code.
> > >
> > > That's fine then. vCPUs are considered "user-space" tasks and can't be
> > > scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
> >
> > Waiman, can you please give the following hack a go on your box? The
> > machines I have are thankfully limited to a single ITS group, so I
> > can't directly reproduce your issue.
> >
> > Thanks,
> >
> > M.
> >
> > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> > index ada585bfa4517..20967000f2348 100644
> > --- a/drivers/irqchip/irq-gic-v3-its.c
> > +++ b/drivers/irqchip/irq-gic-v3-its.c
> > @@ -2896,7 +2896,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
> > return true;
> > }
> >
> > -static int allocate_vpe_l1_table(void)
> > +static int allocate_vpe_l1_table(unsigned int cpu)
> > {
> > void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
> > u64 val, gpsz, npg, pa;
> > @@ -3012,10 +3012,11 @@ static int allocate_vpe_l1_table(void)
> >
> > out:
> > gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER);
> > - cpumask_set_cpu(smp_processor_id(), gic_data_rdist()->vpe_table_mask);
> > + cpumask_set_cpu(cpu, gic_data_rdist()->vpe_table_mask);
> > + dsb(sy);
> >
> > pr_debug("CPU%d: VPROPBASER = %llx %*pbl\n",
> > - smp_processor_id(), val,
> > + cpu, val,
> > cpumask_pr_args(gic_data_rdist()->vpe_table_mask));
> >
> > return 0;
> > @@ -3264,15 +3265,9 @@ static void its_cpu_init_lpis(void)
> > val = its_clear_vpend_valid(vlpi_base, 0, 0);
> > }
> >
> > - if (allocate_vpe_l1_table()) {
> > - /*
> > - * If the allocation has failed, we're in massive trouble.
> > - * Disable direct injection, and pray that no VM was
> > - * already running...
> > - */
> > - gic_rdists->has_rvpeid = false;
> > - gic_rdists->has_vlpis = false;
> > - }
> > + if (smp_processor_id() == 0)
> > + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "irqchip/arm/gicv3:vpe",
> > + allocate_vpe_l1_table, NULL);
>
> If you move it the online state then you could also
> s/GFP_ATOMIC/GFP_KERNEL.
>
> Also previously you checked the error code set has_rvpeid, has_vlpis on
> failure. Now you you should the same in case of a failure during
> registration.
> This also happens happens on CPU hotplug and I don't see how you avoid a
> second allocation. But I also don't understand why this registrations
> happens on CPU0. It might be just a test patch…
It's just a test hack. There is way more things that would need to
change in order to cope with moving this to CPUHP, but I want
confirmation that this indeed solves the original issue before I start
breaking more things.
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
© 2016 - 2026 Red Hat, Inc.