irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

[PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 1 month ago

When running a PREEMPT_RT debug kernel on a 2-socket Grace arm64 system,
the following bug report was produced at bootup time.

  BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
  in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/72
  preempt_count: 1, expected: 0
  RCU nest depth: 1, expected: 1
   :
  CPU: 72 UID: 0 PID: 0 Comm: swapper/72 Tainted: G        W           6.19.0-rc4-test+ #4 PREEMPT_{RT,(full)}
  Tainted: [W]=WARN
  Call trace:
    :
   rt_spin_lock+0xe4/0x408
   rmqueue_bulk+0x48/0x1de8
   __rmqueue_pcplist+0x410/0x650
   rmqueue.constprop.0+0x6a8/0x2b50
   get_page_from_freelist+0x3c0/0xe68
   __alloc_frozen_pages_noprof+0x1dc/0x348
   alloc_pages_mpol+0xe4/0x2f8
   alloc_frozen_pages_noprof+0x124/0x190
   allocate_slab+0x2f0/0x438
   new_slab+0x4c/0x80
   ___slab_alloc+0x410/0x798
   __slab_alloc.constprop.0+0x88/0x1e0
   __kmalloc_cache_noprof+0x2dc/0x4b0
   allocate_vpe_l1_table+0x114/0x788
   its_cpu_init_lpis+0x344/0x790
   its_cpu_init+0x60/0x220
   gic_starting_cpu+0x64/0xe8
   cpuhp_invoke_callback+0x438/0x6d8
   __cpuhp_invoke_callback_range+0xd8/0x1f8
   notify_cpu_starting+0x11c/0x178
   secondary_start_kernel+0xc8/0x188
   __secondary_switched+0xc0/0xc8

This is due to the fact that allocate_vpe_l1_table() will call
kzalloc() to allocate a cpumask_t when the first CPU of the
second node of the 72-cpu Grace system is being called from the
CPUHP_AP_MIPS_GIC_TIMER_STARTING state inside the starting section of
the CPU hotplug bringup pipeline where interrupt is disabled. This is an
atomic context where sleeping is not allowed and acquiring a sleeping
rt_spin_lock within kzalloc() may lead to system hang in case there is
a lock contention.

To work around this issue, a static buffer is used for cpumask
allocation when running a PREEMPT_RT kernel via the newly introduced
vpe_alloc_cpumask() helper. The static buffer is currently set to be
4 kbytes in size. As only one cpumask is needed per node, the current
size should be big enough as long as (cpumask_size() * nr_node_ids)
is not bigger than 4k.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ada585bfa451..9185785524dc 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2896,6 +2896,30 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
 	return true;
 }
 
+static void *vpe_alloc_cpumask(void)
+{
+	/*
+	 * With PREEMPT_RT kernel, we can't call any k*alloc() APIs as they
+	 * may acquire a sleeping rt_spin_lock in an atomic context. So use
+	 * a pre-allocated buffer instead.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		static unsigned long mask_buf[512];
+		static atomic_t	alloc_idx;
+		int idx, mask_size = cpumask_size();
+		int nr_cpumasks = sizeof(mask_buf)/mask_size;
+
+		/*
+		 * Fetch an allocation index and if it points to a buffer within
+		 * mask_buf[], return that. Fall back to kzalloc() otherwise.
+		 */
+		idx = atomic_fetch_inc(&alloc_idx);
+		if (idx < nr_cpumasks)
+			return &mask_buf[idx * mask_size/sizeof(long)];
+	}
+	return kzalloc(sizeof(cpumask_t), GFP_ATOMIC);
+}
+
 static int allocate_vpe_l1_table(void)
 {
 	void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -2927,7 +2951,7 @@ static int allocate_vpe_l1_table(void)
 	if (val & GICR_VPROPBASER_4_1_VALID)
 		goto out;
 
-	gic_data_rdist()->vpe_table_mask = kzalloc(sizeof(cpumask_t), GFP_ATOMIC);
+	gic_data_rdist()->vpe_table_mask = vpe_alloc_cpumask();
 	if (!gic_data_rdist()->vpe_table_mask)
 		return -ENOMEM;
 
-- 
2.52.0

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Marc Zyngier 1 month ago

On Wed, 07 Jan 2026 21:53:53 +0000,
Waiman Long <longman@redhat.com> wrote:
> 
> When running a PREEMPT_RT debug kernel on a 2-socket Grace arm64 system,
> the following bug report was produced at bootup time.
> 
>   BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
>   in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/72
>   preempt_count: 1, expected: 0
>   RCU nest depth: 1, expected: 1
>    :
>   CPU: 72 UID: 0 PID: 0 Comm: swapper/72 Tainted: G        W           6.19.0-rc4-test+ #4 PREEMPT_{RT,(full)}
>   Tainted: [W]=WARN
>   Call trace:
>     :
>    rt_spin_lock+0xe4/0x408
>    rmqueue_bulk+0x48/0x1de8
>    __rmqueue_pcplist+0x410/0x650
>    rmqueue.constprop.0+0x6a8/0x2b50
>    get_page_from_freelist+0x3c0/0xe68
>    __alloc_frozen_pages_noprof+0x1dc/0x348
>    alloc_pages_mpol+0xe4/0x2f8
>    alloc_frozen_pages_noprof+0x124/0x190
>    allocate_slab+0x2f0/0x438
>    new_slab+0x4c/0x80
>    ___slab_alloc+0x410/0x798
>    __slab_alloc.constprop.0+0x88/0x1e0
>    __kmalloc_cache_noprof+0x2dc/0x4b0
>    allocate_vpe_l1_table+0x114/0x788
>    its_cpu_init_lpis+0x344/0x790
>    its_cpu_init+0x60/0x220
>    gic_starting_cpu+0x64/0xe8
>    cpuhp_invoke_callback+0x438/0x6d8
>    __cpuhp_invoke_callback_range+0xd8/0x1f8
>    notify_cpu_starting+0x11c/0x178
>    secondary_start_kernel+0xc8/0x188
>    __secondary_switched+0xc0/0xc8
> 
> This is due to the fact that allocate_vpe_l1_table() will call
> kzalloc() to allocate a cpumask_t when the first CPU of the
> second node of the 72-cpu Grace system is being called from the
> CPUHP_AP_MIPS_GIC_TIMER_STARTING state inside the starting section of

Surely *not* that particular state.

> the CPU hotplug bringup pipeline where interrupt is disabled. This is an
> atomic context where sleeping is not allowed and acquiring a sleeping
> rt_spin_lock within kzalloc() may lead to system hang in case there is
> a lock contention.
> 
> To work around this issue, a static buffer is used for cpumask
> allocation when running a PREEMPT_RT kernel via the newly introduced
> vpe_alloc_cpumask() helper. The static buffer is currently set to be
> 4 kbytes in size. As only one cpumask is needed per node, the current
> size should be big enough as long as (cpumask_size() * nr_node_ids)
> is not bigger than 4k.

What role does the node play here? The GIC topology has nothing to do
with NUMA. It may be true on your particular toy, but that's
definitely not true architecturally. You could, at worse, end-up with
one such cpumask per *CPU*. That'd be a braindead system, but this
code is written to support the architecture, not any particular
implementation.

> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++++++-
>  1 file changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index ada585bfa451..9185785524dc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -2896,6 +2896,30 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
>  	return true;
>  }
>  
> +static void *vpe_alloc_cpumask(void)
> +{
> +	/*
> +	 * With PREEMPT_RT kernel, we can't call any k*alloc() APIs as they
> +	 * may acquire a sleeping rt_spin_lock in an atomic context. So use
> +	 * a pre-allocated buffer instead.
> +	 */
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> +		static unsigned long mask_buf[512];
> +		static atomic_t	alloc_idx;
> +		int idx, mask_size = cpumask_size();
> +		int nr_cpumasks = sizeof(mask_buf)/mask_size;
> +
> +		/*
> +		 * Fetch an allocation index and if it points to a buffer within
> +		 * mask_buf[], return that. Fall back to kzalloc() otherwise.
> +		 */
> +		idx = atomic_fetch_inc(&alloc_idx);
> +		if (idx < nr_cpumasks)
> +			return &mask_buf[idx * mask_size/sizeof(long)];
> +	}

Err, no. That's horrible. I can see three ways to address this in a
more appealing way:

- you give RT a generic allocator that works for (small) atomic
  allocations. I appreciate that's not easy, and even probably
  contrary to the RT goals. But I'm also pretty sure that the GIC code
  is not the only pile of crap being caught doing that.

- you pre-compute upfront how many cpumasks you are going to require,
  based on the actual GIC topology. You do that on CPU0, outside of
  the hotplug constraints, and allocate what you need. This is
  difficult as you need to ensure the RD<->CPU matching without the
  CPUs having booted, which means wading through the DT/ACPI gunk to
  try and guess what you have.

- you delay the allocation of L1 tables to a context where you can
  perform allocations, and before we have a chance of running a guest
  on this CPU. That's probably the simplest option (though dealing
  with late onlining while guests are already running could be
  interesting...).

But I'm always going to say no to something that is a poor hack and
ultimately falling back to the same broken behaviour.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 4 weeks ago

On 1/8/26 3:26 AM, Marc Zyngier wrote:
> On Wed, 07 Jan 2026 21:53:53 +0000,
> Waiman Long <longman@redhat.com> wrote:
>> When running a PREEMPT_RT debug kernel on a 2-socket Grace arm64 system,
>> the following bug report was produced at bootup time.
>>
>>    BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
>>    in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/72
>>    preempt_count: 1, expected: 0
>>    RCU nest depth: 1, expected: 1
>>     :
>>    CPU: 72 UID: 0 PID: 0 Comm: swapper/72 Tainted: G        W           6.19.0-rc4-test+ #4 PREEMPT_{RT,(full)}
>>    Tainted: [W]=WARN
>>    Call trace:
>>      :
>>     rt_spin_lock+0xe4/0x408
>>     rmqueue_bulk+0x48/0x1de8
>>     __rmqueue_pcplist+0x410/0x650
>>     rmqueue.constprop.0+0x6a8/0x2b50
>>     get_page_from_freelist+0x3c0/0xe68
>>     __alloc_frozen_pages_noprof+0x1dc/0x348
>>     alloc_pages_mpol+0xe4/0x2f8
>>     alloc_frozen_pages_noprof+0x124/0x190
>>     allocate_slab+0x2f0/0x438
>>     new_slab+0x4c/0x80
>>     ___slab_alloc+0x410/0x798
>>     __slab_alloc.constprop.0+0x88/0x1e0
>>     __kmalloc_cache_noprof+0x2dc/0x4b0
>>     allocate_vpe_l1_table+0x114/0x788
>>     its_cpu_init_lpis+0x344/0x790
>>     its_cpu_init+0x60/0x220
>>     gic_starting_cpu+0x64/0xe8
>>     cpuhp_invoke_callback+0x438/0x6d8
>>     __cpuhp_invoke_callback_range+0xd8/0x1f8
>>     notify_cpu_starting+0x11c/0x178
>>     secondary_start_kernel+0xc8/0x188
>>     __secondary_switched+0xc0/0xc8
>>
>> This is due to the fact that allocate_vpe_l1_table() will call
>> kzalloc() to allocate a cpumask_t when the first CPU of the
>> second node of the 72-cpu Grace system is being called from the
>> CPUHP_AP_MIPS_GIC_TIMER_STARTING state inside the starting section of
> Surely *not* that particular state.

My mistake, it should be CPUHP_AP_IRQ_GIC_STARTING. There are three 
static gic_starting_cpu() functions that confuse me.


>> the CPU hotplug bringup pipeline where interrupt is disabled. This is an
>> atomic context where sleeping is not allowed and acquiring a sleeping
>> rt_spin_lock within kzalloc() may lead to system hang in case there is
>> a lock contention.
>>
>> To work around this issue, a static buffer is used for cpumask
>> allocation when running a PREEMPT_RT kernel via the newly introduced
>> vpe_alloc_cpumask() helper. The static buffer is currently set to be
>> 4 kbytes in size. As only one cpumask is needed per node, the current
>> size should be big enough as long as (cpumask_size() * nr_node_ids)
>> is not bigger than 4k.
> What role does the node play here? The GIC topology has nothing to do
> with NUMA. It may be true on your particular toy, but that's
> definitely not true architecturally. You could, at worse, end-up with
> one such cpumask per *CPU*. That'd be a braindead system, but this
> code is written to support the architecture, not any particular
> implementation.
>
It is just what I have observed on the hardware that I used for 
reproducing the problem. I agree that it may be different in other arm64 
CPUs.
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
>>   drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++++++-
>>   1 file changed, 25 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>> index ada585bfa451..9185785524dc 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -2896,6 +2896,30 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
>>   	return true;
>>   }
>>   
>> +static void *vpe_alloc_cpumask(void)
>> +{
>> +	/*
>> +	 * With PREEMPT_RT kernel, we can't call any k*alloc() APIs as they
>> +	 * may acquire a sleeping rt_spin_lock in an atomic context. So use
>> +	 * a pre-allocated buffer instead.
>> +	 */
>> +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
>> +		static unsigned long mask_buf[512];
>> +		static atomic_t	alloc_idx;
>> +		int idx, mask_size = cpumask_size();
>> +		int nr_cpumasks = sizeof(mask_buf)/mask_size;
>> +
>> +		/*
>> +		 * Fetch an allocation index and if it points to a buffer within
>> +		 * mask_buf[], return that. Fall back to kzalloc() otherwise.
>> +		 */
>> +		idx = atomic_fetch_inc(&alloc_idx);
>> +		if (idx < nr_cpumasks)
>> +			return &mask_buf[idx * mask_size/sizeof(long)];
>> +	}
> Err, no. That's horrible. I can see three ways to address this in a
> more appealing way:
>
> - you give RT a generic allocator that works for (small) atomic
>    allocations. I appreciate that's not easy, and even probably
>    contrary to the RT goals. But I'm also pretty sure that the GIC code
>    is not the only pile of crap being caught doing that.
>
> - you pre-compute upfront how many cpumasks you are going to require,
>    based on the actual GIC topology. You do that on CPU0, outside of
>    the hotplug constraints, and allocate what you need. This is
>    difficult as you need to ensure the RD<->CPU matching without the
>    CPUs having booted, which means wading through the DT/ACPI gunk to
>    try and guess what you have.
>
> - you delay the allocation of L1 tables to a context where you can
>    perform allocations, and before we have a chance of running a guest
>    on this CPU. That's probably the simplest option (though dealing
>    with late onlining while guests are already running could be
>    interesting...).
>
> But I'm always going to say no to something that is a poor hack and
> ultimately falling back to the same broken behaviour.

Thanks for the suggestion. I will try  the first alternative of a more 
generic memory allocator.

Cheers,
Longman

>
> Thanks,
>
> 	M.
>

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Thomas Gleixner 1 month ago

On Thu, Jan 08 2026 at 08:26, Marc Zyngier wrote:
> Err, no. That's horrible. I can see three ways to address this in a
> more appealing way:
>
> - you give RT a generic allocator that works for (small) atomic
>   allocations. I appreciate that's not easy, and even probably
>   contrary to the RT goals. But I'm also pretty sure that the GIC code
>   is not the only pile of crap being caught doing that.
>
> - you pre-compute upfront how many cpumasks you are going to require,
>   based on the actual GIC topology. You do that on CPU0, outside of
>   the hotplug constraints, and allocate what you need. This is
>   difficult as you need to ensure the RD<->CPU matching without the
>   CPUs having booted, which means wading through the DT/ACPI gunk to
>   try and guess what you have.
>
> - you delay the allocation of L1 tables to a context where you can
>   perform allocations, and before we have a chance of running a guest
>   on this CPU. That's probably the simplest option (though dealing
>   with late onlining while guests are already running could be
>   interesting...).

At the point where a CPU is brought up, the topology should be known
already, which means this can be allocated on the control CPU _before_
the new CPU comes up, no?

Thanks,

        tglx

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Marc Zyngier 1 month ago

On Thu, 08 Jan 2026 22:11:33 +0000,
Thomas Gleixner <tglx@kernel.org> wrote:
> 
> On Thu, Jan 08 2026 at 08:26, Marc Zyngier wrote:
> > Err, no. That's horrible. I can see three ways to address this in a
> > more appealing way:
> >
> > - you give RT a generic allocator that works for (small) atomic
> >   allocations. I appreciate that's not easy, and even probably
> >   contrary to the RT goals. But I'm also pretty sure that the GIC code
> >   is not the only pile of crap being caught doing that.
> >
> > - you pre-compute upfront how many cpumasks you are going to require,
> >   based on the actual GIC topology. You do that on CPU0, outside of
> >   the hotplug constraints, and allocate what you need. This is
> >   difficult as you need to ensure the RD<->CPU matching without the
> >   CPUs having booted, which means wading through the DT/ACPI gunk to
> >   try and guess what you have.
> >
> > - you delay the allocation of L1 tables to a context where you can
> >   perform allocations, and before we have a chance of running a guest
> >   on this CPU. That's probably the simplest option (though dealing
> >   with late onlining while guests are already running could be
> >   interesting...).
> 
> At the point where a CPU is brought up, the topology should be known
> already, which means this can be allocated on the control CPU _before_
> the new CPU comes up, no?

No. Each CPU finds *itself* in the forest of redistributors, and from
there tries to find whether it has some shared resource with a CPU
that has booted before it. That's because firmware is absolutely awful
and can't present a consistent view of the system.

Anyway, I expect it could be solved by moving this part of the init to
an ONLINE HP callback.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Thomas Gleixner 4 weeks ago

On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> On Thu, 08 Jan 2026 22:11:33 +0000,
> Thomas Gleixner <tglx@kernel.org> wrote:
>> At the point where a CPU is brought up, the topology should be known
>> already, which means this can be allocated on the control CPU _before_
>> the new CPU comes up, no?
>
> No. Each CPU finds *itself* in the forest of redistributors, and from
> there tries to find whether it has some shared resource with a CPU
> that has booted before it. That's because firmware is absolutely awful
> and can't present a consistent view of the system.

Groan....

> Anyway, I expect it could be solved by moving this part of the init to
> an ONLINE HP callback.

Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
might be to late because there are callbacks in the STARTING section,
i.e. timer, perf, which might rely on interrupts being accessible.

Also that patch seems to be incomplete because there is another
allocation further down in allocate_vpe_l1_table()....

Thanks,

        tglx

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Marc Zyngier 4 weeks ago

On Sun, 11 Jan 2026 09:39:07 +0000,
Thomas Gleixner <tglx@kernel.org> wrote:
> 
> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> > On Thu, 08 Jan 2026 22:11:33 +0000,
> > Thomas Gleixner <tglx@kernel.org> wrote:
> >> At the point where a CPU is brought up, the topology should be known
> >> already, which means this can be allocated on the control CPU _before_
> >> the new CPU comes up, no?
> >
> > No. Each CPU finds *itself* in the forest of redistributors, and from
> > there tries to find whether it has some shared resource with a CPU
> > that has booted before it. That's because firmware is absolutely awful
> > and can't present a consistent view of the system.
> 
> Groan....
>
> > Anyway, I expect it could be solved by moving this part of the init to
> > an ONLINE HP callback.
> 
> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> might be to late because there are callbacks in the STARTING section,
> i.e. timer, perf, which might rely on interrupts being accessible.

Nah. This stuff is only for direct injection of vLPIs into guests, so
as long as this is done before we can schedule a vcpu on this physical
CPU, we're good. No physical interrupt is concerned with this code.

> Also that patch seems to be incomplete because there is another
> allocation further down in allocate_vpe_l1_table()....

Yeah, I wondered why page allocation wasn't affected by this issue,
but didn't try to find out.

	M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 3 weeks, 6 days ago

On 1/11/26 5:38 AM, Marc Zyngier wrote:
>> Also that patch seems to be incomplete because there is another
>> allocation further down in allocate_vpe_l1_table()....
> Yeah, I wondered why page allocation wasn't affected by this issue,
> but didn't try to find out.

The use of GFP_ATOMIC flag in the page allocation request may help it to 
dip into the reserved area and avoid taking any spinlock. In my own 
test, just removing the kzalloc() call is enough to avoid any invalid 
context warning. In the page allocation code, there is a zone lock and a 
per_cpu_pages lock. They were not acquired in my particular test case, 
though further investigation may be needed to make sure it is really safe.

Cheers,
Longman

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Thomas Gleixner 3 weeks, 6 days ago

On Sun, Jan 11 2026 at 18:02, Waiman Long wrote:
> On 1/11/26 5:38 AM, Marc Zyngier wrote:
>>> Also that patch seems to be incomplete because there is another
>>> allocation further down in allocate_vpe_l1_table()....
>> Yeah, I wondered why page allocation wasn't affected by this issue,
>> but didn't try to find out.
>
> The use of GFP_ATOMIC flag in the page allocation request may help it to 
> dip into the reserved area and avoid taking any spinlock. In my own 
> test, just removing the kzalloc() call is enough to avoid any invalid 
> context warning. In the page allocation code, there is a zone lock and a 
> per_cpu_pages lock. They were not acquired in my particular test case, 
> though further investigation may be needed to make sure it is really safe.

They might be acquired though. Only alloc_pages_nolock() guarantees that
no lock is taken IIRC.

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 3 weeks, 6 days ago

On 1/12/26 10:09 AM, Thomas Gleixner wrote:
> On Sun, Jan 11 2026 at 18:02, Waiman Long wrote:
>> On 1/11/26 5:38 AM, Marc Zyngier wrote:
>>>> Also that patch seems to be incomplete because there is another
>>>> allocation further down in allocate_vpe_l1_table()....
>>> Yeah, I wondered why page allocation wasn't affected by this issue,
>>> but didn't try to find out.
>> The use of GFP_ATOMIC flag in the page allocation request may help it to
>> dip into the reserved area and avoid taking any spinlock. In my own
>> test, just removing the kzalloc() call is enough to avoid any invalid
>> context warning. In the page allocation code, there is a zone lock and a
>> per_cpu_pages lock. They were not acquired in my particular test case,
>> though further investigation may be needed to make sure it is really safe.
> They might be acquired though. Only alloc_pages_nolock() guarantees that
> no lock is taken IIRC.

Thanks for the suggestion. I will look into using that for page 
allocation. I had actually attempt to use kmalloc_nolock() to replace 
kzalloc() initially. Even though it removed the call to rmqueue(), but 
there were other spinlocks in the slub code that were still being 
acquired like the local_lock() or the spinlock in the get_random() code. 
So I gave up using that. Anyway, kmalloc_nolock() doesn't seem to be 
fully working yet.

Cheers,
Longman

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Sebastian Andrzej Siewior 3 weeks, 5 days ago

On 2026-01-12 12:14:30 [-0500], Waiman Long wrote:
> On 1/12/26 10:09 AM, Thomas Gleixner wrote:
> > They might be acquired though. Only alloc_pages_nolock() guarantees that
> > no lock is taken IIRC.
> 
> Thanks for the suggestion. I will look into using that for page allocation.
> I had actually attempt to use kmalloc_nolock() to replace kzalloc()
> initially. Even though it removed the call to rmqueue(), but there were
> other spinlocks in the slub code that were still being acquired like the
> local_lock() or the spinlock in the get_random() code. So I gave up using
> that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet.

with kmalloc_nolock() you have to be able to deal with a NULL pointer.
Looking at kmalloc_nolock(), it has this (in_nmi() || in_hardirq())
check on PREEMPT_RT. The reasoning was unconditional raw_spinlock_t
locking and bad lock-owner recording for hardirq.
There was a trylock path for local_lock to make it work from atomic
context. But from what I can tell this goes
  kmalloc_nolock_noprof() -> __slab_alloc_node() -> __slab_alloc() ->
  ___slab_alloc() -> local_lock_cpu_slab()

The last one does local_lock_irqsave() on PREEMPT_RT which does a
spin_lock(). That means atomic context is not possible. Where did I make
a wrong turn? Or did this change recently? I do remember that Alexei
reworked parts of the allocator to make the local_lock based trylock
allocation work.

> Cheers,
> Longman

Sebastian

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Vlastimil Babka 3 weeks, 4 days ago

On 1/13/26 12:55, Sebastian Andrzej Siewior wrote:
> On 2026-01-12 12:14:30 [-0500], Waiman Long wrote:
>> On 1/12/26 10:09 AM, Thomas Gleixner wrote:
>> > They might be acquired though. Only alloc_pages_nolock() guarantees that
>> > no lock is taken IIRC.
>> 
>> Thanks for the suggestion. I will look into using that for page allocation.
>> I had actually attempt to use kmalloc_nolock() to replace kzalloc()
>> initially. Even though it removed the call to rmqueue(), but there were
>> other spinlocks in the slub code that were still being acquired like the
>> local_lock() or the spinlock in the get_random() code. So I gave up using

Hmm if get_random() code takes a spinlock, we have an unsolved
incompatibility with kmalloc_nolock() and CONFIG_SLAB_FREELIST_RANDOM.

>> that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet.
> 
> with kmalloc_nolock() you have to be able to deal with a NULL pointer.

Yes. So even after we fix the current problems with incompatible context, I
think kmalloc_nolock() would still be a bad fit for hw bringup code that
should not really fail. Because the possibility of failure will always
exist. The BPF use case that motivated it is quite different.

> Looking at kmalloc_nolock(), it has this (in_nmi() || in_hardirq())
> check on PREEMPT_RT. The reasoning was unconditional raw_spinlock_t
> locking and bad lock-owner recording for hardirq.
> There was a trylock path for local_lock to make it work from atomic
> context. But from what I can tell this goes
>   kmalloc_nolock_noprof() -> __slab_alloc_node() -> __slab_alloc() ->
>   ___slab_alloc() -> local_lock_cpu_slab()
> 
> The last one does local_lock_irqsave() on PREEMPT_RT which does a
> spin_lock(). That means atomic context is not possible. Where did I make
> a wrong turn? Or did this change recently? I do remember that Alexei
> reworked parts of the allocator to make the local_lock based trylock
> allocation work.
> 
>> Cheers,
>> Longman
> 
> Sebastian

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 2 weeks, 4 days ago

On 1/14/26 12:59 PM, Vlastimil Babka wrote:
> On 1/13/26 12:55, Sebastian Andrzej Siewior wrote:
>> On 2026-01-12 12:14:30 [-0500], Waiman Long wrote:
>>> On 1/12/26 10:09 AM, Thomas Gleixner wrote:
>>>> They might be acquired though. Only alloc_pages_nolock() guarantees that
>>>> no lock is taken IIRC.
>>> Thanks for the suggestion. I will look into using that for page allocation.
>>> I had actually attempt to use kmalloc_nolock() to replace kzalloc()
>>> initially. Even though it removed the call to rmqueue(), but there were
>>> other spinlocks in the slub code that were still being acquired like the
>>> local_lock() or the spinlock in the get_random() code. So I gave up using
> Hmm if get_random() code takes a spinlock, we have an unsolved
> incompatibility with kmalloc_nolock() and CONFIG_SLAB_FREELIST_RANDOM.
>
>>> that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet.
>> with kmalloc_nolock() you have to be able to deal with a NULL pointer.
> Yes. So even after we fix the current problems with incompatible context, I
> think kmalloc_nolock() would still be a bad fit for hw bringup code that
> should not really fail. Because the possibility of failure will always
> exist. The BPF use case that motivated it is quite different.

Yes, it is an issue too that kmalloc_nolock() may fail. If that happens, 
we don't have another good alternative.

Cheers,
Longman

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Alexei Starovoitov 3 weeks, 4 days ago

On Tue, Jan 13, 2026 at 3:55 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2026-01-12 12:14:30 [-0500], Waiman Long wrote:
> > On 1/12/26 10:09 AM, Thomas Gleixner wrote:
> > > They might be acquired though. Only alloc_pages_nolock() guarantees that
> > > no lock is taken IIRC.
> >
> > Thanks for the suggestion. I will look into using that for page allocation.
> > I had actually attempt to use kmalloc_nolock() to replace kzalloc()
> > initially. Even though it removed the call to rmqueue(), but there were
> > other spinlocks in the slub code that were still being acquired like the
> > local_lock() or the spinlock in the get_random() code. So I gave up using
> > that. Anyway, kmalloc_nolock() doesn't seem to be fully working yet.
>
> with kmalloc_nolock() you have to be able to deal with a NULL pointer.
> Looking at kmalloc_nolock(), it has this (in_nmi() || in_hardirq())
> check on PREEMPT_RT. The reasoning was unconditional raw_spinlock_t
> locking and bad lock-owner recording for hardirq.
> There was a trylock path for local_lock to make it work from atomic
> context. But from what I can tell this goes
>   kmalloc_nolock_noprof() -> __slab_alloc_node() -> __slab_alloc() ->
>   ___slab_alloc() -> local_lock_cpu_slab()
>
> The last one does local_lock_irqsave() on PREEMPT_RT which does a
> spin_lock(). That means atomic context is not possible. Where did I make
> a wrong turn? Or did this change recently? I do remember that Alexei
> reworked parts of the allocator to make the local_lock based trylock
> allocation work.

Are you forgetting about local_lock_is_locked() in __slab_alloc() ?
With sheaves the whole thing will be very different.

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Sebastian Andrzej Siewior 3 weeks, 4 days ago

On 2026-01-13 15:25:26 [-0800], Alexei Starovoitov wrote:
> On Tue, Jan 13, 2026 at 3:55 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
> > The last one does local_lock_irqsave() on PREEMPT_RT which does a
> > spin_lock(). That means atomic context is not possible. Where did I make
> > a wrong turn? Or did this change recently? I do remember that Alexei
> > reworked parts of the allocator to make the local_lock based trylock
> > allocation work.
> 
> Are you forgetting about local_lock_is_locked() in __slab_alloc() ?

Yeah but this just checks it. Further down the road there is
local_lock_cpu_slab() for the allocation and there is no try-lock on RT.

> With sheaves the whole thing will be very different.
Yes.

Sebastian

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Thomas Gleixner 4 weeks ago

On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> On Sun, 11 Jan 2026 09:39:07 +0000,
> Thomas Gleixner <tglx@kernel.org> wrote:
>> 
>> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
>> > On Thu, 08 Jan 2026 22:11:33 +0000,
>> > Thomas Gleixner <tglx@kernel.org> wrote:
>> >> At the point where a CPU is brought up, the topology should be known
>> >> already, which means this can be allocated on the control CPU _before_
>> >> the new CPU comes up, no?
>> >
>> > No. Each CPU finds *itself* in the forest of redistributors, and from
>> > there tries to find whether it has some shared resource with a CPU
>> > that has booted before it. That's because firmware is absolutely awful
>> > and can't present a consistent view of the system.
>> 
>> Groan....
>>
>> > Anyway, I expect it could be solved by moving this part of the init to
>> > an ONLINE HP callback.
>> 
>> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
>> might be to late because there are callbacks in the STARTING section,
>> i.e. timer, perf, which might rely on interrupts being accessible.
>
> Nah. This stuff is only for direct injection of vLPIs into guests, so
> as long as this is done before we can schedule a vcpu on this physical
> CPU, we're good. No physical interrupt is concerned with this code.

That's fine then. vCPUs are considered "user-space" tasks and can't be
scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.

Thanks,

        tglx

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Marc Zyngier 3 weeks, 6 days ago

On Sun, 11 Jan 2026 16:20:45 +0000,
Thomas Gleixner <tglx@kernel.org> wrote:
> 
> On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> > On Sun, 11 Jan 2026 09:39:07 +0000,
> > Thomas Gleixner <tglx@kernel.org> wrote:
> >> 
> >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> >> > On Thu, 08 Jan 2026 22:11:33 +0000,
> >> > Thomas Gleixner <tglx@kernel.org> wrote:
> >> >> At the point where a CPU is brought up, the topology should be known
> >> >> already, which means this can be allocated on the control CPU _before_
> >> >> the new CPU comes up, no?
> >> >
> >> > No. Each CPU finds *itself* in the forest of redistributors, and from
> >> > there tries to find whether it has some shared resource with a CPU
> >> > that has booted before it. That's because firmware is absolutely awful
> >> > and can't present a consistent view of the system.
> >> 
> >> Groan....
> >>
> >> > Anyway, I expect it could be solved by moving this part of the init to
> >> > an ONLINE HP callback.
> >> 
> >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> >> might be to late because there are callbacks in the STARTING section,
> >> i.e. timer, perf, which might rely on interrupts being accessible.
> >
> > Nah. This stuff is only for direct injection of vLPIs into guests, so
> > as long as this is done before we can schedule a vcpu on this physical
> > CPU, we're good. No physical interrupt is concerned with this code.
> 
> That's fine then. vCPUs are considered "user-space" tasks and can't be
> scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.

Waiman, can you please give the following hack a go on your box? The
machines I have are thankfully limited to a single ITS group, so I
can't directly reproduce your issue.

Thanks,

	M.

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ada585bfa4517..20967000f2348 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2896,7 +2896,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
 	return true;
 }
 
-static int allocate_vpe_l1_table(void)
+static int allocate_vpe_l1_table(unsigned int cpu)
 {
 	void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
 	u64 val, gpsz, npg, pa;
@@ -3012,10 +3012,11 @@ static int allocate_vpe_l1_table(void)
 
 out:
 	gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER);
-	cpumask_set_cpu(smp_processor_id(), gic_data_rdist()->vpe_table_mask);
+	cpumask_set_cpu(cpu, gic_data_rdist()->vpe_table_mask);
+	dsb(sy);
 
 	pr_debug("CPU%d: VPROPBASER = %llx %*pbl\n",
-		 smp_processor_id(), val,
+		 cpu, val,
 		 cpumask_pr_args(gic_data_rdist()->vpe_table_mask));
 
 	return 0;
@@ -3264,15 +3265,9 @@ static void its_cpu_init_lpis(void)
 		val = its_clear_vpend_valid(vlpi_base, 0, 0);
 	}
 
-	if (allocate_vpe_l1_table()) {
-		/*
-		 * If the allocation has failed, we're in massive trouble.
-		 * Disable direct injection, and pray that no VM was
-		 * already running...
-		 */
-		gic_rdists->has_rvpeid = false;
-		gic_rdists->has_vlpis = false;
-	}
+	if (smp_processor_id() == 0)
+		cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "irqchip/arm/gicv3:vpe",
+				  allocate_vpe_l1_table, NULL);
 
 	/* Make sure the GIC has seen the above */
 	dsb(sy);


-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Marc Zyngier 2 weeks, 4 days ago

On Mon, 12 Jan 2026 11:20:07 +0000,
Marc Zyngier <maz@kernel.org> wrote:
> 
> On Sun, 11 Jan 2026 16:20:45 +0000,
> Thomas Gleixner <tglx@kernel.org> wrote:
> > 
> > On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> > > On Sun, 11 Jan 2026 09:39:07 +0000,
> > > Thomas Gleixner <tglx@kernel.org> wrote:
> > >> 
> > >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> > >> > On Thu, 08 Jan 2026 22:11:33 +0000,
> > >> > Thomas Gleixner <tglx@kernel.org> wrote:
> > >> >> At the point where a CPU is brought up, the topology should be known
> > >> >> already, which means this can be allocated on the control CPU _before_
> > >> >> the new CPU comes up, no?
> > >> >
> > >> > No. Each CPU finds *itself* in the forest of redistributors, and from
> > >> > there tries to find whether it has some shared resource with a CPU
> > >> > that has booted before it. That's because firmware is absolutely awful
> > >> > and can't present a consistent view of the system.
> > >> 
> > >> Groan....
> > >>
> > >> > Anyway, I expect it could be solved by moving this part of the init to
> > >> > an ONLINE HP callback.
> > >> 
> > >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> > >> might be to late because there are callbacks in the STARTING section,
> > >> i.e. timer, perf, which might rely on interrupts being accessible.
> > >
> > > Nah. This stuff is only for direct injection of vLPIs into guests, so
> > > as long as this is done before we can schedule a vcpu on this physical
> > > CPU, we're good. No physical interrupt is concerned with this code.
> > 
> > That's fine then. vCPUs are considered "user-space" tasks and can't be
> > scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
> 
> Waiman, can you please give the following hack a go on your box? The
> machines I have are thankfully limited to a single ITS group, so I
> can't directly reproduce your issue.

Have you managed to try this hack? I may be able to spend some time
addressing the issue in the next cycle if I have an indication that
I'm on the right track.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 2 weeks, 4 days ago

On 1/21/26 3:38 AM, Marc Zyngier wrote:
> On Mon, 12 Jan 2026 11:20:07 +0000,
> Marc Zyngier <maz@kernel.org> wrote:
>> On Sun, 11 Jan 2026 16:20:45 +0000,
>> Thomas Gleixner <tglx@kernel.org> wrote:
>>> On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
>>>> On Sun, 11 Jan 2026 09:39:07 +0000,
>>>> Thomas Gleixner <tglx@kernel.org> wrote:
>>>>> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
>>>>>> On Thu, 08 Jan 2026 22:11:33 +0000,
>>>>>> Thomas Gleixner <tglx@kernel.org> wrote:
>>>>>>> At the point where a CPU is brought up, the topology should be known
>>>>>>> already, which means this can be allocated on the control CPU _before_
>>>>>>> the new CPU comes up, no?
>>>>>> No. Each CPU finds *itself* in the forest of redistributors, and from
>>>>>> there tries to find whether it has some shared resource with a CPU
>>>>>> that has booted before it. That's because firmware is absolutely awful
>>>>>> and can't present a consistent view of the system.
>>>>> Groan....
>>>>>
>>>>>> Anyway, I expect it could be solved by moving this part of the init to
>>>>>> an ONLINE HP callback.
>>>>> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
>>>>> might be to late because there are callbacks in the STARTING section,
>>>>> i.e. timer, perf, which might rely on interrupts being accessible.
>>>> Nah. This stuff is only for direct injection of vLPIs into guests, so
>>>> as long as this is done before we can schedule a vcpu on this physical
>>>> CPU, we're good. No physical interrupt is concerned with this code.
>>> That's fine then. vCPUs are considered "user-space" tasks and can't be
>>> scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
>> Waiman, can you please give the following hack a go on your box? The
>> machines I have are thankfully limited to a single ITS group, so I
>> can't directly reproduce your issue.
> Have you managed to try this hack? I may be able to spend some time
> addressing the issue in the next cycle if I have an indication that
> I'm on the right track.

Yes, I have tried out your hack patch and the 2-socket Grace test system 
booted up without producing any bug report for a RT debug kernel. I will 
try out your official patch once it come out. So moving the memory 
allocation to a later part of the hotplug bringup pipeline where 
sleeping is allowed should work.

Cheers,
Longman

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 2 weeks, 3 days ago

On 1/21/26 3:41 PM, Waiman Long wrote:
>
>>> Waiman, can you please give the following hack a go on your box? The
>>> machines I have are thankfully limited to a single ITS group, so I
>>> can't directly reproduce your issue.
>> Have you managed to try this hack? I may be able to spend some time
>> addressing the issue in the next cycle if I have an indication that
>> I'm on the right track.
>
> Yes, I have tried out your hack patch and the 2-socket Grace test 
> system booted up without producing any bug report for a RT debug 
> kernel. I will try out your official patch once it come out. So moving 
> the memory allocation to a later part of the hotplug bringup pipeline 
> where sleeping is allowed should work. 

Attaching the dmesg log for your further investigation.

Cheers,
Longman

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Waiman Long 2 weeks, 4 days ago

On 1/21/26 3:38 AM, Marc Zyngier wrote:
> On Mon, 12 Jan 2026 11:20:07 +0000,
> Marc Zyngier <maz@kernel.org> wrote:
>> On Sun, 11 Jan 2026 16:20:45 +0000,
>> Thomas Gleixner <tglx@kernel.org> wrote:
>>> On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
>>>> On Sun, 11 Jan 2026 09:39:07 +0000,
>>>> Thomas Gleixner <tglx@kernel.org> wrote:
>>>>> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
>>>>>> On Thu, 08 Jan 2026 22:11:33 +0000,
>>>>>> Thomas Gleixner <tglx@kernel.org> wrote:
>>>>>>> At the point where a CPU is brought up, the topology should be known
>>>>>>> already, which means this can be allocated on the control CPU _before_
>>>>>>> the new CPU comes up, no?
>>>>>> No. Each CPU finds *itself* in the forest of redistributors, and from
>>>>>> there tries to find whether it has some shared resource with a CPU
>>>>>> that has booted before it. That's because firmware is absolutely awful
>>>>>> and can't present a consistent view of the system.
>>>>> Groan....
>>>>>
>>>>>> Anyway, I expect it could be solved by moving this part of the init to
>>>>>> an ONLINE HP callback.
>>>>> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
>>>>> might be to late because there are callbacks in the STARTING section,
>>>>> i.e. timer, perf, which might rely on interrupts being accessible.
>>>> Nah. This stuff is only for direct injection of vLPIs into guests, so
>>>> as long as this is done before we can schedule a vcpu on this physical
>>>> CPU, we're good. No physical interrupt is concerned with this code.
>>> That's fine then. vCPUs are considered "user-space" tasks and can't be
>>> scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
>> Waiman, can you please give the following hack a go on your box? The
>> machines I have are thankfully limited to a single ITS group, so I
>> can't directly reproduce your issue.
> Have you managed to try this hack? I may be able to spend some time
> addressing the issue in the next cycle if I have an indication that
> I'm on the right track.

I am sorry that I was busy working on other stuff. Will try out the hack 
today and report back ASAP.

Cheers,
Longman

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Sebastian Andrzej Siewior 3 weeks, 6 days ago

On 2026-01-12 11:20:07 [+0000], Marc Zyngier wrote:
> On Sun, 11 Jan 2026 16:20:45 +0000,
> Thomas Gleixner <tglx@kernel.org> wrote:
> > 
> > On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> > > On Sun, 11 Jan 2026 09:39:07 +0000,
> > > Thomas Gleixner <tglx@kernel.org> wrote:
> > >> 
> > >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> > >> > On Thu, 08 Jan 2026 22:11:33 +0000,
> > >> > Thomas Gleixner <tglx@kernel.org> wrote:
> > >> >> At the point where a CPU is brought up, the topology should be known
> > >> >> already, which means this can be allocated on the control CPU _before_
> > >> >> the new CPU comes up, no?
> > >> >
> > >> > No. Each CPU finds *itself* in the forest of redistributors, and from
> > >> > there tries to find whether it has some shared resource with a CPU
> > >> > that has booted before it. That's because firmware is absolutely awful
> > >> > and can't present a consistent view of the system.
> > >> 
> > >> Groan....
> > >>
> > >> > Anyway, I expect it could be solved by moving this part of the init to
> > >> > an ONLINE HP callback.
> > >> 
> > >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> > >> might be to late because there are callbacks in the STARTING section,
> > >> i.e. timer, perf, which might rely on interrupts being accessible.
> > >
> > > Nah. This stuff is only for direct injection of vLPIs into guests, so
> > > as long as this is done before we can schedule a vcpu on this physical
> > > CPU, we're good. No physical interrupt is concerned with this code.
> > 
> > That's fine then. vCPUs are considered "user-space" tasks and can't be
> > scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
> 
> Waiman, can you please give the following hack a go on your box? The
> machines I have are thankfully limited to a single ITS group, so I
> can't directly reproduce your issue.
> 
> Thanks,
> 
> 	M.
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index ada585bfa4517..20967000f2348 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -2896,7 +2896,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
>  	return true;
>  }
>  
> -static int allocate_vpe_l1_table(void)
> +static int allocate_vpe_l1_table(unsigned int cpu)
>  {
>  	void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
>  	u64 val, gpsz, npg, pa;
> @@ -3012,10 +3012,11 @@ static int allocate_vpe_l1_table(void)
>  
>  out:
>  	gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER);
> -	cpumask_set_cpu(smp_processor_id(), gic_data_rdist()->vpe_table_mask);
> +	cpumask_set_cpu(cpu, gic_data_rdist()->vpe_table_mask);
> +	dsb(sy);
>  
>  	pr_debug("CPU%d: VPROPBASER = %llx %*pbl\n",
> -		 smp_processor_id(), val,
> +		 cpu, val,
>  		 cpumask_pr_args(gic_data_rdist()->vpe_table_mask));
>  
>  	return 0;
> @@ -3264,15 +3265,9 @@ static void its_cpu_init_lpis(void)
>  		val = its_clear_vpend_valid(vlpi_base, 0, 0);
>  	}
>  
> -	if (allocate_vpe_l1_table()) {
> -		/*
> -		 * If the allocation has failed, we're in massive trouble.
> -		 * Disable direct injection, and pray that no VM was
> -		 * already running...
> -		 */
> -		gic_rdists->has_rvpeid = false;
> -		gic_rdists->has_vlpis = false;
> -	}
> +	if (smp_processor_id() == 0)
> +		cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "irqchip/arm/gicv3:vpe",
> +				  allocate_vpe_l1_table, NULL);

If you move it the online state then you could also
s/GFP_ATOMIC/GFP_KERNEL.

Also previously you checked the error code set has_rvpeid, has_vlpis on
failure. Now you you should the same in case of a failure during
registration.
This also happens happens on CPU hotplug and I don't see how you avoid a
second allocation. But I also don't understand why this registrations
happens on CPU0. It might be just a test patch…

>  
>  	/* Make sure the GIC has seen the above */
>  	dsb(sy);

Sebastian

Re: [PATCH] irqchip/gic-v3-its: Don't acquire rt_spin_lock in allocate_vpe_l1_table()

Posted by Marc Zyngier 3 weeks, 6 days ago

On Mon, 12 Jan 2026 14:08:37 +0000,
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> On 2026-01-12 11:20:07 [+0000], Marc Zyngier wrote:
> > On Sun, 11 Jan 2026 16:20:45 +0000,
> > Thomas Gleixner <tglx@kernel.org> wrote:
> > > 
> > > On Sun, Jan 11 2026 at 10:38, Marc Zyngier wrote:
> > > > On Sun, 11 Jan 2026 09:39:07 +0000,
> > > > Thomas Gleixner <tglx@kernel.org> wrote:
> > > >> 
> > > >> On Fri, Jan 09 2026 at 16:13, Marc Zyngier wrote:
> > > >> > On Thu, 08 Jan 2026 22:11:33 +0000,
> > > >> > Thomas Gleixner <tglx@kernel.org> wrote:
> > > >> >> At the point where a CPU is brought up, the topology should be known
> > > >> >> already, which means this can be allocated on the control CPU _before_
> > > >> >> the new CPU comes up, no?
> > > >> >
> > > >> > No. Each CPU finds *itself* in the forest of redistributors, and from
> > > >> > there tries to find whether it has some shared resource with a CPU
> > > >> > that has booted before it. That's because firmware is absolutely awful
> > > >> > and can't present a consistent view of the system.
> > > >> 
> > > >> Groan....
> > > >>
> > > >> > Anyway, I expect it could be solved by moving this part of the init to
> > > >> > an ONLINE HP callback.
> > > >> 
> > > >> Which needs to be before CPUHP_AP_IRQ_AFFINITY_ONLINE, but even that
> > > >> might be to late because there are callbacks in the STARTING section,
> > > >> i.e. timer, perf, which might rely on interrupts being accessible.
> > > >
> > > > Nah. This stuff is only for direct injection of vLPIs into guests, so
> > > > as long as this is done before we can schedule a vcpu on this physical
> > > > CPU, we're good. No physical interrupt is concerned with this code.
> > > 
> > > That's fine then. vCPUs are considered "user-space" tasks and can't be
> > > scheduled before CPUHP_AP_ACTIVE sets the CPU active for the scheduler.
> > 
> > Waiman, can you please give the following hack a go on your box? The
> > machines I have are thankfully limited to a single ITS group, so I
> > can't directly reproduce your issue.
> > 
> > Thanks,
> > 
> > 	M.
> > 
> > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> > index ada585bfa4517..20967000f2348 100644
> > --- a/drivers/irqchip/irq-gic-v3-its.c
> > +++ b/drivers/irqchip/irq-gic-v3-its.c
> > @@ -2896,7 +2896,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
> >  	return true;
> >  }
> >  
> > -static int allocate_vpe_l1_table(void)
> > +static int allocate_vpe_l1_table(unsigned int cpu)
> >  {
> >  	void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
> >  	u64 val, gpsz, npg, pa;
> > @@ -3012,10 +3012,11 @@ static int allocate_vpe_l1_table(void)
> >  
> >  out:
> >  	gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER);
> > -	cpumask_set_cpu(smp_processor_id(), gic_data_rdist()->vpe_table_mask);
> > +	cpumask_set_cpu(cpu, gic_data_rdist()->vpe_table_mask);
> > +	dsb(sy);
> >  
> >  	pr_debug("CPU%d: VPROPBASER = %llx %*pbl\n",
> > -		 smp_processor_id(), val,
> > +		 cpu, val,
> >  		 cpumask_pr_args(gic_data_rdist()->vpe_table_mask));
> >  
> >  	return 0;
> > @@ -3264,15 +3265,9 @@ static void its_cpu_init_lpis(void)
> >  		val = its_clear_vpend_valid(vlpi_base, 0, 0);
> >  	}
> >  
> > -	if (allocate_vpe_l1_table()) {
> > -		/*
> > -		 * If the allocation has failed, we're in massive trouble.
> > -		 * Disable direct injection, and pray that no VM was
> > -		 * already running...
> > -		 */
> > -		gic_rdists->has_rvpeid = false;
> > -		gic_rdists->has_vlpis = false;
> > -	}
> > +	if (smp_processor_id() == 0)
> > +		cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "irqchip/arm/gicv3:vpe",
> > +				  allocate_vpe_l1_table, NULL);
> 
> If you move it the online state then you could also
> s/GFP_ATOMIC/GFP_KERNEL.
> 
> Also previously you checked the error code set has_rvpeid, has_vlpis on
> failure. Now you you should the same in case of a failure during
> registration.
> This also happens happens on CPU hotplug and I don't see how you avoid a
> second allocation. But I also don't understand why this registrations
> happens on CPU0. It might be just a test patch…

It's just a test hack. There is way more things that would need to
change in order to cope with moving this to CPUHP, but I want
confirmation that this indeed solves the original issue before I start
breaking more things.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.