[v1] irqchip/gic-v3-its: Fix invalid wait context lockdep report

[PATCH] irqchip/gic-v3-its: Fix invalid wait context lockdep report

Posted by Koichiro Den 1 month, 1 week ago

its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait
context, so calling kcalloc there is not permitted and RT-unsafe since
___slab_alloc() may acquire a local lock. The below is the actual
lockdep report observed:

  =============================
  [ BUG: Invalid wait context ]
  6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S
  -----------------------------
  qemu-system-aar/2129 is trying to lock:
  ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708
  other info that might help us debug this:
  context-{5:5}
  6 locks held by qemu-system-aar/2129:
   #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core]
   #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0
   #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0
   #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0
   #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158
   #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528
  ...
  Call trace:
   show_stack+0x30/0x98 (C)
   dump_stack_lvl+0x9c/0xd0
   dump_stack+0x1c/0x34
   __lock_acquire+0x814/0xb40
   lock_acquire.part.0+0x16c/0x2a8
   lock_acquire+0x8c/0x178
   get_random_u32+0xd4/0x708
   __get_random_u32_below+0x20/0x80
   shuffle_freelist+0x5c/0x1b0
   allocate_slab+0x15c/0x348
   new_slab+0x48/0x80
   ___slab_alloc+0x590/0x8b8
   __slab_alloc.isra.0+0x3c/0x80
   __kmalloc_noprof+0x174/0x520
   its_vlpi_map+0x834/0xce0
   its_irq_set_vcpu_affinity+0x21c/0x528
   irq_set_vcpu_affinity+0x160/0x1b0
   its_map_vlpi+0x90/0x100
   kvm_vgic_v4_set_forwarding+0x3c4/0x6f0
   kvm_arch_irq_bypass_add_producer+0xac/0x108
   __connect+0x138/0x1b0
   irq_bypass_register_producer+0x16c/0x2f0
   vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core]
   vfio_msi_set_block+0x8c/0x120 [vfio_pci_core]
   vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core]
  ...

To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4
device with LPIs allcation. The trade-off is some wasted memory
depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs.

An alternative would be to move the vlpi_maps allocation out of
its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a
caller (KVM in the lockdep splat shown above) to do the allocation
outside irq_set_vcpu_affinity(). However, this would unnecessarily add
complexity.

Fixes: d011e4e654d7 ("irqchip/gic-v3-its: Add VLPI map/unmap operations")
Signed-off-by: Koichiro Den <den@valinux.co.jp>
---
 drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 467cb78435a9..b933be8ddc51 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1923,19 +1923,10 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info)
 	if (!info->map)
 		return -EINVAL;
 
-	if (!its_dev->event_map.vm) {
-		struct its_vlpi_map *maps;
-
-		maps = kcalloc(its_dev->event_map.nr_lpis, sizeof(*maps),
-			       GFP_ATOMIC);
-		if (!maps)
-			return -ENOMEM;
-
+	if (!its_dev->event_map.vm)
 		its_dev->event_map.vm = info->map->vm;
-		its_dev->event_map.vlpi_maps = maps;
-	} else if (its_dev->event_map.vm != info->map->vm) {
+	else if (its_dev->event_map.vm != info->map->vm)
 		return -EINVAL;
-	}
 
 	/* Get our private copy of the mapping information */
 	its_dev->event_map.vlpi_maps[event] = *info->map;
@@ -2010,10 +2001,8 @@ static int its_vlpi_unmap(struct irq_data *d)
 	 * Drop the refcount and make the device available again if
 	 * this was the last VLPI.
 	 */
-	if (!--its_dev->event_map.nr_vlpis) {
+	if (!--its_dev->event_map.nr_vlpis)
 		its_dev->event_map.vm = NULL;
-		kfree(its_dev->event_map.vlpi_maps);
-	}
 
 	return 0;
 }
@@ -3469,6 +3458,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 {
 	struct its_device *dev;
 	unsigned long *lpi_map = NULL;
+	struct its_vlpi_map *vlpi_maps;
 	unsigned long flags;
 	u16 *col_map = NULL;
 	void *itt;
@@ -3497,16 +3487,28 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 
 	if (alloc_lpis) {
 		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
-		if (lpi_map)
+		if (lpi_map) {
 			col_map = kcalloc(nr_lpis, sizeof(*col_map),
 					  GFP_KERNEL);
+
+			/*
+			 * Pre-allocate vlpi_maps to avoid slab allocation
+			 * under the strict raw spinlock wait context of
+			 * irq_set_vcpu_affinity. This could waste memory
+			 * if no vlpi map is ever created.
+			 */
+			if (is_v4(its) && nr_lpis > 0)
+				vlpi_maps = kcalloc(nr_lpis, sizeof(*vlpi_maps),
+						    GFP_KERNEL);
+		}
 	} else {
 		col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL);
 		nr_lpis = 0;
 		lpi_base = 0;
 	}
 
-	if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) {
+	if (!dev || !itt || !col_map ||
+	    (alloc_lpis && (!lpi_map || (is_v4(its) && !vlpi_maps)))) {
 		kfree(dev);
 		itt_free_pool(itt, sz);
 		bitmap_free(lpi_map);
@@ -3524,6 +3526,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	dev->event_map.col_map = col_map;
 	dev->event_map.lpi_base = lpi_base;
 	dev->event_map.nr_lpis = nr_lpis;
+	dev->event_map.vlpi_maps = vlpi_maps;
 	raw_spin_lock_init(&dev->event_map.vlpi_lock);
 	dev->device_id = dev_id;
 	INIT_LIST_HEAD(&dev->entry);
@@ -3546,6 +3549,7 @@ static void its_free_device(struct its_device *its_dev)
 	list_del(&its_dev->entry);
 	raw_spin_unlock_irqrestore(&its_dev->its->lock, flags);
 	kfree(its_dev->event_map.col_map);
+	kfree(its_dev->event_map.vlpi_maps);
 	itt_free_pool(its_dev->itt, its_dev->itt_sz);
 	kfree(its_dev);
 }
-- 
2.48.1

Re: [PATCH] irqchip/gic-v3-its: Fix invalid wait context lockdep report

Posted by Marc Zyngier 1 month, 1 week ago

On Wed, 27 Aug 2025 08:38:48 +0100,
Koichiro Den <den@valinux.co.jp> wrote:
> 
> its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait
> context, so calling kcalloc there is not permitted and RT-unsafe since
> ___slab_alloc() may acquire a local lock. The below is the actual
> lockdep report observed:
> 
>   =============================
>   [ BUG: Invalid wait context ]
>   6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S
>   -----------------------------
>   qemu-system-aar/2129 is trying to lock:
>   ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708
>   other info that might help us debug this:
>   context-{5:5}
>   6 locks held by qemu-system-aar/2129:
>    #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core]
>    #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0
>    #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0
>    #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0
>    #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158
>    #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528
>   ...
>   Call trace:
>    show_stack+0x30/0x98 (C)
>    dump_stack_lvl+0x9c/0xd0
>    dump_stack+0x1c/0x34
>    __lock_acquire+0x814/0xb40
>    lock_acquire.part.0+0x16c/0x2a8
>    lock_acquire+0x8c/0x178
>    get_random_u32+0xd4/0x708
>    __get_random_u32_below+0x20/0x80
>    shuffle_freelist+0x5c/0x1b0
>    allocate_slab+0x15c/0x348
>    new_slab+0x48/0x80
>    ___slab_alloc+0x590/0x8b8
>    __slab_alloc.isra.0+0x3c/0x80
>    __kmalloc_noprof+0x174/0x520
>    its_vlpi_map+0x834/0xce0
>    its_irq_set_vcpu_affinity+0x21c/0x528
>    irq_set_vcpu_affinity+0x160/0x1b0
>    its_map_vlpi+0x90/0x100
>    kvm_vgic_v4_set_forwarding+0x3c4/0x6f0
>    kvm_arch_irq_bypass_add_producer+0xac/0x108
>    __connect+0x138/0x1b0
>    irq_bypass_register_producer+0x16c/0x2f0
>    vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core]
>    vfio_msi_set_block+0x8c/0x120 [vfio_pci_core]
>    vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core]

Huh. I guess this is due to RT not being completely compatible with
GFP_ATOMIC...  Why you'd want RT and KVM at the same time is beyond
me, but hey.

>   ...
> 
> To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4
> device with LPIs allcation. The trade-off is some wasted memory
> depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs.
>
> An alternative would be to move the vlpi_maps allocation out of
> its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a
> caller (KVM in the lockdep splat shown above) to do the allocation
> outside irq_set_vcpu_affinity(). However, this would unnecessarily add
> complexity.

That's debatable. It is probably fine for now, but if this was to
grow, we'd need to revisit this.

> Fixes: d011e4e654d7 ("irqchip/gic-v3-its: Add VLPI map/unmap operations")

No. This code predates RT being merged, and this problem cannot occur
before RT.

> Signed-off-by: Koichiro Den <den@valinux.co.jp>
> ---
>  drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++--------------
>  1 file changed, 20 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 467cb78435a9..b933be8ddc51 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -1923,19 +1923,10 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info)
>  	if (!info->map)
>  		return -EINVAL;
>  
> -	if (!its_dev->event_map.vm) {
> -		struct its_vlpi_map *maps;
> -
> -		maps = kcalloc(its_dev->event_map.nr_lpis, sizeof(*maps),
> -			       GFP_ATOMIC);
> -		if (!maps)
> -			return -ENOMEM;
> -
> +	if (!its_dev->event_map.vm)
>  		its_dev->event_map.vm = info->map->vm;
> -		its_dev->event_map.vlpi_maps = maps;
> -	} else if (its_dev->event_map.vm != info->map->vm) {
> +	else if (its_dev->event_map.vm != info->map->vm)
>  		return -EINVAL;
> -	}
>  
>  	/* Get our private copy of the mapping information */
>  	its_dev->event_map.vlpi_maps[event] = *info->map;
> @@ -2010,10 +2001,8 @@ static int its_vlpi_unmap(struct irq_data *d)
>  	 * Drop the refcount and make the device available again if
>  	 * this was the last VLPI.
>  	 */
> -	if (!--its_dev->event_map.nr_vlpis) {
> +	if (!--its_dev->event_map.nr_vlpis)
>  		its_dev->event_map.vm = NULL;
> -		kfree(its_dev->event_map.vlpi_maps);
> -	}
>  
>  	return 0;
>  }
> @@ -3469,6 +3458,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>  {
>  	struct its_device *dev;
>  	unsigned long *lpi_map = NULL;
> +	struct its_vlpi_map *vlpi_maps;
>  	unsigned long flags;
>  	u16 *col_map = NULL;
>  	void *itt;
> @@ -3497,16 +3487,28 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>  
>  	if (alloc_lpis) {
>  		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
> -		if (lpi_map)
> +		if (lpi_map) {
>  			col_map = kcalloc(nr_lpis, sizeof(*col_map),
>  					  GFP_KERNEL);
> +
> +			/*
> +			 * Pre-allocate vlpi_maps to avoid slab allocation
> +			 * under the strict raw spinlock wait context of
> +			 * irq_set_vcpu_affinity. This could waste memory
> +			 * if no vlpi map is ever created.
> +			 */
> +			if (is_v4(its) && nr_lpis > 0)
> +				vlpi_maps = kcalloc(nr_lpis, sizeof(*vlpi_maps),
> +						    GFP_KERNEL);
> +		}
>  	} else {
>  		col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL);
>  		nr_lpis = 0;
>  		lpi_base = 0;
>  	}
>  
> -	if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) {
> +	if (!dev || !itt || !col_map ||
> +	    (alloc_lpis && (!lpi_map || (is_v4(its) && !vlpi_maps)))) {

This needs to be collapsed into a single boolean evaluated with the
pointer being NULL.

>  		kfree(dev);
>  		itt_free_pool(itt, sz);
>  		bitmap_free(lpi_map);

Where are you freeing vlpi_maps if on the failure path??

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Fix invalid wait context lockdep report

Posted by Koichiro Den 1 month ago

On Wed, Aug 27, 2025 at 01:48:33PM +0100, Marc Zyngier wrote:
> On Wed, 27 Aug 2025 08:38:48 +0100,
> Koichiro Den <den@valinux.co.jp> wrote:
> > 
> > its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait
> > context, so calling kcalloc there is not permitted and RT-unsafe since
> > ___slab_alloc() may acquire a local lock. The below is the actual
> > lockdep report observed:
> > 
> >   =============================
> >   [ BUG: Invalid wait context ]
> >   6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S
> >   -----------------------------
> >   qemu-system-aar/2129 is trying to lock:
> >   ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708
> >   other info that might help us debug this:
> >   context-{5:5}
> >   6 locks held by qemu-system-aar/2129:
> >    #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core]
> >    #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0
> >    #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0
> >    #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0
> >    #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158
> >    #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528
> >   ...
> >   Call trace:
> >    show_stack+0x30/0x98 (C)
> >    dump_stack_lvl+0x9c/0xd0
> >    dump_stack+0x1c/0x34
> >    __lock_acquire+0x814/0xb40
> >    lock_acquire.part.0+0x16c/0x2a8
> >    lock_acquire+0x8c/0x178
> >    get_random_u32+0xd4/0x708
> >    __get_random_u32_below+0x20/0x80
> >    shuffle_freelist+0x5c/0x1b0
> >    allocate_slab+0x15c/0x348
> >    new_slab+0x48/0x80
> >    ___slab_alloc+0x590/0x8b8
> >    __slab_alloc.isra.0+0x3c/0x80
> >    __kmalloc_noprof+0x174/0x520
> >    its_vlpi_map+0x834/0xce0
> >    its_irq_set_vcpu_affinity+0x21c/0x528
> >    irq_set_vcpu_affinity+0x160/0x1b0
> >    its_map_vlpi+0x90/0x100
> >    kvm_vgic_v4_set_forwarding+0x3c4/0x6f0
> >    kvm_arch_irq_bypass_add_producer+0xac/0x108
> >    __connect+0x138/0x1b0
> >    irq_bypass_register_producer+0x16c/0x2f0
> >    vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core]
> >    vfio_msi_set_block+0x8c/0x120 [vfio_pci_core]
> >    vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core]
> 
> Huh. I guess this is due to RT not being completely compatible with
> GFP_ATOMIC...  Why you'd want RT and KVM at the same time is beyond
> me, but hey.

For the record, I didn't run KVM on RT, though I still believe it's better
to conform to the wait context rule and avoid triggering the lockdep splat.

I don't know if there are any plans which make kmalloc with GFP_ATOMIC
workable under a stricter wait context (getting rid of the local lock
in some way?), but I think it would be nicer.

> 
> >   ...
> > 
> > To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4
> > device with LPIs allcation. The trade-off is some wasted memory
> > depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs.
> >
> > An alternative would be to move the vlpi_maps allocation out of
> > its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a
> > caller (KVM in the lockdep splat shown above) to do the allocation
> > outside irq_set_vcpu_affinity(). However, this would unnecessarily add
> > complexity.
> 
> That's debatable. It is probably fine for now, but if this was to
> grow, we'd need to revisit this.

Just curious but do you have any plans to replace the current
irq_set_vcpu_affinity() approach with something else?

> 
> > Fixes: d011e4e654d7 ("irqchip/gic-v3-its: Add VLPI map/unmap operations")
> 
> No. This code predates RT being merged, and this problem cannot occur
> before RT.

I'll drop this in v2.

> 
> > Signed-off-by: Koichiro Den <den@valinux.co.jp>
> > ---
> >  drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++--------------
> >  1 file changed, 20 insertions(+), 16 deletions(-)
> > 
> > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> > index 467cb78435a9..b933be8ddc51 100644
> > --- a/drivers/irqchip/irq-gic-v3-its.c
> > +++ b/drivers/irqchip/irq-gic-v3-its.c
> > @@ -1923,19 +1923,10 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info)
> >  	if (!info->map)
> >  		return -EINVAL;
> >  
> > -	if (!its_dev->event_map.vm) {
> > -		struct its_vlpi_map *maps;
> > -
> > -		maps = kcalloc(its_dev->event_map.nr_lpis, sizeof(*maps),
> > -			       GFP_ATOMIC);
> > -		if (!maps)
> > -			return -ENOMEM;
> > -
> > +	if (!its_dev->event_map.vm)
> >  		its_dev->event_map.vm = info->map->vm;
> > -		its_dev->event_map.vlpi_maps = maps;
> > -	} else if (its_dev->event_map.vm != info->map->vm) {
> > +	else if (its_dev->event_map.vm != info->map->vm)
> >  		return -EINVAL;
> > -	}
> >  
> >  	/* Get our private copy of the mapping information */
> >  	its_dev->event_map.vlpi_maps[event] = *info->map;
> > @@ -2010,10 +2001,8 @@ static int its_vlpi_unmap(struct irq_data *d)
> >  	 * Drop the refcount and make the device available again if
> >  	 * this was the last VLPI.
> >  	 */
> > -	if (!--its_dev->event_map.nr_vlpis) {
> > +	if (!--its_dev->event_map.nr_vlpis)
> >  		its_dev->event_map.vm = NULL;
> > -		kfree(its_dev->event_map.vlpi_maps);
> > -	}
> >  
> >  	return 0;
> >  }
> > @@ -3469,6 +3458,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
> >  {
> >  	struct its_device *dev;
> >  	unsigned long *lpi_map = NULL;
> > +	struct its_vlpi_map *vlpi_maps;
> >  	unsigned long flags;
> >  	u16 *col_map = NULL;
> >  	void *itt;
> > @@ -3497,16 +3487,28 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
> >  
> >  	if (alloc_lpis) {
> >  		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
> > -		if (lpi_map)
> > +		if (lpi_map) {
> >  			col_map = kcalloc(nr_lpis, sizeof(*col_map),
> >  					  GFP_KERNEL);
> > +
> > +			/*
> > +			 * Pre-allocate vlpi_maps to avoid slab allocation
> > +			 * under the strict raw spinlock wait context of
> > +			 * irq_set_vcpu_affinity. This could waste memory
> > +			 * if no vlpi map is ever created.
> > +			 */
> > +			if (is_v4(its) && nr_lpis > 0)
> > +				vlpi_maps = kcalloc(nr_lpis, sizeof(*vlpi_maps),
> > +						    GFP_KERNEL);
> > +		}
> >  	} else {
> >  		col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL);
> >  		nr_lpis = 0;
> >  		lpi_base = 0;
> >  	}
> >  
> > -	if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) {
> > +	if (!dev || !itt || !col_map ||
> > +	    (alloc_lpis && (!lpi_map || (is_v4(its) && !vlpi_maps)))) {
> 
> This needs to be collapsed into a single boolean evaluated with the
> pointer being NULL.

Right, I'll add and use something like:

  bool prealloc_vlpis_maps = alloc_lpis && is_v4(its);

If that's not the intended direction, please let me know.
BTW, I noticed I forgot to initialize vlpi_maps. I'll fix that as well.

> 
> >  		kfree(dev);
> >  		itt_free_pool(itt, sz);
> >  		bitmap_free(lpi_map);
> 
> Where are you freeing vlpi_maps if on the failure path??

Thanks for catching this, I'll fix this in v2.

Thanks for the review!

-Koichiro

> 
> Thanks,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Fix invalid wait context lockdep report

Posted by Marc Zyngier 1 month ago

On Thu, 28 Aug 2025 04:09:00 +0100,
Koichiro Den <den@valinux.co.jp> wrote:
> 
> On Wed, Aug 27, 2025 at 01:48:33PM +0100, Marc Zyngier wrote:
> > On Wed, 27 Aug 2025 08:38:48 +0100,
> > Koichiro Den <den@valinux.co.jp> wrote:
> > > 
> > > its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait
> > > context, so calling kcalloc there is not permitted and RT-unsafe since
> > > ___slab_alloc() may acquire a local lock. The below is the actual
> > > lockdep report observed:
> > > 
> > >   =============================
> > >   [ BUG: Invalid wait context ]
> > >   6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S
> > >   -----------------------------
> > >   qemu-system-aar/2129 is trying to lock:
> > >   ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708
> > >   other info that might help us debug this:
> > >   context-{5:5}
> > >   6 locks held by qemu-system-aar/2129:
> > >    #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core]
> > >    #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0
> > >    #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0
> > >    #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0
> > >    #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158
> > >    #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528
> > >   ...
> > >   Call trace:
> > >    show_stack+0x30/0x98 (C)
> > >    dump_stack_lvl+0x9c/0xd0
> > >    dump_stack+0x1c/0x34
> > >    __lock_acquire+0x814/0xb40
> > >    lock_acquire.part.0+0x16c/0x2a8
> > >    lock_acquire+0x8c/0x178
> > >    get_random_u32+0xd4/0x708
> > >    __get_random_u32_below+0x20/0x80
> > >    shuffle_freelist+0x5c/0x1b0
> > >    allocate_slab+0x15c/0x348
> > >    new_slab+0x48/0x80
> > >    ___slab_alloc+0x590/0x8b8
> > >    __slab_alloc.isra.0+0x3c/0x80
> > >    __kmalloc_noprof+0x174/0x520
> > >    its_vlpi_map+0x834/0xce0
> > >    its_irq_set_vcpu_affinity+0x21c/0x528
> > >    irq_set_vcpu_affinity+0x160/0x1b0
> > >    its_map_vlpi+0x90/0x100
> > >    kvm_vgic_v4_set_forwarding+0x3c4/0x6f0
> > >    kvm_arch_irq_bypass_add_producer+0xac/0x108
> > >    __connect+0x138/0x1b0
> > >    irq_bypass_register_producer+0x16c/0x2f0
> > >    vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core]
> > >    vfio_msi_set_block+0x8c/0x120 [vfio_pci_core]
> > >    vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core]
> > 
> > Huh. I guess this is due to RT not being completely compatible with
> > GFP_ATOMIC...  Why you'd want RT and KVM at the same time is beyond
> > me, but hey.
> 
> For the record, I didn't run KVM on RT, though I still believe it's better
> to conform to the wait context rule and avoid triggering the lockdep
> splat.

Then I don't understand how you get this, because I have not seen it
so far.

> 
> I don't know if there are any plans which make kmalloc with GFP_ATOMIC
> workable under a stricter wait context (getting rid of the local lock
> in some way?), but I think it would be nicer.

GFP_ATOMIC is documented as being compatible with raw spinlocks in the
absence of RT, making the above trace pretty odd.

> 
> > 
> > >   ...
> > > 
> > > To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4
> > > device with LPIs allcation. The trade-off is some wasted memory
> > > depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs.
> > >
> > > An alternative would be to move the vlpi_maps allocation out of
> > > its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a
> > > caller (KVM in the lockdep splat shown above) to do the allocation
> > > outside irq_set_vcpu_affinity(). However, this would unnecessarily add
> > > complexity.
> > 
> > That's debatable. It is probably fine for now, but if this was to
> > grow, we'd need to revisit this.
> 
> Just curious but do you have any plans to replace the current
> irq_set_vcpu_affinity() approach with something else?

Who knows. This is the Linux kernel, everything changes all the time
without the need for a good reason. More significantly, the amount of
*data* being associated with a VLPI could become much higher in the
future, and add more unnecessary allocation.

	M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH] irqchip/gic-v3-its: Fix invalid wait context lockdep report

Posted by Koichiro Den 1 month ago

On Thu, Aug 28, 2025 at 08:56:01AM +0100, Marc Zyngier wrote:
> On Thu, 28 Aug 2025 04:09:00 +0100,
> Koichiro Den <den@valinux.co.jp> wrote:
> > 
> > On Wed, Aug 27, 2025 at 01:48:33PM +0100, Marc Zyngier wrote:
> > > On Wed, 27 Aug 2025 08:38:48 +0100,
> > > Koichiro Den <den@valinux.co.jp> wrote:
> > > > 
> > > > its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait
> > > > context, so calling kcalloc there is not permitted and RT-unsafe since
> > > > ___slab_alloc() may acquire a local lock. The below is the actual
> > > > lockdep report observed:
> > > > 
> > > >   =============================
> > > >   [ BUG: Invalid wait context ]
> > > >   6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S
> > > >   -----------------------------
> > > >   qemu-system-aar/2129 is trying to lock:
> > > >   ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708
> > > >   other info that might help us debug this:
> > > >   context-{5:5}
> > > >   6 locks held by qemu-system-aar/2129:
> > > >    #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core]
> > > >    #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0
> > > >    #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0
> > > >    #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0
> > > >    #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158
> > > >    #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528
> > > >   ...
> > > >   Call trace:
> > > >    show_stack+0x30/0x98 (C)
> > > >    dump_stack_lvl+0x9c/0xd0
> > > >    dump_stack+0x1c/0x34
> > > >    __lock_acquire+0x814/0xb40
> > > >    lock_acquire.part.0+0x16c/0x2a8
> > > >    lock_acquire+0x8c/0x178
> > > >    get_random_u32+0xd4/0x708
> > > >    __get_random_u32_below+0x20/0x80
> > > >    shuffle_freelist+0x5c/0x1b0
> > > >    allocate_slab+0x15c/0x348
> > > >    new_slab+0x48/0x80
> > > >    ___slab_alloc+0x590/0x8b8
> > > >    __slab_alloc.isra.0+0x3c/0x80
> > > >    __kmalloc_noprof+0x174/0x520
> > > >    its_vlpi_map+0x834/0xce0
> > > >    its_irq_set_vcpu_affinity+0x21c/0x528
> > > >    irq_set_vcpu_affinity+0x160/0x1b0
> > > >    its_map_vlpi+0x90/0x100
> > > >    kvm_vgic_v4_set_forwarding+0x3c4/0x6f0
> > > >    kvm_arch_irq_bypass_add_producer+0xac/0x108
> > > >    __connect+0x138/0x1b0
> > > >    irq_bypass_register_producer+0x16c/0x2f0
> > > >    vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core]
> > > >    vfio_msi_set_block+0x8c/0x120 [vfio_pci_core]
> > > >    vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core]
> > > 
> > > Huh. I guess this is due to RT not being completely compatible with
> > > GFP_ATOMIC...  Why you'd want RT and KVM at the same time is beyond
> > > me, but hey.
> > 
> > For the record, I didn't run KVM on RT, though I still believe it's better
> > to conform to the wait context rule and avoid triggering the lockdep
> > splat.
> 
> Then I don't understand how you get this, because I have not seen it
> so far.
> 
> > 
> > I don't know if there are any plans which make kmalloc with GFP_ATOMIC
> > workable under a stricter wait context (getting rid of the local lock
> > in some way?), but I think it would be nicer.
> 
> GFP_ATOMIC is documented as being compatible with raw spinlocks in the
> absence of RT, making the above trace pretty odd.

I got the report on my ARM64 env with CONFIG_PROVE_LOCKING=y, which
leads to CONFIG_PROVE_RAW_LOCK_NESTING=y on ARM64. And this report says
that it was trying to acquire a local_lock_t (LD_WAIT_CONFIG) while any
raw_spinlock_t (LD_WAIT_SPIN) being held.
So I still believe we're on the same page; while I got the report on
non-RT, the report just pre-warns the danger for RT. There's no
immediate harm on non-RT.

> 
> > 
> > > 
> > > >   ...
> > > > 
> > > > To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4
> > > > device with LPIs allcation. The trade-off is some wasted memory
> > > > depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs.
> > > >
> > > > An alternative would be to move the vlpi_maps allocation out of
> > > > its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a
> > > > caller (KVM in the lockdep splat shown above) to do the allocation
> > > > outside irq_set_vcpu_affinity(). However, this would unnecessarily add
> > > > complexity.
> > > 
> > > That's debatable. It is probably fine for now, but if this was to
> > > grow, we'd need to revisit this.
> > 
> > Just curious but do you have any plans to replace the current
> > irq_set_vcpu_affinity() approach with something else?
> 
> Who knows. This is the Linux kernel, everything changes all the time
> without the need for a good reason. More significantly, the amount of
> *data* being associated with a VLPI could become much higher in the
> future, and add more unnecessary allocation.

Alright, thank you.

-Koichiro

> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.