drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-)
its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait
context, so calling kcalloc there is not permitted and RT-unsafe since
___slab_alloc() may acquire a local lock. The below is the actual
lockdep report observed:
=============================
[ BUG: Invalid wait context ]
6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S
-----------------------------
qemu-system-aar/2129 is trying to lock:
ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708
other info that might help us debug this:
context-{5:5}
6 locks held by qemu-system-aar/2129:
#0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core]
#1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0
#2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0
#3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0
#4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158
#5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528
...
Call trace:
show_stack+0x30/0x98 (C)
dump_stack_lvl+0x9c/0xd0
dump_stack+0x1c/0x34
__lock_acquire+0x814/0xb40
lock_acquire.part.0+0x16c/0x2a8
lock_acquire+0x8c/0x178
get_random_u32+0xd4/0x708
__get_random_u32_below+0x20/0x80
shuffle_freelist+0x5c/0x1b0
allocate_slab+0x15c/0x348
new_slab+0x48/0x80
___slab_alloc+0x590/0x8b8
__slab_alloc.isra.0+0x3c/0x80
__kmalloc_noprof+0x174/0x520
its_vlpi_map+0x834/0xce0
its_irq_set_vcpu_affinity+0x21c/0x528
irq_set_vcpu_affinity+0x160/0x1b0
its_map_vlpi+0x90/0x100
kvm_vgic_v4_set_forwarding+0x3c4/0x6f0
kvm_arch_irq_bypass_add_producer+0xac/0x108
__connect+0x138/0x1b0
irq_bypass_register_producer+0x16c/0x2f0
vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core]
vfio_msi_set_block+0x8c/0x120 [vfio_pci_core]
vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core]
...
To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4
device with LPIs allcation. The trade-off is some wasted memory
depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs.
An alternative would be to move the vlpi_maps allocation out of
its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a
caller (KVM in the lockdep splat shown above) to do the allocation
outside irq_set_vcpu_affinity(). However, this would unnecessarily add
complexity.
Fixes: d011e4e654d7 ("irqchip/gic-v3-its: Add VLPI map/unmap operations")
Signed-off-by: Koichiro Den <den@valinux.co.jp>
---
drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++--------------
1 file changed, 20 insertions(+), 16 deletions(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 467cb78435a9..b933be8ddc51 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1923,19 +1923,10 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info)
if (!info->map)
return -EINVAL;
- if (!its_dev->event_map.vm) {
- struct its_vlpi_map *maps;
-
- maps = kcalloc(its_dev->event_map.nr_lpis, sizeof(*maps),
- GFP_ATOMIC);
- if (!maps)
- return -ENOMEM;
-
+ if (!its_dev->event_map.vm)
its_dev->event_map.vm = info->map->vm;
- its_dev->event_map.vlpi_maps = maps;
- } else if (its_dev->event_map.vm != info->map->vm) {
+ else if (its_dev->event_map.vm != info->map->vm)
return -EINVAL;
- }
/* Get our private copy of the mapping information */
its_dev->event_map.vlpi_maps[event] = *info->map;
@@ -2010,10 +2001,8 @@ static int its_vlpi_unmap(struct irq_data *d)
* Drop the refcount and make the device available again if
* this was the last VLPI.
*/
- if (!--its_dev->event_map.nr_vlpis) {
+ if (!--its_dev->event_map.nr_vlpis)
its_dev->event_map.vm = NULL;
- kfree(its_dev->event_map.vlpi_maps);
- }
return 0;
}
@@ -3469,6 +3458,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
{
struct its_device *dev;
unsigned long *lpi_map = NULL;
+ struct its_vlpi_map *vlpi_maps;
unsigned long flags;
u16 *col_map = NULL;
void *itt;
@@ -3497,16 +3487,28 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
if (alloc_lpis) {
lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
- if (lpi_map)
+ if (lpi_map) {
col_map = kcalloc(nr_lpis, sizeof(*col_map),
GFP_KERNEL);
+
+ /*
+ * Pre-allocate vlpi_maps to avoid slab allocation
+ * under the strict raw spinlock wait context of
+ * irq_set_vcpu_affinity. This could waste memory
+ * if no vlpi map is ever created.
+ */
+ if (is_v4(its) && nr_lpis > 0)
+ vlpi_maps = kcalloc(nr_lpis, sizeof(*vlpi_maps),
+ GFP_KERNEL);
+ }
} else {
col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL);
nr_lpis = 0;
lpi_base = 0;
}
- if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) {
+ if (!dev || !itt || !col_map ||
+ (alloc_lpis && (!lpi_map || (is_v4(its) && !vlpi_maps)))) {
kfree(dev);
itt_free_pool(itt, sz);
bitmap_free(lpi_map);
@@ -3524,6 +3526,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
dev->event_map.col_map = col_map;
dev->event_map.lpi_base = lpi_base;
dev->event_map.nr_lpis = nr_lpis;
+ dev->event_map.vlpi_maps = vlpi_maps;
raw_spin_lock_init(&dev->event_map.vlpi_lock);
dev->device_id = dev_id;
INIT_LIST_HEAD(&dev->entry);
@@ -3546,6 +3549,7 @@ static void its_free_device(struct its_device *its_dev)
list_del(&its_dev->entry);
raw_spin_unlock_irqrestore(&its_dev->its->lock, flags);
kfree(its_dev->event_map.col_map);
+ kfree(its_dev->event_map.vlpi_maps);
itt_free_pool(its_dev->itt, its_dev->itt_sz);
kfree(its_dev);
}
--
2.48.1
On Wed, 27 Aug 2025 08:38:48 +0100, Koichiro Den <den@valinux.co.jp> wrote: > > its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait > context, so calling kcalloc there is not permitted and RT-unsafe since > ___slab_alloc() may acquire a local lock. The below is the actual > lockdep report observed: > > ============================= > [ BUG: Invalid wait context ] > 6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S > ----------------------------- > qemu-system-aar/2129 is trying to lock: > ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708 > other info that might help us debug this: > context-{5:5} > 6 locks held by qemu-system-aar/2129: > #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core] > #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0 > #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0 > #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0 > #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158 > #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528 > ... > Call trace: > show_stack+0x30/0x98 (C) > dump_stack_lvl+0x9c/0xd0 > dump_stack+0x1c/0x34 > __lock_acquire+0x814/0xb40 > lock_acquire.part.0+0x16c/0x2a8 > lock_acquire+0x8c/0x178 > get_random_u32+0xd4/0x708 > __get_random_u32_below+0x20/0x80 > shuffle_freelist+0x5c/0x1b0 > allocate_slab+0x15c/0x348 > new_slab+0x48/0x80 > ___slab_alloc+0x590/0x8b8 > __slab_alloc.isra.0+0x3c/0x80 > __kmalloc_noprof+0x174/0x520 > its_vlpi_map+0x834/0xce0 > its_irq_set_vcpu_affinity+0x21c/0x528 > irq_set_vcpu_affinity+0x160/0x1b0 > its_map_vlpi+0x90/0x100 > kvm_vgic_v4_set_forwarding+0x3c4/0x6f0 > kvm_arch_irq_bypass_add_producer+0xac/0x108 > __connect+0x138/0x1b0 > irq_bypass_register_producer+0x16c/0x2f0 > vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core] > vfio_msi_set_block+0x8c/0x120 [vfio_pci_core] > vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core] Huh. I guess this is due to RT not being completely compatible with GFP_ATOMIC... Why you'd want RT and KVM at the same time is beyond me, but hey. > ... > > To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4 > device with LPIs allcation. The trade-off is some wasted memory > depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs. > > An alternative would be to move the vlpi_maps allocation out of > its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a > caller (KVM in the lockdep splat shown above) to do the allocation > outside irq_set_vcpu_affinity(). However, this would unnecessarily add > complexity. That's debatable. It is probably fine for now, but if this was to grow, we'd need to revisit this. > Fixes: d011e4e654d7 ("irqchip/gic-v3-its: Add VLPI map/unmap operations") No. This code predates RT being merged, and this problem cannot occur before RT. > Signed-off-by: Koichiro Den <den@valinux.co.jp> > --- > drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++-------------- > 1 file changed, 20 insertions(+), 16 deletions(-) > > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c > index 467cb78435a9..b933be8ddc51 100644 > --- a/drivers/irqchip/irq-gic-v3-its.c > +++ b/drivers/irqchip/irq-gic-v3-its.c > @@ -1923,19 +1923,10 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info) > if (!info->map) > return -EINVAL; > > - if (!its_dev->event_map.vm) { > - struct its_vlpi_map *maps; > - > - maps = kcalloc(its_dev->event_map.nr_lpis, sizeof(*maps), > - GFP_ATOMIC); > - if (!maps) > - return -ENOMEM; > - > + if (!its_dev->event_map.vm) > its_dev->event_map.vm = info->map->vm; > - its_dev->event_map.vlpi_maps = maps; > - } else if (its_dev->event_map.vm != info->map->vm) { > + else if (its_dev->event_map.vm != info->map->vm) > return -EINVAL; > - } > > /* Get our private copy of the mapping information */ > its_dev->event_map.vlpi_maps[event] = *info->map; > @@ -2010,10 +2001,8 @@ static int its_vlpi_unmap(struct irq_data *d) > * Drop the refcount and make the device available again if > * this was the last VLPI. > */ > - if (!--its_dev->event_map.nr_vlpis) { > + if (!--its_dev->event_map.nr_vlpis) > its_dev->event_map.vm = NULL; > - kfree(its_dev->event_map.vlpi_maps); > - } > > return 0; > } > @@ -3469,6 +3458,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > { > struct its_device *dev; > unsigned long *lpi_map = NULL; > + struct its_vlpi_map *vlpi_maps; > unsigned long flags; > u16 *col_map = NULL; > void *itt; > @@ -3497,16 +3487,28 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > > if (alloc_lpis) { > lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); > - if (lpi_map) > + if (lpi_map) { > col_map = kcalloc(nr_lpis, sizeof(*col_map), > GFP_KERNEL); > + > + /* > + * Pre-allocate vlpi_maps to avoid slab allocation > + * under the strict raw spinlock wait context of > + * irq_set_vcpu_affinity. This could waste memory > + * if no vlpi map is ever created. > + */ > + if (is_v4(its) && nr_lpis > 0) > + vlpi_maps = kcalloc(nr_lpis, sizeof(*vlpi_maps), > + GFP_KERNEL); > + } > } else { > col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL); > nr_lpis = 0; > lpi_base = 0; > } > > - if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { > + if (!dev || !itt || !col_map || > + (alloc_lpis && (!lpi_map || (is_v4(its) && !vlpi_maps)))) { This needs to be collapsed into a single boolean evaluated with the pointer being NULL. > kfree(dev); > itt_free_pool(itt, sz); > bitmap_free(lpi_map); Where are you freeing vlpi_maps if on the failure path?? Thanks, M. -- Without deviation from the norm, progress is not possible.
On Wed, Aug 27, 2025 at 01:48:33PM +0100, Marc Zyngier wrote: > On Wed, 27 Aug 2025 08:38:48 +0100, > Koichiro Den <den@valinux.co.jp> wrote: > > > > its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait > > context, so calling kcalloc there is not permitted and RT-unsafe since > > ___slab_alloc() may acquire a local lock. The below is the actual > > lockdep report observed: > > > > ============================= > > [ BUG: Invalid wait context ] > > 6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S > > ----------------------------- > > qemu-system-aar/2129 is trying to lock: > > ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708 > > other info that might help us debug this: > > context-{5:5} > > 6 locks held by qemu-system-aar/2129: > > #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core] > > #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0 > > #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0 > > #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0 > > #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158 > > #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528 > > ... > > Call trace: > > show_stack+0x30/0x98 (C) > > dump_stack_lvl+0x9c/0xd0 > > dump_stack+0x1c/0x34 > > __lock_acquire+0x814/0xb40 > > lock_acquire.part.0+0x16c/0x2a8 > > lock_acquire+0x8c/0x178 > > get_random_u32+0xd4/0x708 > > __get_random_u32_below+0x20/0x80 > > shuffle_freelist+0x5c/0x1b0 > > allocate_slab+0x15c/0x348 > > new_slab+0x48/0x80 > > ___slab_alloc+0x590/0x8b8 > > __slab_alloc.isra.0+0x3c/0x80 > > __kmalloc_noprof+0x174/0x520 > > its_vlpi_map+0x834/0xce0 > > its_irq_set_vcpu_affinity+0x21c/0x528 > > irq_set_vcpu_affinity+0x160/0x1b0 > > its_map_vlpi+0x90/0x100 > > kvm_vgic_v4_set_forwarding+0x3c4/0x6f0 > > kvm_arch_irq_bypass_add_producer+0xac/0x108 > > __connect+0x138/0x1b0 > > irq_bypass_register_producer+0x16c/0x2f0 > > vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core] > > vfio_msi_set_block+0x8c/0x120 [vfio_pci_core] > > vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core] > > Huh. I guess this is due to RT not being completely compatible with > GFP_ATOMIC... Why you'd want RT and KVM at the same time is beyond > me, but hey. For the record, I didn't run KVM on RT, though I still believe it's better to conform to the wait context rule and avoid triggering the lockdep splat. I don't know if there are any plans which make kmalloc with GFP_ATOMIC workable under a stricter wait context (getting rid of the local lock in some way?), but I think it would be nicer. > > > ... > > > > To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4 > > device with LPIs allcation. The trade-off is some wasted memory > > depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs. > > > > An alternative would be to move the vlpi_maps allocation out of > > its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a > > caller (KVM in the lockdep splat shown above) to do the allocation > > outside irq_set_vcpu_affinity(). However, this would unnecessarily add > > complexity. > > That's debatable. It is probably fine for now, but if this was to > grow, we'd need to revisit this. Just curious but do you have any plans to replace the current irq_set_vcpu_affinity() approach with something else? > > > Fixes: d011e4e654d7 ("irqchip/gic-v3-its: Add VLPI map/unmap operations") > > No. This code predates RT being merged, and this problem cannot occur > before RT. I'll drop this in v2. > > > Signed-off-by: Koichiro Den <den@valinux.co.jp> > > --- > > drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++-------------- > > 1 file changed, 20 insertions(+), 16 deletions(-) > > > > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c > > index 467cb78435a9..b933be8ddc51 100644 > > --- a/drivers/irqchip/irq-gic-v3-its.c > > +++ b/drivers/irqchip/irq-gic-v3-its.c > > @@ -1923,19 +1923,10 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info) > > if (!info->map) > > return -EINVAL; > > > > - if (!its_dev->event_map.vm) { > > - struct its_vlpi_map *maps; > > - > > - maps = kcalloc(its_dev->event_map.nr_lpis, sizeof(*maps), > > - GFP_ATOMIC); > > - if (!maps) > > - return -ENOMEM; > > - > > + if (!its_dev->event_map.vm) > > its_dev->event_map.vm = info->map->vm; > > - its_dev->event_map.vlpi_maps = maps; > > - } else if (its_dev->event_map.vm != info->map->vm) { > > + else if (its_dev->event_map.vm != info->map->vm) > > return -EINVAL; > > - } > > > > /* Get our private copy of the mapping information */ > > its_dev->event_map.vlpi_maps[event] = *info->map; > > @@ -2010,10 +2001,8 @@ static int its_vlpi_unmap(struct irq_data *d) > > * Drop the refcount and make the device available again if > > * this was the last VLPI. > > */ > > - if (!--its_dev->event_map.nr_vlpis) { > > + if (!--its_dev->event_map.nr_vlpis) > > its_dev->event_map.vm = NULL; > > - kfree(its_dev->event_map.vlpi_maps); > > - } > > > > return 0; > > } > > @@ -3469,6 +3458,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > > { > > struct its_device *dev; > > unsigned long *lpi_map = NULL; > > + struct its_vlpi_map *vlpi_maps; > > unsigned long flags; > > u16 *col_map = NULL; > > void *itt; > > @@ -3497,16 +3487,28 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > > > > if (alloc_lpis) { > > lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); > > - if (lpi_map) > > + if (lpi_map) { > > col_map = kcalloc(nr_lpis, sizeof(*col_map), > > GFP_KERNEL); > > + > > + /* > > + * Pre-allocate vlpi_maps to avoid slab allocation > > + * under the strict raw spinlock wait context of > > + * irq_set_vcpu_affinity. This could waste memory > > + * if no vlpi map is ever created. > > + */ > > + if (is_v4(its) && nr_lpis > 0) > > + vlpi_maps = kcalloc(nr_lpis, sizeof(*vlpi_maps), > > + GFP_KERNEL); > > + } > > } else { > > col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL); > > nr_lpis = 0; > > lpi_base = 0; > > } > > > > - if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { > > + if (!dev || !itt || !col_map || > > + (alloc_lpis && (!lpi_map || (is_v4(its) && !vlpi_maps)))) { > > This needs to be collapsed into a single boolean evaluated with the > pointer being NULL. Right, I'll add and use something like: bool prealloc_vlpis_maps = alloc_lpis && is_v4(its); If that's not the intended direction, please let me know. BTW, I noticed I forgot to initialize vlpi_maps. I'll fix that as well. > > > kfree(dev); > > itt_free_pool(itt, sz); > > bitmap_free(lpi_map); > > Where are you freeing vlpi_maps if on the failure path?? Thanks for catching this, I'll fix this in v2. Thanks for the review! -Koichiro > > Thanks, > > M. > > -- > Without deviation from the norm, progress is not possible.
On Thu, 28 Aug 2025 04:09:00 +0100, Koichiro Den <den@valinux.co.jp> wrote: > > On Wed, Aug 27, 2025 at 01:48:33PM +0100, Marc Zyngier wrote: > > On Wed, 27 Aug 2025 08:38:48 +0100, > > Koichiro Den <den@valinux.co.jp> wrote: > > > > > > its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait > > > context, so calling kcalloc there is not permitted and RT-unsafe since > > > ___slab_alloc() may acquire a local lock. The below is the actual > > > lockdep report observed: > > > > > > ============================= > > > [ BUG: Invalid wait context ] > > > 6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S > > > ----------------------------- > > > qemu-system-aar/2129 is trying to lock: > > > ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708 > > > other info that might help us debug this: > > > context-{5:5} > > > 6 locks held by qemu-system-aar/2129: > > > #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core] > > > #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0 > > > #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0 > > > #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0 > > > #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158 > > > #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528 > > > ... > > > Call trace: > > > show_stack+0x30/0x98 (C) > > > dump_stack_lvl+0x9c/0xd0 > > > dump_stack+0x1c/0x34 > > > __lock_acquire+0x814/0xb40 > > > lock_acquire.part.0+0x16c/0x2a8 > > > lock_acquire+0x8c/0x178 > > > get_random_u32+0xd4/0x708 > > > __get_random_u32_below+0x20/0x80 > > > shuffle_freelist+0x5c/0x1b0 > > > allocate_slab+0x15c/0x348 > > > new_slab+0x48/0x80 > > > ___slab_alloc+0x590/0x8b8 > > > __slab_alloc.isra.0+0x3c/0x80 > > > __kmalloc_noprof+0x174/0x520 > > > its_vlpi_map+0x834/0xce0 > > > its_irq_set_vcpu_affinity+0x21c/0x528 > > > irq_set_vcpu_affinity+0x160/0x1b0 > > > its_map_vlpi+0x90/0x100 > > > kvm_vgic_v4_set_forwarding+0x3c4/0x6f0 > > > kvm_arch_irq_bypass_add_producer+0xac/0x108 > > > __connect+0x138/0x1b0 > > > irq_bypass_register_producer+0x16c/0x2f0 > > > vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core] > > > vfio_msi_set_block+0x8c/0x120 [vfio_pci_core] > > > vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core] > > > > Huh. I guess this is due to RT not being completely compatible with > > GFP_ATOMIC... Why you'd want RT and KVM at the same time is beyond > > me, but hey. > > For the record, I didn't run KVM on RT, though I still believe it's better > to conform to the wait context rule and avoid triggering the lockdep > splat. Then I don't understand how you get this, because I have not seen it so far. > > I don't know if there are any plans which make kmalloc with GFP_ATOMIC > workable under a stricter wait context (getting rid of the local lock > in some way?), but I think it would be nicer. GFP_ATOMIC is documented as being compatible with raw spinlocks in the absence of RT, making the above trace pretty odd. > > > > > > ... > > > > > > To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4 > > > device with LPIs allcation. The trade-off is some wasted memory > > > depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs. > > > > > > An alternative would be to move the vlpi_maps allocation out of > > > its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a > > > caller (KVM in the lockdep splat shown above) to do the allocation > > > outside irq_set_vcpu_affinity(). However, this would unnecessarily add > > > complexity. > > > > That's debatable. It is probably fine for now, but if this was to > > grow, we'd need to revisit this. > > Just curious but do you have any plans to replace the current > irq_set_vcpu_affinity() approach with something else? Who knows. This is the Linux kernel, everything changes all the time without the need for a good reason. More significantly, the amount of *data* being associated with a VLPI could become much higher in the future, and add more unnecessary allocation. M. -- Without deviation from the norm, progress is not possible.
On Thu, Aug 28, 2025 at 08:56:01AM +0100, Marc Zyngier wrote: > On Thu, 28 Aug 2025 04:09:00 +0100, > Koichiro Den <den@valinux.co.jp> wrote: > > > > On Wed, Aug 27, 2025 at 01:48:33PM +0100, Marc Zyngier wrote: > > > On Wed, 27 Aug 2025 08:38:48 +0100, > > > Koichiro Den <den@valinux.co.jp> wrote: > > > > > > > > its_irq_set_vcpu_affinity() always runs under a raw_spin_lock wait > > > > context, so calling kcalloc there is not permitted and RT-unsafe since > > > > ___slab_alloc() may acquire a local lock. The below is the actual > > > > lockdep report observed: > > > > > > > > ============================= > > > > [ BUG: Invalid wait context ] > > > > 6.16.0-rc3-irqchip-next-7e28bba92c5c+ #1 Tainted: G S > > > > ----------------------------- > > > > qemu-system-aar/2129 is trying to lock: > > > > ffff0085b74f2178 (batched_entropy_u32.lock){..-.}-{3:3}, at: get_random_u32+0x9c/0x708 > > > > other info that might help us debug this: > > > > context-{5:5} > > > > 6 locks held by qemu-system-aar/2129: > > > > #0: ffff0000b84a0738 (&vdev->igate){+.+.}-{4:4}, at: vfio_pci_core_ioctl+0x40c/0x748 [vfio_pci_core] > > > > #1: ffff8000883cef68 (lock#6){+.+.}-{4:4}, at: irq_bypass_register_producer+0x64/0x2f0 > > > > #2: ffff0000ac0df960 (&its->its_lock){+.+.}-{4:4}, at: kvm_vgic_v4_set_forwarding+0x224/0x6f0 > > > > #3: ffff000086dc4718 (&irq->irq_lock#3){....}-{2:2}, at: kvm_vgic_v4_set_forwarding+0x288/0x6f0 > > > > #4: ffff0001356200c8 (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0xc8/0x158 > > > > #5: ffff00009eae4850 (&dev->event_map.vlpi_lock){....}-{2:2}, at: its_irq_set_vcpu_affinity+0x8c/0x528 > > > > ... > > > > Call trace: > > > > show_stack+0x30/0x98 (C) > > > > dump_stack_lvl+0x9c/0xd0 > > > > dump_stack+0x1c/0x34 > > > > __lock_acquire+0x814/0xb40 > > > > lock_acquire.part.0+0x16c/0x2a8 > > > > lock_acquire+0x8c/0x178 > > > > get_random_u32+0xd4/0x708 > > > > __get_random_u32_below+0x20/0x80 > > > > shuffle_freelist+0x5c/0x1b0 > > > > allocate_slab+0x15c/0x348 > > > > new_slab+0x48/0x80 > > > > ___slab_alloc+0x590/0x8b8 > > > > __slab_alloc.isra.0+0x3c/0x80 > > > > __kmalloc_noprof+0x174/0x520 > > > > its_vlpi_map+0x834/0xce0 > > > > its_irq_set_vcpu_affinity+0x21c/0x528 > > > > irq_set_vcpu_affinity+0x160/0x1b0 > > > > its_map_vlpi+0x90/0x100 > > > > kvm_vgic_v4_set_forwarding+0x3c4/0x6f0 > > > > kvm_arch_irq_bypass_add_producer+0xac/0x108 > > > > __connect+0x138/0x1b0 > > > > irq_bypass_register_producer+0x16c/0x2f0 > > > > vfio_msi_set_vector_signal+0x2c0/0x5a8 [vfio_pci_core] > > > > vfio_msi_set_block+0x8c/0x120 [vfio_pci_core] > > > > vfio_pci_set_msi_trigger+0x120/0x3d8 [vfio_pci_core] > > > > > > Huh. I guess this is due to RT not being completely compatible with > > > GFP_ATOMIC... Why you'd want RT and KVM at the same time is beyond > > > me, but hey. > > > > For the record, I didn't run KVM on RT, though I still believe it's better > > to conform to the wait context rule and avoid triggering the lockdep > > splat. > > Then I don't understand how you get this, because I have not seen it > so far. > > > > > I don't know if there are any plans which make kmalloc with GFP_ATOMIC > > workable under a stricter wait context (getting rid of the local lock > > in some way?), but I think it would be nicer. > > GFP_ATOMIC is documented as being compatible with raw spinlocks in the > absence of RT, making the above trace pretty odd. I got the report on my ARM64 env with CONFIG_PROVE_LOCKING=y, which leads to CONFIG_PROVE_RAW_LOCK_NESTING=y on ARM64. And this report says that it was trying to acquire a local_lock_t (LD_WAIT_CONFIG) while any raw_spinlock_t (LD_WAIT_SPIN) being held. So I still believe we're on the same page; while I got the report on non-RT, the report just pre-warns the danger for RT. There's no immediate harm on non-RT. > > > > > > > > > > ... > > > > > > > > To avoid this, simply pre-allocate vlpi_maps when creating an ITS v4 > > > > device with LPIs allcation. The trade-off is some wasted memory > > > > depending on nr_lpis, if none of those LPIs are never upgraded to VLPIs. > > > > > > > > An alternative would be to move the vlpi_maps allocation out of > > > > its_map_vlpi() and introduce a two-stage prepare/commit flow, allowing a > > > > caller (KVM in the lockdep splat shown above) to do the allocation > > > > outside irq_set_vcpu_affinity(). However, this would unnecessarily add > > > > complexity. > > > > > > That's debatable. It is probably fine for now, but if this was to > > > grow, we'd need to revisit this. > > > > Just curious but do you have any plans to replace the current > > irq_set_vcpu_affinity() approach with something else? > > Who knows. This is the Linux kernel, everything changes all the time > without the need for a good reason. More significantly, the amount of > *data* being associated with a VLPI could become much higher in the > future, and add more unnecessary allocation. Alright, thank you. -Koichiro > > M. > > -- > Without deviation from the norm, progress is not possible.
© 2016 - 2025 Red Hat, Inc.