block: prevent deadlock in del_gendisk()

[PATCH] block: prevent deadlock in del_gendisk()

Posted by Ujwal Kundur 6 months, 1 week ago

A potential unsafe locking scenario presents itself when
mutex_lock(&disk->open_mutex) is called with reader's lock held on
update_nr_hwq_lock:
       CPU0                    CPU1
       ----                    ----
rlock(&set->update_nr_hwq_lock)
                               lock(&nbd->config_lock);
                               lock(&set->update_nr_hwq_lock);
lock(&disk->open_mutex)

When the gendisk is added back concurrently, a writer's lock is
attempted to be held on update_nr_hwq_lock while holding other locks in
the call-path, becoming a potential source of deadlock(s).

Scope read-critical section to blk_unregister_queue, which is the only
function that interacts with switching elevator and requires
synchronization with update_nr_hwq_lock.

Reported-by: syzbot+2e9e529ac0b319316453@syzkaller.appspotmail.com
Signed-off-by: Ujwal Kundur <ujwal.kundur@gmail.com>
---
 block/genhd.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 block/genhd.c

diff --git a/block/genhd.c b/block/genhd.c
old mode 100644
new mode 100755
index c26733f6324b..b56f09f5699b
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -696,6 +696,7 @@ static void __del_gendisk(struct gendisk *disk)
 	struct block_device *part;
 	unsigned long idx;
 	bool start_drain;
+	struct blk_mq_tag_set *set = q->tag_set;
 
 	might_sleep();
 
@@ -740,7 +741,9 @@ static void __del_gendisk(struct gendisk *disk)
 		bdi_unregister(disk->bdi);
 	}
 
+	down_read(&set->update_nr_hwq_lock);
 	blk_unregister_queue(disk);
+	up_read(&set->update_nr_hwq_lock);
 
 	kobject_put(disk->part0->bd_holder_dir);
 	kobject_put(disk->slave_dir);
@@ -808,20 +811,15 @@ static void disable_elv_switch(struct request_queue *q)
  */
 void del_gendisk(struct gendisk *disk)
 {
-	struct blk_mq_tag_set *set;
 	unsigned int memflags;
 
 	if (!queue_is_mq(disk->queue)) {
 		__del_gendisk(disk);
 	} else {
-		set = disk->queue->tag_set;
-
 		disable_elv_switch(disk->queue);
 
 		memflags = memalloc_noio_save();
-		down_read(&set->update_nr_hwq_lock);
 		__del_gendisk(disk);
-		up_read(&set->update_nr_hwq_lock);
 		memalloc_noio_restore(memflags);
 	}
 }
-- 
2.30.2

Re: [PATCH] block: prevent deadlock in del_gendisk()

Posted by Yu Kuai 6 months, 1 week ago

Hi,

在 2025/08/03 21:41, Ujwal Kundur 写道:
> A potential unsafe locking scenario presents itself when
> mutex_lock(&disk->open_mutex) is called with reader's lock held on
> update_nr_hwq_lock:
>         CPU0                    CPU1
>         ----                    ----
> rlock(&set->update_nr_hwq_lock)
>                                 lock(&nbd->config_lock);
>                                 lock(&set->update_nr_hwq_lock);
> lock(&disk->open_mutex)
> 
This problem is already fixed inside nbd by:
8b428f42f3ed ("nbd: fix lockdep deadlock warning")

Thanks,
Kuai
> When the gendisk is added back concurrently, a writer's lock is
> attempted to be held on update_nr_hwq_lock while holding other locks in
> the call-path, becoming a potential source of deadlock(s).
> 
> Scope read-critical section to blk_unregister_queue, which is the only
> function that interacts with switching elevator and requires
> synchronization with update_nr_hwq_lock.
> 
> Reported-by: syzbot+2e9e529ac0b319316453@syzkaller.appspotmail.com
> Signed-off-by: Ujwal Kundur <ujwal.kundur@gmail.com>
> ---
>   block/genhd.c | 8 +++-----
>   1 file changed, 3 insertions(+), 5 deletions(-)
>   mode change 100644 => 100755 block/genhd.c
> 
> diff --git a/block/genhd.c b/block/genhd.c
> old mode 100644
> new mode 100755
> index c26733f6324b..b56f09f5699b
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -696,6 +696,7 @@ static void __del_gendisk(struct gendisk *disk)
>   	struct block_device *part;
>   	unsigned long idx;
>   	bool start_drain;
> +	struct blk_mq_tag_set *set = q->tag_set;
>   
>   	might_sleep();
>   
> @@ -740,7 +741,9 @@ static void __del_gendisk(struct gendisk *disk)
>   		bdi_unregister(disk->bdi);
>   	}
>   
> +	down_read(&set->update_nr_hwq_lock);
>   	blk_unregister_queue(disk);
> +	up_read(&set->update_nr_hwq_lock);
>   
>   	kobject_put(disk->part0->bd_holder_dir);
>   	kobject_put(disk->slave_dir);
> @@ -808,20 +811,15 @@ static void disable_elv_switch(struct request_queue *q)
>    */
>   void del_gendisk(struct gendisk *disk)
>   {
> -	struct blk_mq_tag_set *set;
>   	unsigned int memflags;
>   
>   	if (!queue_is_mq(disk->queue)) {
>   		__del_gendisk(disk);
>   	} else {
> -		set = disk->queue->tag_set;
> -
>   		disable_elv_switch(disk->queue);
>   
>   		memflags = memalloc_noio_save();
> -		down_read(&set->update_nr_hwq_lock);
>   		__del_gendisk(disk);
> -		up_read(&set->update_nr_hwq_lock);
>   		memalloc_noio_restore(memflags);
>   	}
>   }
>

Re: [PATCH] block: prevent deadlock in del_gendisk()

Posted by Hillf Danton 6 months, 1 week ago

On Mon, 4 Aug 2025 15:51:48 +0800 Yu Kuai wrote:
> �� 2025/08/03 21:41, Ujwal Kundur д��:
> > A potential unsafe locking scenario presents itself when
> > mutex_lock(&disk->open_mutex) is called with reader's lock held on
> > update_nr_hwq_lock:
> >         CPU0                    CPU1
> >         ----                    ----
> > rlock(&set->update_nr_hwq_lock)
> >                                 lock(&nbd->config_lock);
> >                                 lock(&set->update_nr_hwq_lock);
> > lock(&disk->open_mutex)
> > 
> This problem is already fixed inside nbd by:
> 8b428f42f3ed ("nbd: fix lockdep deadlock warning")
> 
Deadlock still exists [1].

[1] https://lore.kernel.org/lkml/6891742c.050a0220.7f033.001a.GAE@google.com/

Re: [PATCH] block: prevent deadlock in del_gendisk()

Posted by Yu Kuai 6 months, 1 week ago


在 2025/08/05 19:59, Hillf Danton 写道:
> On Mon, 4 Aug 2025 15:51:48 +0800 Yu Kuai wrote:
>> �� 2025/08/03 21:41, Ujwal Kundur д��:
>>> A potential unsafe locking scenario presents itself when
>>> mutex_lock(&disk->open_mutex) is called with reader's lock held on
>>> update_nr_hwq_lock:
>>>          CPU0                    CPU1
>>>          ----                    ----
>>> rlock(&set->update_nr_hwq_lock)
>>>                                  lock(&nbd->config_lock);
>>>                                  lock(&set->update_nr_hwq_lock);
>>> lock(&disk->open_mutex)
>>>
>> This problem is already fixed inside nbd by:
>> 8b428f42f3ed ("nbd: fix lockdep deadlock warning")
>>
> Deadlock still exists [1].
> 
This deadlock is a different problem, not what you claimed to fix in
this patch.

Thanks,
Kuai

> [1] https://lore.kernel.org/lkml/6891742c.050a0220.7f033.001a.GAE@google.com/
> .
>

Re: [PATCH] block: prevent deadlock in del_gendisk()

Posted by Ujwal Kundur 6 months, 1 week ago

> This problem is already fixed inside nbd by:
> 8b428f42f3ed ("nbd: fix lockdep deadlock warning")

Thanks for letting me know.

Would this patch still make sense as it narrows the scope of the
read-critical section? It reduces the chances of holding other locks
while trying to update the number of hw queues (and thus avoids more
lockdep warnings for this particular lock in the future).

Thanks,
Ujwal

Re: [PATCH] block: prevent deadlock in del_gendisk()

Posted by kernel test robot 6 months ago


Hello,

kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on:

commit: 00ff643264f9f5c300898a591fa4b83a62c43950 ("[PATCH] block: prevent deadlock in del_gendisk()")
url: https://github.com/intel-lab-lkp/linux/commits/Ujwal-Kundur/block-prevent-deadlock-in-del_gendisk/20250803-214237
base: https://git.kernel.org/cgit/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/all/20250803134114.2707-1-ujwal.kundur@gmail.com/
patch subject: [PATCH] block: prevent deadlock in del_gendisk()

in testcase: phoronix-test-suite
version: 
with following parameters:

	test: svt-av1-2.11.1
	option_a: 2
	option_b: Bosphorus 1080p
	cpufreq_governor: performance



config: x86_64-rhel-9.4
compiler: gcc-12
test machine: 96 threads 2 sockets Intel(R) Xeon(R) Gold 6252 CPU @ 2.10GHz (Cascade Lake) with 512G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202508081000.cb134e91-lkp@intel.com


[   24.331036][ T1884] BUG: kernel NULL pointer dereference, address: 00000000000000a8
[   24.339231][ T1884] #PF: supervisor write access in kernel mode
[   24.345493][ T1884] #PF: error_code(0x0002) - not-present page
[   24.351631][ T1884] PGD 0 P4D 0
[   24.355133][ T1884] Oops: Oops: 0002 [#1] SMP NOPTI
[   24.360264][ T1884] CPU: 26 UID: 0 PID: 1884 Comm: dockerd Tainted: G S                  6.16.0-04436-g00ff643264f9 #1 VOLUNTARY
[   24.372434][ T1884] Tainted: [S]=CPU_OUT_OF_SPEC
[   24.377290][ T1884] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
[ 24.388664][ T1884] RIP: 0010:down_read (arch/x86/include/asm/atomic64_64.h:79 include/linux/atomic/atomic-arch-fallback.h:2723 include/linux/atomic/atomic-long.h:163 include/linux/atomic/atomic-instrumented.h:3298 kernel/locking/rwsem.c:250 kernel/locking/rwsem.c:1245 kernel/locking/rwsem.c:1259 kernel/locking/rwsem.c:1524) 
[ 24.393632][ T1884] Code: 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 53 48 89 fb e8 ae c0 ff ff be 00 01 00 00 <f0> 48 0f c1 33 48 81 c6 00 01 00 00 78 3f 48 b8 07 00 00 00 00 00
All code
========
   0:	00 00                	add    %al,(%rax)
   2:	00 90 90 90 90 90    	add    %dl,-0x6f6f6f70(%rax)
   8:	90                   	nop
   9:	90                   	nop
   a:	90                   	nop
   b:	90                   	nop
   c:	90                   	nop
   d:	90                   	nop
   e:	90                   	nop
   f:	90                   	nop
  10:	90                   	nop
  11:	90                   	nop
  12:	90                   	nop
  13:	f3 0f 1e fa          	endbr64
  17:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
  1c:	53                   	push   %rbx
  1d:	48 89 fb             	mov    %rdi,%rbx
  20:	e8 ae c0 ff ff       	call   0xffffffffffffc0d3
  25:	be 00 01 00 00       	mov    $0x100,%esi
  2a:*	f0 48 0f c1 33       	lock xadd %rsi,(%rbx)		<-- trapping instruction
  2f:	48 81 c6 00 01 00 00 	add    $0x100,%rsi
  36:	78 3f                	js     0x77
  38:	48                   	rex.W
  39:	b8 07 00 00 00       	mov    $0x7,%eax
	...

Code starting with the faulting instruction
===========================================
   0:	f0 48 0f c1 33       	lock xadd %rsi,(%rbx)
   5:	48 81 c6 00 01 00 00 	add    $0x100,%rsi
   c:	78 3f                	js     0x4d
   e:	48                   	rex.W
   f:	b8 07 00 00 00       	mov    $0x7,%eax
	...
[   24.413635][ T1884] RSP: 0018:ffffc900211f3918 EFLAGS: 00010246
[   24.419821][ T1884] RAX: 0000000000000000 RBX: 00000000000000a8 RCX: 00000000000001f0
[   24.427912][ T1884] RDX: 00000000000001ef RSI: 0000000000000100 RDI: 00000000000000a8
[   24.436003][ T1884] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[   24.444092][ T1884] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000000a8
[   24.452181][ T1884] R13: ffff88c1841e87a0 R14: ffff88c08b28ad68 R15: ffff88c0ce5b2200
[   24.460273][ T1884] FS:  00007ff8a3fff6c0(0000) GS:ffff88f027468000(0000) knlGS:0000000000000000
[   24.469318][ T1884] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   24.476016][ T1884] CR2: 00000000000000a8 CR3: 000000010e760004 CR4: 00000000007726f0
[   24.484104][ T1884] PKRU: 55555554
[   24.487751][ T1884] Call Trace:
[   24.491131][ T1884]  <TASK>
[ 24.494147][ T1884] __del_gendisk (block/genhd.c:745) 
[ 24.498810][ T1884] cleanup_mapped_device (drivers/md/dm.c:2243) dm_mod 
[ 24.504959][ T1884] __dm_destroy (include/linux/list.h:373 drivers/md/dm.c:2397 drivers/md/dm.c:2730) dm_mod 
[ 24.510321][ T1884] ? __pfx_dev_remove (drivers/md/dm-ioctl.c:978) dm_mod 
[ 24.516029][ T1884] dev_remove (drivers/md/dm-ioctl.c:1029) dm_mod 
[ 24.521215][ T1884] ctl_ioctl (drivers/md/dm-ioctl.c:2101) dm_mod 
[ 24.526310][ T1884] dm_ctl_ioctl (drivers/md/dm-ioctl.c:2123) dm_mod 
[ 24.531399][ T1884] __x64_sys_ioctl (fs/ioctl.c:51 fs/ioctl.c:598 fs/ioctl.c:584 fs/ioctl.c:584) 
[ 24.536040][ T1884] ? change_protection_range (mm/mprotect.c:498 mm/mprotect.c:526) 
[ 24.541726][ T1884] do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
[ 24.546279][ T1884] ? get_page_from_freelist (mm/page_alloc.c:1083 mm/page_alloc.c:1702 mm/page_alloc.c:1712 mm/page_alloc.c:3669) 
[ 24.551869][ T1884] ? __alloc_frozen_pages_noprof (mm/page_alloc.c:4959) 
[ 24.557894][ T1884] ? mod_memcg_lruvec_state (mm/memcontrol.c:576 mm/memcontrol.c:564 mm/memcontrol.c:754) 
[ 24.563495][ T1884] ? __lruvec_stat_mod_folio (mm/memcontrol.c:800) 
[ 24.569006][ T1884] ? __folio_mod_stat (mm/rmap.c:1414) 
[ 24.573905][ T1884] ? folio_add_new_anon_rmap (arch/x86/include/asm/bitops.h:206 arch/x86/include/asm/bitops.h:238 include/asm-generic/bitops/instrumented-non-atomic.h:142 include/linux/page-flags.h:867 include/linux/page-flags.h:888 include/linux/mm.h:992 mm/rmap.c:1609) 
[ 24.579493][ T1884] ? do_anonymous_page (mm/memory.c:5153) 
[ 24.584646][ T1884] ? __handle_mm_fault (mm/memory.c:6230) 
[ 24.589805][ T1884] ? do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
[ 24.594520][ T1884] ? count_memcg_events (mm/memcontrol.c:576 mm/memcontrol.c:564 mm/memcontrol.c:848) 
[ 24.599749][ T1884] ? handle_mm_fault (include/linux/memcontrol.h:978 include/linux/memcontrol.h:985 mm/memory.c:6264 mm/memory.c:6425) 
[ 24.604635][ T1884] ? do_user_addr_fault (arch/x86/mm/fault.c:1337) 
[ 24.609863][ T1884] ? clear_bhb_loop (arch/x86/entry/entry_64.S:1548) 
[ 24.614572][ T1884] ? clear_bhb_loop (arch/x86/entry/entry_64.S:1548) 
[ 24.619279][ T1884] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 
[   24.625200][ T1884] RIP: 0033:0x7ff9476c3d1b
[ 24.629648][ T1884] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00
All code
========
   0:	00 48 89             	add    %cl,-0x77(%rax)
   3:	44 24 18             	rex.R and $0x18,%al
   6:	31 c0                	xor    %eax,%eax
   8:	48 8d 44 24 60       	lea    0x60(%rsp),%rax
   d:	c7 04 24 10 00 00 00 	movl   $0x10,(%rsp)
  14:	48 89 44 24 08       	mov    %rax,0x8(%rsp)
  19:	48 8d 44 24 20       	lea    0x20(%rsp),%rax
  1e:	48 89 44 24 10       	mov    %rax,0x10(%rsp)
  23:	b8 10 00 00 00       	mov    $0x10,%eax
  28:	0f 05                	syscall
  2a:*	89 c2                	mov    %eax,%edx		<-- trapping instruction
  2c:	3d 00 f0 ff ff       	cmp    $0xfffff000,%eax
  31:	77 1c                	ja     0x4f
  33:	48 8b 44 24 18       	mov    0x18(%rsp),%rax
  38:	64                   	fs
  39:	48                   	rex.W
  3a:	2b                   	.byte 0x2b
  3b:	04 25                	add    $0x25,%al
  3d:	28 00                	sub    %al,(%rax)
	...

Code starting with the faulting instruction
===========================================
   0:	89 c2                	mov    %eax,%edx
   2:	3d 00 f0 ff ff       	cmp    $0xfffff000,%eax
   7:	77 1c                	ja     0x25
   9:	48 8b 44 24 18       	mov    0x18(%rsp),%rax
   e:	64                   	fs
   f:	48                   	rex.W
  10:	2b                   	.byte 0x2b
  11:	04 25                	add    $0x25,%al
  13:	28 00                	sub    %al,(%rax)


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250808/202508081000.cb134e91-lkp@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki