Recently we got several deadlock report[1][2][3] caused by
blk_mq_freeze_queue and blk_enter_queue().
Turns out the two are just like acquiring read/write lock, so model them
as read/write lock for supporting lockdep:
1) model q->q_usage_counter as two locks(io and queue lock)
- queue lock covers sync with blk_enter_queue()
- io lock covers sync with bio_enter_queue()
2) make the lockdep class/key as per-queue:
- different subsystem has very different lock use pattern, shared lock
class causes false positive easily
- freeze_queue degrades to no lock in case that disk state becomes DEAD
because bio_enter_queue() won't be blocked any more
- freeze_queue degrades to no lock in case that request queue becomes dying
because blk_enter_queue() won't be blocked any more
3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock
- it is exclusive lock, so dependency with blk_enter_queue() is covered
- it is trylock because blk_mq_freeze_queue() are allowed to run
concurrently
4) model blk_enter_queue() & bio_enter_queue() as acquire_read()
- nested blk_enter_queue() are allowed
- dependency with blk_mq_freeze_queue() is covered
- blk_queue_exit() is often called from other contexts(such as irq), and
it can't be annotated as lock_release(), so simply do it in
blk_enter_queue(), this way still covered cases as many as possible
With lockdep support, such kind of reports may be reported asap and
needn't wait until the real deadlock is triggered.
For example, lockdep report can be triggered in the report[3] with this
patch applied.
[1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler'
https://bugzilla.kernel.org/show_bug.cgi?id=219166
[2] del_gendisk() vs blk_queue_enter() race condition
https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/
[3] queue_freeze & queue_enter deadlock in scsi
https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-core.c | 18 ++++++++++++++++--
block/blk-mq.c | 26 ++++++++++++++++++++++----
block/blk.h | 29 ++++++++++++++++++++++++++---
block/genhd.c | 15 +++++++++++----
include/linux/blkdev.h | 6 ++++++
5 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index bc5e8c5eaac9..09d10bb95fda 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -261,6 +261,8 @@ static void blk_free_queue(struct request_queue *q)
blk_mq_release(q);
ida_free(&blk_queue_ida, q->id);
+ lockdep_unregister_key(&q->io_lock_cls_key);
+ lockdep_unregister_key(&q->q_lock_cls_key);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
@@ -278,18 +280,20 @@ void blk_put_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_put_queue);
-void blk_queue_start_drain(struct request_queue *q)
+bool blk_queue_start_drain(struct request_queue *q)
{
/*
* When queue DYING flag is set, we need to block new req
* entering queue, so we call blk_freeze_queue_start() to
* prevent I/O from crossing blk_queue_enter().
*/
- blk_freeze_queue_start(q);
+ bool freeze = __blk_freeze_queue_start(q);
if (queue_is_mq(q))
blk_mq_wake_waiters(q);
/* Make blk_queue_enter() reexamine the DYING flag. */
wake_up_all(&q->mq_freeze_wq);
+
+ return freeze;
}
/**
@@ -321,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
return -ENODEV;
}
+ rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->q_lockdep_map, _RET_IP_);
return 0;
}
@@ -352,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio)
goto dead;
}
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
return 0;
dead:
bio_io_error(bio);
@@ -441,6 +449,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
if (error)
goto fail_stats;
+ lockdep_register_key(&q->io_lock_cls_key);
+ lockdep_register_key(&q->q_lock_cls_key);
+ lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)",
+ &q->io_lock_cls_key, 0);
+ lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
+ &q->q_lock_cls_key, 0);
q->nr_requests = BLKDEV_DEFAULT_RQ;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 96858fb3b9ff..76f277a30c11 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -120,17 +120,29 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
inflight[1] = mi.inflight[1];
}
-void blk_freeze_queue_start(struct request_queue *q)
+bool __blk_freeze_queue_start(struct request_queue *q)
{
+ int freeze;
+
mutex_lock(&q->mq_freeze_lock);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
if (queue_is_mq(q))
blk_mq_run_hw_queues(q, false);
+ freeze = true;
} else {
mutex_unlock(&q->mq_freeze_lock);
+ freeze = false;
}
+
+ return freeze;
+}
+
+void blk_freeze_queue_start(struct request_queue *q)
+{
+ if (__blk_freeze_queue_start(q))
+ blk_freeze_acquire_lock(q, false, false);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -176,8 +188,10 @@ void blk_mq_freeze_queue(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
+ int unfreeze = false;
+
mutex_lock(&q->mq_freeze_lock);
if (force_atomic)
q->q_usage_counter.data->force_atomic = true;
@@ -186,13 +200,17 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
if (!q->mq_freeze_depth) {
percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
+ unfreeze = true;
}
mutex_unlock(&q->mq_freeze_lock);
+
+ return unfreeze;
}
void blk_mq_unfreeze_queue(struct request_queue *q)
{
- __blk_mq_unfreeze_queue(q, false);
+ if (__blk_mq_unfreeze_queue(q, false))
+ blk_unfreeze_release_lock(q, false, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
@@ -205,7 +223,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
*/
void blk_freeze_queue_start_non_owner(struct request_queue *q)
{
- blk_freeze_queue_start(q);
+ __blk_freeze_queue_start(q);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
diff --git a/block/blk.h b/block/blk.h
index c718e4291db0..832e54c5a271 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@
#include <linux/bio-integrity.h>
#include <linux/blk-crypto.h>
+#include <linux/lockdep.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
@@ -35,8 +36,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
void blk_free_flush_queue(struct blk_flush_queue *q);
void blk_freeze_queue(struct request_queue *q);
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
-void blk_queue_start_drain(struct request_queue *q);
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
+bool blk_queue_start_drain(struct request_queue *q);
+bool __blk_freeze_queue_start(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio);
void bio_await_chain(struct bio *bio);
@@ -69,8 +71,11 @@ static inline int bio_queue_enter(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (blk_try_enter_queue(q, false))
+ if (blk_try_enter_queue(q, false)) {
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
return 0;
+ }
return __bio_queue_enter(q, bio);
}
@@ -734,4 +739,22 @@ void blk_integrity_verify(struct bio *bio);
void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
+static inline void blk_freeze_acquire_lock(struct request_queue *q, bool
+ disk_dead, bool queue_dying)
+{
+ if (!disk_dead)
+ rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
+ if (!queue_dying)
+ rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
+}
+
+static inline void blk_unfreeze_release_lock(struct request_queue *q, bool
+ disk_dead, bool queue_dying)
+{
+ if (!queue_dying)
+ rwsem_release(&q->q_lockdep_map, _RET_IP_);
+ if (!disk_dead)
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
+}
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/genhd.c b/block/genhd.c
index 1c05dd4c6980..6ad3fcde0110 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -581,13 +581,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
rcu_read_unlock();
}
-static void __blk_mark_disk_dead(struct gendisk *disk)
+static bool __blk_mark_disk_dead(struct gendisk *disk)
{
/*
* Fail any new I/O.
*/
if (test_and_set_bit(GD_DEAD, &disk->state))
- return;
+ return false;
if (test_bit(GD_OWNS_QUEUE, &disk->state))
blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);
@@ -600,7 +600,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk)
/*
* Prevent new I/O from crossing bio_queue_enter().
*/
- blk_queue_start_drain(disk->queue);
+ return blk_queue_start_drain(disk->queue);
}
/**
@@ -641,6 +641,7 @@ void del_gendisk(struct gendisk *disk)
struct request_queue *q = disk->queue;
struct block_device *part;
unsigned long idx;
+ bool start_drain, queue_dying;
might_sleep();
@@ -668,7 +669,10 @@ void del_gendisk(struct gendisk *disk)
* Drop all partitions now that the disk is marked dead.
*/
mutex_lock(&disk->open_mutex);
- __blk_mark_disk_dead(disk);
+ start_drain = __blk_mark_disk_dead(disk);
+ queue_dying = blk_queue_dying(q);
+ if (start_drain)
+ blk_freeze_acquire_lock(q, true, queue_dying);
xa_for_each_start(&disk->part_tbl, idx, part, 1)
drop_partition(part);
mutex_unlock(&disk->open_mutex);
@@ -725,6 +729,9 @@ void del_gendisk(struct gendisk *disk)
if (queue_is_mq(q))
blk_mq_exit_queue(q);
}
+
+ if (start_drain)
+ blk_unfreeze_release_lock(q, true, queue_dying);
}
EXPORT_SYMBOL(del_gendisk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50c3b959da28..57f1ee386b57 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -25,6 +25,7 @@
#include <linux/uuid.h>
#include <linux/xarray.h>
#include <linux/file.h>
+#include <linux/lockdep.h>
struct module;
struct request_queue;
@@ -471,6 +472,11 @@ struct request_queue {
struct xarray hctx_table;
struct percpu_ref q_usage_counter;
+ struct lock_class_key io_lock_cls_key;
+ struct lockdep_map io_lockdep_map;
+
+ struct lock_class_key q_lock_cls_key;
+ struct lockdep_map q_lockdep_map;
struct request *last_merge;
--
2.46.0
Hi Ming, Greetings! I used Syzkaller and found that there is possible deadlock in __submit_bio in linux-next next-20241029. After bisection and the first bad commit is: " f1be1788a32e block: model freeze & enter queue as lock for supporting lockdep " All detailed into can be found at: https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio Syzkaller repro code: https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.c Syzkaller repro syscall steps: https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.prog Syzkaller report: https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.report Kconfig(make olddefconfig): https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/kconfig_origin Bisect info: https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/bisect_info.log bzImage: https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/241029_183511___submit_bio/bzImage_6fb2fa9805c501d9ade047fc511961f3273cdcb5 Issue dmesg: https://github.com/laifryiee/syzkaller_logs/blob/main/241029_183511___submit_bio/6fb2fa9805c501d9ade047fc511961f3273cdcb5_dmesg.log " [ 22.219103] 6.12.0-rc5-next-20241029-6fb2fa9805c5 #1 Not tainted [ 22.219512] ------------------------------------------------------ [ 22.219827] repro/735 is trying to acquire lock: [ 22.220066] ffff888010f1a768 (&q->q_usage_counter(io)#25){++++}-{0:0}, at: __submit_bio+0x39f/0x550 [ 22.220568] [ 22.220568] but task is already holding lock: [ 22.220884] ffffffff872322a0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x76b/0x21e0 [ 22.221453] [ 22.221453] which lock already depends on the new lock. [ 22.221453] [ 22.221862] [ 22.221862] the existing dependency chain (in reverse order) is: [ 22.222247] [ 22.222247] -> #1 (fs_reclaim){+.+.}-{0:0}: [ 22.222630] lock_acquire+0x80/0xb0 [ 22.222920] fs_reclaim_acquire+0x116/0x160 [ 22.223244] __kmalloc_cache_node_noprof+0x59/0x470 [ 22.223528] blk_mq_init_tags+0x79/0x1a0 [ 22.223771] blk_mq_alloc_map_and_rqs+0x1f4/0xdd0 [ 22.224127] blk_mq_init_sched+0x33d/0x6d0 [ 22.224376] elevator_init_mq+0x2b2/0x400 [ 22.224619] add_disk_fwnode+0x11c/0x1300 [ 22.224866] device_add_disk+0x31/0x40 [ 22.225097] sd_probe+0xa79/0x1080 [ 22.225324] really_probe+0x27c/0xac0 [ 22.225556] __driver_probe_device+0x1f3/0x460 [ 22.225819] driver_probe_device+0x56/0x1b0 [ 22.226069] __device_attach_driver+0x1e7/0x300 [ 22.226336] bus_for_each_drv+0x159/0x1e0 [ 22.226576] __device_attach_async_helper+0x1e4/0x2a0 [ 22.226871] async_run_entry_fn+0xa3/0x450 [ 22.227127] process_one_work+0x92e/0x1b50 [ 22.227380] worker_thread+0x68d/0xe90 [ 22.227614] kthread+0x35a/0x470 [ 22.227834] ret_from_fork+0x56/0x90 [ 22.228119] ret_from_fork_asm+0x1a/0x30 [ 22.228365] [ 22.228365] -> #0 (&q->q_usage_counter(io)#25){++++}-{0:0}: [ 22.228739] __lock_acquire+0x2ff8/0x5d60 [ 22.228982] lock_acquire.part.0+0x142/0x390 [ 22.229237] lock_acquire+0x80/0xb0 [ 22.229452] blk_mq_submit_bio+0x1cbe/0x2590 [ 22.229708] __submit_bio+0x39f/0x550 [ 22.229931] submit_bio_noacct_nocheck+0x647/0xcc0 [ 22.230212] submit_bio_noacct+0x620/0x1e00 [ 22.230463] submit_bio+0xce/0x480 [ 22.230677] __swap_writepage+0x2f1/0xdf0 [ 22.230923] swap_writepage+0x464/0xbc0 [ 22.231157] shmem_writepage+0xdeb/0x1340 [ 22.231406] pageout+0x3bc/0x9b0 [ 22.231617] shrink_folio_list+0x16b9/0x3b60 [ 22.231884] shrink_lruvec+0xd78/0x2790 [ 22.232220] shrink_node+0xb29/0x2870 [ 22.232447] do_try_to_free_pages+0x2e3/0x1790 [ 22.232789] try_to_free_pages+0x2bc/0x750 [ 22.233065] __alloc_pages_slowpath.constprop.0+0x812/0x21e0 [ 22.233528] __alloc_pages_noprof+0x5d4/0x6f0 [ 22.233861] alloc_pages_mpol_noprof+0x30a/0x580 [ 22.234233] alloc_pages_noprof+0xa9/0x180 [ 22.234555] kimage_alloc_pages+0x79/0x240 [ 22.234808] kimage_alloc_control_pages+0x1cb/0xa60 [ 22.235119] do_kexec_load+0x3a6/0x8e0 [ 22.235418] __x64_sys_kexec_load+0x1cc/0x240 [ 22.235759] x64_sys_call+0xf0f/0x20d0 [ 22.236058] do_syscall_64+0x6d/0x140 [ 22.236343] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 22.236643] [ 22.236643] other info that might help us debug this: [ 22.236643] [ 22.237042] Possible unsafe locking scenario: [ 22.237042] [ 22.237359] CPU0 CPU1 [ 22.237662] ---- ---- [ 22.237926] lock(fs_reclaim); [ 22.238098] lock(&q->q_usage_counter(io)#25); [ 22.238535] lock(fs_reclaim); [ 22.238935] rlock(&q->q_usage_counter(io)#25); [ 22.239182] [ 22.239182] *** DEADLOCK *** [ 22.239182] [ 22.239521] 1 lock held by repro/735: [ 22.239778] #0: ffffffff872322a0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x76b/0x21e0 [ 22.240320] [ 22.240320] stack backtrace: [ 22.240550] CPU: 1 UID: 0 PID: 735 Comm: repro Not tainted 6.12.0-rc5-next-20241029-6fb2fa9805c5 #1 [ 22.241079] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 22.241780] Call Trace: [ 22.241928] <TASK> [ 22.242095] dump_stack_lvl+0xea/0x150 [ 22.242351] dump_stack+0x19/0x20 [ 22.242534] print_circular_bug+0x47f/0x750 [ 22.242761] check_noncircular+0x2f4/0x3e0 [ 22.242992] ? __pfx_check_noncircular+0x10/0x10 [ 22.243302] ? __pfx_lock_release+0x10/0x10 [ 22.243586] ? lockdep_lock+0xd0/0x1d0 [ 22.243865] ? __pfx_lockdep_lock+0x10/0x10 [ 22.244103] __lock_acquire+0x2ff8/0x5d60 [ 22.244382] ? __kernel_text_address+0x16/0x50 [ 22.244633] ? __pfx___lock_acquire+0x10/0x10 [ 22.244871] ? stack_trace_save+0x97/0xd0 [ 22.245102] lock_acquire.part.0+0x142/0x390 [ 22.245394] ? __submit_bio+0x39f/0x550 [ 22.245646] ? __pfx_lock_acquire.part.0+0x10/0x10 " I hope you find it useful. Regards, Yi Lai --- If you don't need the following environment to reproduce the problem or if you already have one reproduced environment, please ignore the following information. How to reproduce: git clone https://gitlab.com/xupengfe/repro_vm_env.git cd repro_vm_env tar -xvf repro_vm_env.tar.gz cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0 // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel // You could change the bzImage_xxx as you want // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version You could use below command to log in, there is no password for root. ssh -p 10023 root@localhost After login vm(virtual machine) successfully, you could transfer reproduced binary to the vm by below way, and reproduce the problem in vm: gcc -pthread -o repro repro.c scp -P 10023 repro root@localhost:/root/ Get the bzImage for target kernel: Please use target kconfig and copy it to kernel_src/.config make olddefconfig make -jx bzImage //x should equal or less than cpu num your pc has Fill the bzImage file into above start3.sh to load the target kernel in vm. Tips: If you already have qemu-system-x86_64, please ignore below info. If you want to install qemu v7.1.0 version: git clone https://github.com/qemu/qemu.git cd qemu git checkout -f v7.1.0 mkdir build cd build yum install -y ninja-build.x86_64 yum -y install libslirp-devel.x86_64 ../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp make make install On Fri, Oct 25, 2024 at 08:37:20AM +0800, Ming Lei wrote: > Recently we got several deadlock report[1][2][3] caused by > blk_mq_freeze_queue and blk_enter_queue(). > > Turns out the two are just like acquiring read/write lock, so model them > as read/write lock for supporting lockdep: > > 1) model q->q_usage_counter as two locks(io and queue lock) > > - queue lock covers sync with blk_enter_queue() > > - io lock covers sync with bio_enter_queue() > > 2) make the lockdep class/key as per-queue: > > - different subsystem has very different lock use pattern, shared lock > class causes false positive easily > > - freeze_queue degrades to no lock in case that disk state becomes DEAD > because bio_enter_queue() won't be blocked any more > > - freeze_queue degrades to no lock in case that request queue becomes dying > because blk_enter_queue() won't be blocked any more > > 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock > - it is exclusive lock, so dependency with blk_enter_queue() is covered > > - it is trylock because blk_mq_freeze_queue() are allowed to run > concurrently > > 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() > - nested blk_enter_queue() are allowed > > - dependency with blk_mq_freeze_queue() is covered > > - blk_queue_exit() is often called from other contexts(such as irq), and > it can't be annotated as lock_release(), so simply do it in > blk_enter_queue(), this way still covered cases as many as possible > > With lockdep support, such kind of reports may be reported asap and > needn't wait until the real deadlock is triggered. > > For example, lockdep report can be triggered in the report[3] with this > patch applied. > > [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' > https://bugzilla.kernel.org/show_bug.cgi?id=219166 > > [2] del_gendisk() vs blk_queue_enter() race condition > https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ > > [3] queue_freeze & queue_enter deadlock in scsi > https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u > > Reviewed-by: Christoph Hellwig <hch@lst.de> > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > block/blk-core.c | 18 ++++++++++++++++-- > block/blk-mq.c | 26 ++++++++++++++++++++++---- > block/blk.h | 29 ++++++++++++++++++++++++++--- > block/genhd.c | 15 +++++++++++---- > include/linux/blkdev.h | 6 ++++++ > 5 files changed, 81 insertions(+), 13 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index bc5e8c5eaac9..09d10bb95fda 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -261,6 +261,8 @@ static void blk_free_queue(struct request_queue *q) > blk_mq_release(q); > > ida_free(&blk_queue_ida, q->id); > + lockdep_unregister_key(&q->io_lock_cls_key); > + lockdep_unregister_key(&q->q_lock_cls_key); > call_rcu(&q->rcu_head, blk_free_queue_rcu); > } > > @@ -278,18 +280,20 @@ void blk_put_queue(struct request_queue *q) > } > EXPORT_SYMBOL(blk_put_queue); > > -void blk_queue_start_drain(struct request_queue *q) > +bool blk_queue_start_drain(struct request_queue *q) > { > /* > * When queue DYING flag is set, we need to block new req > * entering queue, so we call blk_freeze_queue_start() to > * prevent I/O from crossing blk_queue_enter(). > */ > - blk_freeze_queue_start(q); > + bool freeze = __blk_freeze_queue_start(q); > if (queue_is_mq(q)) > blk_mq_wake_waiters(q); > /* Make blk_queue_enter() reexamine the DYING flag. */ > wake_up_all(&q->mq_freeze_wq); > + > + return freeze; > } > > /** > @@ -321,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) > return -ENODEV; > } > > + rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); > + rwsem_release(&q->q_lockdep_map, _RET_IP_); > return 0; > } > > @@ -352,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio) > goto dead; > } > > + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); > + rwsem_release(&q->io_lockdep_map, _RET_IP_); > return 0; > dead: > bio_io_error(bio); > @@ -441,6 +449,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) > PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); > if (error) > goto fail_stats; > + lockdep_register_key(&q->io_lock_cls_key); > + lockdep_register_key(&q->q_lock_cls_key); > + lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", > + &q->io_lock_cls_key, 0); > + lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", > + &q->q_lock_cls_key, 0); > > q->nr_requests = BLKDEV_DEFAULT_RQ; > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 96858fb3b9ff..76f277a30c11 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -120,17 +120,29 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, > inflight[1] = mi.inflight[1]; > } > > -void blk_freeze_queue_start(struct request_queue *q) > +bool __blk_freeze_queue_start(struct request_queue *q) > { > + int freeze; > + > mutex_lock(&q->mq_freeze_lock); > if (++q->mq_freeze_depth == 1) { > percpu_ref_kill(&q->q_usage_counter); > mutex_unlock(&q->mq_freeze_lock); > if (queue_is_mq(q)) > blk_mq_run_hw_queues(q, false); > + freeze = true; > } else { > mutex_unlock(&q->mq_freeze_lock); > + freeze = false; > } > + > + return freeze; > +} > + > +void blk_freeze_queue_start(struct request_queue *q) > +{ > + if (__blk_freeze_queue_start(q)) > + blk_freeze_acquire_lock(q, false, false); > } > EXPORT_SYMBOL_GPL(blk_freeze_queue_start); > > @@ -176,8 +188,10 @@ void blk_mq_freeze_queue(struct request_queue *q) > } > EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); > > -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) > +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) > { > + int unfreeze = false; > + > mutex_lock(&q->mq_freeze_lock); > if (force_atomic) > q->q_usage_counter.data->force_atomic = true; > @@ -186,13 +200,17 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) > if (!q->mq_freeze_depth) { > percpu_ref_resurrect(&q->q_usage_counter); > wake_up_all(&q->mq_freeze_wq); > + unfreeze = true; > } > mutex_unlock(&q->mq_freeze_lock); > + > + return unfreeze; > } > > void blk_mq_unfreeze_queue(struct request_queue *q) > { > - __blk_mq_unfreeze_queue(q, false); > + if (__blk_mq_unfreeze_queue(q, false)) > + blk_unfreeze_release_lock(q, false, false); > } > EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); > > @@ -205,7 +223,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); > */ > void blk_freeze_queue_start_non_owner(struct request_queue *q) > { > - blk_freeze_queue_start(q); > + __blk_freeze_queue_start(q); > } > EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); > > diff --git a/block/blk.h b/block/blk.h > index c718e4291db0..832e54c5a271 100644 > --- a/block/blk.h > +++ b/block/blk.h > @@ -4,6 +4,7 @@ > > #include <linux/bio-integrity.h> > #include <linux/blk-crypto.h> > +#include <linux/lockdep.h> > #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ > #include <linux/sched/sysctl.h> > #include <linux/timekeeping.h> > @@ -35,8 +36,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, > void blk_free_flush_queue(struct blk_flush_queue *q); > > void blk_freeze_queue(struct request_queue *q); > -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); > -void blk_queue_start_drain(struct request_queue *q); > +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); > +bool blk_queue_start_drain(struct request_queue *q); > +bool __blk_freeze_queue_start(struct request_queue *q); > int __bio_queue_enter(struct request_queue *q, struct bio *bio); > void submit_bio_noacct_nocheck(struct bio *bio); > void bio_await_chain(struct bio *bio); > @@ -69,8 +71,11 @@ static inline int bio_queue_enter(struct bio *bio) > { > struct request_queue *q = bdev_get_queue(bio->bi_bdev); > > - if (blk_try_enter_queue(q, false)) > + if (blk_try_enter_queue(q, false)) { > + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); > + rwsem_release(&q->io_lockdep_map, _RET_IP_); > return 0; > + } > return __bio_queue_enter(q, bio); > } > > @@ -734,4 +739,22 @@ void blk_integrity_verify(struct bio *bio); > void blk_integrity_prepare(struct request *rq); > void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); > > +static inline void blk_freeze_acquire_lock(struct request_queue *q, bool > + disk_dead, bool queue_dying) > +{ > + if (!disk_dead) > + rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); > + if (!queue_dying) > + rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); > +} > + > +static inline void blk_unfreeze_release_lock(struct request_queue *q, bool > + disk_dead, bool queue_dying) > +{ > + if (!queue_dying) > + rwsem_release(&q->q_lockdep_map, _RET_IP_); > + if (!disk_dead) > + rwsem_release(&q->io_lockdep_map, _RET_IP_); > +} > + > #endif /* BLK_INTERNAL_H */ > diff --git a/block/genhd.c b/block/genhd.c > index 1c05dd4c6980..6ad3fcde0110 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -581,13 +581,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise) > rcu_read_unlock(); > } > > -static void __blk_mark_disk_dead(struct gendisk *disk) > +static bool __blk_mark_disk_dead(struct gendisk *disk) > { > /* > * Fail any new I/O. > */ > if (test_and_set_bit(GD_DEAD, &disk->state)) > - return; > + return false; > > if (test_bit(GD_OWNS_QUEUE, &disk->state)) > blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); > @@ -600,7 +600,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk) > /* > * Prevent new I/O from crossing bio_queue_enter(). > */ > - blk_queue_start_drain(disk->queue); > + return blk_queue_start_drain(disk->queue); > } > > /** > @@ -641,6 +641,7 @@ void del_gendisk(struct gendisk *disk) > struct request_queue *q = disk->queue; > struct block_device *part; > unsigned long idx; > + bool start_drain, queue_dying; > > might_sleep(); > > @@ -668,7 +669,10 @@ void del_gendisk(struct gendisk *disk) > * Drop all partitions now that the disk is marked dead. > */ > mutex_lock(&disk->open_mutex); > - __blk_mark_disk_dead(disk); > + start_drain = __blk_mark_disk_dead(disk); > + queue_dying = blk_queue_dying(q); > + if (start_drain) > + blk_freeze_acquire_lock(q, true, queue_dying); > xa_for_each_start(&disk->part_tbl, idx, part, 1) > drop_partition(part); > mutex_unlock(&disk->open_mutex); > @@ -725,6 +729,9 @@ void del_gendisk(struct gendisk *disk) > if (queue_is_mq(q)) > blk_mq_exit_queue(q); > } > + > + if (start_drain) > + blk_unfreeze_release_lock(q, true, queue_dying); > } > EXPORT_SYMBOL(del_gendisk); > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 50c3b959da28..57f1ee386b57 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -25,6 +25,7 @@ > #include <linux/uuid.h> > #include <linux/xarray.h> > #include <linux/file.h> > +#include <linux/lockdep.h> > > struct module; > struct request_queue; > @@ -471,6 +472,11 @@ struct request_queue { > struct xarray hctx_table; > > struct percpu_ref q_usage_counter; > + struct lock_class_key io_lock_cls_key; > + struct lockdep_map io_lockdep_map; > + > + struct lock_class_key q_lock_cls_key; > + struct lockdep_map q_lockdep_map; > > struct request *last_merge; > > -- > 2.46.0 >
On Wed, Oct 30, 2024 at 02:45:03PM +0800, Lai, Yi wrote: > Hi Ming, > > Greetings! > > I used Syzkaller and found that there is possible deadlock in __submit_bio in linux-next next-20241029. > > After bisection and the first bad commit is: > " > f1be1788a32e block: model freeze & enter queue as lock for supporting lockdep > " > > All detailed into can be found at: > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio > Syzkaller repro code: > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.c > Syzkaller repro syscall steps: > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.prog > Syzkaller report: > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.report > Kconfig(make olddefconfig): > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/kconfig_origin > Bisect info: > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/bisect_info.log > bzImage: > https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/241029_183511___submit_bio/bzImage_6fb2fa9805c501d9ade047fc511961f3273cdcb5 > Issue dmesg: > https://github.com/laifryiee/syzkaller_logs/blob/main/241029_183511___submit_bio/6fb2fa9805c501d9ade047fc511961f3273cdcb5_dmesg.log > > " > [ 22.219103] 6.12.0-rc5-next-20241029-6fb2fa9805c5 #1 Not tainted > [ 22.219512] ------------------------------------------------------ > [ 22.219827] repro/735 is trying to acquire lock: > [ 22.220066] ffff888010f1a768 (&q->q_usage_counter(io)#25){++++}-{0:0}, at: __submit_bio+0x39f/0x550 > [ 22.220568] > [ 22.220568] but task is already holding lock: > [ 22.220884] ffffffff872322a0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x76b/0x21e0 > [ 22.221453] > [ 22.221453] which lock already depends on the new lock. > [ 22.221453] > [ 22.221862] > [ 22.221862] the existing dependency chain (in reverse order) is: > [ 22.222247] > [ 22.222247] -> #1 (fs_reclaim){+.+.}-{0:0}: > [ 22.222630] lock_acquire+0x80/0xb0 > [ 22.222920] fs_reclaim_acquire+0x116/0x160 > [ 22.223244] __kmalloc_cache_node_noprof+0x59/0x470 > [ 22.223528] blk_mq_init_tags+0x79/0x1a0 > [ 22.223771] blk_mq_alloc_map_and_rqs+0x1f4/0xdd0 > [ 22.224127] blk_mq_init_sched+0x33d/0x6d0 > [ 22.224376] elevator_init_mq+0x2b2/0x400 It should be addressed by the following patch: https://lore.kernel.org/linux-block/ZyEGLdg744U_xBjp@fedora/ Thanks, Ming
On Wed, Oct 30, 2024 at 03:13:09PM +0800, Ming Lei wrote: > On Wed, Oct 30, 2024 at 02:45:03PM +0800, Lai, Yi wrote: > > Hi Ming, > > > > Greetings! > > > > I used Syzkaller and found that there is possible deadlock in __submit_bio in linux-next next-20241029. > > > > After bisection and the first bad commit is: > > " > > f1be1788a32e block: model freeze & enter queue as lock for supporting lockdep > > " > > > > All detailed into can be found at: > > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio > > Syzkaller repro code: > > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.c > > Syzkaller repro syscall steps: > > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.prog > > Syzkaller report: > > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/repro.report > > Kconfig(make olddefconfig): > > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/kconfig_origin > > Bisect info: > > https://github.com/laifryiee/syzkaller_logs/tree/main/241029_183511___submit_bio/bisect_info.log > > bzImage: > > https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/241029_183511___submit_bio/bzImage_6fb2fa9805c501d9ade047fc511961f3273cdcb5 > > Issue dmesg: > > https://github.com/laifryiee/syzkaller_logs/blob/main/241029_183511___submit_bio/6fb2fa9805c501d9ade047fc511961f3273cdcb5_dmesg.log > > > > " > > [ 22.219103] 6.12.0-rc5-next-20241029-6fb2fa9805c5 #1 Not tainted > > [ 22.219512] ------------------------------------------------------ > > [ 22.219827] repro/735 is trying to acquire lock: > > [ 22.220066] ffff888010f1a768 (&q->q_usage_counter(io)#25){++++}-{0:0}, at: __submit_bio+0x39f/0x550 > > [ 22.220568] > > [ 22.220568] but task is already holding lock: > > [ 22.220884] ffffffff872322a0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x76b/0x21e0 > > [ 22.221453] > > [ 22.221453] which lock already depends on the new lock. > > [ 22.221453] > > [ 22.221862] > > [ 22.221862] the existing dependency chain (in reverse order) is: > > [ 22.222247] > > [ 22.222247] -> #1 (fs_reclaim){+.+.}-{0:0}: > > [ 22.222630] lock_acquire+0x80/0xb0 > > [ 22.222920] fs_reclaim_acquire+0x116/0x160 > > [ 22.223244] __kmalloc_cache_node_noprof+0x59/0x470 > > [ 22.223528] blk_mq_init_tags+0x79/0x1a0 > > [ 22.223771] blk_mq_alloc_map_and_rqs+0x1f4/0xdd0 > > [ 22.224127] blk_mq_init_sched+0x33d/0x6d0 > > [ 22.224376] elevator_init_mq+0x2b2/0x400 > > It should be addressed by the following patch: > > https://lore.kernel.org/linux-block/ZyEGLdg744U_xBjp@fedora/ > I have applied proposed fix patch on top of next-20241029. Issue can still be reproduced. It seems the dependency chain is different from Marek's log and mine. > Thanks, > Ming >
On Wed, Oct 30, 2024 at 4:51 PM Lai, Yi <yi1.lai@linux.intel.com> wrote: > > On Wed, Oct 30, 2024 at 03:13:09PM +0800, Ming Lei wrote: > > On Wed, Oct 30, 2024 at 02:45:03PM +0800, Lai, Yi wrote: ... > > > > It should be addressed by the following patch: > > > > https://lore.kernel.org/linux-block/ZyEGLdg744U_xBjp@fedora/ > > > > I have applied proposed fix patch on top of next-20241029. Issue can > still be reproduced. > > It seems the dependency chain is different from Marek's log and mine. Can you post the new log since q->q_usage_counter(io)->fs_reclaim from blk_mq_init_sched is cut down by the patch? Thanks, Ming
On Wed, Oct 30, 2024 at 05:50:15PM +0800, Ming Lei wrote: > On Wed, Oct 30, 2024 at 4:51 PM Lai, Yi <yi1.lai@linux.intel.com> wrote: > > > > On Wed, Oct 30, 2024 at 03:13:09PM +0800, Ming Lei wrote: > > > On Wed, Oct 30, 2024 at 02:45:03PM +0800, Lai, Yi wrote: > ... > > > > > > It should be addressed by the following patch: > > > > > > https://lore.kernel.org/linux-block/ZyEGLdg744U_xBjp@fedora/ > > > > > > > I have applied proposed fix patch on top of next-20241029. Issue can > > still be reproduced. > > > > It seems the dependency chain is different from Marek's log and mine. > > Can you post the new log since q->q_usage_counter(io)->fs_reclaim from > blk_mq_init_sched is cut down by the patch? > New possible deadlock log after patch applied: [ 52.485023] repro: page allocation failure: order:1, mode:0x10cc0(GFP_KERNEL|__GFP_NORETRY), nodemask=(null),cpuset=/,mems0 [ 52.486074] CPU: 1 UID: 0 PID: 635 Comm: repro Not tainted 6.12.0-rc5-next-20241029-kvm-dirty #6 [ 52.486752] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/014 [ 52.487616] Call Trace: [ 52.487820] <TASK> [ 52.488001] dump_stack_lvl+0x121/0x150 [ 52.488345] dump_stack+0x19/0x20 [ 52.488616] warn_alloc+0x218/0x350 [ 52.488913] ? __pfx_warn_alloc+0x10/0x10 [ 52.489263] ? __alloc_pages_direct_compact+0x130/0xa10 [ 52.489699] ? __pfx___alloc_pages_direct_compact+0x10/0x10 [ 52.490151] ? __drain_all_pages+0x27b/0x480 [ 52.490522] __alloc_pages_slowpath.constprop.0+0x14d6/0x21e0 [ 52.491018] ? __pfx___alloc_pages_slowpath.constprop.0+0x10/0x10 [ 52.491519] ? lock_is_held_type+0xef/0x150 [ 52.491875] ? __pfx_get_page_from_freelist+0x10/0x10 [ 52.492291] ? lock_acquire+0x80/0xb0 [ 52.492619] __alloc_pages_noprof+0x5d4/0x6f0 [ 52.492992] ? __pfx___alloc_pages_noprof+0x10/0x10 [ 52.493405] ? __sanitizer_cov_trace_switch+0x58/0xa0 [ 52.493830] ? policy_nodemask+0xf9/0x450 [ 52.494169] alloc_pages_mpol_noprof+0x30a/0x580 [ 52.494561] ? __pfx_alloc_pages_mpol_noprof+0x10/0x10 [ 52.494982] ? sysvec_apic_timer_interrupt+0x6a/0xd0 [ 52.495396] ? asm_sysvec_apic_timer_interrupt+0x1f/0x30 [ 52.495845] alloc_pages_noprof+0xa9/0x180 [ 52.496201] kimage_alloc_pages+0x79/0x240 [ 52.496558] kimage_alloc_control_pages+0x1cb/0xa60 [ 52.496982] ? __pfx_kimage_alloc_control_pages+0x10/0x10 [ 52.497437] ? __sanitizer_cov_trace_const_cmp1+0x1e/0x30 [ 52.497897] do_kexec_load+0x3a6/0x8e0 [ 52.498228] ? __pfx_do_kexec_load+0x10/0x10 [ 52.498593] ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30 [ 52.499035] ? _copy_from_user+0xb6/0xf0 [ 52.499371] __x64_sys_kexec_load+0x1cc/0x240 [ 52.499740] x64_sys_call+0xf0f/0x20d0 [ 52.500055] do_syscall_64+0x6d/0x140 [ 52.500367] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 52.500778] RIP: 0033:0x7f310423ee5d [ 52.501077] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c88 [ 52.502494] RSP: 002b:00007fffcecca558 EFLAGS: 00000207 ORIG_RAX: 00000000000000f6 [ 52.503087] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f310423ee5d [ 52.503644] RDX: 0000000020000040 RSI: 0000000000000009 RDI: 0000000000000000 [ 52.504198] RBP: 00007fffcecca560 R08: 00007fffcecc9fd0 R09: 00007fffcecca590 [ 52.504767] R10: 0000000000000000 R11: 0000000000000207 R12: 00007fffcecca6d8 [ 52.505345] R13: 0000000000401c72 R14: 0000000000403e08 R15: 00007f3104469000 [ 52.505949] </TASK> [ 52.506239] Mem-Info: [ 52.506449] active_anon:119 inactive_anon:14010 isolated_anon:0 [ 52.506449] active_file:17895 inactive_file:87 isolated_file:0 [ 52.506449] unevictable:0 dirty:15 writeback:0 [ 52.506449] slab_reclaimable:6957 slab_unreclaimable:20220 [ 52.506449] mapped:11598 shmem:1150 pagetables:766 [ 52.506449] sec_pagetables:0 bounce:0 [ 52.506449] kernel_misc_reclaimable:0 [ 52.506449] free:13776 free_pcp:99 free_cma:0 [ 52.509456] Node 0 active_anon:476kB inactive_anon:56040kB active_file:71580kB inactive_file:348kB unevictable:0kB isolateo [ 52.511881] Node 0 DMA free:440kB boost:0kB min:440kB low:548kB high:656kB reserved_highatomic:0KB active_anon:0kB inactivB [ 52.513883] lowmem_reserve[]: 0 1507 0 0 0 [ 52.514269] Node 0 DMA32 free:54664kB boost:0kB min:44612kB low:55764kB high:66916kB reserved_highatomic:0KB active_anon:4B [ 52.516485] lowmem_reserve[]: 0 0 0 0 0 [ 52.516831] Node 0 DMA: 0*4kB 1*8kB (U) 1*16kB (U) 1*32kB (U) 0*64kB 1*128kB (U) 1*256kB (U) 0*512kB 0*1024kB 0*2048kB 0*4B [ 52.517895] Node 0 DMA32: 2970*4kB (UME) 1123*8kB (UME) 532*16kB (UME) 280*32kB (UM) 126*64kB (UM) 27*128kB (UME) 9*256kB B [ 52.519279] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB [ 52.519387] [ 52.519971] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [ 52.520138] ====================================================== [ 52.520805] 19113 total pagecache pages [ 52.521702] WARNING: possible circular locking dependency detected [ 52.522016] 0 pages in swap cache [ 52.522022] Free swap = 124984kB [ 52.522027] Total swap = 124996kB [ 52.523720] 6.12.0-rc5-next-20241029-kvm-dirty #6 Not tainted [ 52.523741] ------------------------------------------------------ [ 52.524059] 524158 pages RAM [ 52.525050] kswapd0/56 is trying to acquire lock: [ 52.525452] 0 pages HighMem/MovableOnly [ 52.525461] 129765 pages reserved [ 52.525465] 0 pages cma reserved [ 52.525469] 0 pages hwpoisoned [ 52.527163] ffff8880104374e8 (&q->q_usage_counter(io)#25){++++}-{0:0}, at: __submit_bio+0x39f/0x550 [ 52.532396] [ 52.532396] but task is already holding lock: [ 52.533293] ffffffff872322a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0xb0f/0x1500 [ 52.534508] [ 52.534508] which lock already depends on the new lock. [ 52.534508] [ 52.535723] [ 52.535723] the existing dependency chain (in reverse order) is: [ 52.536818] [ 52.536818] -> #2 (fs_reclaim){+.+.}-{0:0}: [ 52.537705] lock_acquire+0x80/0xb0 [ 52.538337] fs_reclaim_acquire+0x116/0x160 [ 52.539076] blk_mq_alloc_and_init_hctx+0x4df/0x1200 [ 52.539906] blk_mq_realloc_hw_ctxs+0x4cf/0x610 [ 52.540676] blk_mq_init_allocated_queue+0x3da/0x11b0 [ 52.541547] blk_mq_alloc_queue+0x22c/0x300 [ 52.542279] __blk_mq_alloc_disk+0x34/0x100 [ 52.543011] loop_add+0x4c9/0xbd0 [ 52.543622] loop_init+0x133/0x1a0 [ 52.544248] do_one_initcall+0x114/0x5d0 [ 52.544954] kernel_init_freeable+0xab0/0xeb0 [ 52.545732] kernel_init+0x28/0x2f0 [ 52.546366] ret_from_fork+0x56/0x90 [ 52.547009] ret_from_fork_asm+0x1a/0x30 [ 52.547698] [ 52.547698] -> #1 (&q->sysfs_lock){+.+.}-{4:4}: [ 52.548625] lock_acquire+0x80/0xb0 [ 52.549276] __mutex_lock+0x17c/0x1540 [ 52.549958] mutex_lock_nested+0x1f/0x30 [ 52.550664] queue_attr_store+0xea/0x180 [ 52.551360] sysfs_kf_write+0x11f/0x180 [ 52.552036] kernfs_fop_write_iter+0x40e/0x630 [ 52.552808] vfs_write+0xc59/0x1140 [ 52.553446] ksys_write+0x14f/0x290 [ 52.554068] __x64_sys_write+0x7b/0xc0 [ 52.554728] x64_sys_call+0x1685/0x20d0 [ 52.555397] do_syscall_64+0x6d/0x140 [ 52.556029] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 52.556865] [ 52.556865] -> #0 (&q->q_usage_counter(io)#25){++++}-{0:0}: [ 52.557963] __lock_acquire+0x2ff8/0x5d60 [ 52.558667] lock_acquire.part.0+0x142/0x390 [ 52.559427] lock_acquire+0x80/0xb0 [ 52.560057] blk_mq_submit_bio+0x1cbe/0x2590 [ 52.560801] __submit_bio+0x39f/0x550 [ 52.561473] submit_bio_noacct_nocheck+0x647/0xcc0 [ 52.562285] submit_bio_noacct+0x620/0x1e00 [ 52.563017] submit_bio+0xce/0x480 [ 52.563637] __swap_writepage+0x2f1/0xdf0 [ 52.564349] swap_writepage+0x464/0xbc0 [ 52.565022] shmem_writepage+0xdeb/0x1340 [ 52.565745] pageout+0x3bc/0x9b0 [ 52.566353] shrink_folio_list+0x16b9/0x3b60 [ 52.567104] shrink_lruvec+0xd78/0x2790 [ 52.567794] shrink_node+0xb29/0x2870 [ 52.568454] balance_pgdat+0x9c2/0x1500 [ 52.569142] kswapd+0x765/0xe00 [ 52.569741] kthread+0x35a/0x470 [ 52.570340] ret_from_fork+0x56/0x90 [ 52.570993] ret_from_fork_asm+0x1a/0x30 [ 52.571696] [ 52.571696] other info that might help us debug this: [ 52.571696] [ 52.572904] Chain exists of: [ 52.572904] &q->q_usage_counter(io)#25 --> &q->sysfs_lock --> fs_reclaim [ 52.572904] [ 52.574631] Possible unsafe locking scenario: [ 52.574631] [ 52.575547] CPU0 CPU1 [ 52.576246] ---- ---- [ 52.576942] lock(fs_reclaim); [ 52.577467] lock(&q->sysfs_lock); [ 52.578382] lock(fs_reclaim); [ 52.579250] rlock(&q->q_usage_counter(io)#25); [ 52.579974] [ 52.579974] *** DEADLOCK *** [ 52.579974] [ 52.580866] 1 lock held by kswapd0/56: [ 52.581459] #0: ffffffff872322a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0xb0f/0x1500 [ 52.582731] [ 52.582731] stack backtrace: [ 52.583404] CPU: 0 UID: 0 PID: 56 Comm: kswapd0 Not tainted 6.12.0-rc5-next-20241029-kvm-dirty #6 [ 52.584735] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/014 [ 52.586439] Call Trace: [ 52.586836] <TASK> [ 52.587190] dump_stack_lvl+0xea/0x150 [ 52.587753] dump_stack+0x19/0x20 [ 52.588253] print_circular_bug+0x47f/0x750 [ 52.588872] check_noncircular+0x2f4/0x3e0 [ 52.589492] ? __pfx_check_noncircular+0x10/0x10 [ 52.590180] ? lockdep_lock+0xd0/0x1d0 [ 52.590741] ? __pfx_lockdep_lock+0x10/0x10 [ 52.590790] kexec: Could not allocate control_code_buffer [ 52.591341] __lock_acquire+0x2ff8/0x5d60 [ 52.592365] ? __pfx___lock_acquire+0x10/0x10 [ 52.593087] ? __pfx_mark_lock.part.0+0x10/0x10 [ 52.593753] ? __kasan_check_read+0x15/0x20 [ 52.594366] lock_acquire.part.0+0x142/0x390 [ 52.594989] ? __submit_bio+0x39f/0x550 [ 52.595554] ? __pfx_lock_acquire.part.0+0x10/0x10 [ 52.596246] ? debug_smp_processor_id+0x20/0x30 [ 52.596900] ? rcu_is_watching+0x19/0xc0 [ 52.597484] ? trace_lock_acquire+0x139/0x1b0 [ 52.598118] lock_acquire+0x80/0xb0 [ 52.598633] ? __submit_bio+0x39f/0x550 [ 52.599191] blk_mq_submit_bio+0x1cbe/0x2590 [ 52.599805] ? __submit_bio+0x39f/0x550 [ 52.600361] ? __kasan_check_read+0x15/0x20 [ 52.600966] ? __pfx_blk_mq_submit_bio+0x10/0x10 [ 52.601632] ? __pfx_mark_lock.part.0+0x10/0x10 [ 52.602285] ? __this_cpu_preempt_check+0x21/0x30 [ 52.602968] ? __this_cpu_preempt_check+0x21/0x30 [ 52.603646] ? lock_release+0x441/0x870 [ 52.604207] __submit_bio+0x39f/0x550 [ 52.604742] ? __pfx___submit_bio+0x10/0x10 [ 52.605364] ? __this_cpu_preempt_check+0x21/0x30 [ 52.606045] ? seqcount_lockdep_reader_access.constprop.0+0xb4/0xd0 [ 52.606940] ? __sanitizer_cov_trace_const_cmp1+0x1e/0x30 [ 52.607707] ? kvm_clock_get_cycles+0x43/0x70 [ 52.608345] submit_bio_noacct_nocheck+0x647/0xcc0 [ 52.609045] ? __pfx_submit_bio_noacct_nocheck+0x10/0x10 [ 52.609820] ? __sanitizer_cov_trace_switch+0x58/0xa0 [ 52.610552] submit_bio_noacct+0x620/0x1e00 [ 52.611167] submit_bio+0xce/0x480 [ 52.611677] __swap_writepage+0x2f1/0xdf0 [ 52.612267] swap_writepage+0x464/0xbc0 [ 52.612837] shmem_writepage+0xdeb/0x1340 [ 52.613441] ? __pfx_shmem_writepage+0x10/0x10 [ 52.614090] ? __kasan_check_write+0x18/0x20 [ 52.614716] ? folio_clear_dirty_for_io+0xc1/0x600 [ 52.615403] pageout+0x3bc/0x9b0 [ 52.615894] ? __pfx_pageout+0x10/0x10 [ 52.616471] ? __pfx_folio_referenced_one+0x10/0x10 [ 52.617169] ? __pfx_folio_lock_anon_vma_read+0x10/0x10 [ 52.617918] ? __pfx_invalid_folio_referenced_vma+0x10/0x10 [ 52.618713] shrink_folio_list+0x16b9/0x3b60 [ 52.619346] ? __pfx_shrink_folio_list+0x10/0x10 [ 52.620021] ? __this_cpu_preempt_check+0x21/0x30 [ 52.620713] ? mark_lock.part.0+0xf3/0x17b0 [ 52.621339] ? isolate_lru_folios+0xcb1/0x1250 [ 52.621991] ? __pfx_mark_lock.part.0+0x10/0x10 [ 52.622655] ? __this_cpu_preempt_check+0x21/0x30 [ 52.623335] ? lock_release+0x441/0x870 [ 52.623900] ? __this_cpu_preempt_check+0x21/0x30 [ 52.624573] ? _raw_spin_unlock_irq+0x2c/0x60 [ 52.625204] ? lockdep_hardirqs_on+0x89/0x110 [ 52.625848] shrink_lruvec+0xd78/0x2790 [ 52.626422] ? __pfx_shrink_lruvec+0x10/0x10 [ 52.627040] ? __this_cpu_preempt_check+0x21/0x30 [ 52.627729] ? __this_cpu_preempt_check+0x21/0x30 [ 52.628423] ? trace_lock_acquire+0x139/0x1b0 [ 52.629061] ? trace_lock_acquire+0x139/0x1b0 [ 52.629752] shrink_node+0xb29/0x2870 [ 52.630305] ? __pfx_shrink_node+0x10/0x10 [ 52.630899] ? pgdat_balanced+0x1d4/0x230 [ 52.631490] balance_pgdat+0x9c2/0x1500 [ 52.632055] ? __pfx_balance_pgdat+0x10/0x10 [ 52.632669] ? __this_cpu_preempt_check+0x21/0x30 [ 52.633380] kswapd+0x765/0xe00 [ 52.633861] ? __pfx_kswapd+0x10/0x10 [ 52.634393] ? local_clock_noinstr+0xb0/0xd0 [ 52.635015] ? __pfx_autoremove_wake_function+0x10/0x10 [ 52.635759] ? __sanitizer_cov_trace_const_cmp1+0x1e/0x30 [ 52.636525] ? __kthread_parkme+0x15d/0x230 [ 52.637134] kthread+0x35a/0x470 [ 52.637616] ? __pfx_kswapd+0x10/0x10 [ 52.638146] ? __pfx_kthread+0x10/0x10 [ 52.638693] ret_from_fork+0x56/0x90 [ 52.639227] ? __pfx_kthread+0x10/0x10 [ 52.639778] ret_from_fork_asm+0x1a/0x30 [ 52.640391] </TASK> > Thanks, > Ming >
On Wed, Oct 30, 2024 at 06:39:13PM +0800, Lai, Yi wrote: > On Wed, Oct 30, 2024 at 05:50:15PM +0800, Ming Lei wrote: > > On Wed, Oct 30, 2024 at 4:51 PM Lai, Yi <yi1.lai@linux.intel.com> wrote: > > > > > > On Wed, Oct 30, 2024 at 03:13:09PM +0800, Ming Lei wrote: > > > > On Wed, Oct 30, 2024 at 02:45:03PM +0800, Lai, Yi wrote: > > ... > > > > > > > > It should be addressed by the following patch: > > > > > > > > https://lore.kernel.org/linux-block/ZyEGLdg744U_xBjp@fedora/ > > > > > > > > > > I have applied proposed fix patch on top of next-20241029. Issue can > > > still be reproduced. > > > > > > It seems the dependency chain is different from Marek's log and mine. > > > > Can you post the new log since q->q_usage_counter(io)->fs_reclaim from > > blk_mq_init_sched is cut down by the patch? > > > > New possible deadlock log after patch applied: This one looks like one real deadlock, any memory allocation with q->sysfs_lock held has such risk. There is another similar report related with queue sysfs store operation: https://lore.kernel.org/linux-scsi/ZxG38G9BuFdBpBHZ@fedora/ Thanks, Ming
On 25.10.2024 02:37, Ming Lei wrote: > Recently we got several deadlock report[1][2][3] caused by > blk_mq_freeze_queue and blk_enter_queue(). > > Turns out the two are just like acquiring read/write lock, so model them > as read/write lock for supporting lockdep: > > 1) model q->q_usage_counter as two locks(io and queue lock) > > - queue lock covers sync with blk_enter_queue() > > - io lock covers sync with bio_enter_queue() > > 2) make the lockdep class/key as per-queue: > > - different subsystem has very different lock use pattern, shared lock > class causes false positive easily > > - freeze_queue degrades to no lock in case that disk state becomes DEAD > because bio_enter_queue() won't be blocked any more > > - freeze_queue degrades to no lock in case that request queue becomes dying > because blk_enter_queue() won't be blocked any more > > 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock > - it is exclusive lock, so dependency with blk_enter_queue() is covered > > - it is trylock because blk_mq_freeze_queue() are allowed to run > concurrently > > 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() > - nested blk_enter_queue() are allowed > > - dependency with blk_mq_freeze_queue() is covered > > - blk_queue_exit() is often called from other contexts(such as irq), and > it can't be annotated as lock_release(), so simply do it in > blk_enter_queue(), this way still covered cases as many as possible > > With lockdep support, such kind of reports may be reported asap and > needn't wait until the real deadlock is triggered. > > For example, lockdep report can be triggered in the report[3] with this > patch applied. > > [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' > https://bugzilla.kernel.org/show_bug.cgi?id=219166 > > [2] del_gendisk() vs blk_queue_enter() race condition > https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ > > [3] queue_freeze & queue_enter deadlock in scsi > https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u > > Reviewed-by: Christoph Hellwig <hch@lst.de> > Signed-off-by: Ming Lei <ming.lei@redhat.com> This patch landed yesterday in linux-next as commit f1be1788a32e ("block: model freeze & enter queue as lock for supporting lockdep"). In my tests I found that it introduces the following 2 lockdep warnings: 1. On Samsung Exynos 4412-based Odroid U3 board (ARM 32bit), observed when booting it: ====================================================== WARNING: possible circular locking dependency detected 6.12.0-rc4-00037-gf1be1788a32e #9290 Not tainted ------------------------------------------------------ find/1284 is trying to acquire lock: cf3b8534 (&mm->mmap_lock){++++}-{3:3}, at: __might_fault+0x30/0x70 but task is already holding lock: c203a0c8 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: iterate_dir+0x30/0x140 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (&sb->s_type->i_mutex_key#2){++++}-{3:3}: down_write+0x44/0xc4 start_creating+0x8c/0x170 debugfs_create_dir+0x1c/0x178 blk_register_queue+0xa0/0x1c0 add_disk_fwnode+0x210/0x434 brd_alloc+0x1cc/0x210 brd_init+0xac/0x104 do_one_initcall+0x64/0x30c kernel_init_freeable+0x1c4/0x228 kernel_init+0x1c/0x12c ret_from_fork+0x14/0x28 -> #3 (&q->debugfs_mutex){+.+.}-{3:3}: __mutex_lock+0x94/0x94c mutex_lock_nested+0x1c/0x24 blk_mq_init_sched+0x140/0x204 elevator_init_mq+0xb8/0x130 add_disk_fwnode+0x3c/0x434 mmc_blk_alloc_req+0x34c/0x464 mmc_blk_probe+0x1f4/0x6f8 really_probe+0xe0/0x3d8 __driver_probe_device+0x9c/0x1e4 driver_probe_device+0x30/0xc0 __device_attach_driver+0xa8/0x120 bus_for_each_drv+0x80/0xcc __device_attach+0xac/0x1fc bus_probe_device+0x8c/0x90 device_add+0x5a4/0x7cc mmc_add_card+0x118/0x2c8 mmc_attach_mmc+0xd8/0x174 mmc_rescan+0x2ec/0x3a8 process_one_work+0x240/0x6d0 worker_thread+0x1a0/0x398 kthread+0x104/0x138 ret_from_fork+0x14/0x28 -> #2 (&q->q_usage_counter(io)#17){++++}-{0:0}: blk_mq_submit_bio+0x8dc/0xb34 __submit_bio+0x78/0x148 submit_bio_noacct_nocheck+0x204/0x32c ext4_mpage_readpages+0x558/0x7b0 read_pages+0x64/0x28c page_cache_ra_unbounded+0x118/0x1bc filemap_get_pages+0x124/0x7ec filemap_read+0x174/0x5b0 __kernel_read+0x128/0x2c0 bprm_execve+0x230/0x7a4 kernel_execve+0xec/0x194 try_to_run_init_process+0xc/0x38 kernel_init+0xdc/0x12c ret_from_fork+0x14/0x28 -> #1 (mapping.invalidate_lock#2){++++}-{3:3}: down_read+0x44/0x224 filemap_fault+0x648/0x10f0 __do_fault+0x38/0xd4 handle_mm_fault+0xaf8/0x14d0 do_page_fault+0xe0/0x5c8 do_DataAbort+0x3c/0xb0 __dabt_svc+0x50/0x80 __clear_user_std+0x34/0x68 elf_load+0x1a8/0x208 load_elf_binary+0x3f4/0x13cc bprm_execve+0x28c/0x7a4 do_execveat_common+0x150/0x198 sys_execve+0x30/0x38 ret_fast_syscall+0x0/0x1c -> #0 (&mm->mmap_lock){++++}-{3:3}: __lock_acquire+0x158c/0x2970 lock_acquire+0x130/0x384 __might_fault+0x50/0x70 filldir64+0x94/0x28c dcache_readdir+0x174/0x260 iterate_dir+0x64/0x140 sys_getdents64+0x60/0x130 ret_fast_syscall+0x0/0x1c other info that might help us debug this: Chain exists of: &mm->mmap_lock --> &q->debugfs_mutex --> &sb->s_type->i_mutex_key#2 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock(&sb->s_type->i_mutex_key#2); lock(&q->debugfs_mutex); lock(&sb->s_type->i_mutex_key#2); rlock(&mm->mmap_lock); *** DEADLOCK *** 2 locks held by find/1284: #0: c3df1e88 (&f->f_pos_lock){+.+.}-{3:3}, at: fdget_pos+0x88/0xd0 #1: c203a0c8 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: iterate_dir+0x30/0x140 stack backtrace: CPU: 1 UID: 0 PID: 1284 Comm: find Not tainted 6.12.0-rc4-00037-gf1be1788a32e #9290 Hardware name: Samsung Exynos (Flattened Device Tree) Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x88 dump_stack_lvl from print_circular_bug+0x31c/0x394 print_circular_bug from check_noncircular+0x16c/0x184 check_noncircular from __lock_acquire+0x158c/0x2970 __lock_acquire from lock_acquire+0x130/0x384 lock_acquire from __might_fault+0x50/0x70 __might_fault from filldir64+0x94/0x28c filldir64 from dcache_readdir+0x174/0x260 dcache_readdir from iterate_dir+0x64/0x140 iterate_dir from sys_getdents64+0x60/0x130 sys_getdents64 from ret_fast_syscall+0x0/0x1c Exception stack(0xf22b5fa8 to 0xf22b5ff0) 5fa0: 004b4fa0 004b4f80 00000004 004b4fa0 00008000 00000000 5fc0: 004b4fa0 004b4f80 00000001 000000d9 00000000 004b4af0 00000000 000010ea 5fe0: 004b1eb4 bea05af0 b6da4b08 b6da4a28 --->8--- 2. On QEMU's ARM64 virt machine, observed during system suspend/resume cycle: # time rtcwake -s10 -mmem rtcwake: wakeup from "mem" using /dev/rtc0 at Tue Oct 29 11:54:30 2024 PM: suspend entry (s2idle) Filesystems sync: 0.004 seconds Freezing user space processes Freezing user space processes completed (elapsed 0.007 seconds) OOM killer disabled. Freezing remaining freezable tasks Freezing remaining freezable tasks completed (elapsed 0.004 seconds) ====================================================== WARNING: possible circular locking dependency detected 6.12.0-rc4+ #9291 Not tainted ------------------------------------------------------ rtcwake/1299 is trying to acquire lock: ffff80008358a7f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x1c/0x28 but task is already holding lock: ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: virtblk_freeze+0x24/0x60 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&q->q_usage_counter(io)#5){++++}-{0:0}: blk_mq_submit_bio+0x7c0/0x9d8 __submit_bio+0x74/0x164 submit_bio_noacct_nocheck+0x2d4/0x3b4 submit_bio_noacct+0x148/0x3fc submit_bio+0x130/0x204 submit_bh_wbc+0x148/0x1bc submit_bh+0x18/0x24 ext4_read_bh_nowait+0x70/0x100 ext4_sb_breadahead_unmovable+0x50/0x80 __ext4_get_inode_loc+0x354/0x654 ext4_get_inode_loc+0x40/0xa8 ext4_reserve_inode_write+0x40/0xf0 __ext4_mark_inode_dirty+0x88/0x300 ext4_ext_tree_init+0x40/0x4c __ext4_new_inode+0x99c/0x1614 ext4_create+0xe4/0x1d4 lookup_open.isra.0+0x47c/0x540 path_openat+0x370/0x9e8 do_filp_open+0x80/0x130 do_sys_openat2+0xb4/0xe8 __arm64_compat_sys_openat+0x5c/0xa4 invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe8 do_el0_svc_compat+0x20/0x3c el0_svc_compat+0x44/0xe0 el0t_32_sync_handler+0x98/0x148 el0t_32_sync+0x194/0x198 -> #2 (jbd2_handle){++++}-{0:0}: start_this_handle+0x178/0x4e8 jbd2__journal_start+0x110/0x298 __ext4_journal_start_sb+0x9c/0x274 ext4_dirty_inode+0x38/0x88 __mark_inode_dirty+0x90/0x568 generic_update_time+0x50/0x64 touch_atime+0x2c0/0x324 ext4_file_mmap+0x68/0x88 mmap_region+0x448/0xa38 do_mmap+0x3dc/0x540 vm_mmap_pgoff+0xf8/0x1b4 ksys_mmap_pgoff+0x148/0x1f0 __arm64_compat_sys_aarch32_mmap2+0x20/0x2c invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe8 do_el0_svc_compat+0x20/0x3c el0_svc_compat+0x44/0xe0 el0t_32_sync_handler+0x98/0x148 el0t_32_sync+0x194/0x198 -> #1 (&mm->mmap_lock){++++}-{3:3}: __might_fault+0x5c/0x80 inet_gifconf+0xcc/0x278 dev_ifconf+0xc4/0x1f8 sock_ioctl+0x234/0x384 compat_sock_ioctl+0x180/0x35c __arm64_compat_sys_ioctl+0x14c/0x16c invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe8 do_el0_svc_compat+0x20/0x3c el0_svc_compat+0x44/0xe0 el0t_32_sync_handler+0x98/0x148 el0t_32_sync+0x194/0x198 -> #0 (rtnl_mutex){+.+.}-{3:3}: __lock_acquire+0x1374/0x2224 lock_acquire+0x200/0x340 __mutex_lock+0x98/0x428 mutex_lock_nested+0x24/0x30 rtnl_lock+0x1c/0x28 virtnet_freeze_down.isra.0+0x20/0x9c virtnet_freeze+0x44/0x60 virtio_device_freeze+0x68/0x94 virtio_mmio_freeze+0x14/0x20 platform_pm_suspend+0x2c/0x6c dpm_run_callback+0xa4/0x218 device_suspend+0x12c/0x52c dpm_suspend+0x10c/0x2e4 dpm_suspend_start+0x70/0x78 suspend_devices_and_enter+0x130/0x798 pm_suspend+0x1ac/0x2e8 state_store+0x8c/0x110 kobj_attr_store+0x18/0x2c sysfs_kf_write+0x4c/0x78 kernfs_fop_write_iter+0x120/0x1b4 vfs_write+0x2b0/0x35c ksys_write+0x68/0xf4 __arm64_sys_write+0x1c/0x28 invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe8 do_el0_svc_compat+0x20/0x3c el0_svc_compat+0x44/0xe0 el0t_32_sync_handler+0x98/0x148 el0t_32_sync+0x194/0x198 other info that might help us debug this: Chain exists of: rtnl_mutex --> jbd2_handle --> &q->q_usage_counter(io)#5 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->q_usage_counter(io)#5); lock(jbd2_handle); lock(&q->q_usage_counter(io)#5); lock(rtnl_mutex); *** DEADLOCK *** 9 locks held by rtcwake/1299: #0: ffff0000069103f8 (sb_writers#7){.+.+}-{0:0}, at: vfs_write+0x1e4/0x35c #1: ffff000007c0ae88 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf0/0x1b4 #2: ffff0000046906e8 (kn->active#24){.+.+}-{0:0}, at: kernfs_fop_write_iter+0xf8/0x1b4 #3: ffff800082fd9908 (system_transition_mutex){+.+.}-{3:3}, at: pm_suspend+0x88/0x2e8 #4: ffff000006137678 (&q->q_usage_counter(io)#6){++++}-{0:0}, at: virtblk_freeze+0x24/0x60 #5: ffff0000061376b0 (&q->q_usage_counter(queue)#2){++++}-{0:0}, at: virtblk_freeze+0x24/0x60 #6: ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: virtblk_freeze+0x24/0x60 #7: ffff000006136da0 (&q->q_usage_counter(queue)){++++}-{0:0}, at: virtblk_freeze+0x24/0x60 #8: ffff0000056208f8 (&dev->mutex){....}-{3:3}, at: device_suspend+0xf8/0x52c stack backtrace: CPU: 0 UID: 0 PID: 1299 Comm: rtcwake Not tainted 6.12.0-rc4+ #9291 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x94/0xec show_stack+0x18/0x24 dump_stack_lvl+0x90/0xd0 dump_stack+0x18/0x24 print_circular_bug+0x298/0x37c check_noncircular+0x15c/0x170 __lock_acquire+0x1374/0x2224 lock_acquire+0x200/0x340 __mutex_lock+0x98/0x428 mutex_lock_nested+0x24/0x30 rtnl_lock+0x1c/0x28 virtnet_freeze_down.isra.0+0x20/0x9c virtnet_freeze+0x44/0x60 virtio_device_freeze+0x68/0x94 virtio_mmio_freeze+0x14/0x20 platform_pm_suspend+0x2c/0x6c dpm_run_callback+0xa4/0x218 device_suspend+0x12c/0x52c dpm_suspend+0x10c/0x2e4 dpm_suspend_start+0x70/0x78 suspend_devices_and_enter+0x130/0x798 pm_suspend+0x1ac/0x2e8 state_store+0x8c/0x110 kobj_attr_store+0x18/0x2c sysfs_kf_write+0x4c/0x78 kernfs_fop_write_iter+0x120/0x1b4 vfs_write+0x2b0/0x35c ksys_write+0x68/0xf4 __arm64_sys_write+0x1c/0x28 invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe8 do_el0_svc_compat+0x20/0x3c el0_svc_compat+0x44/0xe0 el0t_32_sync_handler+0x98/0x148 el0t_32_sync+0x194/0x198 virtio_blk virtio2: 1/0/0 default/read/poll queues virtio_blk virtio3: 1/0/0 default/read/poll queues OOM killer enabled. Restarting tasks ... done. random: crng reseeded on system resumption PM: suspend exit --->8--- Let me know if You need more information about my test systems. > --- > block/blk-core.c | 18 ++++++++++++++++-- > block/blk-mq.c | 26 ++++++++++++++++++++++---- > block/blk.h | 29 ++++++++++++++++++++++++++--- > block/genhd.c | 15 +++++++++++---- > include/linux/blkdev.h | 6 ++++++ > 5 files changed, 81 insertions(+), 13 deletions(-) > ... Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland
On Tue, Oct 29, 2024 at 12:13:35PM +0100, Marek Szyprowski wrote: > On 25.10.2024 02:37, Ming Lei wrote: > > Recently we got several deadlock report[1][2][3] caused by > > blk_mq_freeze_queue and blk_enter_queue(). > > > > Turns out the two are just like acquiring read/write lock, so model them > > as read/write lock for supporting lockdep: > > > > 1) model q->q_usage_counter as two locks(io and queue lock) > > > > - queue lock covers sync with blk_enter_queue() > > > > - io lock covers sync with bio_enter_queue() > > > > 2) make the lockdep class/key as per-queue: > > > > - different subsystem has very different lock use pattern, shared lock > > class causes false positive easily > > > > - freeze_queue degrades to no lock in case that disk state becomes DEAD > > because bio_enter_queue() won't be blocked any more > > > > - freeze_queue degrades to no lock in case that request queue becomes dying > > because blk_enter_queue() won't be blocked any more > > > > 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock > > - it is exclusive lock, so dependency with blk_enter_queue() is covered > > > > - it is trylock because blk_mq_freeze_queue() are allowed to run > > concurrently > > > > 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() > > - nested blk_enter_queue() are allowed > > > > - dependency with blk_mq_freeze_queue() is covered > > > > - blk_queue_exit() is often called from other contexts(such as irq), and > > it can't be annotated as lock_release(), so simply do it in > > blk_enter_queue(), this way still covered cases as many as possible > > > > With lockdep support, such kind of reports may be reported asap and > > needn't wait until the real deadlock is triggered. > > > > For example, lockdep report can be triggered in the report[3] with this > > patch applied. > > > > [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' > > https://bugzilla.kernel.org/show_bug.cgi?id=219166 > > > > [2] del_gendisk() vs blk_queue_enter() race condition > > https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ > > > > [3] queue_freeze & queue_enter deadlock in scsi > > https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > This patch landed yesterday in linux-next as commit f1be1788a32e > ("block: model freeze & enter queue as lock for supporting lockdep"). > In my tests I found that it introduces the following 2 lockdep warnings: > > 1. On Samsung Exynos 4412-based Odroid U3 board (ARM 32bit), observed > when booting it: > > ====================================================== > WARNING: possible circular locking dependency detected > 6.12.0-rc4-00037-gf1be1788a32e #9290 Not tainted > ------------------------------------------------------ > find/1284 is trying to acquire lock: > cf3b8534 (&mm->mmap_lock){++++}-{3:3}, at: __might_fault+0x30/0x70 > > but task is already holding lock: > c203a0c8 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: > iterate_dir+0x30/0x140 > > which lock already depends on the new lock. > > > the existing dependency chain (in reverse order) is: > > -> #4 (&sb->s_type->i_mutex_key#2){++++}-{3:3}: > down_write+0x44/0xc4 > start_creating+0x8c/0x170 > debugfs_create_dir+0x1c/0x178 > blk_register_queue+0xa0/0x1c0 > add_disk_fwnode+0x210/0x434 > brd_alloc+0x1cc/0x210 > brd_init+0xac/0x104 > do_one_initcall+0x64/0x30c > kernel_init_freeable+0x1c4/0x228 > kernel_init+0x1c/0x12c > ret_from_fork+0x14/0x28 > > -> #3 (&q->debugfs_mutex){+.+.}-{3:3}: > __mutex_lock+0x94/0x94c > mutex_lock_nested+0x1c/0x24 > blk_mq_init_sched+0x140/0x204 > elevator_init_mq+0xb8/0x130 > add_disk_fwnode+0x3c/0x434 The above chain can be cut by the following patch because disk state can be thought as DEAD in add_disk(), can you test it? diff --git a/block/elevator.c b/block/elevator.c index 4122026b11f1..efa6ff941a25 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -600,12 +600,14 @@ void elevator_init_mq(struct request_queue *q) * requests, then no need to quiesce queue which may add long boot * latency, especially when lots of disks are involved. */ - blk_mq_freeze_queue(q); + if (__blk_freeze_queue_start(q)) + blk_freeze_acquire_lock(q, true, false); blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_mq_unfreeze_queue(q); + if (__blk_mq_unfreeze_queue(q, false)) + blk_unfreeze_release_lock(q, true, false); if (err) { pr_warn("\"%s\" elevator initialization failed, " ... > 2 locks held by find/1284: > #0: c3df1e88 (&f->f_pos_lock){+.+.}-{3:3}, at: fdget_pos+0x88/0xd0 > #1: c203a0c8 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: > iterate_dir+0x30/0x140 > > stack backtrace: > CPU: 1 UID: 0 PID: 1284 Comm: find Not tainted > 6.12.0-rc4-00037-gf1be1788a32e #9290 > Hardware name: Samsung Exynos (Flattened Device Tree) > Call trace: > unwind_backtrace from show_stack+0x10/0x14 > show_stack from dump_stack_lvl+0x68/0x88 > dump_stack_lvl from print_circular_bug+0x31c/0x394 > print_circular_bug from check_noncircular+0x16c/0x184 > check_noncircular from __lock_acquire+0x158c/0x2970 > __lock_acquire from lock_acquire+0x130/0x384 > lock_acquire from __might_fault+0x50/0x70 > __might_fault from filldir64+0x94/0x28c > filldir64 from dcache_readdir+0x174/0x260 > dcache_readdir from iterate_dir+0x64/0x140 > iterate_dir from sys_getdents64+0x60/0x130 > sys_getdents64 from ret_fast_syscall+0x0/0x1c > Exception stack(0xf22b5fa8 to 0xf22b5ff0) > 5fa0: 004b4fa0 004b4f80 00000004 004b4fa0 00008000 > 00000000 > 5fc0: 004b4fa0 004b4f80 00000001 000000d9 00000000 004b4af0 00000000 > 000010ea > 5fe0: 004b1eb4 bea05af0 b6da4b08 b6da4a28 > > --->8--- > > > 2. On QEMU's ARM64 virt machine, observed during system suspend/resume > cycle: > > # time rtcwake -s10 -mmem > rtcwake: wakeup from "mem" using /dev/rtc0 at Tue Oct 29 11:54:30 2024 > PM: suspend entry (s2idle) > Filesystems sync: 0.004 seconds > Freezing user space processes > Freezing user space processes completed (elapsed 0.007 seconds) > OOM killer disabled. > Freezing remaining freezable tasks > Freezing remaining freezable tasks completed (elapsed 0.004 seconds) > > ====================================================== > WARNING: possible circular locking dependency detected > 6.12.0-rc4+ #9291 Not tainted > ------------------------------------------------------ > rtcwake/1299 is trying to acquire lock: > ffff80008358a7f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x1c/0x28 > > but task is already holding lock: > ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: > virtblk_freeze+0x24/0x60 > > which lock already depends on the new lock. > > > the existing dependency chain (in reverse order) is: This one looks a real thing, at least the added lockdep code works as expected, also the blk_mq_freeze_queue() use in virtio-blk's ->suspend() is questionable. I will take a further look. Thanks, Ming
On 29.10.2024 16:58, Ming Lei wrote: > On Tue, Oct 29, 2024 at 12:13:35PM +0100, Marek Szyprowski wrote: >> On 25.10.2024 02:37, Ming Lei wrote: >>> Recently we got several deadlock report[1][2][3] caused by >>> blk_mq_freeze_queue and blk_enter_queue(). >>> >>> Turns out the two are just like acquiring read/write lock, so model them >>> as read/write lock for supporting lockdep: >>> >>> 1) model q->q_usage_counter as two locks(io and queue lock) >>> >>> - queue lock covers sync with blk_enter_queue() >>> >>> - io lock covers sync with bio_enter_queue() >>> >>> 2) make the lockdep class/key as per-queue: >>> >>> - different subsystem has very different lock use pattern, shared lock >>> class causes false positive easily >>> >>> - freeze_queue degrades to no lock in case that disk state becomes DEAD >>> because bio_enter_queue() won't be blocked any more >>> >>> - freeze_queue degrades to no lock in case that request queue becomes dying >>> because blk_enter_queue() won't be blocked any more >>> >>> 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock >>> - it is exclusive lock, so dependency with blk_enter_queue() is covered >>> >>> - it is trylock because blk_mq_freeze_queue() are allowed to run >>> concurrently >>> >>> 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() >>> - nested blk_enter_queue() are allowed >>> >>> - dependency with blk_mq_freeze_queue() is covered >>> >>> - blk_queue_exit() is often called from other contexts(such as irq), and >>> it can't be annotated as lock_release(), so simply do it in >>> blk_enter_queue(), this way still covered cases as many as possible >>> >>> With lockdep support, such kind of reports may be reported asap and >>> needn't wait until the real deadlock is triggered. >>> >>> For example, lockdep report can be triggered in the report[3] with this >>> patch applied. >>> >>> [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' >>> https://bugzilla.kernel.org/show_bug.cgi?id=219166 >>> >>> [2] del_gendisk() vs blk_queue_enter() race condition >>> https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ >>> >>> [3] queue_freeze & queue_enter deadlock in scsi >>> https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u >>> >>> Reviewed-by: Christoph Hellwig <hch@lst.de> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >> This patch landed yesterday in linux-next as commit f1be1788a32e >> ("block: model freeze & enter queue as lock for supporting lockdep"). >> In my tests I found that it introduces the following 2 lockdep warnings: >> >> > ... >> >> >> 2. On QEMU's ARM64 virt machine, observed during system suspend/resume >> cycle: >> >> # time rtcwake -s10 -mmem >> rtcwake: wakeup from "mem" using /dev/rtc0 at Tue Oct 29 11:54:30 2024 >> PM: suspend entry (s2idle) >> Filesystems sync: 0.004 seconds >> Freezing user space processes >> Freezing user space processes completed (elapsed 0.007 seconds) >> OOM killer disabled. >> Freezing remaining freezable tasks >> Freezing remaining freezable tasks completed (elapsed 0.004 seconds) >> >> ====================================================== >> WARNING: possible circular locking dependency detected >> 6.12.0-rc4+ #9291 Not tainted >> ------------------------------------------------------ >> rtcwake/1299 is trying to acquire lock: >> ffff80008358a7f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x1c/0x28 >> >> but task is already holding lock: >> ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: >> virtblk_freeze+0x24/0x60 >> >> which lock already depends on the new lock. >> >> >> the existing dependency chain (in reverse order) is: > This one looks a real thing, at least the added lockdep code works as > expected, also the blk_mq_freeze_queue() use in virtio-blk's ->suspend() > is questionable. I will take a further look. Did you find a way to fix this one? I still observe such warnings in my tests, even though your lockdep fixes are already merged to -next: https://lore.kernel.org/all/20241031133723.303835-1-ming.lei@redhat.com/ Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland
On Tue, Nov 12, 2024 at 09:36:40AM +0100, Marek Szyprowski wrote: > On 29.10.2024 16:58, Ming Lei wrote: > > On Tue, Oct 29, 2024 at 12:13:35PM +0100, Marek Szyprowski wrote: > >> On 25.10.2024 02:37, Ming Lei wrote: > >>> Recently we got several deadlock report[1][2][3] caused by > >>> blk_mq_freeze_queue and blk_enter_queue(). > >>> > >>> Turns out the two are just like acquiring read/write lock, so model them > >>> as read/write lock for supporting lockdep: > >>> > >>> 1) model q->q_usage_counter as two locks(io and queue lock) > >>> > >>> - queue lock covers sync with blk_enter_queue() > >>> > >>> - io lock covers sync with bio_enter_queue() > >>> > >>> 2) make the lockdep class/key as per-queue: > >>> > >>> - different subsystem has very different lock use pattern, shared lock > >>> class causes false positive easily > >>> > >>> - freeze_queue degrades to no lock in case that disk state becomes DEAD > >>> because bio_enter_queue() won't be blocked any more > >>> > >>> - freeze_queue degrades to no lock in case that request queue becomes dying > >>> because blk_enter_queue() won't be blocked any more > >>> > >>> 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock > >>> - it is exclusive lock, so dependency with blk_enter_queue() is covered > >>> > >>> - it is trylock because blk_mq_freeze_queue() are allowed to run > >>> concurrently > >>> > >>> 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() > >>> - nested blk_enter_queue() are allowed > >>> > >>> - dependency with blk_mq_freeze_queue() is covered > >>> > >>> - blk_queue_exit() is often called from other contexts(such as irq), and > >>> it can't be annotated as lock_release(), so simply do it in > >>> blk_enter_queue(), this way still covered cases as many as possible > >>> > >>> With lockdep support, such kind of reports may be reported asap and > >>> needn't wait until the real deadlock is triggered. > >>> > >>> For example, lockdep report can be triggered in the report[3] with this > >>> patch applied. > >>> > >>> [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' > >>> https://bugzilla.kernel.org/show_bug.cgi?id=219166 > >>> > >>> [2] del_gendisk() vs blk_queue_enter() race condition > >>> https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ > >>> > >>> [3] queue_freeze & queue_enter deadlock in scsi > >>> https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u > >>> > >>> Reviewed-by: Christoph Hellwig <hch@lst.de> > >>> Signed-off-by: Ming Lei <ming.lei@redhat.com> > >> This patch landed yesterday in linux-next as commit f1be1788a32e > >> ("block: model freeze & enter queue as lock for supporting lockdep"). > >> In my tests I found that it introduces the following 2 lockdep warnings: > >> > >> > ... > >> > >> > >> 2. On QEMU's ARM64 virt machine, observed during system suspend/resume > >> cycle: > >> > >> # time rtcwake -s10 -mmem > >> rtcwake: wakeup from "mem" using /dev/rtc0 at Tue Oct 29 11:54:30 2024 > >> PM: suspend entry (s2idle) > >> Filesystems sync: 0.004 seconds > >> Freezing user space processes > >> Freezing user space processes completed (elapsed 0.007 seconds) > >> OOM killer disabled. > >> Freezing remaining freezable tasks > >> Freezing remaining freezable tasks completed (elapsed 0.004 seconds) > >> > >> ====================================================== > >> WARNING: possible circular locking dependency detected > >> 6.12.0-rc4+ #9291 Not tainted > >> ------------------------------------------------------ > >> rtcwake/1299 is trying to acquire lock: > >> ffff80008358a7f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x1c/0x28 > >> > >> but task is already holding lock: > >> ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: > >> virtblk_freeze+0x24/0x60 > >> > >> which lock already depends on the new lock. > >> > >> > >> the existing dependency chain (in reverse order) is: > > This one looks a real thing, at least the added lockdep code works as > > expected, also the blk_mq_freeze_queue() use in virtio-blk's ->suspend() > > is questionable. I will take a further look. > > Did you find a way to fix this one? I still observe such warnings in my > tests, even though your lockdep fixes are already merged to -next: > https://lore.kernel.org/all/20241031133723.303835-1-ming.lei@redhat.com/ The lockdep fixes in ->next is just for making the added lockdep work correctly, and virtio-blk is another story. It might be fine to annotate it with blk_mq_freeze_queue_no_owner(), but it looks very fragile to call freeze queue in ->suspend(), and the lock is just kept as being grabbed in the whole suspend code path. Can you try the following patch? diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 194417abc105..21488740eb15 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1594,6 +1594,7 @@ static int virtblk_freeze(struct virtio_device *vdev) /* Ensure no requests in virtqueues before deleting vqs. */ blk_mq_freeze_queue(vblk->disk->queue); + blk_mq_unfreeze_queue(vblk->disk->queue); /* Ensure we don't receive any more interrupts */ virtio_reset_device(vdev); @@ -1617,8 +1618,6 @@ static int virtblk_restore(struct virtio_device *vdev) return ret; virtio_device_ready(vdev); - - blk_mq_unfreeze_queue(vblk->disk->queue); return 0; } #endif Thanks, Ming
On 12.11.2024 11:15, Ming Lei wrote: > On Tue, Nov 12, 2024 at 09:36:40AM +0100, Marek Szyprowski wrote: >> On 29.10.2024 16:58, Ming Lei wrote: >>> On Tue, Oct 29, 2024 at 12:13:35PM +0100, Marek Szyprowski wrote: >>>> On 25.10.2024 02:37, Ming Lei wrote: >>>>> Recently we got several deadlock report[1][2][3] caused by >>>>> blk_mq_freeze_queue and blk_enter_queue(). >>>>> >>>>> Turns out the two are just like acquiring read/write lock, so model them >>>>> as read/write lock for supporting lockdep: >>>>> >>>>> 1) model q->q_usage_counter as two locks(io and queue lock) >>>>> >>>>> - queue lock covers sync with blk_enter_queue() >>>>> >>>>> - io lock covers sync with bio_enter_queue() >>>>> >>>>> 2) make the lockdep class/key as per-queue: >>>>> >>>>> - different subsystem has very different lock use pattern, shared lock >>>>> class causes false positive easily >>>>> >>>>> - freeze_queue degrades to no lock in case that disk state becomes DEAD >>>>> because bio_enter_queue() won't be blocked any more >>>>> >>>>> - freeze_queue degrades to no lock in case that request queue becomes dying >>>>> because blk_enter_queue() won't be blocked any more >>>>> >>>>> 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock >>>>> - it is exclusive lock, so dependency with blk_enter_queue() is covered >>>>> >>>>> - it is trylock because blk_mq_freeze_queue() are allowed to run >>>>> concurrently >>>>> >>>>> 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() >>>>> - nested blk_enter_queue() are allowed >>>>> >>>>> - dependency with blk_mq_freeze_queue() is covered >>>>> >>>>> - blk_queue_exit() is often called from other contexts(such as irq), and >>>>> it can't be annotated as lock_release(), so simply do it in >>>>> blk_enter_queue(), this way still covered cases as many as possible >>>>> >>>>> With lockdep support, such kind of reports may be reported asap and >>>>> needn't wait until the real deadlock is triggered. >>>>> >>>>> For example, lockdep report can be triggered in the report[3] with this >>>>> patch applied. >>>>> >>>>> [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' >>>>> https://bugzilla.kernel.org/show_bug.cgi?id=219166 >>>>> >>>>> [2] del_gendisk() vs blk_queue_enter() race condition >>>>> https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ >>>>> >>>>> [3] queue_freeze & queue_enter deadlock in scsi >>>>> https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u >>>>> >>>>> Reviewed-by: Christoph Hellwig <hch@lst.de> >>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >>>> This patch landed yesterday in linux-next as commit f1be1788a32e >>>> ("block: model freeze & enter queue as lock for supporting lockdep"). >>>> In my tests I found that it introduces the following 2 lockdep warnings: >>>> >>>>> ... >>>> >>>> 2. On QEMU's ARM64 virt machine, observed during system suspend/resume >>>> cycle: >>>> >>>> # time rtcwake -s10 -mmem >>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Tue Oct 29 11:54:30 2024 >>>> PM: suspend entry (s2idle) >>>> Filesystems sync: 0.004 seconds >>>> Freezing user space processes >>>> Freezing user space processes completed (elapsed 0.007 seconds) >>>> OOM killer disabled. >>>> Freezing remaining freezable tasks >>>> Freezing remaining freezable tasks completed (elapsed 0.004 seconds) >>>> >>>> ====================================================== >>>> WARNING: possible circular locking dependency detected >>>> 6.12.0-rc4+ #9291 Not tainted >>>> ------------------------------------------------------ >>>> rtcwake/1299 is trying to acquire lock: >>>> ffff80008358a7f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x1c/0x28 >>>> >>>> but task is already holding lock: >>>> ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: >>>> virtblk_freeze+0x24/0x60 >>>> >>>> which lock already depends on the new lock. >>>> >>>> >>>> the existing dependency chain (in reverse order) is: >>> This one looks a real thing, at least the added lockdep code works as >>> expected, also the blk_mq_freeze_queue() use in virtio-blk's ->suspend() >>> is questionable. I will take a further look. >> Did you find a way to fix this one? I still observe such warnings in my >> tests, even though your lockdep fixes are already merged to -next: >> https://lore.kernel.org/all/20241031133723.303835-1-ming.lei@redhat.com/ > The lockdep fixes in ->next is just for making the added lockdep work > correctly, and virtio-blk is another story. > > It might be fine to annotate it with blk_mq_freeze_queue_no_owner(), > but it looks very fragile to call freeze queue in ->suspend(), and the lock > is just kept as being grabbed in the whole suspend code path. > > Can you try the following patch? Yes, this hides this lockdep warning, but imho it looks like a workaround, not a final fix. Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 194417abc105..21488740eb15 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -1594,6 +1594,7 @@ static int virtblk_freeze(struct virtio_device *vdev) > > /* Ensure no requests in virtqueues before deleting vqs. */ > blk_mq_freeze_queue(vblk->disk->queue); > + blk_mq_unfreeze_queue(vblk->disk->queue); > > /* Ensure we don't receive any more interrupts */ > virtio_reset_device(vdev); > @@ -1617,8 +1618,6 @@ static int virtblk_restore(struct virtio_device *vdev) > return ret; > > virtio_device_ready(vdev); > - > - blk_mq_unfreeze_queue(vblk->disk->queue); > return 0; > } > #endif > > > > Thanks, > Ming > > Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland
On Tue, Nov 12, 2024 at 12:32:29PM +0100, Marek Szyprowski wrote: > On 12.11.2024 11:15, Ming Lei wrote: > > On Tue, Nov 12, 2024 at 09:36:40AM +0100, Marek Szyprowski wrote: > >> On 29.10.2024 16:58, Ming Lei wrote: > >>> On Tue, Oct 29, 2024 at 12:13:35PM +0100, Marek Szyprowski wrote: > >>>> On 25.10.2024 02:37, Ming Lei wrote: > >>>>> Recently we got several deadlock report[1][2][3] caused by > >>>>> blk_mq_freeze_queue and blk_enter_queue(). > >>>>> > >>>>> Turns out the two are just like acquiring read/write lock, so model them > >>>>> as read/write lock for supporting lockdep: > >>>>> > >>>>> 1) model q->q_usage_counter as two locks(io and queue lock) > >>>>> > >>>>> - queue lock covers sync with blk_enter_queue() > >>>>> > >>>>> - io lock covers sync with bio_enter_queue() > >>>>> > >>>>> 2) make the lockdep class/key as per-queue: > >>>>> > >>>>> - different subsystem has very different lock use pattern, shared lock > >>>>> class causes false positive easily > >>>>> > >>>>> - freeze_queue degrades to no lock in case that disk state becomes DEAD > >>>>> because bio_enter_queue() won't be blocked any more > >>>>> > >>>>> - freeze_queue degrades to no lock in case that request queue becomes dying > >>>>> because blk_enter_queue() won't be blocked any more > >>>>> > >>>>> 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock > >>>>> - it is exclusive lock, so dependency with blk_enter_queue() is covered > >>>>> > >>>>> - it is trylock because blk_mq_freeze_queue() are allowed to run > >>>>> concurrently > >>>>> > >>>>> 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() > >>>>> - nested blk_enter_queue() are allowed > >>>>> > >>>>> - dependency with blk_mq_freeze_queue() is covered > >>>>> > >>>>> - blk_queue_exit() is often called from other contexts(such as irq), and > >>>>> it can't be annotated as lock_release(), so simply do it in > >>>>> blk_enter_queue(), this way still covered cases as many as possible > >>>>> > >>>>> With lockdep support, such kind of reports may be reported asap and > >>>>> needn't wait until the real deadlock is triggered. > >>>>> > >>>>> For example, lockdep report can be triggered in the report[3] with this > >>>>> patch applied. > >>>>> > >>>>> [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' > >>>>> https://bugzilla.kernel.org/show_bug.cgi?id=219166 > >>>>> > >>>>> [2] del_gendisk() vs blk_queue_enter() race condition > >>>>> https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ > >>>>> > >>>>> [3] queue_freeze & queue_enter deadlock in scsi > >>>>> https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u > >>>>> > >>>>> Reviewed-by: Christoph Hellwig <hch@lst.de> > >>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> > >>>> This patch landed yesterday in linux-next as commit f1be1788a32e > >>>> ("block: model freeze & enter queue as lock for supporting lockdep"). > >>>> In my tests I found that it introduces the following 2 lockdep warnings: > >>>> > >>>>> ... > >>>> > >>>> 2. On QEMU's ARM64 virt machine, observed during system suspend/resume > >>>> cycle: > >>>> > >>>> # time rtcwake -s10 -mmem > >>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Tue Oct 29 11:54:30 2024 > >>>> PM: suspend entry (s2idle) > >>>> Filesystems sync: 0.004 seconds > >>>> Freezing user space processes > >>>> Freezing user space processes completed (elapsed 0.007 seconds) > >>>> OOM killer disabled. > >>>> Freezing remaining freezable tasks > >>>> Freezing remaining freezable tasks completed (elapsed 0.004 seconds) > >>>> > >>>> ====================================================== > >>>> WARNING: possible circular locking dependency detected > >>>> 6.12.0-rc4+ #9291 Not tainted > >>>> ------------------------------------------------------ > >>>> rtcwake/1299 is trying to acquire lock: > >>>> ffff80008358a7f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x1c/0x28 > >>>> > >>>> but task is already holding lock: > >>>> ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: > >>>> virtblk_freeze+0x24/0x60 > >>>> > >>>> which lock already depends on the new lock. > >>>> > >>>> > >>>> the existing dependency chain (in reverse order) is: > >>> This one looks a real thing, at least the added lockdep code works as > >>> expected, also the blk_mq_freeze_queue() use in virtio-blk's ->suspend() > >>> is questionable. I will take a further look. > >> Did you find a way to fix this one? I still observe such warnings in my > >> tests, even though your lockdep fixes are already merged to -next: > >> https://lore.kernel.org/all/20241031133723.303835-1-ming.lei@redhat.com/ > > The lockdep fixes in ->next is just for making the added lockdep work > > correctly, and virtio-blk is another story. > > > > It might be fine to annotate it with blk_mq_freeze_queue_no_owner(), > > but it looks very fragile to call freeze queue in ->suspend(), and the lock > > is just kept as being grabbed in the whole suspend code path. > > > > Can you try the following patch? > > Yes, this hides this lockdep warning, but imho it looks like a > workaround, not a final fix. > > Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> > Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> Thanks for the test! It is actually not workaround, because what virtblk_freeze() needs is to drain all in-flight IOs. One thing missed is to mark the queue as quiesced, and I will post one formal patch with queue quiesce covered. Thanks, Ming
On 29.10.2024 16:58, Ming Lei wrote: > On Tue, Oct 29, 2024 at 12:13:35PM +0100, Marek Szyprowski wrote: >> On 25.10.2024 02:37, Ming Lei wrote: >>> Recently we got several deadlock report[1][2][3] caused by >>> blk_mq_freeze_queue and blk_enter_queue(). >>> >>> Turns out the two are just like acquiring read/write lock, so model them >>> as read/write lock for supporting lockdep: >>> >>> 1) model q->q_usage_counter as two locks(io and queue lock) >>> >>> - queue lock covers sync with blk_enter_queue() >>> >>> - io lock covers sync with bio_enter_queue() >>> >>> 2) make the lockdep class/key as per-queue: >>> >>> - different subsystem has very different lock use pattern, shared lock >>> class causes false positive easily >>> >>> - freeze_queue degrades to no lock in case that disk state becomes DEAD >>> because bio_enter_queue() won't be blocked any more >>> >>> - freeze_queue degrades to no lock in case that request queue becomes dying >>> because blk_enter_queue() won't be blocked any more >>> >>> 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock >>> - it is exclusive lock, so dependency with blk_enter_queue() is covered >>> >>> - it is trylock because blk_mq_freeze_queue() are allowed to run >>> concurrently >>> >>> 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() >>> - nested blk_enter_queue() are allowed >>> >>> - dependency with blk_mq_freeze_queue() is covered >>> >>> - blk_queue_exit() is often called from other contexts(such as irq), and >>> it can't be annotated as lock_release(), so simply do it in >>> blk_enter_queue(), this way still covered cases as many as possible >>> >>> With lockdep support, such kind of reports may be reported asap and >>> needn't wait until the real deadlock is triggered. >>> >>> For example, lockdep report can be triggered in the report[3] with this >>> patch applied. >>> >>> [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' >>> https://bugzilla.kernel.org/show_bug.cgi?id=219166 >>> >>> [2] del_gendisk() vs blk_queue_enter() race condition >>> https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ >>> >>> [3] queue_freeze & queue_enter deadlock in scsi >>> https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u >>> >>> Reviewed-by: Christoph Hellwig <hch@lst.de> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >> This patch landed yesterday in linux-next as commit f1be1788a32e >> ("block: model freeze & enter queue as lock for supporting lockdep"). >> In my tests I found that it introduces the following 2 lockdep warnings: >> >> 1. On Samsung Exynos 4412-based Odroid U3 board (ARM 32bit), observed >> when booting it: >> >> ====================================================== >> WARNING: possible circular locking dependency detected >> 6.12.0-rc4-00037-gf1be1788a32e #9290 Not tainted >> ------------------------------------------------------ >> find/1284 is trying to acquire lock: >> cf3b8534 (&mm->mmap_lock){++++}-{3:3}, at: __might_fault+0x30/0x70 >> >> but task is already holding lock: >> c203a0c8 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: >> iterate_dir+0x30/0x140 >> >> which lock already depends on the new lock. >> >> >> the existing dependency chain (in reverse order) is: >> >> -> #4 (&sb->s_type->i_mutex_key#2){++++}-{3:3}: >> down_write+0x44/0xc4 >> start_creating+0x8c/0x170 >> debugfs_create_dir+0x1c/0x178 >> blk_register_queue+0xa0/0x1c0 >> add_disk_fwnode+0x210/0x434 >> brd_alloc+0x1cc/0x210 >> brd_init+0xac/0x104 >> do_one_initcall+0x64/0x30c >> kernel_init_freeable+0x1c4/0x228 >> kernel_init+0x1c/0x12c >> ret_from_fork+0x14/0x28 >> >> -> #3 (&q->debugfs_mutex){+.+.}-{3:3}: >> __mutex_lock+0x94/0x94c >> mutex_lock_nested+0x1c/0x24 >> blk_mq_init_sched+0x140/0x204 >> elevator_init_mq+0xb8/0x130 >> add_disk_fwnode+0x3c/0x434 > The above chain can be cut by the following patch because disk state > can be thought as DEAD in add_disk(), can you test it? Seems to be fixing this issue. Feel free to add: Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> > diff --git a/block/elevator.c b/block/elevator.c > index 4122026b11f1..efa6ff941a25 100644 > --- a/block/elevator.c > +++ b/block/elevator.c > @@ -600,12 +600,14 @@ void elevator_init_mq(struct request_queue *q) > * requests, then no need to quiesce queue which may add long boot > * latency, especially when lots of disks are involved. > */ > - blk_mq_freeze_queue(q); > + if (__blk_freeze_queue_start(q)) > + blk_freeze_acquire_lock(q, true, false); > blk_mq_cancel_work_sync(q); > > err = blk_mq_init_sched(q, e); > > - blk_mq_unfreeze_queue(q); > + if (__blk_mq_unfreeze_queue(q, false)) > + blk_unfreeze_release_lock(q, true, false); > > if (err) { > pr_warn("\"%s\" elevator initialization failed, " > > ... > >> 2 locks held by find/1284: >> #0: c3df1e88 (&f->f_pos_lock){+.+.}-{3:3}, at: fdget_pos+0x88/0xd0 >> #1: c203a0c8 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: >> iterate_dir+0x30/0x140 >> >> stack backtrace: >> CPU: 1 UID: 0 PID: 1284 Comm: find Not tainted >> 6.12.0-rc4-00037-gf1be1788a32e #9290 >> Hardware name: Samsung Exynos (Flattened Device Tree) >> Call trace: >> unwind_backtrace from show_stack+0x10/0x14 >> show_stack from dump_stack_lvl+0x68/0x88 >> dump_stack_lvl from print_circular_bug+0x31c/0x394 >> print_circular_bug from check_noncircular+0x16c/0x184 >> check_noncircular from __lock_acquire+0x158c/0x2970 >> __lock_acquire from lock_acquire+0x130/0x384 >> lock_acquire from __might_fault+0x50/0x70 >> __might_fault from filldir64+0x94/0x28c >> filldir64 from dcache_readdir+0x174/0x260 >> dcache_readdir from iterate_dir+0x64/0x140 >> iterate_dir from sys_getdents64+0x60/0x130 >> sys_getdents64 from ret_fast_syscall+0x0/0x1c >> Exception stack(0xf22b5fa8 to 0xf22b5ff0) >> 5fa0: 004b4fa0 004b4f80 00000004 004b4fa0 00008000 >> 00000000 >> 5fc0: 004b4fa0 004b4f80 00000001 000000d9 00000000 004b4af0 00000000 >> 000010ea >> 5fe0: 004b1eb4 bea05af0 b6da4b08 b6da4a28 >> >> --->8--- >> >> >> 2. On QEMU's ARM64 virt machine, observed during system suspend/resume >> cycle: >> >> # time rtcwake -s10 -mmem >> rtcwake: wakeup from "mem" using /dev/rtc0 at Tue Oct 29 11:54:30 2024 >> PM: suspend entry (s2idle) >> Filesystems sync: 0.004 seconds >> Freezing user space processes >> Freezing user space processes completed (elapsed 0.007 seconds) >> OOM killer disabled. >> Freezing remaining freezable tasks >> Freezing remaining freezable tasks completed (elapsed 0.004 seconds) >> >> ====================================================== >> WARNING: possible circular locking dependency detected >> 6.12.0-rc4+ #9291 Not tainted >> ------------------------------------------------------ >> rtcwake/1299 is trying to acquire lock: >> ffff80008358a7f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x1c/0x28 >> >> but task is already holding lock: >> ffff000006136d68 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: >> virtblk_freeze+0x24/0x60 >> >> which lock already depends on the new lock. >> >> >> the existing dependency chain (in reverse order) is: > This one looks a real thing, at least the added lockdep code works as > expected, also the blk_mq_freeze_queue() use in virtio-blk's ->suspend() > is questionable. I will take a further look. > > > Thanks, > Ming > > Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland
© 2016 - 2024 Red Hat, Inc.