blk-mq: fix possible deadlocks

[PATCH v9 8/8] blk-mq-debugfs: warn about possible deadlock

Posted by Yu Kuai 1 week ago

Creating new debugfs entries can trigger fs reclaim, hence we can't do
this with queue frozen, meanwhile, other locks that can be held while
queue is frozen should not be held as well.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 block/blk-mq-debugfs.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5c7cadf51a88..faeaa1fc86a7 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -608,9 +608,23 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 	{},
 };
 
-static void debugfs_create_files(struct dentry *parent, void *data,
+static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
+				 void *data,
 				 const struct blk_mq_debugfs_attr *attr)
 {
+	lockdep_assert_held(&q->debugfs_mutex);
+	/*
+	 * Creating new debugfs entries with queue freezed has the risk of
+	 * deadlock.
+	 */
+	WARN_ON_ONCE(q->mq_freeze_depth != 0);
+	/*
+	 * debugfs_mutex should not be nested under other locks that can be
+	 * grabbed while queue is frozen.
+	 */
+	lockdep_assert_not_held(&q->elevator_lock);
+	lockdep_assert_not_held(&q->rq_qos_mutex);
+
 	if (IS_ERR_OR_NULL(parent))
 		return;
 
@@ -624,7 +638,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i;
 
-	debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
+	debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->debugfs_dir)
@@ -643,7 +657,8 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
 	snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
 	ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir);
 
-	debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs);
+	debugfs_create_files(hctx->queue, ctx_dir, ctx,
+			     blk_mq_debugfs_ctx_attrs);
 }
 
 void blk_mq_debugfs_register_hctx(struct request_queue *q,
@@ -659,7 +674,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
 	hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
 
-	debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs);
+	debugfs_create_files(q, hctx->debugfs_dir, hctx,
+			     blk_mq_debugfs_hctx_attrs);
 
 	hctx_for_each_ctx(hctx, ctx, i)
 		blk_mq_debugfs_register_ctx(hctx, ctx);
@@ -712,7 +728,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
 
 	q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir);
 
-	debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
+	debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
 }
 
 void blk_mq_debugfs_unregister_sched(struct request_queue *q)
@@ -751,7 +767,8 @@ static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 							 q->debugfs_dir);
 
 	rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir);
-	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
+	debugfs_create_files(q, rqos->debugfs_dir, rqos,
+			     rqos->ops->debugfs_attrs);
 }
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
@@ -788,7 +805,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 
 	hctx->sched_debugfs_dir = debugfs_create_dir("sched",
 						     hctx->debugfs_dir);
-	debugfs_create_files(hctx->sched_debugfs_dir, hctx,
+	debugfs_create_files(q, hctx->sched_debugfs_dir, hctx,
 			     e->hctx_debugfs_attrs);
 }
 
-- 
2.51.0

Re: [PATCH v9 8/8] blk-mq-debugfs: warn about possible deadlock

Posted by kernel test robot 3 days, 1 hour ago


Hello,

kernel test robot noticed "blktests.nvme/040.fail" on:

commit: a228828b6a29e3787c2d4f30b966b4e723436491 ("[PATCH v9 8/8] blk-mq-debugfs: warn about possible deadlock")
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/blk-wbt-factor-out-a-helper-wbt_set_lat/20260202-161435
base: https://git.kernel.org/cgit/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/all/20260202080523.3947504-9-yukuai@fnnas.com/
patch subject: [PATCH v9 8/8] blk-mq-debugfs: warn about possible deadlock

in testcase: blktests
version: blktests-x86_64-5885dee-1_20260203
with following parameters:

	disk: 1SSD
	test: nvme-040
	nvme_trtype: rdma
	use_siw: true


config: x86_64-rhel-9.4-func
compiler: gcc-14
test machine: 224 threads 2 sockets Intel(R) Xeon(R) Platinum 8480+ (Sapphire Rapids) with 256G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)


If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202602061756.96736e8f-lkp@intel.com


2026-02-04 16:08:35 cd /lkp/benchmarks/blktests
2026-02-04 16:08:35 mkdir -p /mnt/nvme-040
2026-02-04 16:08:35 mount /dev/nvme0n1p1 /mnt/nvme-040
2026-02-04 16:08:35 echo nvme/040
2026-02-04 16:08:35 ./check -o /mnt/nvme-040 nvme/040
nvme/040 (tr=rdma) (test nvme fabrics controller reset/disconnect operation during I/O)
    runtime  8.568s  ...
nvme/040 (tr=rdma) (test nvme fabrics controller reset/disconnect operation during I/O) [failed]
    runtime  8.568s  ...  8.988s
    something found in dmesg:
    [  161.504240] [   T3651] run blktests nvme/040 at 2026-02-04 16:08:36
    [  161.567237] [   T4057] loop0: detected capacity change from 0 to 2097152
    [  161.581589] [   T4062] nvmet: adding nsid 1 to subsystem blktests-subsystem-1
    [  161.598986] [   T4067] iwpm_register_pid: Unable to send a nlmsg (client = 2)
    [  161.608814] [   T4067] nvmet_rdma: enabling port 0 (192.168.3.121:4420)
    [  161.658405] [   T1863] nvmet: Created nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
    [  161.686065] [   T4074] nvme nvme2: creating 128 I/O queues.
    [  162.191928] [   T4074] nvme nvme2: mapped 128/0/0 default/read/poll queues.
    [  162.263779] [   T4074] nvme nvme2: new ctrl: NQN "blktests-subsystem-1", addr 192.168.3.121:4420, hostnqn: nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349
    [  162.358452] [   T4108] block nvme0n1: No UUID available providing old NGUID
    ...
    (See '/mnt/nvme-040/nodev_tr_rdma/nvme/040.dmesg' for the entire message)



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20260206/202602061756.96736e8f-lkp@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki