From: Yu Kuai <yukuai3@huawei.com>
Allocate and free sched_tags while queue is freezed can deadlock[1],
this is a long term problem, hence allocate memory before freezing
queue and free memory after queue is unfreezed.
[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
block/blk-mq.c | 22 ++++++++++------------
block/blk-mq.h | 5 ++++-
block/blk-sysfs.c | 25 +++++++++++++++++--------
3 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 82fa81036115..d85afbb9f031 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4917,11 +4917,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *et,
+ unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *old_et = NULL;
struct blk_mq_hw_ctx *hctx;
- int ret = 0;
unsigned long i;
blk_mq_quiesce_queue(q);
@@ -4946,23 +4948,19 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
nr - hctx->sched_tags->nr_reserved_tags);
}
} else {
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->sched_tags)
- continue;
- blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr);
- if (ret)
- goto out;
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = et->tags[i];
+
+ old_et = q->elevator->et;
+ q->elevator->et = et;
}
q->nr_requests = nr;
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(q);
-out:
blk_mq_unquiesce_queue(q);
-
- return ret;
+ return old_et;
}
/*
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 731f4578d9a8..6c9d03625ba1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -6,6 +6,7 @@
#include "blk-stat.h"
struct blk_mq_tag_set;
+struct elevator_tags;
struct blk_mq_ctxs {
struct kobject kobj;
@@ -45,7 +46,9 @@ void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *tags,
+ unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
bool);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7ea15bf68b4b..a0a7ebad378f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page)
static ssize_t
queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
- unsigned long nr;
- int ret, err;
- unsigned int memflags;
struct request_queue *q = disk->queue;
struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *et = NULL;
+ unsigned int memflags;
+ unsigned long nr;
+ int ret;
ret = queue_var_store(&nr, page, count);
if (ret < 0)
@@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
goto unlock;
}
+ if (q->elevator && nr > q->elevator->et->nr_requests) {
+ /* allocate memory before freezing queue to prevent deadlock */
+ et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr);
+ if (!et) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ }
+
memflags = blk_mq_freeze_queue(q);
mutex_lock(&q->elevator_lock);
-
- err = blk_mq_update_nr_requests(disk->queue, nr);
- if (err)
- ret = err;
-
+ et = blk_mq_update_nr_requests(q, et, nr);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
+ if (et)
+ blk_mq_free_sched_tags(et, set);
+
unlock:
up_write(&set->update_nr_hwq_lock);
return ret;
--
2.39.2
Hi, 在 2025/09/08 14:15, Yu Kuai 写道: > @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) > goto unlock; > } > > + if (q->elevator && nr > q->elevator->et->nr_requests) { While rebasing v2, I found that I should also add non-shared checking here, because from blk_mq_alloc_shced_tags(), et->nr_requests is not set to MAX_SCHED_RQ, and we don't want to allocate memory for shared case. I'll fix this as well in v2. Thanks, Kuai
On 9/8/25 11:45 AM, Yu Kuai wrote: > From: Yu Kuai <yukuai3@huawei.com> > > Allocate and free sched_tags while queue is freezed can deadlock[1], > this is a long term problem, hence allocate memory before freezing > queue and free memory after queue is unfreezed. > > [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ > Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs") > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> Looks good to me: Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
On 9/8/25 11:45 AM, Yu Kuai wrote: > From: Yu Kuai <yukuai3@huawei.com> > > Allocate and free sched_tags while queue is freezed can deadlock[1], > this is a long term problem, hence allocate memory before freezing > queue and free memory after queue is unfreezed. > > [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ > Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs") > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> [...] [...] > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > index 7ea15bf68b4b..a0a7ebad378f 100644 > --- a/block/blk-sysfs.c > +++ b/block/blk-sysfs.c > @@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) > static ssize_t > queue_requests_store(struct gendisk *disk, const char *page, size_t count) > { > - unsigned long nr; > - int ret, err; > - unsigned int memflags; > struct request_queue *q = disk->queue; > struct blk_mq_tag_set *set = q->tag_set; > + struct elevator_tags *et = NULL; > + unsigned int memflags; > + unsigned long nr; > + int ret; > > ret = queue_var_store(&nr, page, count); > if (ret < 0) > @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) > goto unlock; > } > > + if (q->elevator && nr > q->elevator->et->nr_requests) { > + /* allocate memory before freezing queue to prevent deadlock */ > + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); > + if (!et) { > + ret = -ENOMEM; > + goto unlock; > + } > + } > + I think we should add a comment above explaining why is it safe to access q->elevator without holding ->elevator_lock. Thanks, --Nilay
Hi, 在 2025/09/09 14:39, Nilay Shroff 写道: > > > On 9/8/25 11:45 AM, Yu Kuai wrote: >> From: Yu Kuai <yukuai3@huawei.com> >> >> Allocate and free sched_tags while queue is freezed can deadlock[1], >> this is a long term problem, hence allocate memory before freezing >> queue and free memory after queue is unfreezed. >> >> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ >> Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs") >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> > [...] > [...] > >> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c >> index 7ea15bf68b4b..a0a7ebad378f 100644 >> --- a/block/blk-sysfs.c >> +++ b/block/blk-sysfs.c >> @@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) >> static ssize_t >> queue_requests_store(struct gendisk *disk, const char *page, size_t count) >> { >> - unsigned long nr; >> - int ret, err; >> - unsigned int memflags; >> struct request_queue *q = disk->queue; >> struct blk_mq_tag_set *set = q->tag_set; >> + struct elevator_tags *et = NULL; >> + unsigned int memflags; >> + unsigned long nr; >> + int ret; >> >> ret = queue_var_store(&nr, page, count); >> if (ret < 0) >> @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) >> goto unlock; >> } >> >> + if (q->elevator && nr > q->elevator->et->nr_requests) { >> + /* allocate memory before freezing queue to prevent deadlock */ >> + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); >> + if (!et) { >> + ret = -ENOMEM; >> + goto unlock; >> + } >> + } >> + > I think we should add a comment above explaining why is it safe > to access q->elevator without holding ->elevator_lock. > I already access q->elevator to check input nr from patch 4, and that's why I add comments to explain switching elevator is serialized, is this enough? Thanks, Kuai > Thanks, > --Nilay > . >
On 9/9/25 1:07 PM, Yu Kuai wrote: > Hi, > > 在 2025/09/09 14:39, Nilay Shroff 写道: >> >> >> On 9/8/25 11:45 AM, Yu Kuai wrote: >>> From: Yu Kuai <yukuai3@huawei.com> >>> >>> Allocate and free sched_tags while queue is freezed can deadlock[1], >>> this is a long term problem, hence allocate memory before freezing >>> queue and free memory after queue is unfreezed. >>> >>> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ >>> Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs") >>> >>> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> [...] >> [...] >> >>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c >>> index 7ea15bf68b4b..a0a7ebad378f 100644 >>> --- a/block/blk-sysfs.c >>> +++ b/block/blk-sysfs.c >>> @@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) >>> static ssize_t >>> queue_requests_store(struct gendisk *disk, const char *page, size_t count) >>> { >>> - unsigned long nr; >>> - int ret, err; >>> - unsigned int memflags; >>> struct request_queue *q = disk->queue; >>> struct blk_mq_tag_set *set = q->tag_set; >>> + struct elevator_tags *et = NULL; >>> + unsigned int memflags; >>> + unsigned long nr; >>> + int ret; >>> ret = queue_var_store(&nr, page, count); >>> if (ret < 0) >>> @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) >>> goto unlock; >>> } >>> + if (q->elevator && nr > q->elevator->et->nr_requests) { >>> + /* allocate memory before freezing queue to prevent deadlock */ >>> + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); >>> + if (!et) { >>> + ret = -ENOMEM; >>> + goto unlock; >>> + } >>> + } >>> + >> I think we should add a comment above explaining why is it safe >> to access q->elevator without holding ->elevator_lock. >> > > I already access q->elevator to check input nr from patch 4, and that's > why I add comments to explain switching elevator is serialized, is this > enough? > yes in patch 04/10 you moved the ->elevator_lock after the usual sanity checks. However when we run those sanity checks or the code in this patch where we have to access q->elevator, it's good to add a comment here in the code (not in commit). For reference, you may check blk_mq_alloc_sched_tags_batch. I think similar comment may be added here as well. Thanks, --Nilay
© 2016 - 2025 Red Hat, Inc.