From: Yu Kuai <yukuai3@huawei.com>
Allocate and free sched_tags while queue is freezed can deadlock[1],
this is a long term problem, hence allocate memory before freezing
queue and free memory after queue is unfreezed.
[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
block/blk-mq.c | 22 ++++++++++------------
block/blk-mq.h | 5 ++++-
block/blk-sysfs.c | 25 +++++++++++++++++--------
3 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 82fa81036115..d85afbb9f031 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4917,11 +4917,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *et,
+ unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *old_et = NULL;
struct blk_mq_hw_ctx *hctx;
- int ret = 0;
unsigned long i;
blk_mq_quiesce_queue(q);
@@ -4946,23 +4948,19 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
nr - hctx->sched_tags->nr_reserved_tags);
}
} else {
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->sched_tags)
- continue;
- blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr);
- if (ret)
- goto out;
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = et->tags[i];
+
+ old_et = q->elevator->et;
+ q->elevator->et = et;
}
q->nr_requests = nr;
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(q);
-out:
blk_mq_unquiesce_queue(q);
-
- return ret;
+ return old_et;
}
/*
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 731f4578d9a8..6c9d03625ba1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -6,6 +6,7 @@
#include "blk-stat.h"
struct blk_mq_tag_set;
+struct elevator_tags;
struct blk_mq_ctxs {
struct kobject kobj;
@@ -45,7 +46,9 @@ void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *tags,
+ unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
bool);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7ea15bf68b4b..a0a7ebad378f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page)
static ssize_t
queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
- unsigned long nr;
- int ret, err;
- unsigned int memflags;
struct request_queue *q = disk->queue;
struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *et = NULL;
+ unsigned int memflags;
+ unsigned long nr;
+ int ret;
ret = queue_var_store(&nr, page, count);
if (ret < 0)
@@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
goto unlock;
}
+ if (q->elevator && nr > q->elevator->et->nr_requests) {
+ /* allocate memory before freezing queue to prevent deadlock */
+ et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr);
+ if (!et) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ }
+
memflags = blk_mq_freeze_queue(q);
mutex_lock(&q->elevator_lock);
-
- err = blk_mq_update_nr_requests(disk->queue, nr);
- if (err)
- ret = err;
-
+ et = blk_mq_update_nr_requests(q, et, nr);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
+ if (et)
+ blk_mq_free_sched_tags(et, set);
+
unlock:
up_write(&set->update_nr_hwq_lock);
return ret;
--
2.39.2
Hi,
在 2025/09/08 14:15, Yu Kuai 写道:
> @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
> goto unlock;
> }
>
> + if (q->elevator && nr > q->elevator->et->nr_requests) {
While rebasing v2, I found that I should also add non-shared checking
here, because from blk_mq_alloc_shced_tags(), et->nr_requests is not set
to MAX_SCHED_RQ, and we don't want to allocate memory for shared case.
I'll fix this as well in v2.
Thanks,
Kuai
On 9/8/25 11:45 AM, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
>
> Allocate and free sched_tags while queue is freezed can deadlock[1],
> this is a long term problem, hence allocate memory before freezing
> queue and free memory after queue is unfreezed.
>
> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs")
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
On 9/8/25 11:45 AM, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
>
> Allocate and free sched_tags while queue is freezed can deadlock[1],
> this is a long term problem, hence allocate memory before freezing
> queue and free memory after queue is unfreezed.
>
> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs")
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
[...]
[...]
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 7ea15bf68b4b..a0a7ebad378f 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page)
> static ssize_t
> queue_requests_store(struct gendisk *disk, const char *page, size_t count)
> {
> - unsigned long nr;
> - int ret, err;
> - unsigned int memflags;
> struct request_queue *q = disk->queue;
> struct blk_mq_tag_set *set = q->tag_set;
> + struct elevator_tags *et = NULL;
> + unsigned int memflags;
> + unsigned long nr;
> + int ret;
>
> ret = queue_var_store(&nr, page, count);
> if (ret < 0)
> @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
> goto unlock;
> }
>
> + if (q->elevator && nr > q->elevator->et->nr_requests) {
> + /* allocate memory before freezing queue to prevent deadlock */
> + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr);
> + if (!et) {
> + ret = -ENOMEM;
> + goto unlock;
> + }
> + }
> +
I think we should add a comment above explaining why is it safe
to access q->elevator without holding ->elevator_lock.
Thanks,
--Nilay
Hi,
在 2025/09/09 14:39, Nilay Shroff 写道:
>
>
> On 9/8/25 11:45 AM, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Allocate and free sched_tags while queue is freezed can deadlock[1],
>> this is a long term problem, hence allocate memory before freezing
>> queue and free memory after queue is unfreezed.
>>
>> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
>> Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs")
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> [...]
> [...]
>
>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
>> index 7ea15bf68b4b..a0a7ebad378f 100644
>> --- a/block/blk-sysfs.c
>> +++ b/block/blk-sysfs.c
>> @@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page)
>> static ssize_t
>> queue_requests_store(struct gendisk *disk, const char *page, size_t count)
>> {
>> - unsigned long nr;
>> - int ret, err;
>> - unsigned int memflags;
>> struct request_queue *q = disk->queue;
>> struct blk_mq_tag_set *set = q->tag_set;
>> + struct elevator_tags *et = NULL;
>> + unsigned int memflags;
>> + unsigned long nr;
>> + int ret;
>>
>> ret = queue_var_store(&nr, page, count);
>> if (ret < 0)
>> @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
>> goto unlock;
>> }
>>
>> + if (q->elevator && nr > q->elevator->et->nr_requests) {
>> + /* allocate memory before freezing queue to prevent deadlock */
>> + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr);
>> + if (!et) {
>> + ret = -ENOMEM;
>> + goto unlock;
>> + }
>> + }
>> +
> I think we should add a comment above explaining why is it safe
> to access q->elevator without holding ->elevator_lock.
>
I already access q->elevator to check input nr from patch 4, and that's
why I add comments to explain switching elevator is serialized, is this
enough?
Thanks,
Kuai
> Thanks,
> --Nilay
> .
>
On 9/9/25 1:07 PM, Yu Kuai wrote:
> Hi,
>
> 在 2025/09/09 14:39, Nilay Shroff 写道:
>>
>>
>> On 9/8/25 11:45 AM, Yu Kuai wrote:
>>> From: Yu Kuai <yukuai3@huawei.com>
>>>
>>> Allocate and free sched_tags while queue is freezed can deadlock[1],
>>> this is a long term problem, hence allocate memory before freezing
>>> queue and free memory after queue is unfreezed.
>>>
>>> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
>>> Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs")
>>>
>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> [...]
>> [...]
>>
>>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
>>> index 7ea15bf68b4b..a0a7ebad378f 100644
>>> --- a/block/blk-sysfs.c
>>> +++ b/block/blk-sysfs.c
>>> @@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page)
>>> static ssize_t
>>> queue_requests_store(struct gendisk *disk, const char *page, size_t count)
>>> {
>>> - unsigned long nr;
>>> - int ret, err;
>>> - unsigned int memflags;
>>> struct request_queue *q = disk->queue;
>>> struct blk_mq_tag_set *set = q->tag_set;
>>> + struct elevator_tags *et = NULL;
>>> + unsigned int memflags;
>>> + unsigned long nr;
>>> + int ret;
>>> ret = queue_var_store(&nr, page, count);
>>> if (ret < 0)
>>> @@ -90,16 +91,24 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
>>> goto unlock;
>>> }
>>> + if (q->elevator && nr > q->elevator->et->nr_requests) {
>>> + /* allocate memory before freezing queue to prevent deadlock */
>>> + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr);
>>> + if (!et) {
>>> + ret = -ENOMEM;
>>> + goto unlock;
>>> + }
>>> + }
>>> +
>> I think we should add a comment above explaining why is it safe
>> to access q->elevator without holding ->elevator_lock.
>>
>
> I already access q->elevator to check input nr from patch 4, and that's
> why I add comments to explain switching elevator is serialized, is this
> enough?
>
yes in patch 04/10 you moved the ->elevator_lock after the
usual sanity checks. However when we run those sanity checks
or the code in this patch where we have to access q->elevator,
it's good to add a comment here in the code (not in commit).
For reference, you may check blk_mq_alloc_sched_tags_batch.
I think similar comment may be added here as well.
Thanks,
--Nilay
© 2016 - 2026 Red Hat, Inc.