From: Yu Kuai <yukuai3@huawei.com>
Currently, both mq-deadline and bfq have global spin lock that will be
grabbed inside elevator methods like dispatch_request, insert_requests,
and bio_merge. And the global lock is the main reason mq-deadline and
bfq can't scale very well.
While dispatching request, blk_mq_get_disatpch_budget() and
blk_mq_get_driver_tag() must be called, and they are not ready to be called
inside elevator methods, hence introduce a new method like
dispatch_requests is not possible.
Hence introduce a new high level elevator lock, currently it is protecting
dispatch_request only. Following patches will convert mq-deadline and bfq
to use this lock and finally support request batch dispatching by calling
the method multiple time while holding the lock.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
block/blk-mq-sched.c | 9 ++++++++-
block/elevator.c | 1 +
block/elevator.h | 14 ++++++++++++--
3 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 55a0fd105147..1a2da5edbe13 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (budget_token < 0)
break;
- rq = e->type->ops.dispatch_request(hctx);
+ if (blk_queue_sq_sched(q)) {
+ elevator_lock(e);
+ rq = e->type->ops.dispatch_request(hctx);
+ elevator_unlock(e);
+ } else {
+ rq = e->type->ops.dispatch_request(hctx);
+ }
+
if (!rq) {
blk_mq_put_dispatch_budget(q, budget_token);
/*
diff --git a/block/elevator.c b/block/elevator.c
index 88f8f36bed98..45303af0ca73 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -144,6 +144,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
eq->type = e;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
+ spin_lock_init(&eq->lock);
hash_init(eq->hash);
return eq;
diff --git a/block/elevator.h b/block/elevator.h
index a07ce773a38f..81f7700b0339 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -110,12 +110,12 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
/*
* each queue has an elevator_queue associated with it
*/
-struct elevator_queue
-{
+struct elevator_queue {
struct elevator_type *type;
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
+ spinlock_t lock;
unsigned long flags;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};
@@ -124,6 +124,16 @@ struct elevator_queue
#define ELEVATOR_FLAG_DYING 1
#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
+#define elevator_lock(e) spin_lock(&(e)->lock)
+#define elevator_unlock(e) spin_unlock(&(e)->lock)
+#define elevator_lock_irq(e) spin_lock_irq(&(e)->lock)
+#define elevator_unlock_irq(e) spin_unlock_irq(&(e)->lock)
+#define elevator_lock_irqsave(e, flags) \
+ spin_lock_irqsave(&(e)->lock, flags)
+#define elevator_unlock_irqrestore(e, flags) \
+ spin_unlock_irqrestore(&(e)->lock, flags)
+#define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock)
+
/*
* block elevator interface
*/
--
2.39.2
On 8/6/25 17:57, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
>
> Currently, both mq-deadline and bfq have global spin lock that will be
> grabbed inside elevator methods like dispatch_request, insert_requests,
> and bio_merge. And the global lock is the main reason mq-deadline and
> bfq can't scale very well.
>
> While dispatching request, blk_mq_get_disatpch_budget() and
> blk_mq_get_driver_tag() must be called, and they are not ready to be called
> inside elevator methods, hence introduce a new method like
> dispatch_requests is not possible.
>
> Hence introduce a new high level elevator lock, currently it is protecting
> dispatch_request only. Following patches will convert mq-deadline and bfq
> to use this lock and finally support request batch dispatching by calling
> the method multiple time while holding the lock.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
> block/blk-mq-sched.c | 9 ++++++++-
> block/elevator.c | 1 +
> block/elevator.h | 14 ++++++++++++--
> 3 files changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 55a0fd105147..1a2da5edbe13 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> if (budget_token < 0)
> break;
>
> - rq = e->type->ops.dispatch_request(hctx);
> + if (blk_queue_sq_sched(q)) {
> + elevator_lock(e);
> + rq = e->type->ops.dispatch_request(hctx);
> + elevator_unlock(e);
I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin
lock variant. If it is safe, this needs a big comment block explaining why
and/or the rules regarding the scheduler use of this lock.
> + } else {
> + rq = e->type->ops.dispatch_request(hctx);
> + }
> +
> if (!rq) {
> blk_mq_put_dispatch_budget(q, budget_token);
> /*
> diff --git a/block/elevator.c b/block/elevator.c
> index 88f8f36bed98..45303af0ca73 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -144,6 +144,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
> eq->type = e;
> kobject_init(&eq->kobj, &elv_ktype);
> mutex_init(&eq->sysfs_lock);
> + spin_lock_init(&eq->lock);
> hash_init(eq->hash);
>
> return eq;
> diff --git a/block/elevator.h b/block/elevator.h
> index a07ce773a38f..81f7700b0339 100644
> --- a/block/elevator.h
> +++ b/block/elevator.h
> @@ -110,12 +110,12 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
> /*
> * each queue has an elevator_queue associated with it
> */
> -struct elevator_queue
> -{
> +struct elevator_queue {
> struct elevator_type *type;
> void *elevator_data;
> struct kobject kobj;
> struct mutex sysfs_lock;
> + spinlock_t lock;
> unsigned long flags;
> DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
> };
> @@ -124,6 +124,16 @@ struct elevator_queue
> #define ELEVATOR_FLAG_DYING 1
> #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
>
> +#define elevator_lock(e) spin_lock(&(e)->lock)
> +#define elevator_unlock(e) spin_unlock(&(e)->lock)
> +#define elevator_lock_irq(e) spin_lock_irq(&(e)->lock)
> +#define elevator_unlock_irq(e) spin_unlock_irq(&(e)->lock)
> +#define elevator_lock_irqsave(e, flags) \
> + spin_lock_irqsave(&(e)->lock, flags)
> +#define elevator_unlock_irqrestore(e, flags) \
> + spin_unlock_irqrestore(&(e)->lock, flags)
> +#define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock)
> +
> /*
> * block elevator interface
> */
--
Damien Le Moal
Western Digital Research
Hi,
在 2025/08/11 8:44, Damien Le Moal 写道:
> On 8/6/25 17:57, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Currently, both mq-deadline and bfq have global spin lock that will be
>> grabbed inside elevator methods like dispatch_request, insert_requests,
>> and bio_merge. And the global lock is the main reason mq-deadline and
>> bfq can't scale very well.
>>
>> While dispatching request, blk_mq_get_disatpch_budget() and
>> blk_mq_get_driver_tag() must be called, and they are not ready to be called
>> inside elevator methods, hence introduce a new method like
>> dispatch_requests is not possible.
>>
>> Hence introduce a new high level elevator lock, currently it is protecting
>> dispatch_request only. Following patches will convert mq-deadline and bfq
>> to use this lock and finally support request batch dispatching by calling
>> the method multiple time while holding the lock.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>> block/blk-mq-sched.c | 9 ++++++++-
>> block/elevator.c | 1 +
>> block/elevator.h | 14 ++++++++++++--
>> 3 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
>> index 55a0fd105147..1a2da5edbe13 100644
>> --- a/block/blk-mq-sched.c
>> +++ b/block/blk-mq-sched.c
>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>> if (budget_token < 0)
>> break;
>>
>> - rq = e->type->ops.dispatch_request(hctx);
>> + if (blk_queue_sq_sched(q)) {
>> + elevator_lock(e);
>> + rq = e->type->ops.dispatch_request(hctx);
>> + elevator_unlock(e);
>
> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin
> lock variant. If it is safe, this needs a big comment block explaining why
> and/or the rules regarding the scheduler use of this lock.
It's correct, however, this patch doesn't change bfq yet, and it's like:
elevator_lock
spin_lock_irq(&bfqd->lock)
spin_unlock_irq(&bfqd->lock)
elevator_unlock
Patch 3 remove bfqd->lock and convert this to:
elevator_lock_irq
elevator_unlock_irq.
Thanks,
Kuai
>
>> + } else {
>> + rq = e->type->ops.dispatch_request(hctx);
>> + }
>> +
>> if (!rq) {
>> blk_mq_put_dispatch_budget(q, budget_token);
>> /*
>> diff --git a/block/elevator.c b/block/elevator.c
>> index 88f8f36bed98..45303af0ca73 100644
>> --- a/block/elevator.c
>> +++ b/block/elevator.c
>> @@ -144,6 +144,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
>> eq->type = e;
>> kobject_init(&eq->kobj, &elv_ktype);
>> mutex_init(&eq->sysfs_lock);
>> + spin_lock_init(&eq->lock);
>> hash_init(eq->hash);
>>
>> return eq;
>> diff --git a/block/elevator.h b/block/elevator.h
>> index a07ce773a38f..81f7700b0339 100644
>> --- a/block/elevator.h
>> +++ b/block/elevator.h
>> @@ -110,12 +110,12 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
>> /*
>> * each queue has an elevator_queue associated with it
>> */
>> -struct elevator_queue
>> -{
>> +struct elevator_queue {
>> struct elevator_type *type;
>> void *elevator_data;
>> struct kobject kobj;
>> struct mutex sysfs_lock;
>> + spinlock_t lock;
>> unsigned long flags;
>> DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
>> };
>> @@ -124,6 +124,16 @@ struct elevator_queue
>> #define ELEVATOR_FLAG_DYING 1
>> #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
>>
>> +#define elevator_lock(e) spin_lock(&(e)->lock)
>> +#define elevator_unlock(e) spin_unlock(&(e)->lock)
>> +#define elevator_lock_irq(e) spin_lock_irq(&(e)->lock)
>> +#define elevator_unlock_irq(e) spin_unlock_irq(&(e)->lock)
>> +#define elevator_lock_irqsave(e, flags) \
>> + spin_lock_irqsave(&(e)->lock, flags)
>> +#define elevator_unlock_irqrestore(e, flags) \
>> + spin_unlock_irqrestore(&(e)->lock, flags)
>> +#define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock)
>> +
>> /*
>> * block elevator interface
>> */
>
>
On 8/11/25 10:01, Yu Kuai wrote:
>>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
>>> index 55a0fd105147..1a2da5edbe13 100644
>>> --- a/block/blk-mq-sched.c
>>> +++ b/block/blk-mq-sched.c
>>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>>> if (budget_token < 0)
>>> break;
>>>
>>> - rq = e->type->ops.dispatch_request(hctx);
>>> + if (blk_queue_sq_sched(q)) {
>>> + elevator_lock(e);
>>> + rq = e->type->ops.dispatch_request(hctx);
>>> + elevator_unlock(e);
>>
>> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin
>> lock variant. If it is safe, this needs a big comment block explaining why
>> and/or the rules regarding the scheduler use of this lock.
>
> It's correct, however, this patch doesn't change bfq yet, and it's like:
>
> elevator_lock
> spin_lock_irq(&bfqd->lock)
> spin_unlock_irq(&bfqd->lock)
> elevator_unlock
>
> Patch 3 remove bfqd->lock and convert this to:
>
> elevator_lock_irq
> elevator_unlock_irq.
I do not understand. Since q->elevator->lock is already taken here, without IRQ
disabled, how can bfq_dispatch_request method again take this same lock with IRQ
disabled ? That cannot possibly work.
The other side of this is that if you use elevator_lock(e) to call
e->type->ops.dispatch_request(hctx), it means that the scheduler *can NOT* use
that same lock in its completion path, since that can be called from IRQ
context. This may not be needed/a problem right now, but I think this needs a
comment in this patch to mention this.
--
Damien Le Moal
Western Digital Research
Hi,
在 2025/08/11 11:53, Damien Le Moal 写道:
> On 8/11/25 10:01, Yu Kuai wrote:
>>>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
>>>> index 55a0fd105147..1a2da5edbe13 100644
>>>> --- a/block/blk-mq-sched.c
>>>> +++ b/block/blk-mq-sched.c
>>>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>>>> if (budget_token < 0)
>>>> break;
>>>>
>>>> - rq = e->type->ops.dispatch_request(hctx);
>>>> + if (blk_queue_sq_sched(q)) {
>>>> + elevator_lock(e);
>>>> + rq = e->type->ops.dispatch_request(hctx);
>>>> + elevator_unlock(e);
>>>
>>> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin
>>> lock variant. If it is safe, this needs a big comment block explaining why
>>> and/or the rules regarding the scheduler use of this lock.
>>
>> It's correct, however, this patch doesn't change bfq yet, and it's like:
>>
>> elevator_lock
>> spin_lock_irq(&bfqd->lock)
>> spin_unlock_irq(&bfqd->lock)
>> elevator_unlock
>>
>> Patch 3 remove bfqd->lock and convert this to:
>>
>> elevator_lock_irq
>> elevator_unlock_irq.
>
> I do not understand. Since q->elevator->lock is already taken here, without IRQ
> disabled, how can bfq_dispatch_request method again take this same lock with IRQ
> disabled ? That cannot possibly work.
Looks like there is still misunderstanding somehow :( After patch 3,
bfq_dispatch_work doesn't grab any lock, elevator lock is held before
calling into dispatch method.
Before:
elevator_lock
bfq_dispatch_request
spin_lock_irq(&bfqd->lock)
spin_unlock_irq(&bfqd->lock)
elevator_unlock
After:
elevator_lock_irq
bfq_dispatch_request
elevator_unlock_irq
>
> The other side of this is that if you use elevator_lock(e) to call
> e->type->ops.dispatch_request(hctx), it means that the scheduler *can NOT* use
> that same lock in its completion path, since that can be called from IRQ
> context. This may not be needed/a problem right now, but I think this needs a
> comment in this patch to mention this.
>
So, the first patch just grab elevator lock for dispatch method, bfq
dispatch still using spin_lock_irq(&bfqd->lock), and complete is still
using bfqd->lock, this is safe.
Later patch 3 remove bfqd->lock and use elevator lock instead, and
because elevator lock will be used in complete path now, elevator_lock
from dispatch path is converted to elevator_lock_irq.
Thanks,
Kuai
On 8/11/25 13:25, Yu Kuai wrote:
> Hi,
>
> 在 2025/08/11 11:53, Damien Le Moal 写道:
>> On 8/11/25 10:01, Yu Kuai wrote:
>>>>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
>>>>> index 55a0fd105147..1a2da5edbe13 100644
>>>>> --- a/block/blk-mq-sched.c
>>>>> +++ b/block/blk-mq-sched.c
>>>>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>>>>> if (budget_token < 0)
>>>>> break;
>>>>>
>>>>> - rq = e->type->ops.dispatch_request(hctx);
>>>>> + if (blk_queue_sq_sched(q)) {
>>>>> + elevator_lock(e);
>>>>> + rq = e->type->ops.dispatch_request(hctx);
>>>>> + elevator_unlock(e);
>>>>
>>>> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin
>>>> lock variant. If it is safe, this needs a big comment block explaining why
>>>> and/or the rules regarding the scheduler use of this lock.
>>>
>>> It's correct, however, this patch doesn't change bfq yet, and it's like:
>>>
>>> elevator_lock
>>> spin_lock_irq(&bfqd->lock)
>>> spin_unlock_irq(&bfqd->lock)
>>> elevator_unlock
>>>
>>> Patch 3 remove bfqd->lock and convert this to:
>>>
>>> elevator_lock_irq
>>> elevator_unlock_irq.
>>
>> I do not understand. Since q->elevator->lock is already taken here, without IRQ
>> disabled, how can bfq_dispatch_request method again take this same lock with IRQ
>> disabled ? That cannot possibly work.
>
> Looks like there is still misunderstanding somehow :( After patch 3,
> bfq_dispatch_work doesn't grab any lock, elevator lock is held before
> calling into dispatch method.
>
> Before:
>
> elevator_lock
> bfq_dispatch_request
> spin_lock_irq(&bfqd->lock)
> spin_unlock_irq(&bfqd->lock)
> elevator_unlock
>
> After:
> elevator_lock_irq
> bfq_dispatch_request
> elevator_unlock_irq
Ah, yes, I see it now.
But that is a nasty change that affects *all* schedulers, even those that do not
need to disable IRQs because they are not using the lock in their completion
path, e.g. mq-deadline. So I do not think that is acceptable.
--
Damien Le Moal
Western Digital Research
Hi,
在 2025/08/11 12:34, Damien Le Moal 写道:
> On 8/11/25 13:25, Yu Kuai wrote:
>> Hi,
>>
>> 在 2025/08/11 11:53, Damien Le Moal 写道:
>>> On 8/11/25 10:01, Yu Kuai wrote:
>>>>>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
>>>>>> index 55a0fd105147..1a2da5edbe13 100644
>>>>>> --- a/block/blk-mq-sched.c
>>>>>> +++ b/block/blk-mq-sched.c
>>>>>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>>>>>> if (budget_token < 0)
>>>>>> break;
>>>>>>
>>>>>> - rq = e->type->ops.dispatch_request(hctx);
>>>>>> + if (blk_queue_sq_sched(q)) {
>>>>>> + elevator_lock(e);
>>>>>> + rq = e->type->ops.dispatch_request(hctx);
>>>>>> + elevator_unlock(e);
>>>>>
>>>>> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin
>>>>> lock variant. If it is safe, this needs a big comment block explaining why
>>>>> and/or the rules regarding the scheduler use of this lock.
>>>>
>>>> It's correct, however, this patch doesn't change bfq yet, and it's like:
>>>>
>>>> elevator_lock
>>>> spin_lock_irq(&bfqd->lock)
>>>> spin_unlock_irq(&bfqd->lock)
>>>> elevator_unlock
>>>>
>>>> Patch 3 remove bfqd->lock and convert this to:
>>>>
>>>> elevator_lock_irq
>>>> elevator_unlock_irq.
>>>
>>> I do not understand. Since q->elevator->lock is already taken here, without IRQ
>>> disabled, how can bfq_dispatch_request method again take this same lock with IRQ
>>> disabled ? That cannot possibly work.
>>
>> Looks like there is still misunderstanding somehow :( After patch 3,
>> bfq_dispatch_work doesn't grab any lock, elevator lock is held before
>> calling into dispatch method.
>>
>> Before:
>>
>> elevator_lock
>> bfq_dispatch_request
>> spin_lock_irq(&bfqd->lock)
>> spin_unlock_irq(&bfqd->lock)
>> elevator_unlock
>>
>> After:
>> elevator_lock_irq
>> bfq_dispatch_request
>> elevator_unlock_irq
>
> Ah, yes, I see it now.
>
> But that is a nasty change that affects *all* schedulers, even those that do not
> need to disable IRQs because they are not using the lock in their completion
> path, e.g. mq-deadline. So I do not think that is acceptable.
>
OK, I did some benchmark and didn't found any performance regression, so
I decided use the irq version for deadline. Perhaps I didn't found such
test.
I can add a flag in elevator_queue->flags in addition like following to
fix this:
diff --git a/block/elevator.h b/block/elevator.h
index 81f7700b0339..f04b9b2ae344 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -123,6 +123,7 @@ struct elevator_queue {
#define ELEVATOR_FLAG_REGISTERED 0
#define ELEVATOR_FLAG_DYING 1
#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
+#define ELEVATOR_FLAG_LOCK_AT_COMP 3
#define elevator_lock(e) spin_lock(&(e)->lock)
#define elevator_unlock(e) spin_unlock(&(e)->lock)
@@ -134,6 +135,22 @@ struct elevator_queue {
spin_unlock_irqrestore(&(e)->lock, flags)
#define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock)
+static inline void elevator_dispatch_lock(struct elevator_queue *e)
+{
+ if (test_bit(ELEVATOR_FLAG_LOCK_AT_COMP, &e->flags))
+ elevator_lock_irq(e);
+ else
+ elevator_lock(e);
+}
+
+static inline void elevator_dispatch_unlock(struct elevator_queue *e)
+{
+ if (test_bit(ELEVATOR_FLAG_LOCK_AT_COMP, &e->flags))
+ elevator_unlock_irq(e);
+ else
+ elevator_unlock(e);
+}
+
Thanks,
Kuai
© 2016 - 2026 Red Hat, Inc.