From: Yu Kuai <yukuai3@huawei.com>
Currently, both mq-deadline and bfq have global spin lock that will be
grabbed inside elevator methods like dispatch_request, insert_requests,
and bio_merge. And the global lock is the main reason mq-deadline and
bfq can't scale very well.
While dispatching request, blk_mq_get_disatpch_budget() and
blk_mq_get_driver_tag() must be called, and they are not ready to be called
inside elevator methods, hence introduce a new method like
dispatch_requests is not possible.
Hence introduce a new high level elevator lock, currently it is protecting
dispatch_request only. Following patches will convert mq-deadline and bfq
to use this lock and finally support request batch dispatching by calling
the method multiple time while holding the lock.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
block/blk-mq-sched.c | 9 ++++++++-
block/elevator.c | 1 +
block/elevator.h | 14 ++++++++++++--
3 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 55a0fd105147..1a2da5edbe13 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (budget_token < 0)
break;
- rq = e->type->ops.dispatch_request(hctx);
+ if (blk_queue_sq_sched(q)) {
+ elevator_lock(e);
+ rq = e->type->ops.dispatch_request(hctx);
+ elevator_unlock(e);
+ } else {
+ rq = e->type->ops.dispatch_request(hctx);
+ }
+
if (!rq) {
blk_mq_put_dispatch_budget(q, budget_token);
/*
diff --git a/block/elevator.c b/block/elevator.c
index 88f8f36bed98..45303af0ca73 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -144,6 +144,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
eq->type = e;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
+ spin_lock_init(&eq->lock);
hash_init(eq->hash);
return eq;
diff --git a/block/elevator.h b/block/elevator.h
index a07ce773a38f..81f7700b0339 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -110,12 +110,12 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
/*
* each queue has an elevator_queue associated with it
*/
-struct elevator_queue
-{
+struct elevator_queue {
struct elevator_type *type;
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
+ spinlock_t lock;
unsigned long flags;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};
@@ -124,6 +124,16 @@ struct elevator_queue
#define ELEVATOR_FLAG_DYING 1
#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
+#define elevator_lock(e) spin_lock(&(e)->lock)
+#define elevator_unlock(e) spin_unlock(&(e)->lock)
+#define elevator_lock_irq(e) spin_lock_irq(&(e)->lock)
+#define elevator_unlock_irq(e) spin_unlock_irq(&(e)->lock)
+#define elevator_lock_irqsave(e, flags) \
+ spin_lock_irqsave(&(e)->lock, flags)
+#define elevator_unlock_irqrestore(e, flags) \
+ spin_unlock_irqrestore(&(e)->lock, flags)
+#define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock)
+
/*
* block elevator interface
*/
--
2.39.2
On 8/6/25 17:57, Yu Kuai wrote: > From: Yu Kuai <yukuai3@huawei.com> > > Currently, both mq-deadline and bfq have global spin lock that will be > grabbed inside elevator methods like dispatch_request, insert_requests, > and bio_merge. And the global lock is the main reason mq-deadline and > bfq can't scale very well. > > While dispatching request, blk_mq_get_disatpch_budget() and > blk_mq_get_driver_tag() must be called, and they are not ready to be called > inside elevator methods, hence introduce a new method like > dispatch_requests is not possible. > > Hence introduce a new high level elevator lock, currently it is protecting > dispatch_request only. Following patches will convert mq-deadline and bfq > to use this lock and finally support request batch dispatching by calling > the method multiple time while holding the lock. > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > block/blk-mq-sched.c | 9 ++++++++- > block/elevator.c | 1 + > block/elevator.h | 14 ++++++++++++-- > 3 files changed, 21 insertions(+), 3 deletions(-) > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > index 55a0fd105147..1a2da5edbe13 100644 > --- a/block/blk-mq-sched.c > +++ b/block/blk-mq-sched.c > @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > if (budget_token < 0) > break; > > - rq = e->type->ops.dispatch_request(hctx); > + if (blk_queue_sq_sched(q)) { > + elevator_lock(e); > + rq = e->type->ops.dispatch_request(hctx); > + elevator_unlock(e); I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin lock variant. If it is safe, this needs a big comment block explaining why and/or the rules regarding the scheduler use of this lock. > + } else { > + rq = e->type->ops.dispatch_request(hctx); > + } > + > if (!rq) { > blk_mq_put_dispatch_budget(q, budget_token); > /* > diff --git a/block/elevator.c b/block/elevator.c > index 88f8f36bed98..45303af0ca73 100644 > --- a/block/elevator.c > +++ b/block/elevator.c > @@ -144,6 +144,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, > eq->type = e; > kobject_init(&eq->kobj, &elv_ktype); > mutex_init(&eq->sysfs_lock); > + spin_lock_init(&eq->lock); > hash_init(eq->hash); > > return eq; > diff --git a/block/elevator.h b/block/elevator.h > index a07ce773a38f..81f7700b0339 100644 > --- a/block/elevator.h > +++ b/block/elevator.h > @@ -110,12 +110,12 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); > /* > * each queue has an elevator_queue associated with it > */ > -struct elevator_queue > -{ > +struct elevator_queue { > struct elevator_type *type; > void *elevator_data; > struct kobject kobj; > struct mutex sysfs_lock; > + spinlock_t lock; > unsigned long flags; > DECLARE_HASHTABLE(hash, ELV_HASH_BITS); > }; > @@ -124,6 +124,16 @@ struct elevator_queue > #define ELEVATOR_FLAG_DYING 1 > #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 > > +#define elevator_lock(e) spin_lock(&(e)->lock) > +#define elevator_unlock(e) spin_unlock(&(e)->lock) > +#define elevator_lock_irq(e) spin_lock_irq(&(e)->lock) > +#define elevator_unlock_irq(e) spin_unlock_irq(&(e)->lock) > +#define elevator_lock_irqsave(e, flags) \ > + spin_lock_irqsave(&(e)->lock, flags) > +#define elevator_unlock_irqrestore(e, flags) \ > + spin_unlock_irqrestore(&(e)->lock, flags) > +#define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock) > + > /* > * block elevator interface > */ -- Damien Le Moal Western Digital Research
Hi, 在 2025/08/11 8:44, Damien Le Moal 写道: > On 8/6/25 17:57, Yu Kuai wrote: >> From: Yu Kuai <yukuai3@huawei.com> >> >> Currently, both mq-deadline and bfq have global spin lock that will be >> grabbed inside elevator methods like dispatch_request, insert_requests, >> and bio_merge. And the global lock is the main reason mq-deadline and >> bfq can't scale very well. >> >> While dispatching request, blk_mq_get_disatpch_budget() and >> blk_mq_get_driver_tag() must be called, and they are not ready to be called >> inside elevator methods, hence introduce a new method like >> dispatch_requests is not possible. >> >> Hence introduce a new high level elevator lock, currently it is protecting >> dispatch_request only. Following patches will convert mq-deadline and bfq >> to use this lock and finally support request batch dispatching by calling >> the method multiple time while holding the lock. >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> --- >> block/blk-mq-sched.c | 9 ++++++++- >> block/elevator.c | 1 + >> block/elevator.h | 14 ++++++++++++-- >> 3 files changed, 21 insertions(+), 3 deletions(-) >> >> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c >> index 55a0fd105147..1a2da5edbe13 100644 >> --- a/block/blk-mq-sched.c >> +++ b/block/blk-mq-sched.c >> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) >> if (budget_token < 0) >> break; >> >> - rq = e->type->ops.dispatch_request(hctx); >> + if (blk_queue_sq_sched(q)) { >> + elevator_lock(e); >> + rq = e->type->ops.dispatch_request(hctx); >> + elevator_unlock(e); > > I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin > lock variant. If it is safe, this needs a big comment block explaining why > and/or the rules regarding the scheduler use of this lock. It's correct, however, this patch doesn't change bfq yet, and it's like: elevator_lock spin_lock_irq(&bfqd->lock) spin_unlock_irq(&bfqd->lock) elevator_unlock Patch 3 remove bfqd->lock and convert this to: elevator_lock_irq elevator_unlock_irq. Thanks, Kuai > >> + } else { >> + rq = e->type->ops.dispatch_request(hctx); >> + } >> + >> if (!rq) { >> blk_mq_put_dispatch_budget(q, budget_token); >> /* >> diff --git a/block/elevator.c b/block/elevator.c >> index 88f8f36bed98..45303af0ca73 100644 >> --- a/block/elevator.c >> +++ b/block/elevator.c >> @@ -144,6 +144,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, >> eq->type = e; >> kobject_init(&eq->kobj, &elv_ktype); >> mutex_init(&eq->sysfs_lock); >> + spin_lock_init(&eq->lock); >> hash_init(eq->hash); >> >> return eq; >> diff --git a/block/elevator.h b/block/elevator.h >> index a07ce773a38f..81f7700b0339 100644 >> --- a/block/elevator.h >> +++ b/block/elevator.h >> @@ -110,12 +110,12 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); >> /* >> * each queue has an elevator_queue associated with it >> */ >> -struct elevator_queue >> -{ >> +struct elevator_queue { >> struct elevator_type *type; >> void *elevator_data; >> struct kobject kobj; >> struct mutex sysfs_lock; >> + spinlock_t lock; >> unsigned long flags; >> DECLARE_HASHTABLE(hash, ELV_HASH_BITS); >> }; >> @@ -124,6 +124,16 @@ struct elevator_queue >> #define ELEVATOR_FLAG_DYING 1 >> #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 >> >> +#define elevator_lock(e) spin_lock(&(e)->lock) >> +#define elevator_unlock(e) spin_unlock(&(e)->lock) >> +#define elevator_lock_irq(e) spin_lock_irq(&(e)->lock) >> +#define elevator_unlock_irq(e) spin_unlock_irq(&(e)->lock) >> +#define elevator_lock_irqsave(e, flags) \ >> + spin_lock_irqsave(&(e)->lock, flags) >> +#define elevator_unlock_irqrestore(e, flags) \ >> + spin_unlock_irqrestore(&(e)->lock, flags) >> +#define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock) >> + >> /* >> * block elevator interface >> */ > >
On 8/11/25 10:01, Yu Kuai wrote: >>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c >>> index 55a0fd105147..1a2da5edbe13 100644 >>> --- a/block/blk-mq-sched.c >>> +++ b/block/blk-mq-sched.c >>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) >>> if (budget_token < 0) >>> break; >>> >>> - rq = e->type->ops.dispatch_request(hctx); >>> + if (blk_queue_sq_sched(q)) { >>> + elevator_lock(e); >>> + rq = e->type->ops.dispatch_request(hctx); >>> + elevator_unlock(e); >> >> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin >> lock variant. If it is safe, this needs a big comment block explaining why >> and/or the rules regarding the scheduler use of this lock. > > It's correct, however, this patch doesn't change bfq yet, and it's like: > > elevator_lock > spin_lock_irq(&bfqd->lock) > spin_unlock_irq(&bfqd->lock) > elevator_unlock > > Patch 3 remove bfqd->lock and convert this to: > > elevator_lock_irq > elevator_unlock_irq. I do not understand. Since q->elevator->lock is already taken here, without IRQ disabled, how can bfq_dispatch_request method again take this same lock with IRQ disabled ? That cannot possibly work. The other side of this is that if you use elevator_lock(e) to call e->type->ops.dispatch_request(hctx), it means that the scheduler *can NOT* use that same lock in its completion path, since that can be called from IRQ context. This may not be needed/a problem right now, but I think this needs a comment in this patch to mention this. -- Damien Le Moal Western Digital Research
Hi, 在 2025/08/11 11:53, Damien Le Moal 写道: > On 8/11/25 10:01, Yu Kuai wrote: >>>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c >>>> index 55a0fd105147..1a2da5edbe13 100644 >>>> --- a/block/blk-mq-sched.c >>>> +++ b/block/blk-mq-sched.c >>>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) >>>> if (budget_token < 0) >>>> break; >>>> >>>> - rq = e->type->ops.dispatch_request(hctx); >>>> + if (blk_queue_sq_sched(q)) { >>>> + elevator_lock(e); >>>> + rq = e->type->ops.dispatch_request(hctx); >>>> + elevator_unlock(e); >>> >>> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin >>> lock variant. If it is safe, this needs a big comment block explaining why >>> and/or the rules regarding the scheduler use of this lock. >> >> It's correct, however, this patch doesn't change bfq yet, and it's like: >> >> elevator_lock >> spin_lock_irq(&bfqd->lock) >> spin_unlock_irq(&bfqd->lock) >> elevator_unlock >> >> Patch 3 remove bfqd->lock and convert this to: >> >> elevator_lock_irq >> elevator_unlock_irq. > > I do not understand. Since q->elevator->lock is already taken here, without IRQ > disabled, how can bfq_dispatch_request method again take this same lock with IRQ > disabled ? That cannot possibly work. Looks like there is still misunderstanding somehow :( After patch 3, bfq_dispatch_work doesn't grab any lock, elevator lock is held before calling into dispatch method. Before: elevator_lock bfq_dispatch_request spin_lock_irq(&bfqd->lock) spin_unlock_irq(&bfqd->lock) elevator_unlock After: elevator_lock_irq bfq_dispatch_request elevator_unlock_irq > > The other side of this is that if you use elevator_lock(e) to call > e->type->ops.dispatch_request(hctx), it means that the scheduler *can NOT* use > that same lock in its completion path, since that can be called from IRQ > context. This may not be needed/a problem right now, but I think this needs a > comment in this patch to mention this. > So, the first patch just grab elevator lock for dispatch method, bfq dispatch still using spin_lock_irq(&bfqd->lock), and complete is still using bfqd->lock, this is safe. Later patch 3 remove bfqd->lock and use elevator lock instead, and because elevator lock will be used in complete path now, elevator_lock from dispatch path is converted to elevator_lock_irq. Thanks, Kuai
On 8/11/25 13:25, Yu Kuai wrote: > Hi, > > 在 2025/08/11 11:53, Damien Le Moal 写道: >> On 8/11/25 10:01, Yu Kuai wrote: >>>>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c >>>>> index 55a0fd105147..1a2da5edbe13 100644 >>>>> --- a/block/blk-mq-sched.c >>>>> +++ b/block/blk-mq-sched.c >>>>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) >>>>> if (budget_token < 0) >>>>> break; >>>>> >>>>> - rq = e->type->ops.dispatch_request(hctx); >>>>> + if (blk_queue_sq_sched(q)) { >>>>> + elevator_lock(e); >>>>> + rq = e->type->ops.dispatch_request(hctx); >>>>> + elevator_unlock(e); >>>> >>>> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin >>>> lock variant. If it is safe, this needs a big comment block explaining why >>>> and/or the rules regarding the scheduler use of this lock. >>> >>> It's correct, however, this patch doesn't change bfq yet, and it's like: >>> >>> elevator_lock >>> spin_lock_irq(&bfqd->lock) >>> spin_unlock_irq(&bfqd->lock) >>> elevator_unlock >>> >>> Patch 3 remove bfqd->lock and convert this to: >>> >>> elevator_lock_irq >>> elevator_unlock_irq. >> >> I do not understand. Since q->elevator->lock is already taken here, without IRQ >> disabled, how can bfq_dispatch_request method again take this same lock with IRQ >> disabled ? That cannot possibly work. > > Looks like there is still misunderstanding somehow :( After patch 3, > bfq_dispatch_work doesn't grab any lock, elevator lock is held before > calling into dispatch method. > > Before: > > elevator_lock > bfq_dispatch_request > spin_lock_irq(&bfqd->lock) > spin_unlock_irq(&bfqd->lock) > elevator_unlock > > After: > elevator_lock_irq > bfq_dispatch_request > elevator_unlock_irq Ah, yes, I see it now. But that is a nasty change that affects *all* schedulers, even those that do not need to disable IRQs because they are not using the lock in their completion path, e.g. mq-deadline. So I do not think that is acceptable. -- Damien Le Moal Western Digital Research
Hi, 在 2025/08/11 12:34, Damien Le Moal 写道: > On 8/11/25 13:25, Yu Kuai wrote: >> Hi, >> >> 在 2025/08/11 11:53, Damien Le Moal 写道: >>> On 8/11/25 10:01, Yu Kuai wrote: >>>>>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c >>>>>> index 55a0fd105147..1a2da5edbe13 100644 >>>>>> --- a/block/blk-mq-sched.c >>>>>> +++ b/block/blk-mq-sched.c >>>>>> @@ -113,7 +113,14 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) >>>>>> if (budget_token < 0) >>>>>> break; >>>>>> >>>>>> - rq = e->type->ops.dispatch_request(hctx); >>>>>> + if (blk_queue_sq_sched(q)) { >>>>>> + elevator_lock(e); >>>>>> + rq = e->type->ops.dispatch_request(hctx); >>>>>> + elevator_unlock(e); >>>>> >>>>> I do not think this is safe for bfq since bfq uses the irqsave/irqrestore spin >>>>> lock variant. If it is safe, this needs a big comment block explaining why >>>>> and/or the rules regarding the scheduler use of this lock. >>>> >>>> It's correct, however, this patch doesn't change bfq yet, and it's like: >>>> >>>> elevator_lock >>>> spin_lock_irq(&bfqd->lock) >>>> spin_unlock_irq(&bfqd->lock) >>>> elevator_unlock >>>> >>>> Patch 3 remove bfqd->lock and convert this to: >>>> >>>> elevator_lock_irq >>>> elevator_unlock_irq. >>> >>> I do not understand. Since q->elevator->lock is already taken here, without IRQ >>> disabled, how can bfq_dispatch_request method again take this same lock with IRQ >>> disabled ? That cannot possibly work. >> >> Looks like there is still misunderstanding somehow :( After patch 3, >> bfq_dispatch_work doesn't grab any lock, elevator lock is held before >> calling into dispatch method. >> >> Before: >> >> elevator_lock >> bfq_dispatch_request >> spin_lock_irq(&bfqd->lock) >> spin_unlock_irq(&bfqd->lock) >> elevator_unlock >> >> After: >> elevator_lock_irq >> bfq_dispatch_request >> elevator_unlock_irq > > Ah, yes, I see it now. > > But that is a nasty change that affects *all* schedulers, even those that do not > need to disable IRQs because they are not using the lock in their completion > path, e.g. mq-deadline. So I do not think that is acceptable. > OK, I did some benchmark and didn't found any performance regression, so I decided use the irq version for deadline. Perhaps I didn't found such test. I can add a flag in elevator_queue->flags in addition like following to fix this: diff --git a/block/elevator.h b/block/elevator.h index 81f7700b0339..f04b9b2ae344 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -123,6 +123,7 @@ struct elevator_queue { #define ELEVATOR_FLAG_REGISTERED 0 #define ELEVATOR_FLAG_DYING 1 #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 +#define ELEVATOR_FLAG_LOCK_AT_COMP 3 #define elevator_lock(e) spin_lock(&(e)->lock) #define elevator_unlock(e) spin_unlock(&(e)->lock) @@ -134,6 +135,22 @@ struct elevator_queue { spin_unlock_irqrestore(&(e)->lock, flags) #define elevator_lock_assert_held(e) lockdep_assert_held(&(e)->lock) +static inline void elevator_dispatch_lock(struct elevator_queue *e) +{ + if (test_bit(ELEVATOR_FLAG_LOCK_AT_COMP, &e->flags)) + elevator_lock_irq(e); + else + elevator_lock(e); +} + +static inline void elevator_dispatch_unlock(struct elevator_queue *e) +{ + if (test_bit(ELEVATOR_FLAG_LOCK_AT_COMP, &e->flags)) + elevator_unlock_irq(e); + else + elevator_unlock(e); +} + Thanks, Kuai
© 2016 - 2025 Red Hat, Inc.