[PATCH] blk-throttle: support io merge over iops_limit

Yu Kuai posted 1 patch 11 months, 1 week ago
block/blk-merge.c    |  3 +++
block/blk-throttle.c | 20 ++++++++++----------
block/blk-throttle.h | 19 +++++++++++++++++--
3 files changed, 30 insertions(+), 12 deletions(-)
[PATCH] blk-throttle: support io merge over iops_limit
Posted by Yu Kuai 11 months, 1 week ago
From: Yu Kuai <yukuai3@huawei.com>

Commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit")
support to account split IO for iops limit, because block layer provides
io accounting against split bio.

However, io merge is still not handled, while block layer doesn't
account merged io for iops. Fix this problem by decreasing io_disp
if bio is merged, and following IO can use the extra budget. If io merge
concurrent with iops throttling, it's not handled if one more or one
less bio is dispatched, this is fine because as long as new slice is not
started, blk-throttle already preserve one extra slice for deviation,
and it's not worth it to handle the case that iops_limit rate is less than
one per slice.

A regression test will be added for this case [1], before this patch,
the test will fail:

+++ /root/blktests-mainline/results/nodev/throtl/007.out.bad
@@ -1,4 +1,4 @@
 Running throtl/007
 1
-1
+11
 Test complete

[1] https://lore.kernel.org/all/20250307080318.3860858-2-yukuai1@huaweicloud.com/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-merge.c    |  3 +++
 block/blk-throttle.c | 20 ++++++++++----------
 block/blk-throttle.h | 19 +++++++++++++++++--
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 15cd231d560c..8dc7add7c31e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -988,6 +988,7 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
 
 	trace_block_bio_backmerge(bio);
 	rq_qos_merge(req->q, req, bio);
+	blk_throtl_bio_merge(req->q, bio);
 
 	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 		blk_rq_set_mixed_merge(req);
@@ -1025,6 +1026,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 
 	trace_block_bio_frontmerge(bio);
 	rq_qos_merge(req->q, req, bio);
+	blk_throtl_bio_merge(req->q, bio);
 
 	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 		blk_rq_set_mixed_merge(req);
@@ -1055,6 +1057,7 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
 		goto no_merge;
 
 	rq_qos_merge(q, req, bio);
+	blk_throtl_bio_merge(q, bio);
 
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 91dab43c65ab..9fd613c79caa 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -477,7 +477,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
 		bool rw, unsigned long start)
 {
 	tg->bytes_disp[rw] = 0;
-	tg->io_disp[rw] = 0;
+	atomic_set(&tg->io_disp[rw], 0);
 
 	/*
 	 * Previous slice has expired. We must have trimmed it after last
@@ -500,7 +500,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
 {
 	if (clear) {
 		tg->bytes_disp[rw] = 0;
-		tg->io_disp[rw] = 0;
+		atomic_set(&tg->io_disp[rw], 0);
 	}
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
@@ -623,10 +623,10 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 	else
 		tg->bytes_disp[rw] = 0;
 
-	if ((int)tg->io_disp[rw] >= io_trim)
-		tg->io_disp[rw] -= io_trim;
+	if (atomic_read(&tg->io_disp[rw]) >= io_trim)
+		atomic_sub(io_trim, &tg->io_disp[rw]);
 	else
-		tg->io_disp[rw] = 0;
+		atomic_set(&tg->io_disp[rw], 0);
 
 	tg->slice_start[rw] += time_elapsed;
 
@@ -655,9 +655,9 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
 			tg->bytes_disp[rw];
 	if (iops_limit != UINT_MAX)
 		*ios = calculate_io_allowed(iops_limit, jiffy_elapsed) -
-			tg->io_disp[rw];
+			atomic_read(&tg->io_disp[rw]);
 	tg->bytes_disp[rw] -= *bytes;
-	tg->io_disp[rw] -= *ios;
+	atomic_sub(*ios, &tg->io_disp[rw]);
 }
 
 static void tg_update_carryover(struct throtl_grp *tg)
@@ -691,7 +691,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
 	/* Round up to the next throttle slice, wait time must be nonzero */
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
 	io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd);
-	if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed)
+	if (io_allowed > 0 && atomic_read(&tg->io_disp[rw]) + 1 <= io_allowed)
 		return 0;
 
 	/* Calc approx time to dispatch */
@@ -815,7 +815,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	if (!bio_flagged(bio, BIO_BPS_THROTTLED))
 		tg->bytes_disp[rw] += bio_size;
 
-	tg->io_disp[rw]++;
+	atomic_inc(&tg->io_disp[rw]);
 }
 
 /**
@@ -1679,7 +1679,7 @@ bool __blk_throtl_bio(struct bio *bio)
 		   rw == READ ? 'R' : 'W',
 		   tg->bytes_disp[rw], bio->bi_iter.bi_size,
 		   tg_bps_limit(tg, rw),
-		   tg->io_disp[rw], tg_iops_limit(tg, rw),
+		   atomic_read(&tg->io_disp[rw]), tg_iops_limit(tg, rw),
 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 	td->nr_queued[rw]++;
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 7964cc041e06..7e5f50c6bb19 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -104,7 +104,7 @@ struct throtl_grp {
 	/* Number of bytes dispatched in current slice */
 	int64_t bytes_disp[2];
 	/* Number of bio's dispatched in current slice */
-	int io_disp[2];
+	atomic_t io_disp[2];
 
 	/*
 	 * The following two fields are updated when new configuration is
@@ -144,6 +144,8 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
 static inline void blk_throtl_exit(struct gendisk *disk) { }
 static inline bool blk_throtl_bio(struct bio *bio) { return false; }
 static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
+static inline void blk_throtl_bio_merge(struct request_queue *q,
+					struct bio *bio) { }
 #else /* CONFIG_BLK_DEV_THROTTLING */
 void blk_throtl_exit(struct gendisk *disk);
 bool __blk_throtl_bio(struct bio *bio);
@@ -189,12 +191,25 @@ static inline bool blk_should_throtl(struct bio *bio)
 
 static inline bool blk_throtl_bio(struct bio *bio)
 {
-
 	if (!blk_should_throtl(bio))
 		return false;
 
 	return __blk_throtl_bio(bio);
 }
+
+static inline void blk_throtl_bio_merge(struct request_queue *q,
+					struct bio *bio)
+{
+	struct throtl_grp *tg;
+	int rw = bio_data_dir(bio);
+
+	if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
+		return;
+
+	tg = blkg_to_tg(bio->bi_blkg);
+	if (tg->has_rules_iops[rw])
+		atomic_dec(&tg->io_disp[rw]);
+}
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #endif
-- 
2.39.2
Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Tejun Heo 11 months, 1 week ago
Hello,

On Fri, Mar 07, 2025 at 05:01:52PM +0800, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> Commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit")
> support to account split IO for iops limit, because block layer provides
> io accounting against split bio.
> 
> However, io merge is still not handled, while block layer doesn't
> account merged io for iops. Fix this problem by decreasing io_disp
> if bio is merged, and following IO can use the extra budget. If io merge
> concurrent with iops throttling, it's not handled if one more or one
> less bio is dispatched, this is fine because as long as new slice is not
> started, blk-throttle already preserve one extra slice for deviation,
> and it's not worth it to handle the case that iops_limit rate is less than
> one per slice.
> 
> A regression test will be added for this case [1], before this patch,
> the test will fail:
> 
> +++ /root/blktests-mainline/results/nodev/throtl/007.out.bad
> @@ -1,4 +1,4 @@
>  Running throtl/007
>  1
> -1
> +11
>  Test complete
> 
> [1] https://lore.kernel.org/all/20250307080318.3860858-2-yukuai1@huaweicloud.com/

For blk-throtl, iops limit has meant the number of bios issued. I'm not
necessarily against this change but this is significantly changing what a
given configuration means. Also, if we're now doing hardware request based
throttling, maybe we should just move this under rq-qos. That has the
problem of not supporting bio-based drivers but maybe we can leave
blk-throtl in deprecation mode and slowly phase it out.

Also, can you please make atomic_t conversion a separate patch and describe
why that's being done?

Thanks.

-- 
tejun
Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Yu Kuai 11 months ago
Hi,

在 2025/03/08 0:07, Tejun Heo 写道:
> Hello,
> 
> On Fri, Mar 07, 2025 at 05:01:52PM +0800, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit")
>> support to account split IO for iops limit, because block layer provides
>> io accounting against split bio.
>>
>> However, io merge is still not handled, while block layer doesn't
>> account merged io for iops. Fix this problem by decreasing io_disp
>> if bio is merged, and following IO can use the extra budget. If io merge
>> concurrent with iops throttling, it's not handled if one more or one
>> less bio is dispatched, this is fine because as long as new slice is not
>> started, blk-throttle already preserve one extra slice for deviation,
>> and it's not worth it to handle the case that iops_limit rate is less than
>> one per slice.
>>
>> A regression test will be added for this case [1], before this patch,
>> the test will fail:
>>
>> +++ /root/blktests-mainline/results/nodev/throtl/007.out.bad
>> @@ -1,4 +1,4 @@
>>   Running throtl/007
>>   1
>> -1
>> +11
>>   Test complete
>>
>> [1] https://lore.kernel.org/all/20250307080318.3860858-2-yukuai1@huaweicloud.com/
> 
> For blk-throtl, iops limit has meant the number of bios issued. I'm not

Yes, but it's a litter hard to explain to users the differece between IO
split and IO merge, they just think IO split is the numer of IOs issued
to disk, and IO merge is the number of IOs issued from user.

> necessarily against this change but this is significantly changing what a
> given configuration means. Also, if we're now doing hardware request based
> throttling, maybe we should just move this under rq-qos. That has the
> problem of not supporting bio-based drivers but maybe we can leave
> blk-throtl in deprecation mode and slowly phase it out.

Yes, we discussed this before.

And there is another angle that might convince you for the patch, if the
user workload triggers a lot of IO merge, and iops limit is above the
actual iops on disk, then before this patch, blk-throttle will make IO
merge impossible and resulting in performance degradation.

> 
> Also, can you please make atomic_t conversion a separate patch and describe
> why that's being done?

Of course, the reason is that new helper will decrease the counter
outside lock.

Thanks,
Kuai

> 
> Thanks.
> 

Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Ming Lei 11 months ago
On Mon, Mar 10, 2025 at 09:58:57AM +0800, Yu Kuai wrote:
> Hi,
> 
> 在 2025/03/08 0:07, Tejun Heo 写道:
> > Hello,
> > 
> > On Fri, Mar 07, 2025 at 05:01:52PM +0800, Yu Kuai wrote:
> > > From: Yu Kuai <yukuai3@huawei.com>
> > > 
> > > Commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit")
> > > support to account split IO for iops limit, because block layer provides
> > > io accounting against split bio.
> > > 
> > > However, io merge is still not handled, while block layer doesn't
> > > account merged io for iops. Fix this problem by decreasing io_disp
> > > if bio is merged, and following IO can use the extra budget. If io merge
> > > concurrent with iops throttling, it's not handled if one more or one
> > > less bio is dispatched, this is fine because as long as new slice is not
> > > started, blk-throttle already preserve one extra slice for deviation,
> > > and it's not worth it to handle the case that iops_limit rate is less than
> > > one per slice.
> > > 
> > > A regression test will be added for this case [1], before this patch,
> > > the test will fail:
> > > 
> > > +++ /root/blktests-mainline/results/nodev/throtl/007.out.bad
> > > @@ -1,4 +1,4 @@
> > >   Running throtl/007
> > >   1
> > > -1
> > > +11
> > >   Test complete
> > > 
> > > [1] https://lore.kernel.org/all/20250307080318.3860858-2-yukuai1@huaweicloud.com/
> > 
> > For blk-throtl, iops limit has meant the number of bios issued. I'm not
> 
> Yes, but it's a litter hard to explain to users the differece between IO
> split and IO merge, they just think IO split is the numer of IOs issued
> to disk, and IO merge is the number of IOs issued from user.

Here it is really one trouble.

a) Sometimes people think IOs wrt. IOPS means that the read/write IO
submitted from application, one typical example is `fio`.

b) Sometimes people think it is the data observed from `iostat`.

In a), io merge/split isn't taken into account, but b) does cover io
merge and split.

So question is that what is the correct way for user to observe IOPS
wrt. iops throttle?


Thanks,
Ming

Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Tejun Heo 11 months ago
Hello,

On Mon, Mar 10, 2025 at 10:16:46AM +0800, Ming Lei wrote:
...
> > Yes, but it's a litter hard to explain to users the differece between IO
> > split and IO merge, they just think IO split is the numer of IOs issued
> > to disk, and IO merge is the number of IOs issued from user.
> 
> Here it is really one trouble.
> 
> a) Sometimes people think IOs wrt. IOPS means that the read/write IO
> submitted from application, one typical example is `fio`.
> 
> b) Sometimes people think it is the data observed from `iostat`.
> 
> In a), io merge/split isn't taken into account, but b) does cover io
> merge and split.
> 
> So question is that what is the correct way for user to observe IOPS
> wrt. iops throttle?

If we could go back in time, I think the right metric to use is
hardware-seen IOPS as that's better correlated with the actual capacity
available (the correlation is very flawed but still better) and the number
of issued bio's can easily change depending on kernel implementation
details.

That said, I'm not sure about changing the behavior now. blk-throtl has
mostly used the number of bios as long as it has existed and given that
there can be a signficant difference between the two metrics, I'm not sure
the change is justified at this point.

Thanks.

-- 
tejun
Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Yu Kuai 11 months ago
Hi,

在 2025/03/10 23:53, Tejun Heo 写道:
> Hello,
> 
> On Mon, Mar 10, 2025 at 10:16:46AM +0800, Ming Lei wrote:
> ...
>>> Yes, but it's a litter hard to explain to users the differece between IO
>>> split and IO merge, they just think IO split is the numer of IOs issued
>>> to disk, and IO merge is the number of IOs issued from user.
>>
>> Here it is really one trouble.
>>
>> a) Sometimes people think IOs wrt. IOPS means that the read/write IO
>> submitted from application, one typical example is `fio`.
>>
>> b) Sometimes people think it is the data observed from `iostat`.
>>
>> In a), io merge/split isn't taken into account, but b) does cover io
>> merge and split.
>>
>> So question is that what is the correct way for user to observe IOPS
>> wrt. iops throttle?
> 
> If we could go back in time, I think the right metric to use is
> hardware-seen IOPS as that's better correlated with the actual capacity
> available (the correlation is very flawed but still better) and the number
> of issued bio's can easily change depending on kernel implementation
> details.

Yes, I agree the right metric is to use b), which cover IO merge and
split(and also filesystem meta and log).

> 
> That said, I'm not sure about changing the behavior now. blk-throtl has
> mostly used the number of bios as long as it has existed and given that
> there can be a signficant difference between the two metrics, I'm not sure
> the change is justified at this point.

If we really concern about the behavior change, can we consider a new
flag that can switch to the old behavior? We'll see if any user will
complain.

Thanks,
Kuai

> 
> Thanks.
> 

Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Tejun Heo 11 months ago
Hello,

On Tue, Mar 11, 2025 at 11:08:00AM +0800, Yu Kuai wrote:
...
> > That said, I'm not sure about changing the behavior now. blk-throtl has
> > mostly used the number of bios as long as it has existed and given that
> > there can be a signficant difference between the two metrics, I'm not sure
> > the change is justified at this point.
> 
> If we really concern about the behavior change, can we consider a new
> flag that can switch to the old behavior? We'll see if any user will
> complain.

Yeah, that may be the right way to go about this, but let me turn this
around and ask you why adding a new behavior would be a good idea. What
problems are you trying to solve?

Thanks.

-- 
tejun
Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Yu Kuai 11 months ago
Hi,

在 2025/03/12 3:34, Tejun Heo 写道:
> Hello,
> 
> On Tue, Mar 11, 2025 at 11:08:00AM +0800, Yu Kuai wrote:
> ...
>>> That said, I'm not sure about changing the behavior now. blk-throtl has
>>> mostly used the number of bios as long as it has existed and given that
>>> there can be a signficant difference between the two metrics, I'm not sure
>>> the change is justified at this point.
>>
>> If we really concern about the behavior change, can we consider a new
>> flag that can switch to the old behavior? We'll see if any user will
>> complain.
> 
> Yeah, that may be the right way to go about this, but let me turn this
> around and ask you why adding a new behavior would be a good idea. What
> problems are you trying to solve?

In the case of dirty pages writeback, BIO is 4k, while RQ can be up to
hw_sectors_kb. Our user are limiting iops based on real disk capacity
and they found BIO merge will be broken.

The idea way really is rq-qos based iops limit, which is after BIO merge
and BIO merge is ensured not borken. In this case, I have to suggest
them set a high iops limit or just remove the iops limit.

Thanks,
Kuai

> 
> Thanks.
> 

Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Tejun Heo 11 months ago
Hello,

On Wed, Mar 12, 2025 at 09:51:30AM +0800, Yu Kuai wrote:
...
> In the case of dirty pages writeback, BIO is 4k, while RQ can be up to
> hw_sectors_kb. Our user are limiting iops based on real disk capacity
> and they found BIO merge will be broken.
> 
> The idea way really is rq-qos based iops limit, which is after BIO merge
> and BIO merge is ensured not borken. In this case, I have to suggest
> them set a high iops limit or just remove the iops limit.

I get that that particular situation may be worked around with what you're
suggesting but you should be able to see that this would create the exact
opposite problem for people who are limiting by the IOs they issue, which
would be the majority of the existing users, so I don't think we can flip
the meaning of the existing knobs.

re. introducing new knobs or a switch, one thing to consider is that
independent iops limits are not that useful to begin with. A device's iops
capacity can vary drastically depending on e.g. IO sizes and there usually
is no one good iops limit value that both doesn't get in the way and
isolates the impact on other users, so it does feel like trying to polish
something which is fundamentally flawed.

Whether bio or rq based, can you actually achieve meaningful isolation with
blk-throtl's iops/bw limits? If switching to rq based (or something
approximating that) substantially improves the situation, adding new sets of
knobs would make sense, but I'm skeptical this will be all that useful. If
this is just going to be a coarse safety mechanism to guard against things
going completely out of hands or throttle already known IO patterns, whether
the limits are based on bio or rq doesn't make whole lot of difference.

Thanks.

-- 
tejun
Re: [PATCH] blk-throttle: support io merge over iops_limit
Posted by Yu Kuai 10 months, 3 weeks ago
Hi, Tejun!

在 2025/03/12 23:41, Tejun Heo 写道:
> Hello,
> 
> On Wed, Mar 12, 2025 at 09:51:30AM +0800, Yu Kuai wrote:
> ...
>> In the case of dirty pages writeback, BIO is 4k, while RQ can be up to
>> hw_sectors_kb. Our user are limiting iops based on real disk capacity
>> and they found BIO merge will be broken.
>>
>> The idea way really is rq-qos based iops limit, which is after BIO merge
>> and BIO merge is ensured not borken. In this case, I have to suggest
>> them set a high iops limit or just remove the iops limit.
> 

My apology for the late reply.

> I get that that particular situation may be worked around with what you're
> suggesting but you should be able to see that this would create the exact
> opposite problem for people who are limiting by the IOs they issue, which
> would be the majority of the existing users, so I don't think we can flip
> the meaning of the existing knobs.

Yes, I understand the current situation. I just feel blk-throttle have
no such capacity to limit IOs that are issuing from user. There could
also be filesystems, and data blocks can be fragmented.
> 
> re. introducing new knobs or a switch, one thing to consider is that
> independent iops limits are not that useful to begin with. A device's iops
> capacity can vary drastically depending on e.g. IO sizes and there usually
> is no one good iops limit value that both doesn't get in the way and
> isolates the impact on other users, so it does feel like trying to polish
> something which is fundamentally flawed.

It's not just fundamentally flawed, and the implementation is just too
one-sided. So what is the next step?

- remove iops limit since it's not that useful;
- swith iops limit to against disk;
- do nothing?
> 
> Whether bio or rq based, can you actually achieve meaningful isolation with
> blk-throtl's iops/bw limits? If switching to rq based (or something
> approximating that) substantially improves the situation, adding new sets of
> knobs would make sense, but I'm skeptical this will be all that useful. If
> this is just going to be a coarse safety mechanism to guard against things
> going completely out of hands or throttle already known IO patterns, whether
> the limits are based on bio or rq doesn't make whole lot of difference.

Most of our users will just set meaningful bps limit, however, since
iops limit is supported they will set it as well, without much knowledge
how it really works, causing some unexpected phenomenon. And for now,
we'll suggest not to set iops limit, no even a high limit.

Thanks,
Kuai

> 
> Thanks.
>