From: Yu Kuai <yukuai3@huawei.com>
- rename part_in_{flight, flight_rw} and to bdev_count_{inflight,
inflight_rw}.
- export bdev_count_inflight_rw, to fix a problem in mdraid that foreground
IO can be starved by background sync IO in later patches.
- reuse bdev_count_inflight_rw for bdev_count_inflight, also add warning if
inflight is negative, means related driver is buggy.
- remove unused blk_mq_in_flight
- rename blk_mq_in_flight_rw to blk_mq_count_in_driver_rw, to distinguish
from bdev_count_inflight_rw.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
block/blk-core.c | 2 +-
block/blk-mq.c | 15 +++---------
block/blk-mq.h | 7 +++---
block/blk.h | 1 -
block/genhd.c | 48 +++++++++++++++++++--------------------
include/linux/part_stat.h | 10 ++++++++
6 files changed, 40 insertions(+), 43 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 4623de79effa..ead2072eb3bd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1018,7 +1018,7 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp)) &&
likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
- (end || part_in_flight(part)))
+ (end || bdev_count_inflight(part)))
__part_stat_add(part, io_ticks, now - stamp);
if (bdev_is_partition(part)) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c2697db59109..a2bd77601623 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -101,18 +101,9 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
return true;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part)
-{
- struct mq_inflight mi = { .part = part };
-
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
-
- return mi.inflight[0] + mi.inflight[1];
-}
-
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2])
+void blk_mq_count_in_driver_rw(struct request_queue *q,
+ struct block_device *part,
+ unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3011a78cf16a..3897522f014f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -246,10 +246,9 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part);
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2]);
+void blk_mq_count_in_driver_rw(struct request_queue *q,
+ struct block_device *part,
+ unsigned int inflight[2]);
static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
int budget_token)
diff --git a/block/blk.h b/block/blk.h
index 006e3be433d2..f476f233f195 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -418,7 +418,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
int blk_dev_init(void);
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
-unsigned int part_in_flight(struct block_device *part);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
diff --git a/block/genhd.c b/block/genhd.c
index c2bd86cd09de..8329bd539be3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -125,37 +125,35 @@ static void part_stat_read_all(struct block_device *part,
}
}
-unsigned int part_in_flight(struct block_device *part)
-{
- unsigned int inflight = 0;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
- part_stat_local_read_cpu(part, in_flight[1], cpu);
- }
- if ((int)inflight < 0)
- inflight = 0;
-
- return inflight;
-}
-
-static void part_in_flight_rw(struct block_device *part,
- unsigned int inflight[2])
+/**
+ * bdev_count_inflight_rw - get the number of inflight IOs for a block device.
+ *
+ * @bdev: the block device.
+ * @inflight: return number of inflight IOs, @inflight[0] for read and
+ * @inflight[1] for write.
+ *
+ * For bio-based block device, IO is inflight from bdev_start_io_acct(), and for
+ * rq-based block device, IO is inflight from blk_account_io_start().
+ *
+ * Noted, for rq-based block device, use blk_mq_count_in_driver_rw() to get the
+ * number of requests issued to driver.
+ */
+void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2])
{
int cpu;
inflight[0] = 0;
inflight[1] = 0;
for_each_possible_cpu(cpu) {
- inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
- inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
+ inflight[0] += part_stat_local_read_cpu(bdev, in_flight[0], cpu);
+ inflight[1] += part_stat_local_read_cpu(bdev, in_flight[1], cpu);
}
- if ((int)inflight[0] < 0)
+ if (WARN_ON_ONCE((int)inflight[0] < 0))
inflight[0] = 0;
- if ((int)inflight[1] < 0)
+ if (WARN_ON_ONCE((int)inflight[1] < 0))
inflight[1] = 0;
}
+EXPORT_SYMBOL_GPL(bdev_count_inflight_rw);
/*
* Can be deleted altogether. Later.
@@ -1005,7 +1003,7 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
- inflight = part_in_flight(bdev);
+ inflight = bdev_count_inflight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
@@ -1050,9 +1048,9 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
unsigned int inflight[2];
if (queue_is_mq(q))
- blk_mq_in_flight_rw(q, bdev, inflight);
+ blk_mq_count_in_driver_rw(q, bdev, inflight);
else
- part_in_flight_rw(bdev, inflight);
+ bdev_count_inflight_rw(bdev, inflight);
return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]);
}
@@ -1307,7 +1305,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- inflight = part_in_flight(hd);
+ inflight = bdev_count_inflight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index c5e9cac0575e..6248e59dc891 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -79,4 +79,14 @@ static inline void part_stat_set_all(struct block_device *part, int value)
#define part_stat_local_read_cpu(part, field, cpu) \
local_read(&(part_stat_get_cpu(part, field, cpu)))
+void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2]);
+
+static inline unsigned int bdev_count_inflight(struct block_device *bdev)
+{
+ unsigned int inflight[2];
+
+ bdev_count_inflight_rw(bdev, inflight);
+
+ return inflight[0] + inflight[1];
+}
#endif /* _LINUX_PART_STAT_H */
--
2.39.2
On Fri, Apr 18, 2025 at 09:09:37AM +0800, Yu Kuai wrote:
> - remove unused blk_mq_in_flight
That should probably be a separate patch.
> - rename blk_mq_in_flight_rw to blk_mq_count_in_driver_rw, to distinguish
> from bdev_count_inflight_rw.
I'm not sure why this is needed or related, or even what additional
distinction is added here.
> -
> -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
> - unsigned int inflight[2])
> +void blk_mq_count_in_driver_rw(struct request_queue *q,
> + struct block_device *part,
> + unsigned int inflight[2])
Any reason to move away from two tab indents for the prototype
continuations in various places in this patch?
> + * Noted, for rq-based block device, use blk_mq_count_in_driver_rw() to get the
> + * number of requests issued to driver.
I'd just change this helper to call blk_mq_count_in_driver_rw for
blk-mq devices and remove the conditional from the sysfs code instead.
That gives us a much more robust and easier to understand API.
> +void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2]);
Overly long line.
> +static inline unsigned int bdev_count_inflight(struct block_device *bdev)
> +{
> + unsigned int inflight[2];
> +
> + bdev_count_inflight_rw(bdev, inflight);
> +
> + return inflight[0] + inflight[1];
> +}
> #endif /* _LINUX_PART_STAT_H */
Maybe keep this inside of block as it should not not be used by
drivers? Also the reimplementation should probably be a separate
patch from the public API change and exporting.
Hi,
在 2025/04/21 19:59, Christoph Hellwig 写道:
> On Fri, Apr 18, 2025 at 09:09:37AM +0800, Yu Kuai wrote:
>> - remove unused blk_mq_in_flight
>
> That should probably be a separate patch.
ok
>
>> - rename blk_mq_in_flight_rw to blk_mq_count_in_driver_rw, to distinguish
>> from bdev_count_inflight_rw.
>
> I'm not sure why this is needed or related, or even what additional
> distinction is added here.
Because for rq-based device, there are two different stage,
blk_account_io_start() while allocating new rq, and
blk_mq_start_request() while issuing the rq to driver.
When will we think the reqeust is inflight? For iostat, my anser is the
former one, because rq->start_time_ns is set here as well. And noted in
iostats api diskstats_show(/proc/diskstats) and part_stat_show
(/sys/block/sda/stat), inflight is get by part_in_flight, which is
different from disk sysfs api(/sys/block/sda/inflight).
>
>> -
>> -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
>> - unsigned int inflight[2])
>> +void blk_mq_count_in_driver_rw(struct request_queue *q,
>> + struct block_device *part,
>> + unsigned int inflight[2])
>
> Any reason to move away from two tab indents for the prototype
> continuations in various places in this patch?
>
>> + * Noted, for rq-based block device, use blk_mq_count_in_driver_rw() to get the
>> + * number of requests issued to driver.
>
> I'd just change this helper to call blk_mq_count_in_driver_rw for
> blk-mq devices and remove the conditional from the sysfs code instead.
> That gives us a much more robust and easier to understand API.
Ok, and another separate patch, right?
>
>> +void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2]);
>
> Overly long line.
>
>> +static inline unsigned int bdev_count_inflight(struct block_device *bdev)
>> +{
>> + unsigned int inflight[2];
>> +
>> + bdev_count_inflight_rw(bdev, inflight);
>> +
>> + return inflight[0] + inflight[1];
>> +}
>> #endif /* _LINUX_PART_STAT_H */
>
> Maybe keep this inside of block as it should not not be used by
> drivers? Also the reimplementation should probably be a separate
> patch from the public API change and exporting.
ok, and I should probably send the first set just related to this patch
first.
Thanks,
Kuai
>
> .
>
On Mon, Apr 21, 2025 at 09:13:57PM +0800, Yu Kuai wrote: > > I'm not sure why this is needed or related, or even what additional > > distinction is added here. > > Because for rq-based device, there are two different stage, > blk_account_io_start() while allocating new rq, and > blk_mq_start_request() while issuing the rq to driver. > > When will we think the reqeust is inflight? For iostat, my anser is the > former one, because rq->start_time_ns is set here as well. And noted in > iostats api diskstats_show(/proc/diskstats) and part_stat_show > (/sys/block/sda/stat), inflight is get by part_in_flight, which is > different from disk sysfs api(/sys/block/sda/inflight). Trying to express this in a not very obvious function name isn't going to work very well. Documenting your findings in comments is much better. > > > > I'd just change this helper to call blk_mq_count_in_driver_rw for > > blk-mq devices and remove the conditional from the sysfs code instead. > > That gives us a much more robust and easier to understand API. > > Ok, and another separate patch, right? Yes.
© 2016 - 2025 Red Hat, Inc.