[PATCH v2 1/5] block: cleanup and export bdev IO inflight APIs

Yu Kuai posted 5 patches 8 months ago
There is a newer version of this series
[PATCH v2 1/5] block: cleanup and export bdev IO inflight APIs
Posted by Yu Kuai 8 months ago
From: Yu Kuai <yukuai3@huawei.com>

- rename part_in_{flight, flight_rw} and to bdev_count_{inflight,
  inflight_rw}.
- export bdev_count_inflight_rw, to fix a problem in mdraid that foreground
  IO can be starved by background sync IO in later patches.
- reuse bdev_count_inflight_rw for bdev_count_inflight, also add warning if
  inflight is negative, means related driver is buggy.
- remove unused blk_mq_in_flight
- rename blk_mq_in_flight_rw to blk_mq_count_in_driver_rw, to distinguish
  from bdev_count_inflight_rw.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-core.c          |  2 +-
 block/blk-mq.c            | 15 +++---------
 block/blk-mq.h            |  7 +++---
 block/blk.h               |  1 -
 block/genhd.c             | 48 +++++++++++++++++++--------------------
 include/linux/part_stat.h | 10 ++++++++
 6 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4623de79effa..ead2072eb3bd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1018,7 +1018,7 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
 	stamp = READ_ONCE(part->bd_stamp);
 	if (unlikely(time_after(now, stamp)) &&
 	    likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
-	    (end || part_in_flight(part)))
+	    (end || bdev_count_inflight(part)))
 		__part_stat_add(part, io_ticks, now - stamp);
 
 	if (bdev_is_partition(part)) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c2697db59109..a2bd77601623 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -101,18 +101,9 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
 	return true;
 }
 
-unsigned int blk_mq_in_flight(struct request_queue *q,
-		struct block_device *part)
-{
-	struct mq_inflight mi = { .part = part };
-
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
-
-	return mi.inflight[0] + mi.inflight[1];
-}
-
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
-		unsigned int inflight[2])
+void blk_mq_count_in_driver_rw(struct request_queue *q,
+			       struct block_device *part,
+			       unsigned int inflight[2])
 {
 	struct mq_inflight mi = { .part = part };
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3011a78cf16a..3897522f014f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -246,10 +246,9 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 	return hctx->nr_ctx && hctx->tags;
 }
 
-unsigned int blk_mq_in_flight(struct request_queue *q,
-		struct block_device *part);
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
-		unsigned int inflight[2]);
+void blk_mq_count_in_driver_rw(struct request_queue *q,
+			       struct block_device *part,
+			       unsigned int inflight[2]);
 
 static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
 					      int budget_token)
diff --git a/block/blk.h b/block/blk.h
index 006e3be433d2..f476f233f195 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -418,7 +418,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
 int blk_dev_init(void);
 
 void update_io_ticks(struct block_device *part, unsigned long now, bool end);
-unsigned int part_in_flight(struct block_device *part);
 
 static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 {
diff --git a/block/genhd.c b/block/genhd.c
index c2bd86cd09de..8329bd539be3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -125,37 +125,35 @@ static void part_stat_read_all(struct block_device *part,
 	}
 }
 
-unsigned int part_in_flight(struct block_device *part)
-{
-	unsigned int inflight = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
-			    part_stat_local_read_cpu(part, in_flight[1], cpu);
-	}
-	if ((int)inflight < 0)
-		inflight = 0;
-
-	return inflight;
-}
-
-static void part_in_flight_rw(struct block_device *part,
-		unsigned int inflight[2])
+/**
+ * bdev_count_inflight_rw - get the number of inflight IOs for a block device.
+ *
+ * @bdev:	the block device.
+ * @inflight:	return number of inflight IOs, @inflight[0] for read and
+ *		@inflight[1] for write.
+ *
+ * For bio-based block device, IO is inflight from bdev_start_io_acct(), and for
+ * rq-based block device, IO is inflight from blk_account_io_start().
+ *
+ * Noted, for rq-based block device, use blk_mq_count_in_driver_rw() to get the
+ * number of requests issued to driver.
+ */
+void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2])
 {
 	int cpu;
 
 	inflight[0] = 0;
 	inflight[1] = 0;
 	for_each_possible_cpu(cpu) {
-		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
-		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
+		inflight[0] += part_stat_local_read_cpu(bdev, in_flight[0], cpu);
+		inflight[1] += part_stat_local_read_cpu(bdev, in_flight[1], cpu);
 	}
-	if ((int)inflight[0] < 0)
+	if (WARN_ON_ONCE((int)inflight[0] < 0))
 		inflight[0] = 0;
-	if ((int)inflight[1] < 0)
+	if (WARN_ON_ONCE((int)inflight[1] < 0))
 		inflight[1] = 0;
 }
+EXPORT_SYMBOL_GPL(bdev_count_inflight_rw);
 
 /*
  * Can be deleted altogether. Later.
@@ -1005,7 +1003,7 @@ ssize_t part_stat_show(struct device *dev,
 	struct disk_stats stat;
 	unsigned int inflight;
 
-	inflight = part_in_flight(bdev);
+	inflight = bdev_count_inflight(bdev);
 	if (inflight) {
 		part_stat_lock();
 		update_io_ticks(bdev, jiffies, true);
@@ -1050,9 +1048,9 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
 	unsigned int inflight[2];
 
 	if (queue_is_mq(q))
-		blk_mq_in_flight_rw(q, bdev, inflight);
+		blk_mq_count_in_driver_rw(q, bdev, inflight);
 	else
-		part_in_flight_rw(bdev, inflight);
+		bdev_count_inflight_rw(bdev, inflight);
 
 	return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]);
 }
@@ -1307,7 +1305,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
 			continue;
 
-		inflight = part_in_flight(hd);
+		inflight = bdev_count_inflight(hd);
 		if (inflight) {
 			part_stat_lock();
 			update_io_ticks(hd, jiffies, true);
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index c5e9cac0575e..6248e59dc891 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -79,4 +79,14 @@ static inline void part_stat_set_all(struct block_device *part, int value)
 #define part_stat_local_read_cpu(part, field, cpu)			\
 	local_read(&(part_stat_get_cpu(part, field, cpu)))
 
+void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2]);
+
+static inline unsigned int bdev_count_inflight(struct block_device *bdev)
+{
+	unsigned int inflight[2];
+
+	bdev_count_inflight_rw(bdev, inflight);
+
+	return inflight[0] + inflight[1];
+}
 #endif /* _LINUX_PART_STAT_H */
-- 
2.39.2
Re: [PATCH v2 1/5] block: cleanup and export bdev IO inflight APIs
Posted by Christoph Hellwig 7 months, 3 weeks ago
On Fri, Apr 18, 2025 at 09:09:37AM +0800, Yu Kuai wrote:
> - remove unused blk_mq_in_flight

That should probably be a separate patch.

> - rename blk_mq_in_flight_rw to blk_mq_count_in_driver_rw, to distinguish
>   from bdev_count_inflight_rw.

I'm not sure why this is needed or related, or even what additional
distinction is added here.

> -
> -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
> -		unsigned int inflight[2])
> +void blk_mq_count_in_driver_rw(struct request_queue *q,
> +			       struct block_device *part,
> +			       unsigned int inflight[2])

Any reason to move away from two tab indents for the prototype
continuations in various places in this patch?

> + * Noted, for rq-based block device, use blk_mq_count_in_driver_rw() to get the
> + * number of requests issued to driver.

I'd just change this helper to call blk_mq_count_in_driver_rw for
blk-mq devices and remove the conditional from the sysfs code instead.
That gives us a much more robust and easier to understand API.

> +void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2]);

Overly long line.

> +static inline unsigned int bdev_count_inflight(struct block_device *bdev)
> +{
> +	unsigned int inflight[2];
> +
> +	bdev_count_inflight_rw(bdev, inflight);
> +
> +	return inflight[0] + inflight[1];
> +}
>  #endif /* _LINUX_PART_STAT_H */

Maybe keep this inside of block as it should not not be used by
drivers?  Also the reimplementation should probably be a separate
patch from the public API change and exporting.
Re: [PATCH v2 1/5] block: cleanup and export bdev IO inflight APIs
Posted by Yu Kuai 7 months, 3 weeks ago
Hi,

在 2025/04/21 19:59, Christoph Hellwig 写道:
> On Fri, Apr 18, 2025 at 09:09:37AM +0800, Yu Kuai wrote:
>> - remove unused blk_mq_in_flight
> 
> That should probably be a separate patch.
ok

> 
>> - rename blk_mq_in_flight_rw to blk_mq_count_in_driver_rw, to distinguish
>>    from bdev_count_inflight_rw.
> 
> I'm not sure why this is needed or related, or even what additional
> distinction is added here.

Because for rq-based device, there are two different stage,
blk_account_io_start() while allocating new rq, and
blk_mq_start_request() while issuing the rq to driver.

When will we think the reqeust is inflight? For iostat, my anser is the
former one, because rq->start_time_ns is set here as well. And noted in
iostats api diskstats_show(/proc/diskstats) and part_stat_show
(/sys/block/sda/stat), inflight is get by part_in_flight, which is
different from disk sysfs api(/sys/block/sda/inflight).

> 
>> -
>> -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
>> -		unsigned int inflight[2])
>> +void blk_mq_count_in_driver_rw(struct request_queue *q,
>> +			       struct block_device *part,
>> +			       unsigned int inflight[2])
> 
> Any reason to move away from two tab indents for the prototype
> continuations in various places in this patch?
> 
>> + * Noted, for rq-based block device, use blk_mq_count_in_driver_rw() to get the
>> + * number of requests issued to driver.
> 
> I'd just change this helper to call blk_mq_count_in_driver_rw for
> blk-mq devices and remove the conditional from the sysfs code instead.
> That gives us a much more robust and easier to understand API.

Ok, and another separate patch, right?
> 
>> +void bdev_count_inflight_rw(struct block_device *bdev, unsigned int inflight[2]);
> 
> Overly long line.
> 
>> +static inline unsigned int bdev_count_inflight(struct block_device *bdev)
>> +{
>> +	unsigned int inflight[2];
>> +
>> +	bdev_count_inflight_rw(bdev, inflight);
>> +
>> +	return inflight[0] + inflight[1];
>> +}
>>   #endif /* _LINUX_PART_STAT_H */
> 
> Maybe keep this inside of block as it should not not be used by
> drivers?  Also the reimplementation should probably be a separate
> patch from the public API change and exporting.

ok, and I should probably send the first set just related to this patch
first.

Thanks,
Kuai

> 
> .
> 

Re: [PATCH v2 1/5] block: cleanup and export bdev IO inflight APIs
Posted by Christoph Hellwig 7 months, 3 weeks ago
On Mon, Apr 21, 2025 at 09:13:57PM +0800, Yu Kuai wrote:
> > I'm not sure why this is needed or related, or even what additional
> > distinction is added here.
> 
> Because for rq-based device, there are two different stage,
> blk_account_io_start() while allocating new rq, and
> blk_mq_start_request() while issuing the rq to driver.
> 
> When will we think the reqeust is inflight? For iostat, my anser is the
> former one, because rq->start_time_ns is set here as well. And noted in
> iostats api diskstats_show(/proc/diskstats) and part_stat_show
> (/sys/block/sda/stat), inflight is get by part_in_flight, which is
> different from disk sysfs api(/sys/block/sda/inflight).

Trying to express this in a not very obvious function name isn't
going to work very well.  Documenting your findings in comments is
much better.

> > 
> > I'd just change this helper to call blk_mq_count_in_driver_rw for
> > blk-mq devices and remove the conditional from the sysfs code instead.
> > That gives us a much more robust and easier to understand API.
> 
> Ok, and another separate patch, right?

Yes.