[PATCH v3] block: plug attempts to batch allocate tags multiple times

Xue He posted 1 patch 2 days, 12 hours ago
block/blk-mq.c | 44 +++++++++++++++++++++++++-------------------
lib/sbitmap.c  | 44 ++++++++++++++++++++++++++------------------
2 files changed, 51 insertions(+), 37 deletions(-)
[PATCH v3] block: plug attempts to batch allocate tags multiple times
Posted by Xue He 2 days, 12 hours ago
In the existing plug mechanism, tags are allocated in batches based on
the number of requests. However, testing has shown that the plug only
attempts batch allocation of tags once at the beginning of a batch of
I/O operations. Since the tag_mask does not always have enough available
tags to satisfy the requested number, a full batch allocation is not
guaranteed to succeed each time. The remaining tags are then allocated
individually (occurs frequently), leading to multiple single-tag
allocation overheads.

This patch aims to retry batch allocation of tags when the initial batch
allocation fails to reach the requested number, thereby reducing the
overhead of individual allocation attempts.

--------------------------------------------------------------------
perf:
base code: __blk_mq_alloc_requests() 1.35%
patch:__blk_mq_alloc_requests() 0.73%
-------------------------------------------------------------------

---
changes since v1:
- Modify multiple batch registrations into a single loop to achieve
  the batch quantity

changes since v2:
- Modify the call location of remainder handling
- Refactoring sbitmap cleanup time

Signed-off-by: hexue <xue01.he@samsung.com>
---
 block/blk-mq.c | 44 +++++++++++++++++++++++++-------------------
 lib/sbitmap.c  | 44 ++++++++++++++++++++++++++------------------
 2 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ba3a4b77f578..bf9d288e3411 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -456,28 +456,34 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 	struct blk_mq_tags *tags;
 	struct request *rq;
 	unsigned long tag_mask;
-	int i, nr = 0;
+	int nr_tags = data->nr_tags;
 
-	tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
-	if (unlikely(!tag_mask))
-		return NULL;
+	do {
+		int i, nr = 0;
+
+		tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
+		if (unlikely(!tag_mask))
+			return NULL;
+
+		tags = blk_mq_tags_from_data(data);
+		for (i = 0; tag_mask; i++) {
+			if (!(tag_mask & (1UL << i)))
+				continue;
+			tag = tag_offset + i;
+			prefetch(tags->static_rqs[tag]);
+			tag_mask &= ~(1UL << i);
+			rq = blk_mq_rq_ctx_init(data, tags, tag);
+			rq_list_add_head(data->cached_rqs, rq);
+			nr++;
+		}
+		if (!(data->rq_flags & RQF_SCHED_TAGS))
+			blk_mq_add_active_requests(data->hctx, nr);
+
+		data->nr_tags -= nr;
+	} while (data->nr_tags);
 
-	tags = blk_mq_tags_from_data(data);
-	for (i = 0; tag_mask; i++) {
-		if (!(tag_mask & (1UL << i)))
-			continue;
-		tag = tag_offset + i;
-		prefetch(tags->static_rqs[tag]);
-		tag_mask &= ~(1UL << i);
-		rq = blk_mq_rq_ctx_init(data, tags, tag);
-		rq_list_add_head(data->cached_rqs, rq);
-		nr++;
-	}
-	if (!(data->rq_flags & RQF_SCHED_TAGS))
-		blk_mq_add_active_requests(data->hctx, nr);
 	/* caller already holds a reference, add for remainder */
-	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
-	data->nr_tags -= nr;
+	percpu_ref_get_many(&data->q->q_usage_counter, nr_tags - 1);
 
 	return rq_list_pop(data->cached_rqs);
 }
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 4d188d05db15..457d18650950 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -534,26 +534,34 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
 		unsigned int map_depth = __map_depth(sb, index);
 		unsigned long val;
 
-		sbitmap_deferred_clear(map, 0, 0, 0);
 		val = READ_ONCE(map->word);
-		if (val == (1UL << (map_depth - 1)) - 1)
-			goto next;
-
-		nr = find_first_zero_bit(&val, map_depth);
-		if (nr + nr_tags <= map_depth) {
-			atomic_long_t *ptr = (atomic_long_t *) &map->word;
-
-			get_mask = ((1UL << nr_tags) - 1) << nr;
-			while (!atomic_long_try_cmpxchg(ptr, &val,
-							  get_mask | val))
-				;
-			get_mask = (get_mask & ~val) >> nr;
-			if (get_mask) {
-				*offset = nr + (index << sb->shift);
-				update_alloc_hint_after_get(sb, depth, hint,
-							*offset + nr_tags - 1);
-				return get_mask;
+		while (1) {
+			if (val == (1UL << (map_depth - 1)) - 1) {
+				if (!sbitmap_deferred_clear(map, 0, 0, 0))
+					goto next;
+				val = READ_ONCE(map->word);
 			}
+			nr = find_first_zero_bit(&val, map_depth);
+			if (nr + nr_tags <= map_depth)
+				break;
+
+			if (!sbitmap_deferred_clear(map, 0, 0, 0))
+				goto next;
+
+			val = READ_ONCE(map->word);
+		}
+		atomic_long_t *ptr = (atomic_long_t *) &map->word;
+
+		get_mask = ((1UL << nr_tags) - 1) << nr;
+		while (!atomic_long_try_cmpxchg(ptr, &val,
+						  get_mask | val))
+			;
+		get_mask = (get_mask & ~val) >> nr;
+		if (get_mask) {
+			*offset = nr + (index << sb->shift);
+			update_alloc_hint_after_get(sb, depth, hint,
+						*offset + nr_tags - 1);
+			return get_mask;
 		}
 next:
 		/* Jump to next index. */
-- 
2.34.1
Re: [PATCH v3] block: plug attempts to batch allocate tags multiple times
Posted by Yu Kuai 1 day, 20 hours ago
Hi,

在 2025/09/29 17:06, Xue He 写道:
> In the existing plug mechanism, tags are allocated in batches based on
> the number of requests. However, testing has shown that the plug only
> attempts batch allocation of tags once at the beginning of a batch of
> I/O operations. Since the tag_mask does not always have enough available
> tags to satisfy the requested number, a full batch allocation is not
> guaranteed to succeed each time. The remaining tags are then allocated
> individually (occurs frequently), leading to multiple single-tag
> allocation overheads.
> 
> This patch aims to retry batch allocation of tags when the initial batch
> allocation fails to reach the requested number, thereby reducing the
> overhead of individual allocation attempts.
> 
> --------------------------------------------------------------------
> perf:
> base code: __blk_mq_alloc_requests() 1.35%
> patch:__blk_mq_alloc_requests() 0.73%
> -------------------------------------------------------------------
> 
> ---
> changes since v1:
> - Modify multiple batch registrations into a single loop to achieve
>    the batch quantity
> 
> changes since v2:
> - Modify the call location of remainder handling
> - Refactoring sbitmap cleanup time
> 
> Signed-off-by: hexue <xue01.he@samsung.com>
> ---
>   block/blk-mq.c | 44 +++++++++++++++++++++++++-------------------
>   lib/sbitmap.c  | 44 ++++++++++++++++++++++++++------------------
>   2 files changed, 51 insertions(+), 37 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ba3a4b77f578..bf9d288e3411 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -456,28 +456,34 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
>   	struct blk_mq_tags *tags;
>   	struct request *rq;
>   	unsigned long tag_mask;
> -	int i, nr = 0;
> +	int nr_tags = data->nr_tags;
>   
> -	tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
> -	if (unlikely(!tag_mask))
> -		return NULL;
> +	do {
> +		int i, nr = 0;
> +
> +		tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
> +		if (unlikely(!tag_mask))
> +			return NULL;

You should break and handle allocated tags from previous loop. This leak
looks really possible if you run some tests.

> +
> +		tags = blk_mq_tags_from_data(data);
> +		for (i = 0; tag_mask; i++) {
> +			if (!(tag_mask & (1UL << i)))
> +				continue;
> +			tag = tag_offset + i;
> +			prefetch(tags->static_rqs[tag]);
> +			tag_mask &= ~(1UL << i);
> +			rq = blk_mq_rq_ctx_init(data, tags, tag);
> +			rq_list_add_head(data->cached_rqs, rq);
> +			nr++;
> +		}
> +		if (!(data->rq_flags & RQF_SCHED_TAGS))
> +			blk_mq_add_active_requests(data->hctx, nr);
> +
> +		data->nr_tags -= nr;
> +	} while (data->nr_tags);

And perhaps you shoud update the local variable nr_tags to the real
allocated tags.

>   
> -	tags = blk_mq_tags_from_data(data);
> -	for (i = 0; tag_mask; i++) {
> -		if (!(tag_mask & (1UL << i)))
> -			continue;
> -		tag = tag_offset + i;
> -		prefetch(tags->static_rqs[tag]);
> -		tag_mask &= ~(1UL << i);
> -		rq = blk_mq_rq_ctx_init(data, tags, tag);
> -		rq_list_add_head(data->cached_rqs, rq);
> -		nr++;
> -	}
> -	if (!(data->rq_flags & RQF_SCHED_TAGS))
> -		blk_mq_add_active_requests(data->hctx, nr);
>   	/* caller already holds a reference, add for remainder */
> -	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
> -	data->nr_tags -= nr;
> +	percpu_ref_get_many(&data->q->q_usage_counter, nr_tags - 1);
>   
>   	return rq_list_pop(data->cached_rqs);
>   }
> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
> index 4d188d05db15..457d18650950 100644
> --- a/lib/sbitmap.c
> +++ b/lib/sbitmap.c
> @@ -534,26 +534,34 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
>   		unsigned int map_depth = __map_depth(sb, index);
>   		unsigned long val;
>   
> -		sbitmap_deferred_clear(map, 0, 0, 0);
>   		val = READ_ONCE(map->word);
> -		if (val == (1UL << (map_depth - 1)) - 1)
> -			goto next;
> -
> -		nr = find_first_zero_bit(&val, map_depth);
> -		if (nr + nr_tags <= map_depth) {
> -			atomic_long_t *ptr = (atomic_long_t *) &map->word;
> -
> -			get_mask = ((1UL << nr_tags) - 1) << nr;
> -			while (!atomic_long_try_cmpxchg(ptr, &val,
> -							  get_mask | val))
> -				;
> -			get_mask = (get_mask & ~val) >> nr;
> -			if (get_mask) {
> -				*offset = nr + (index << sb->shift);
> -				update_alloc_hint_after_get(sb, depth, hint,
> -							*offset + nr_tags - 1);
> -				return get_mask;
> +		while (1) {
> +			if (val == (1UL << (map_depth - 1)) - 1) {
> +				if (!sbitmap_deferred_clear(map, 0, 0, 0))
> +					goto next;
> +				val = READ_ONCE(map->word);
>   			}
> +			nr = find_first_zero_bit(&val, map_depth);
> +			if (nr + nr_tags <= map_depth)
> +				break;
> +
> +			if (!sbitmap_deferred_clear(map, 0, 0, 0))
> +				goto next;
> +
> +			val = READ_ONCE(map->word);
> +		}

Can you also add a helper like sbitmap_find_bits_in_word() ?

Thanks,
Kuai

> +		atomic_long_t *ptr = (atomic_long_t *) &map->word;
> +
> +		get_mask = ((1UL << nr_tags) - 1) << nr;
> +		while (!atomic_long_try_cmpxchg(ptr, &val,
> +						  get_mask | val))
> +			;
> +		get_mask = (get_mask & ~val) >> nr;
> +		if (get_mask) {
> +			*offset = nr + (index << sb->shift);
> +			update_alloc_hint_after_get(sb, depth, hint,
> +						*offset + nr_tags - 1);
> +			return get_mask;
>   		}
>   next:
>   		/* Jump to next index. */
>