blk-cgroup: fix races and deadlocks

[PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()

Posted by Yu Kuai 6 days, 12 hours ago

Some policies like iocost and iolatency perform percpu allocation in
pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
because percpu memory reclaim may issue IO.

Now that q->blkg_list is protected by blkcg_mutex, restructure
blkcg_activate_policy() to allocate all pds before freezing the queue:
1. Allocate all pds with GFP_KERNEL before freezing the queue
2. Freeze the queue
3. Initialize and online all pds

Note: Future work is to remove all queue freezing before
blkcg_activate_policy() to fix the deadlocks thoroughly.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
 1 file changed, 32 insertions(+), 58 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0206050f81ea..7fcb216917d0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
 int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 {
 	struct request_queue *q = disk->queue;
-	struct blkg_policy_data *pd_prealloc = NULL;
-	struct blkcg_gq *blkg, *pinned_blkg = NULL;
+	struct blkcg_gq *blkg;
 	unsigned int memflags;
 	int ret;
 
@@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
 		return -EINVAL;
 
-	if (queue_is_mq(q))
-		memflags = blk_mq_freeze_queue(q);
-
+	/*
+	 * Allocate all pds before freezing queue. Some policies like iocost
+	 * and iolatency do percpu allocation in pd_alloc_fn(), which can
+	 * deadlock with queue frozen because percpu memory reclaim may issue
+	 * IO. blkcg_mutex protects q->blkg_list iteration.
+	 */
 	mutex_lock(&q->blkcg_mutex);
-retry:
-	spin_lock_irq(&q->queue_lock);
-
-	/* blkg_list is pushed at the head, reverse walk to initialize parents first */
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
 		struct blkg_policy_data *pd;
 
-		if (blkg->pd[pol->plid])
-			continue;
+		/* Skip dying blkg */
 		if (hlist_unhashed(&blkg->blkcg_node))
 			continue;
 
-		/* If prealloc matches, use it; otherwise try GFP_NOWAIT */
-		if (blkg == pinned_blkg) {
-			pd = pd_prealloc;
-			pd_prealloc = NULL;
-		} else {
-			pd = pol->pd_alloc_fn(disk, blkg->blkcg,
-					      GFP_NOWAIT);
-		}
-
+		pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL);
 		if (!pd) {
-			/*
-			 * GFP_NOWAIT failed.  Free the existing one and
-			 * prealloc for @blkg w/ GFP_KERNEL.
-			 */
-			if (pinned_blkg)
-				blkg_put(pinned_blkg);
-			blkg_get(blkg);
-			pinned_blkg = blkg;
-
-			spin_unlock_irq(&q->queue_lock);
-
-			if (pd_prealloc)
-				pol->pd_free_fn(pd_prealloc);
-			pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
-						       GFP_KERNEL);
-			if (pd_prealloc)
-				goto retry;
-			else
-				goto enomem;
+			ret = -ENOMEM;
+			goto err_teardown;
 		}
 
-		spin_lock(&blkg->blkcg->lock);
-
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
+		pd->online = false;
 		blkg->pd[pol->plid] = pd;
+	}
 
+	/* Now freeze queue and initialize/online all pds */
+	if (queue_is_mq(q))
+		memflags = blk_mq_freeze_queue(q);
+
+	spin_lock_irq(&q->queue_lock);
+	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
+		struct blkg_policy_data *pd = blkg->pd[pol->plid];
+
+		/* Skip dying blkg */
+		if (hlist_unhashed(&blkg->blkcg_node))
+			continue;
+
+		spin_lock(&blkg->blkcg->lock);
 		if (pol->pd_init_fn)
 			pol->pd_init_fn(pd);
-
 		if (pol->pd_online_fn)
 			pol->pd_online_fn(pd);
 		pd->online = true;
-
 		spin_unlock(&blkg->blkcg->lock);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
-	ret = 0;
-
 	spin_unlock_irq(&q->queue_lock);
-out:
-	mutex_unlock(&q->blkcg_mutex);
+
 	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q, memflags);
-	if (pinned_blkg)
-		blkg_put(pinned_blkg);
-	if (pd_prealloc)
-		pol->pd_free_fn(pd_prealloc);
-	return ret;
+	mutex_unlock(&q->blkcg_mutex);
+	return 0;
 
-enomem:
-	/* alloc failed, take down everything */
-	spin_lock_irq(&q->queue_lock);
+err_teardown:
 	blkcg_policy_teardown_pds(q, pol);
-	spin_unlock_irq(&q->queue_lock);
-	ret = -ENOMEM;
-	goto out;
+	mutex_unlock(&q->blkcg_mutex);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
 
-- 
2.51.0

Re: [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()

Posted by Michal Koutný 6 days, 11 hours ago

On Tue, Feb 03, 2026 at 04:06:01PM +0800, Yu Kuai <yukuai@fnnas.com> wrote:
> Some policies like iocost and iolatency perform percpu allocation in
> pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
> because percpu memory reclaim may issue IO.
> 
> Now that q->blkg_list is protected by blkcg_mutex,

With this ^^^

...
> restructure
> blkcg_activate_policy() to allocate all pds before freezing the queue:
> 1. Allocate all pds with GFP_KERNEL before freezing the queue
> 2. Freeze the queue
> 3. Initialize and online all pds
> 
> Note: Future work is to remove all queue freezing before
> blkcg_activate_policy() to fix the deadlocks thoroughly.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
> ---
>  block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
>  1 file changed, 32 insertions(+), 58 deletions(-)
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 0206050f81ea..7fcb216917d0 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
>  int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
>  {
>  	struct request_queue *q = disk->queue;
> -	struct blkg_policy_data *pd_prealloc = NULL;
> -	struct blkcg_gq *blkg, *pinned_blkg = NULL;
> +	struct blkcg_gq *blkg;
>  	unsigned int memflags;
>  	int ret;
>  
> @@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
...

> +	/* Now freeze queue and initialize/online all pds */
> +	if (queue_is_mq(q))
> +		memflags = blk_mq_freeze_queue(q);
> +
> +	spin_lock_irq(&q->queue_lock);
> +	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
> +		struct blkg_policy_data *pd = blkg->pd[pol->plid];
> +
> +		/* Skip dying blkg */
> +		if (hlist_unhashed(&blkg->blkcg_node))
> +			continue;
> +
> +		spin_lock(&blkg->blkcg->lock);
>  		if (pol->pd_init_fn)
>  			pol->pd_init_fn(pd);
> -
>  		if (pol->pd_online_fn)
>  			pol->pd_online_fn(pd);
>  		pd->online = true;
> -
>  		spin_unlock(&blkg->blkcg->lock);
>  	}
>  
>  	__set_bit(pol->plid, q->blkcg_pols);
> -	ret = 0;
> -
>  	spin_unlock_irq(&q->queue_lock);
> -out:
> -	mutex_unlock(&q->blkcg_mutex);
> +
>  	if (queue_is_mq(q))
>  		blk_mq_unfreeze_queue(q, memflags);
> -	if (pinned_blkg)
> -		blkg_put(pinned_blkg);
> -	if (pd_prealloc)
> -		pol->pd_free_fn(pd_prealloc);
> -	return ret;
> +	mutex_unlock(&q->blkcg_mutex);
> +	return 0;

Why is q->queue_lock still needed here?

Thanks,
Michal

Re: [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()

Posted by Yu Kuai 5 days, 13 hours ago

Hi,

在 2026/2/3 17:06, Michal Koutný 写道:
> On Tue, Feb 03, 2026 at 04:06:01PM +0800, Yu Kuai <yukuai@fnnas.com> wrote:
>> Some policies like iocost and iolatency perform percpu allocation in
>> pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
>> because percpu memory reclaim may issue IO.
>>
>> Now that q->blkg_list is protected by blkcg_mutex,
> With this ^^^
>
> ...
>> restructure
>> blkcg_activate_policy() to allocate all pds before freezing the queue:
>> 1. Allocate all pds with GFP_KERNEL before freezing the queue
>> 2. Freeze the queue
>> 3. Initialize and online all pds
>>
>> Note: Future work is to remove all queue freezing before
>> blkcg_activate_policy() to fix the deadlocks thoroughly.
>>
>> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
>> ---
>>   block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
>>   1 file changed, 32 insertions(+), 58 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 0206050f81ea..7fcb216917d0 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
>>   int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
>>   {
>>   	struct request_queue *q = disk->queue;
>> -	struct blkg_policy_data *pd_prealloc = NULL;
>> -	struct blkcg_gq *blkg, *pinned_blkg = NULL;
>> +	struct blkcg_gq *blkg;
>>   	unsigned int memflags;
>>   	int ret;
>>   
>> @@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
> ...
>
>> +	/* Now freeze queue and initialize/online all pds */
>> +	if (queue_is_mq(q))
>> +		memflags = blk_mq_freeze_queue(q);
>> +
>> +	spin_lock_irq(&q->queue_lock);
>> +	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>> +		struct blkg_policy_data *pd = blkg->pd[pol->plid];
>> +
>> +		/* Skip dying blkg */
>> +		if (hlist_unhashed(&blkg->blkcg_node))
>> +			continue;
>> +
>> +		spin_lock(&blkg->blkcg->lock);
>>   		if (pol->pd_init_fn)
>>   			pol->pd_init_fn(pd);
>> -
>>   		if (pol->pd_online_fn)
>>   			pol->pd_online_fn(pd);
>>   		pd->online = true;
>> -
>>   		spin_unlock(&blkg->blkcg->lock);
>>   	}
>>   
>>   	__set_bit(pol->plid, q->blkcg_pols);
>> -	ret = 0;
>> -
>>   	spin_unlock_irq(&q->queue_lock);
>> -out:
>> -	mutex_unlock(&q->blkcg_mutex);
>> +
>>   	if (queue_is_mq(q))
>>   		blk_mq_unfreeze_queue(q, memflags);
>> -	if (pinned_blkg)
>> -		blkg_put(pinned_blkg);
>> -	if (pd_prealloc)
>> -		pol->pd_free_fn(pd_prealloc);
>> -	return ret;
>> +	mutex_unlock(&q->blkcg_mutex);
>> +	return 0;
> Why is q->queue_lock still needed here?

I do want to remove queue_lock for accessing blkgs. However, this set just protect q->blkg_list
with blkg_mutex, and I'll remove the queue_lock after everything is converted to blkg_mutex.

>
> Thanks,
> Michal

-- 
Thansk,
Kuai

[PATCH v2 1/7] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex
[PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
[PATCH v2 3/7] blk-cgroup: fix race between policy activation and blkg destruction
[PATCH v2 4/7] blk-cgroup: skip dying blkg in blkcg_activate_policy()
[PATCH v2 5/7] blk-cgroup: factor policy pd teardown loop into helper
[PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()
[PATCH v2 7/7] blk-rq-qos: move rq_qos_mutex acquisition inside rq_qos_add/del