Some policies like iocost and iolatency perform percpu allocation in
pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
because percpu memory reclaim may issue IO.
Now that q->blkg_list is protected by blkcg_mutex, restructure
blkcg_activate_policy() to allocate all pds before freezing the queue:
1. Allocate all pds with GFP_KERNEL before freezing the queue
2. Freeze the queue
3. Initialize and online all pds
Note: Future work is to remove all queue freezing before
blkcg_activate_policy() to fix the deadlocks thoroughly.
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
1 file changed, 32 insertions(+), 58 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0206050f81ea..7fcb216917d0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
struct request_queue *q = disk->queue;
- struct blkg_policy_data *pd_prealloc = NULL;
- struct blkcg_gq *blkg, *pinned_blkg = NULL;
+ struct blkcg_gq *blkg;
unsigned int memflags;
int ret;
@@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
return -EINVAL;
- if (queue_is_mq(q))
- memflags = blk_mq_freeze_queue(q);
-
+ /*
+ * Allocate all pds before freezing queue. Some policies like iocost
+ * and iolatency do percpu allocation in pd_alloc_fn(), which can
+ * deadlock with queue frozen because percpu memory reclaim may issue
+ * IO. blkcg_mutex protects q->blkg_list iteration.
+ */
mutex_lock(&q->blkcg_mutex);
-retry:
- spin_lock_irq(&q->queue_lock);
-
- /* blkg_list is pushed at the head, reverse walk to initialize parents first */
list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd;
- if (blkg->pd[pol->plid])
- continue;
+ /* Skip dying blkg */
if (hlist_unhashed(&blkg->blkcg_node))
continue;
- /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
- if (blkg == pinned_blkg) {
- pd = pd_prealloc;
- pd_prealloc = NULL;
- } else {
- pd = pol->pd_alloc_fn(disk, blkg->blkcg,
- GFP_NOWAIT);
- }
-
+ pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL);
if (!pd) {
- /*
- * GFP_NOWAIT failed. Free the existing one and
- * prealloc for @blkg w/ GFP_KERNEL.
- */
- if (pinned_blkg)
- blkg_put(pinned_blkg);
- blkg_get(blkg);
- pinned_blkg = blkg;
-
- spin_unlock_irq(&q->queue_lock);
-
- if (pd_prealloc)
- pol->pd_free_fn(pd_prealloc);
- pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
- GFP_KERNEL);
- if (pd_prealloc)
- goto retry;
- else
- goto enomem;
+ ret = -ENOMEM;
+ goto err_teardown;
}
- spin_lock(&blkg->blkcg->lock);
-
pd->blkg = blkg;
pd->plid = pol->plid;
+ pd->online = false;
blkg->pd[pol->plid] = pd;
+ }
+ /* Now freeze queue and initialize/online all pds */
+ if (queue_is_mq(q))
+ memflags = blk_mq_freeze_queue(q);
+
+ spin_lock_irq(&q->queue_lock);
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
+ struct blkg_policy_data *pd = blkg->pd[pol->plid];
+
+ /* Skip dying blkg */
+ if (hlist_unhashed(&blkg->blkcg_node))
+ continue;
+
+ spin_lock(&blkg->blkcg->lock);
if (pol->pd_init_fn)
pol->pd_init_fn(pd);
-
if (pol->pd_online_fn)
pol->pd_online_fn(pd);
pd->online = true;
-
spin_unlock(&blkg->blkcg->lock);
}
__set_bit(pol->plid, q->blkcg_pols);
- ret = 0;
-
spin_unlock_irq(&q->queue_lock);
-out:
- mutex_unlock(&q->blkcg_mutex);
+
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q, memflags);
- if (pinned_blkg)
- blkg_put(pinned_blkg);
- if (pd_prealloc)
- pol->pd_free_fn(pd_prealloc);
- return ret;
+ mutex_unlock(&q->blkcg_mutex);
+ return 0;
-enomem:
- /* alloc failed, take down everything */
- spin_lock_irq(&q->queue_lock);
+err_teardown:
blkcg_policy_teardown_pds(q, pol);
- spin_unlock_irq(&q->queue_lock);
- ret = -ENOMEM;
- goto out;
+ mutex_unlock(&q->blkcg_mutex);
+ return ret;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
--
2.51.0
On Tue, Feb 03, 2026 at 04:06:01PM +0800, Yu Kuai <yukuai@fnnas.com> wrote:
> Some policies like iocost and iolatency perform percpu allocation in
> pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
> because percpu memory reclaim may issue IO.
>
> Now that q->blkg_list is protected by blkcg_mutex,
With this ^^^
...
> restructure
> blkcg_activate_policy() to allocate all pds before freezing the queue:
> 1. Allocate all pds with GFP_KERNEL before freezing the queue
> 2. Freeze the queue
> 3. Initialize and online all pds
>
> Note: Future work is to remove all queue freezing before
> blkcg_activate_policy() to fix the deadlocks thoroughly.
>
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
> ---
> block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
> 1 file changed, 32 insertions(+), 58 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 0206050f81ea..7fcb216917d0 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
> int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
> {
> struct request_queue *q = disk->queue;
> - struct blkg_policy_data *pd_prealloc = NULL;
> - struct blkcg_gq *blkg, *pinned_blkg = NULL;
> + struct blkcg_gq *blkg;
> unsigned int memflags;
> int ret;
>
> @@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
...
> + /* Now freeze queue and initialize/online all pds */
> + if (queue_is_mq(q))
> + memflags = blk_mq_freeze_queue(q);
> +
> + spin_lock_irq(&q->queue_lock);
> + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
> + struct blkg_policy_data *pd = blkg->pd[pol->plid];
> +
> + /* Skip dying blkg */
> + if (hlist_unhashed(&blkg->blkcg_node))
> + continue;
> +
> + spin_lock(&blkg->blkcg->lock);
> if (pol->pd_init_fn)
> pol->pd_init_fn(pd);
> -
> if (pol->pd_online_fn)
> pol->pd_online_fn(pd);
> pd->online = true;
> -
> spin_unlock(&blkg->blkcg->lock);
> }
>
> __set_bit(pol->plid, q->blkcg_pols);
> - ret = 0;
> -
> spin_unlock_irq(&q->queue_lock);
> -out:
> - mutex_unlock(&q->blkcg_mutex);
> +
> if (queue_is_mq(q))
> blk_mq_unfreeze_queue(q, memflags);
> - if (pinned_blkg)
> - blkg_put(pinned_blkg);
> - if (pd_prealloc)
> - pol->pd_free_fn(pd_prealloc);
> - return ret;
> + mutex_unlock(&q->blkcg_mutex);
> + return 0;
Why is q->queue_lock still needed here?
Thanks,
Michal
Hi,
在 2026/2/3 17:06, Michal Koutný 写道:
> On Tue, Feb 03, 2026 at 04:06:01PM +0800, Yu Kuai <yukuai@fnnas.com> wrote:
>> Some policies like iocost and iolatency perform percpu allocation in
>> pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
>> because percpu memory reclaim may issue IO.
>>
>> Now that q->blkg_list is protected by blkcg_mutex,
> With this ^^^
>
> ...
>> restructure
>> blkcg_activate_policy() to allocate all pds before freezing the queue:
>> 1. Allocate all pds with GFP_KERNEL before freezing the queue
>> 2. Freeze the queue
>> 3. Initialize and online all pds
>>
>> Note: Future work is to remove all queue freezing before
>> blkcg_activate_policy() to fix the deadlocks thoroughly.
>>
>> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
>> ---
>> block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
>> 1 file changed, 32 insertions(+), 58 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 0206050f81ea..7fcb216917d0 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
>> int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
>> {
>> struct request_queue *q = disk->queue;
>> - struct blkg_policy_data *pd_prealloc = NULL;
>> - struct blkcg_gq *blkg, *pinned_blkg = NULL;
>> + struct blkcg_gq *blkg;
>> unsigned int memflags;
>> int ret;
>>
>> @@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
> ...
>
>> + /* Now freeze queue and initialize/online all pds */
>> + if (queue_is_mq(q))
>> + memflags = blk_mq_freeze_queue(q);
>> +
>> + spin_lock_irq(&q->queue_lock);
>> + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>> + struct blkg_policy_data *pd = blkg->pd[pol->plid];
>> +
>> + /* Skip dying blkg */
>> + if (hlist_unhashed(&blkg->blkcg_node))
>> + continue;
>> +
>> + spin_lock(&blkg->blkcg->lock);
>> if (pol->pd_init_fn)
>> pol->pd_init_fn(pd);
>> -
>> if (pol->pd_online_fn)
>> pol->pd_online_fn(pd);
>> pd->online = true;
>> -
>> spin_unlock(&blkg->blkcg->lock);
>> }
>>
>> __set_bit(pol->plid, q->blkcg_pols);
>> - ret = 0;
>> -
>> spin_unlock_irq(&q->queue_lock);
>> -out:
>> - mutex_unlock(&q->blkcg_mutex);
>> +
>> if (queue_is_mq(q))
>> blk_mq_unfreeze_queue(q, memflags);
>> - if (pinned_blkg)
>> - blkg_put(pinned_blkg);
>> - if (pd_prealloc)
>> - pol->pd_free_fn(pd_prealloc);
>> - return ret;
>> + mutex_unlock(&q->blkcg_mutex);
>> + return 0;
> Why is q->queue_lock still needed here?
I do want to remove queue_lock for accessing blkgs. However, this set just protect q->blkg_list
with blkg_mutex, and I'll remove the queue_lock after everything is converted to blkg_mutex.
>
> Thanks,
> Michal
--
Thansk,
Kuai
© 2016 - 2026 Red Hat, Inc.