[PATCH] block, bfq: release cgroup stats with bfq_group

Yu Kuai posted 1 patch 6 days, 23 hours ago
block/bfq-cgroup.c | 43 ++++++++++++++++++++++---------------------
1 file changed, 22 insertions(+), 21 deletions(-)
[PATCH] block, bfq: release cgroup stats with bfq_group
Posted by Yu Kuai 6 days, 23 hours ago
BFQ cgroup stats contain percpu counters embedded in struct bfq_group,
but the old free path destroys them from bfq_pd_free(), which is tied
to blkg policy-data teardown.

That is not the same lifetime as struct bfq_group. BFQ pins bfq_group
while bfq_queue entities refer to it, so bfq_pd_free() can drop the
policy-data reference while other bfq_group references still exist. The
following blkcg change also defers policy-data release through RCU and
leaves BFQ to run the final bfqg_put() from an RCU callback. For that
conversion, stats teardown must belong to the last bfq_group put, not to
policy-data teardown.

Move stats teardown to bfqg_put() so the embedded counters are destroyed
exactly when the last bfq_group reference is released, before kfree(bfqg).

Without this preparatory change, the RCU-delayed policy-data free
conversion reproduced the following KASAN report:

  BUG: KASAN: slab-use-after-free in percpu_counter_destroy_many+0xf1/0x2e0
  Write of size 8 at addr ffff88811d9409e0 by task test_blkcg/535

  CPU: 0 UID: 0 PID: 535 Comm: test_blkcg Not tainted 7.1.0-rc2-g1e14adca0199 #1 PREEMPT  ea13f83d4b74a12510d20db4a7d9a0fe8275f05c
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
  Call Trace:
   <TASK>
   dump_stack_lvl+0x54/0x70
   print_address_description+0x77/0x200
   ? percpu_counter_destroy_many+0xf1/0x2e0
   print_report+0x64/0x70
   kasan_report+0x118/0x150
   ? percpu_counter_destroy_many+0xf1/0x2e0
   percpu_counter_destroy_many+0xf1/0x2e0
   __mmdrop+0x1d8/0x350
   finish_task_switch+0x3f5/0x570
   __schedule+0xe8e/0x18a0
   schedule+0xfe/0x1c0
   schedule_timeout+0x7f/0x1d0
   __wait_for_common+0x26c/0x3f0
   wait_for_completion_state+0x21/0x40
   call_usermodehelper_exec+0x271/0x2c0
   __request_module+0x296/0x410
   elv_iosched_store+0x1bc/0x2c0
   queue_attr_store+0x152/0x1c0
   kernfs_fop_write_iter+0x1d7/0x280
   vfs_write+0x580/0x630
   ksys_write+0xec/0x190
   do_syscall_64+0x156/0x490
   entry_SYSCALL_64_after_hwframe+0x77/0x7f

  Allocated by task 535:
   kasan_save_track+0x3e/0x80
   __kasan_kmalloc+0x72/0x90
   bfq_pd_alloc+0x60/0x100 [bfq]
   blkg_create+0x3bb/0xbe0
   blkg_lookup_create+0x3a2/0x460
   blkg_conf_start+0x24a/0x2d0
   bfq_io_set_weight+0x17f/0x430 [bfq]
   cgroup_file_write+0x1c5/0x4b0
   kernfs_fop_write_iter+0x1d7/0x280
   vfs_write+0x580/0x630
   ksys_write+0xec/0x190
   do_syscall_64+0x156/0x490
   entry_SYSCALL_64_after_hwframe+0x77/0x7f

  Freed by task 0:
   kasan_save_track+0x3e/0x80
   kasan_save_free_info+0x46/0x50
   __kasan_slab_free+0x3a/0x60
   kfree+0x14e/0x4f0
   rcu_core+0x6f3/0xcd0
   handle_softirqs+0x1a0/0x550
   __irq_exit_rcu+0x8c/0x150
   irq_exit_rcu+0xe/0x20
   sysvec_apic_timer_interrupt+0x6e/0x80
   asm_sysvec_apic_timer_interrupt+0x1a/0x20

  Last potentially related work creation:
   kasan_save_stack+0x3e/0x60
   kasan_record_aux_stack+0x99/0xb0
   call_rcu+0x55/0x5c0
   blkg_free_workfn+0x130/0x220
   process_scheduled_works+0x655/0xb60
   worker_thread+0x446/0x600
   kthread+0x1f4/0x230
   ret_from_fork+0x259/0x420
   ret_from_fork_asm+0x1a/0x30

Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/bfq-cgroup.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index ac83b0668764..37ab70930c8d 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -300,6 +300,25 @@ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
 	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
 }
 
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->bytes);
+	blkg_rwstat_exit(&stats->ios);
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	bfq_stat_exit(&stats->time);
+	bfq_stat_exit(&stats->avg_queue_size_sum);
+	bfq_stat_exit(&stats->avg_queue_size_samples);
+	bfq_stat_exit(&stats->dequeue);
+	bfq_stat_exit(&stats->group_wait_time);
+	bfq_stat_exit(&stats->idle_time);
+	bfq_stat_exit(&stats->empty_time);
+#endif
+}
+
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *group_entity = bfqq->entity.parent;
@@ -321,8 +340,10 @@ static void bfqg_get(struct bfq_group *bfqg)
 
 static void bfqg_put(struct bfq_group *bfqg)
 {
-	if (refcount_dec_and_test(&bfqg->ref))
+	if (refcount_dec_and_test(&bfqg->ref)) {
+		bfqg_stats_exit(&bfqg->stats);
 		kfree(bfqg);
+	}
 }
 
 static void bfqg_and_blkg_get(struct bfq_group *bfqg)
@@ -433,25 +454,6 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
 	entity->sched_data = &bfqg->sched_data;
 }
 
-static void bfqg_stats_exit(struct bfqg_stats *stats)
-{
-	blkg_rwstat_exit(&stats->bytes);
-	blkg_rwstat_exit(&stats->ios);
-#ifdef CONFIG_BFQ_CGROUP_DEBUG
-	blkg_rwstat_exit(&stats->merged);
-	blkg_rwstat_exit(&stats->service_time);
-	blkg_rwstat_exit(&stats->wait_time);
-	blkg_rwstat_exit(&stats->queued);
-	bfq_stat_exit(&stats->time);
-	bfq_stat_exit(&stats->avg_queue_size_sum);
-	bfq_stat_exit(&stats->avg_queue_size_samples);
-	bfq_stat_exit(&stats->dequeue);
-	bfq_stat_exit(&stats->group_wait_time);
-	bfq_stat_exit(&stats->idle_time);
-	bfq_stat_exit(&stats->empty_time);
-#endif
-}
-
 static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 {
 	if (blkg_rwstat_init(&stats->bytes, gfp) ||
@@ -552,7 +554,6 @@ static void bfq_pd_free(struct blkg_policy_data *pd)
 {
 	struct bfq_group *bfqg = pd_to_bfqg(pd);
 
-	bfqg_stats_exit(&bfqg->stats);
 	bfqg_put(bfqg);
 }
 
-- 
2.51.0
Re: [PATCH] block, bfq: release cgroup stats with bfq_group
Posted by Jens Axboe 6 days, 10 hours ago
On Mon, 01 Jun 2026 14:15:02 +0800, Yu Kuai wrote:
> BFQ cgroup stats contain percpu counters embedded in struct bfq_group,
> but the old free path destroys them from bfq_pd_free(), which is tied
> to blkg policy-data teardown.
> 
> That is not the same lifetime as struct bfq_group. BFQ pins bfq_group
> while bfq_queue entities refer to it, so bfq_pd_free() can drop the
> policy-data reference while other bfq_group references still exist. The
> following blkcg change also defers policy-data release through RCU and
> leaves BFQ to run the final bfqg_put() from an RCU callback. For that
> conversion, stats teardown must belong to the last bfq_group put, not to
> policy-data teardown.
> 
> [...]

Applied, thanks!

[1/1] block, bfq: release cgroup stats with bfq_group
      commit: 9e1183a465e4a912cce1e4b389d0d7394c127eb5

Best regards,
-- 
Jens Axboe
Re: [PATCH] block, bfq: release cgroup stats with bfq_group
Posted by Jan Kara 6 days, 13 hours ago
On Mon 01-06-26 14:15:02, Yu Kuai wrote:
> BFQ cgroup stats contain percpu counters embedded in struct bfq_group,
> but the old free path destroys them from bfq_pd_free(), which is tied
> to blkg policy-data teardown.
> 
> That is not the same lifetime as struct bfq_group. BFQ pins bfq_group
> while bfq_queue entities refer to it, so bfq_pd_free() can drop the
> policy-data reference while other bfq_group references still exist. The
> following blkcg change also defers policy-data release through RCU and
> leaves BFQ to run the final bfqg_put() from an RCU callback. For that
> conversion, stats teardown must belong to the last bfq_group put, not to
> policy-data teardown.
> 
> Move stats teardown to bfqg_put() so the embedded counters are destroyed
> exactly when the last bfq_group reference is released, before kfree(bfqg).
> 
> Without this preparatory change, the RCU-delayed policy-data free
> conversion reproduced the following KASAN report:
> 
>   BUG: KASAN: slab-use-after-free in percpu_counter_destroy_many+0xf1/0x2e0
>   Write of size 8 at addr ffff88811d9409e0 by task test_blkcg/535
> 
>   CPU: 0 UID: 0 PID: 535 Comm: test_blkcg Not tainted 7.1.0-rc2-g1e14adca0199 #1 PREEMPT  ea13f83d4b74a12510d20db4a7d9a0fe8275f05c
>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
>   Call Trace:
>    <TASK>
>    dump_stack_lvl+0x54/0x70
>    print_address_description+0x77/0x200
>    ? percpu_counter_destroy_many+0xf1/0x2e0
>    print_report+0x64/0x70
>    kasan_report+0x118/0x150
>    ? percpu_counter_destroy_many+0xf1/0x2e0
>    percpu_counter_destroy_many+0xf1/0x2e0
>    __mmdrop+0x1d8/0x350
>    finish_task_switch+0x3f5/0x570
>    __schedule+0xe8e/0x18a0
>    schedule+0xfe/0x1c0
>    schedule_timeout+0x7f/0x1d0
>    __wait_for_common+0x26c/0x3f0
>    wait_for_completion_state+0x21/0x40
>    call_usermodehelper_exec+0x271/0x2c0
>    __request_module+0x296/0x410
>    elv_iosched_store+0x1bc/0x2c0
>    queue_attr_store+0x152/0x1c0
>    kernfs_fop_write_iter+0x1d7/0x280
>    vfs_write+0x580/0x630
>    ksys_write+0xec/0x190
>    do_syscall_64+0x156/0x490
>    entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
>   Allocated by task 535:
>    kasan_save_track+0x3e/0x80
>    __kasan_kmalloc+0x72/0x90
>    bfq_pd_alloc+0x60/0x100 [bfq]
>    blkg_create+0x3bb/0xbe0
>    blkg_lookup_create+0x3a2/0x460
>    blkg_conf_start+0x24a/0x2d0
>    bfq_io_set_weight+0x17f/0x430 [bfq]
>    cgroup_file_write+0x1c5/0x4b0
>    kernfs_fop_write_iter+0x1d7/0x280
>    vfs_write+0x580/0x630
>    ksys_write+0xec/0x190
>    do_syscall_64+0x156/0x490
>    entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
>   Freed by task 0:
>    kasan_save_track+0x3e/0x80
>    kasan_save_free_info+0x46/0x50
>    __kasan_slab_free+0x3a/0x60
>    kfree+0x14e/0x4f0
>    rcu_core+0x6f3/0xcd0
>    handle_softirqs+0x1a0/0x550
>    __irq_exit_rcu+0x8c/0x150
>    irq_exit_rcu+0xe/0x20
>    sysvec_apic_timer_interrupt+0x6e/0x80
>    asm_sysvec_apic_timer_interrupt+0x1a/0x20
> 
>   Last potentially related work creation:
>    kasan_save_stack+0x3e/0x60
>    kasan_record_aux_stack+0x99/0xb0
>    call_rcu+0x55/0x5c0
>    blkg_free_workfn+0x130/0x220
>    process_scheduled_works+0x655/0xb60
>    worker_thread+0x446/0x600
>    kthread+0x1f4/0x230
>    ret_from_fork+0x259/0x420
>    ret_from_fork_asm+0x1a/0x30
> 
> Signed-off-by: Yu Kuai <yukuai@fygo.io>

Makes sense. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  block/bfq-cgroup.c | 43 ++++++++++++++++++++++---------------------
>  1 file changed, 22 insertions(+), 21 deletions(-)
> 
> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
> index ac83b0668764..37ab70930c8d 100644
> --- a/block/bfq-cgroup.c
> +++ b/block/bfq-cgroup.c
> @@ -300,6 +300,25 @@ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
>  	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
>  }
>  
> +static void bfqg_stats_exit(struct bfqg_stats *stats)
> +{
> +	blkg_rwstat_exit(&stats->bytes);
> +	blkg_rwstat_exit(&stats->ios);
> +#ifdef CONFIG_BFQ_CGROUP_DEBUG
> +	blkg_rwstat_exit(&stats->merged);
> +	blkg_rwstat_exit(&stats->service_time);
> +	blkg_rwstat_exit(&stats->wait_time);
> +	blkg_rwstat_exit(&stats->queued);
> +	bfq_stat_exit(&stats->time);
> +	bfq_stat_exit(&stats->avg_queue_size_sum);
> +	bfq_stat_exit(&stats->avg_queue_size_samples);
> +	bfq_stat_exit(&stats->dequeue);
> +	bfq_stat_exit(&stats->group_wait_time);
> +	bfq_stat_exit(&stats->idle_time);
> +	bfq_stat_exit(&stats->empty_time);
> +#endif
> +}
> +
>  struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
>  {
>  	struct bfq_entity *group_entity = bfqq->entity.parent;
> @@ -321,8 +340,10 @@ static void bfqg_get(struct bfq_group *bfqg)
>  
>  static void bfqg_put(struct bfq_group *bfqg)
>  {
> -	if (refcount_dec_and_test(&bfqg->ref))
> +	if (refcount_dec_and_test(&bfqg->ref)) {
> +		bfqg_stats_exit(&bfqg->stats);
>  		kfree(bfqg);
> +	}
>  }
>  
>  static void bfqg_and_blkg_get(struct bfq_group *bfqg)
> @@ -433,25 +454,6 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
>  	entity->sched_data = &bfqg->sched_data;
>  }
>  
> -static void bfqg_stats_exit(struct bfqg_stats *stats)
> -{
> -	blkg_rwstat_exit(&stats->bytes);
> -	blkg_rwstat_exit(&stats->ios);
> -#ifdef CONFIG_BFQ_CGROUP_DEBUG
> -	blkg_rwstat_exit(&stats->merged);
> -	blkg_rwstat_exit(&stats->service_time);
> -	blkg_rwstat_exit(&stats->wait_time);
> -	blkg_rwstat_exit(&stats->queued);
> -	bfq_stat_exit(&stats->time);
> -	bfq_stat_exit(&stats->avg_queue_size_sum);
> -	bfq_stat_exit(&stats->avg_queue_size_samples);
> -	bfq_stat_exit(&stats->dequeue);
> -	bfq_stat_exit(&stats->group_wait_time);
> -	bfq_stat_exit(&stats->idle_time);
> -	bfq_stat_exit(&stats->empty_time);
> -#endif
> -}
> -
>  static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
>  {
>  	if (blkg_rwstat_init(&stats->bytes, gfp) ||
> @@ -552,7 +554,6 @@ static void bfq_pd_free(struct blkg_policy_data *pd)
>  {
>  	struct bfq_group *bfqg = pd_to_bfqg(pd);
>  
> -	bfqg_stats_exit(&bfqg->stats);
>  	bfqg_put(bfqg);
>  }
>  
> -- 
> 2.51.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR