[PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update

boy.wu posted 1 patch 2 months ago
block/blk-cgroup.c | 62 +++++++++++++++++++---------------------------
block/blk-cgroup.h |  1 +
2 files changed, 26 insertions(+), 37 deletions(-)
[PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by boy.wu 2 months ago
From: Boy Wu <boy.wu@mediatek.com>

In 32bit SMP systems, if multiple CPUs call blkcg_print_stat,
it may cause blkcg_fill_root_iostats to have a concurrent problem
on the seqlock in u64_stats_update, which will cause a deadlock
on u64_stats_fetch_begin in blkcg_print_one_stat.

Thus, replace u64 sync with spinlock to protect iostat update.

Fixes: ef45fe470e1e ("blk-cgroup: show global disk stats in root cgroup io.stat")
Signed-off-by: Boy Wu <boy.wu@mediatek.com>
---
Change in v2:
 - update commit message
 - Remove u64_sync
 - Replace spin_lock_irq with guard statement
 - Replace blkg->q->queue_lock with blkg_stat_lock
Change in v3:
 - update commit message
 - Add spinlock in blkg_iostat_set structure
 - Replace all u64_sync with spinlock for iostat
 - Replace blkg_stat_lock with iostat.spinlock
---
 block/blk-cgroup.c | 62 +++++++++++++++++++---------------------------
 block/blk-cgroup.h |  1 +
 2 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 37e6cc91d576..4b66f37c45a0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -329,7 +329,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
 #endif
 
-	u64_stats_init(&blkg->iostat.sync);
+	spin_lock_init(&blkg->iostat.spinlock);
 	for_each_possible_cpu(cpu) {
 		u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
 		per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
@@ -995,15 +995,13 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
 				struct blkg_iostat *last)
 {
 	struct blkg_iostat delta;
-	unsigned long flags;
 
 	/* propagate percpu delta to global */
-	flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+	guard(spinlock_irqsave)(&blkg->iostat.spinlock);
 	blkg_iostat_set(&delta, cur);
 	blkg_iostat_sub(&delta, last);
 	blkg_iostat_add(&blkg->iostat.cur, &delta);
 	blkg_iostat_add(last, &delta);
-	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 }
 
 static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
@@ -1034,7 +1032,6 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
 		struct blkcg_gq *blkg = bisc->blkg;
 		struct blkcg_gq *parent = blkg->parent;
 		struct blkg_iostat cur;
-		unsigned int seq;
 
 		/*
 		 * Order assignment of `next_bisc` from `bisc->lnode.next` in
@@ -1051,10 +1048,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
 			goto propagate_up; /* propagate up to parent only */
 
 		/* fetch the current per-cpu values */
-		do {
-			seq = u64_stats_fetch_begin(&bisc->sync);
+		scoped_guard(spinlock_irqsave, &bisc->spinlock)
 			blkg_iostat_set(&cur, &bisc->cur);
-		} while (u64_stats_fetch_retry(&bisc->sync, seq));
 
 		blkcg_iostat_update(blkg, &cur, &bisc->last);
 
@@ -1112,7 +1107,6 @@ static void blkcg_fill_root_iostats(void)
 		struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
 		struct blkg_iostat tmp;
 		int cpu;
-		unsigned long flags;
 
 		memset(&tmp, 0, sizeof(tmp));
 		for_each_possible_cpu(cpu) {
@@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
 				cpu_dkstats->sectors[STAT_DISCARD] << 9;
 		}
 
-		flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+		guard(spinlock_irqsave)(&blkg->iostat.spinlock);
 		blkg_iostat_set(&blkg->iostat.cur, &tmp);
-		u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 	}
 }
 
@@ -1145,7 +1138,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
 	struct blkg_iostat_set *bis = &blkg->iostat;
 	u64 rbytes, wbytes, rios, wios, dbytes, dios;
 	const char *dname;
-	unsigned seq;
 	int i;
 
 	if (!blkg->online)
@@ -1157,16 +1149,14 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
 
 	seq_printf(s, "%s ", dname);
 
-	do {
-		seq = u64_stats_fetch_begin(&bis->sync);
-
+	scoped_guard(spinlock_irqsave, &bis->spinlock) {
 		rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
 		wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
 		dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
 		rios = bis->cur.ios[BLKG_IOSTAT_READ];
 		wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
 		dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
-	} while (u64_stats_fetch_retry(&bis->sync, seq));
+	}
 
 	if (rbytes || wbytes || rios || wios) {
 		seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
@@ -2141,7 +2131,6 @@ void blk_cgroup_bio_start(struct bio *bio)
 	struct blkcg *blkcg = bio->bi_blkg->blkcg;
 	int rwd = blk_cgroup_io_type(bio), cpu;
 	struct blkg_iostat_set *bis;
-	unsigned long flags;
 
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
 		return;
@@ -2152,30 +2141,29 @@ void blk_cgroup_bio_start(struct bio *bio)
 
 	cpu = get_cpu();
 	bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
-	flags = u64_stats_update_begin_irqsave(&bis->sync);
-
-	/*
-	 * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
-	 * bio and we would have already accounted for the size of the bio.
-	 */
-	if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
-		bio_set_flag(bio, BIO_CGROUP_ACCT);
-		bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
-	}
-	bis->cur.ios[rwd]++;
+	scoped_guard(spinlock_irqsave, &bis->spinlock) {
+		/*
+		 * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
+		 * bio and we would have already accounted for the size of the bio.
+		 */
+		if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+			bio_set_flag(bio, BIO_CGROUP_ACCT);
+			bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+		}
+		bis->cur.ios[rwd]++;
 
-	/*
-	 * If the iostat_cpu isn't in a lockless list, put it into the
-	 * list to indicate that a stat update is pending.
-	 */
-	if (!READ_ONCE(bis->lqueued)) {
-		struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+		/*
+		 * If the iostat_cpu isn't in a lockless list, put it into the
+		 * list to indicate that a stat update is pending.
+		 */
+		if (!READ_ONCE(bis->lqueued)) {
+			struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
 
-		llist_add(&bis->lnode, lhead);
-		WRITE_ONCE(bis->lqueued, true);
+			llist_add(&bis->lnode, lhead);
+			WRITE_ONCE(bis->lqueued, true);
+		}
 	}
 
-	u64_stats_update_end_irqrestore(&bis->sync, flags);
 	cgroup_rstat_updated(blkcg->css.cgroup, cpu);
 	put_cpu();
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index bd472a30bc61..b9544969a131 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -44,6 +44,7 @@ struct blkg_iostat {
 };
 
 struct blkg_iostat_set {
+	spinlock_t			spinlock;
 	struct u64_stats_sync		sync;
 	struct blkcg_gq		       *blkg;
 	struct llist_node		lnode;
-- 
2.18.0
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Tejun Heo 2 months ago
Hello, Boy.

So, looking at the patch, I'm not sure per-blkg lock makes sense.

On Tue, Jul 16, 2024 at 03:52:06PM +0800, boy.wu wrote:
> @@ -995,15 +995,13 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
>  				struct blkg_iostat *last)
>  {
>  	struct blkg_iostat delta;
> -	unsigned long flags;
>  
>  	/* propagate percpu delta to global */
> -	flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> +	guard(spinlock_irqsave)(&blkg->iostat.spinlock);
>  	blkg_iostat_set(&delta, cur);
>  	blkg_iostat_sub(&delta, last);
>  	blkg_iostat_add(&blkg->iostat.cur, &delta);
>  	blkg_iostat_add(last, &delta);
> -	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>  }

This is already called with blkg_stat_lock held.

> @@ -1051,10 +1048,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
>  			goto propagate_up; /* propagate up to parent only */
>  
>  		/* fetch the current per-cpu values */
> -		do {
> -			seq = u64_stats_fetch_begin(&bisc->sync);
> +		scoped_guard(spinlock_irqsave, &bisc->spinlock)
>  			blkg_iostat_set(&cur, &bisc->cur);
> -		} while (u64_stats_fetch_retry(&bisc->sync, seq));

This is per-cpu stat and we should keep using u64_sync for them.

> @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
>  				cpu_dkstats->sectors[STAT_DISCARD] << 9;
>  		}
>  
> -		flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> +		guard(spinlock_irqsave)(&blkg->iostat.spinlock);
>  		blkg_iostat_set(&blkg->iostat.cur, &tmp);
> -		u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>  	}
>  }
...
> @@ -1157,16 +1149,14 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
>  
>  	seq_printf(s, "%s ", dname);
>  
> -	do {
> -		seq = u64_stats_fetch_begin(&bis->sync);
> -
> +	scoped_guard(spinlock_irqsave, &bis->spinlock) {
>  		rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
>  		wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
>  		dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
>  		rios = bis->cur.ios[BLKG_IOSTAT_READ];
>  		wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
>  		dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> -	} while (u64_stats_fetch_retry(&bis->sync, seq));
> +	}

The above two are the only places which can potentially benefit from
per-blkg lock but these aren't hot paths. I'd just use blkg_stat_lock for
the above.

> @@ -2152,30 +2141,29 @@ void blk_cgroup_bio_start(struct bio *bio)
>  
>  	cpu = get_cpu();
>  	bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
> -	flags = u64_stats_update_begin_irqsave(&bis->sync);
> -
> -	/*
> -	 * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
> -	 * bio and we would have already accounted for the size of the bio.
> -	 */
> -	if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> -		bio_set_flag(bio, BIO_CGROUP_ACCT);
> -		bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> -	}
> -	bis->cur.ios[rwd]++;
> +	scoped_guard(spinlock_irqsave, &bis->spinlock) {
> +		/*
> +		 * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
> +		 * bio and we would have already accounted for the size of the bio.
> +		 */
> +		if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> +			bio_set_flag(bio, BIO_CGROUP_ACCT);
> +			bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> +		}
> +		bis->cur.ios[rwd]++;
>  
> -	/*
> -	 * If the iostat_cpu isn't in a lockless list, put it into the
> -	 * list to indicate that a stat update is pending.
> -	 */
> -	if (!READ_ONCE(bis->lqueued)) {
> -		struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> +		/*
> +		 * If the iostat_cpu isn't in a lockless list, put it into the
> +		 * list to indicate that a stat update is pending.
> +		 */
> +		if (!READ_ONCE(bis->lqueued)) {
> +			struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
>  
> -		llist_add(&bis->lnode, lhead);
> -		WRITE_ONCE(bis->lqueued, true);
> +			llist_add(&bis->lnode, lhead);
> +			WRITE_ONCE(bis->lqueued, true);
> +		}

These are per-cpu stat updates which should keep using u64_sync. We don't
want to incur locking overhead for stat updates in the hot issue path.

Thanks.

-- 
tejun
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Boy Wu (吳勃誼) 2 months ago
On Tue, 2024-07-16 at 11:13 -1000, Tejun Heo wrote:
>  	 
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>  Hello, Boy.
> 
> So, looking at the patch, I'm not sure per-blkg lock makes sense.
> 
> On Tue, Jul 16, 2024 at 03:52:06PM +0800, boy.wu wrote:
> > @@ -995,15 +995,13 @@ static void blkcg_iostat_update(struct
> blkcg_gq *blkg, struct blkg_iostat *cur,
> >  struct blkg_iostat *last)
> >  {
> >  struct blkg_iostat delta;
> > -unsigned long flags;
> >  
> >  /* propagate percpu delta to global */
> > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> >  blkg_iostat_set(&delta, cur);
> >  blkg_iostat_sub(&delta, last);
> >  blkg_iostat_add(&blkg->iostat.cur, &delta);
> >  blkg_iostat_add(last, &delta);
> > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> >  }
> 
> This is already called with blkg_stat_lock held.

In blkcg_iostat_update, it is protected by both blkg_stat_lock and u64
sync. I agree that no changes are needed here.

> 
> > @@ -1051,10 +1048,8 @@ static void __blkcg_rstat_flush(struct blkcg
> *blkcg, int cpu)
> >  goto propagate_up; /* propagate up to parent only */
> >  
> >  /* fetch the current per-cpu values */
> > -do {
> > -seq = u64_stats_fetch_begin(&bisc->sync);
> > +scoped_guard(spinlock_irqsave, &bisc->spinlock)
> >  blkg_iostat_set(&cur, &bisc->cur);
> > -} while (u64_stats_fetch_retry(&bisc->sync, seq));
> 
> This is per-cpu stat and we should keep using u64_sync for them.
> 
> > @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
> >  cpu_dkstats->sectors[STAT_DISCARD] << 9;
> >  }
> >  
> > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> >  blkg_iostat_set(&blkg->iostat.cur, &tmp);
> > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> >  }
> >  }
> ...
> > @@ -1157,16 +1149,14 @@ static void blkcg_print_one_stat(struct
> blkcg_gq *blkg, struct seq_file *s)
> >  
> >  seq_printf(s, "%s ", dname);
> >  
> > -do {
> > -seq = u64_stats_fetch_begin(&bis->sync);
> > -
> > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> >  rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> >  wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> >  dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> >  rios = bis->cur.ios[BLKG_IOSTAT_READ];
> >  wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> >  dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> > -} while (u64_stats_fetch_retry(&bis->sync, seq));
> > +}
> 
> The above two are the only places which can potentially benefit from
> per-blkg lock but these aren't hot paths. I'd just use blkg_stat_lock
> for
> the above.

In __blkcg_rstat_flush, u64 sync is used for fetching data, changing to
spinlock will increase overhead both 64 bit and 32 bit systems. 64 bit
systems do noting in u64 sync, and 32 bit systems can read data in
parallel in u64 sync if no one updating data. However, it is already
protected by blkg_stat_lock, so there is no parallelism now, and there
is no issue here. I think that no changes are needed here.

In blkcg_fill_root_iostats, this is where the issue happens in 32 bit
SMP systems, so spinlock needs to be added to protect it.

In blkcg_print_one_stat, u64 sync is used for fetching data. Changing
to spinlock will increase overhead like in __blkcg_rstat_flush.
However, it is already protected by spin_lock_irq(&blkg->q->queue_lock) 
in blkcg_print_stat, so there is no parallelism now, and there is no
issue here. I think that no changes are needed here.

> 
> > @@ -2152,30 +2141,29 @@ void blk_cgroup_bio_start(struct bio *bio)
> >  
> >  cpu = get_cpu();
> >  bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
> > -flags = u64_stats_update_begin_irqsave(&bis->sync);
> > -
> > -/*
> > - * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> split
> > - * bio and we would have already accounted for the size of the
> bio.
> > - */
> > -if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > -bio_set_flag(bio, BIO_CGROUP_ACCT);
> > -bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > -}
> > -bis->cur.ios[rwd]++;
> > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> > +/*
> > + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> split
> > + * bio and we would have already accounted for the size of the
> bio.
> > + */
> > +if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > +bio_set_flag(bio, BIO_CGROUP_ACCT);
> > +bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > +}
> > +bis->cur.ios[rwd]++;
> >  
> > -/*
> > - * If the iostat_cpu isn't in a lockless list, put it into the
> > - * list to indicate that a stat update is pending.
> > - */
> > -if (!READ_ONCE(bis->lqueued)) {
> > -struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> > +/*
> > + * If the iostat_cpu isn't in a lockless list, put it into the
> > + * list to indicate that a stat update is pending.
> > + */
> > +if (!READ_ONCE(bis->lqueued)) {
> > +struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> >  
> > -llist_add(&bis->lnode, lhead);
> > -WRITE_ONCE(bis->lqueued, true);
> > +llist_add(&bis->lnode, lhead);
> > +WRITE_ONCE(bis->lqueued, true);
> > +}
> 
> These are per-cpu stat updates which should keep using u64_sync. We
> don't
> want to incur locking overhead for stat updates in the hot issue
> path.
> 

I agree that no changes are needed in blk_cgroup_bio_start.

> Thanks.
> 
> -- 
> tejun

I think we can look back to the original issue, which is that on 32 bit
SMP systems will have concurrent problems on
u64_stats_update_begin_irqsave and u64_stats_update_end_irqrestore in
blkcg_fill_root_iostats. So, adding a lock only on 32 bits systems
in blkcg_fill_root_iostats is to resolve the concurrent issue and not
affect 64 bit systems, which do not have the issue in the first place,
because u64 sync does noting in 64 bit systems and they don't need it.

I think we can just fix it by following change.

@@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
                                cpu_dkstats->sectors[STAT_DISCARD] <<
9;
                }

+#if BITS_PER_LONG == 32
+               guard(spinlock_irqsave)(&blkg_stat_lock);
+#endif
                flags = u64_stats_update_begin_irqsave(&blkg-
>iostat.sync);
                blkg_iostat_set(&blkg->iostat.cur, &tmp);
                u64_stats_update_end_irqrestore(&blkg->iostat.sync,
flags);
        }
 }
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Boy Wu (吳勃誼) 2 months ago
On Wed, 2024-07-17 at 10:25 +0800, Boy.Wu wrote:
> On Tue, 2024-07-16 at 11:13 -1000, Tejun Heo wrote:
> >  	 
> > External email : Please do not click links or open attachments
> > until
> > you have verified the sender or the content.
> >  Hello, Boy.
> > 
> > So, looking at the patch, I'm not sure per-blkg lock makes sense.
> > 
> > On Tue, Jul 16, 2024 at 03:52:06PM +0800, boy.wu wrote:
> > > @@ -995,15 +995,13 @@ static void blkcg_iostat_update(struct
> > 
> > blkcg_gq *blkg, struct blkg_iostat *cur,
> > >  struct blkg_iostat *last)
> > >  {
> > >  struct blkg_iostat delta;
> > > -unsigned long flags;
> > >  
> > >  /* propagate percpu delta to global */
> > > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> > >  blkg_iostat_set(&delta, cur);
> > >  blkg_iostat_sub(&delta, last);
> > >  blkg_iostat_add(&blkg->iostat.cur, &delta);
> > >  blkg_iostat_add(last, &delta);
> > > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > >  }
> > 
> > This is already called with blkg_stat_lock held.
> 
> In blkcg_iostat_update, it is protected by both blkg_stat_lock and
> u64
> sync. I agree that no changes are needed here.
> 
> > 
> > > @@ -1051,10 +1048,8 @@ static void __blkcg_rstat_flush(struct
> > > blkcg
> > 
> > *blkcg, int cpu)
> > >  goto propagate_up; /* propagate up to parent only */
> > >  
> > >  /* fetch the current per-cpu values */
> > > -do {
> > > -seq = u64_stats_fetch_begin(&bisc->sync);
> > > +scoped_guard(spinlock_irqsave, &bisc->spinlock)
> > >  blkg_iostat_set(&cur, &bisc->cur);
> > > -} while (u64_stats_fetch_retry(&bisc->sync, seq));
> > 
> > This is per-cpu stat and we should keep using u64_sync for them.
> > 
> > > @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
> > >  cpu_dkstats->sectors[STAT_DISCARD] << 9;
> > >  }
> > >  
> > > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> > >  blkg_iostat_set(&blkg->iostat.cur, &tmp);
> > > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > >  }
> > >  }
> > 
> > ...
> > > @@ -1157,16 +1149,14 @@ static void blkcg_print_one_stat(struct
> > 
> > blkcg_gq *blkg, struct seq_file *s)
> > >  
> > >  seq_printf(s, "%s ", dname);
> > >  
> > > -do {
> > > -seq = u64_stats_fetch_begin(&bis->sync);
> > > -
> > > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> > >  rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> > >  wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> > >  dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> > >  rios = bis->cur.ios[BLKG_IOSTAT_READ];
> > >  wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> > >  dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> > > -} while (u64_stats_fetch_retry(&bis->sync, seq));
> > > +}
> > 
> > The above two are the only places which can potentially benefit
> > from
> > per-blkg lock but these aren't hot paths. I'd just use
> > blkg_stat_lock
> > for
> > the above.
> 
> In __blkcg_rstat_flush, u64 sync is used for fetching data, changing
> to
> spinlock will increase overhead both 64 bit and 32 bit systems. 64
> bit
> systems do noting in u64 sync, and 32 bit systems can read data in
> parallel in u64 sync if no one updating data. However, it is already
> protected by blkg_stat_lock, so there is no parallelism now, and
> there
> is no issue here. I think that no changes are needed here.
> 
> In blkcg_fill_root_iostats, this is where the issue happens in 32 bit
> SMP systems, so spinlock needs to be added to protect it.
> 
> In blkcg_print_one_stat, u64 sync is used for fetching data. Changing
> to spinlock will increase overhead like in __blkcg_rstat_flush.
> However, it is already protected by spin_lock_irq(&blkg->q-
> >queue_lock) 
> in blkcg_print_stat, so there is no parallelism now, and there is no
> issue here. I think that no changes are needed here.
> 
> > 
> > > @@ -2152,30 +2141,29 @@ void blk_cgroup_bio_start(struct bio
> > > *bio)
> > >  
> > >  cpu = get_cpu();
> > >  bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
> > > -flags = u64_stats_update_begin_irqsave(&bis->sync);
> > > -
> > > -/*
> > > - * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> > 
> > split
> > > - * bio and we would have already accounted for the size of the
> > 
> > bio.
> > > - */
> > > -if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > > -bio_set_flag(bio, BIO_CGROUP_ACCT);
> > > -bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > > -}
> > > -bis->cur.ios[rwd]++;
> > > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> > > +/*
> > > + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> > 
> > split
> > > + * bio and we would have already accounted for the size of the
> > 
> > bio.
> > > + */
> > > +if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > > +bio_set_flag(bio, BIO_CGROUP_ACCT);
> > > +bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > > +}
> > > +bis->cur.ios[rwd]++;
> > >  
> > > -/*
> > > - * If the iostat_cpu isn't in a lockless list, put it into the
> > > - * list to indicate that a stat update is pending.
> > > - */
> > > -if (!READ_ONCE(bis->lqueued)) {
> > > -struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> > > +/*
> > > + * If the iostat_cpu isn't in a lockless list, put it into the
> > > + * list to indicate that a stat update is pending.
> > > + */
> > > +if (!READ_ONCE(bis->lqueued)) {
> > > +struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> > >  
> > > -llist_add(&bis->lnode, lhead);
> > > -WRITE_ONCE(bis->lqueued, true);
> > > +llist_add(&bis->lnode, lhead);
> > > +WRITE_ONCE(bis->lqueued, true);
> > > +}
> > 
> > These are per-cpu stat updates which should keep using u64_sync. We
> > don't
> > want to incur locking overhead for stat updates in the hot issue
> > path.
> > 
> 
> I agree that no changes are needed in blk_cgroup_bio_start.
> 
> > Thanks.
> > 
> > -- 
> > tejun
> 
> I think we can look back to the original issue, which is that on 32
> bit
> SMP systems will have concurrent problems on
> u64_stats_update_begin_irqsave and u64_stats_update_end_irqrestore in
> blkcg_fill_root_iostats. So, adding a lock only on 32 bits systems
> in blkcg_fill_root_iostats is to resolve the concurrent issue and not
> affect 64 bit systems, which do not have the issue in the first
> place,
> because u64 sync does noting in 64 bit systems and they don't need
> it.
> 
> I think we can just fix it by following change.
> 
> @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
>                                 cpu_dkstats->sectors[STAT_DISCARD] <<
> 9;
>                 }
> 
> +#if BITS_PER_LONG == 32
> +               guard(spinlock_irqsave)(&blkg_stat_lock);
> +#endif
>                 flags = u64_stats_update_begin_irqsave(&blkg-
> > iostat.sync);
> 
>                 blkg_iostat_set(&blkg->iostat.cur, &tmp);
>                 u64_stats_update_end_irqrestore(&blkg->iostat.sync,
> flags);
>         }
>  }

I think there is another way to fix it. Maybe we should address the
root cause, which is that u64_stats_update_begin_irqsave
and u64_stats_update_end_irqrestore do not protect the seqcount in 32
bit SMP systems. This can be fix by the following change.

diff --git a/include/linux/u64_stats_sync.h
b/include/linux/u64_stats_sync.h
index 46040d66334a..94dd74b4fb2c 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -64,6 +64,7 @@
 struct u64_stats_sync {
 #if BITS_PER_LONG == 32
        seqcount_t      seq;
+       spinlock_t      spinlock;
 #endif
 };

@@ -138,6 +139,7 @@ static inline void u64_stats_inc(u64_stats_t *p)
 static inline void u64_stats_init(struct u64_stats_sync *syncp)
 {
        seqcount_init(&syncp->seq);
+       spin_lock_init(&syncp->spinlock)
 }

 static inline void __u64_stats_update_begin(struct u64_stats_sync
*syncp)
@@ -191,6 +193,7 @@ static inline unsigned long
u64_stats_update_begin_irqsave(struct u64_stats_sync
 {
        unsigned long flags = __u64_stats_irqsave();

+       spin_lock_irq(&syncp->spinlock);
        __u64_stats_update_begin(syncp);
        return flags;
 }
@@ -199,6 +202,7 @@ static inline void
u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
                                                   unsigned long flags)
 {
        __u64_stats_update_end(syncp);
+       spin_unlock_irq(&syncp->spinlock);
        __u64_stats_irqrestore(flags);
 }

--
boy.wu

Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Boy Wu (吳勃誼) 2 months ago
On Wed, 2024-07-17 at 11:44 +0800, Boy.Wu wrote:
> 
> I think there is another way to fix it. Maybe we should address the
> root cause, which is that u64_stats_update_begin_irqsave
> and u64_stats_update_end_irqrestore do not protect the seqcount in 32
> bit SMP systems. This can be fix by the following change.
> 
> diff --git a/include/linux/u64_stats_sync.h
> b/include/linux/u64_stats_sync.h
> index 46040d66334a..94dd74b4fb2c 100644
> --- a/include/linux/u64_stats_sync.h
> +++ b/include/linux/u64_stats_sync.h
> @@ -64,6 +64,7 @@
>  struct u64_stats_sync {
>  #if BITS_PER_LONG == 32
>         seqcount_t      seq;
> +       spinlock_t      spinlock;
>  #endif
>  };
> 
> @@ -138,6 +139,7 @@ static inline void u64_stats_inc(u64_stats_t *p)
>  static inline void u64_stats_init(struct u64_stats_sync *syncp)
>  {
>         seqcount_init(&syncp->seq);
> +       spin_lock_init(&syncp->spinlock)
>  }
> 
>  static inline void __u64_stats_update_begin(struct u64_stats_sync
> *syncp)
> @@ -191,6 +193,7 @@ static inline unsigned long
> u64_stats_update_begin_irqsave(struct u64_stats_sync
>  {
>         unsigned long flags = __u64_stats_irqsave();
> 
> +       spin_lock_irq(&syncp->spinlock);
>         __u64_stats_update_begin(syncp);
>         return flags;
>  }
> @@ -199,6 +202,7 @@ static inline void
> u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
>                                                    unsigned long
> flags)
>  {
>         __u64_stats_update_end(syncp);
> +       spin_unlock_irq(&syncp->spinlock);
>         __u64_stats_irqrestore(flags);
>  }
> 
> --
> boy.wu
> 

Never mind, there is a usage in u64_stats_sync.h

 * Usage :
 *
 * Stats producer (writer) should use following template granted it
already got
 * an exclusive access to counters (a lock is already taken, or per cpu
 * data is used [in a non preemptable context])
 *
 *   spin_lock_bh(...) or other synchronization to get exclusive access
 *   ...
 *   u64_stats_update_begin(&stats->syncp);
 *   u64_stats_add(&stats->bytes64, len); // non atomic operation
 *   u64_stats_inc(&stats->packets64);    // non atomic operation
 *   u64_stats_update_end(&stats->syncp);

--
boy.wu

Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by tj@kernel.org 2 months ago
Hello,

Does something like the following work for you?

Thanks.

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 37e6cc91d576..ec1d191f5c83 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -329,7 +329,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
 #endif
 
-	u64_stats_init(&blkg->iostat.sync);
 	for_each_possible_cpu(cpu) {
 		u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
 		per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
@@ -632,24 +631,26 @@ static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
 static void __blkg_clear_stat(struct blkg_iostat_set *bis)
 {
 	struct blkg_iostat cur = {0};
-	unsigned long flags;
 
-	flags = u64_stats_update_begin_irqsave(&bis->sync);
 	blkg_iostat_set(&bis->cur, &cur);
 	blkg_iostat_set(&bis->last, &cur);
-	u64_stats_update_end_irqrestore(&bis->sync, flags);
 }
 
 static void blkg_clear_stat(struct blkcg_gq *blkg)
 {
+	unsigned long flags;
 	int cpu;
 
+	raw_spin_lock_irqsave(&blkg_stat_lock, flags);
+
 	for_each_possible_cpu(cpu) {
 		struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);
 
 		__blkg_clear_stat(s);
 	}
 	__blkg_clear_stat(&blkg->iostat);
+
+	raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
 }
 
 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
@@ -998,12 +999,10 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
 	unsigned long flags;
 
 	/* propagate percpu delta to global */
-	flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
 	blkg_iostat_set(&delta, cur);
 	blkg_iostat_sub(&delta, last);
 	blkg_iostat_add(&blkg->iostat.cur, &delta);
 	blkg_iostat_add(last, &delta);
-	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 }
 
 static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
@@ -1134,9 +1133,9 @@ static void blkcg_fill_root_iostats(void)
 				cpu_dkstats->sectors[STAT_DISCARD] << 9;
 		}
 
-		flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+		raw_spin_lock_irqsave(&blkg_stat_lock, flags);
 		blkg_iostat_set(&blkg->iostat.cur, &tmp);
-		u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
+		raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
 	}
 }
 
@@ -1145,7 +1144,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
 	struct blkg_iostat_set *bis = &blkg->iostat;
 	u64 rbytes, wbytes, rios, wios, dbytes, dios;
 	const char *dname;
-	unsigned seq;
 	int i;
 
 	if (!blkg->online)
@@ -1157,16 +1155,14 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
 
 	seq_printf(s, "%s ", dname);
 
-	do {
-		seq = u64_stats_fetch_begin(&bis->sync);
-
-		rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
-		wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
-		dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
-		rios = bis->cur.ios[BLKG_IOSTAT_READ];
-		wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
-		dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
-	} while (u64_stats_fetch_retry(&bis->sync, seq));
+	raw_spin_lock_irq(&blkg_stat_lock);
+	rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
+	wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
+	dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
+	rios = bis->cur.ios[BLKG_IOSTAT_READ];
+	wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
+	dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
+	raw_spin_unlock_irq(&blkg_stat_lock, flags);
 
 	if (rbytes || wbytes || rios || wios) {
 		seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Boy Wu (吳勃誼) 2 months ago
On Wed, 2024-07-17 at 06:55 -1000, tj@kernel.org wrote:
>  	 
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>  Hello,
> 
> Does something like the following work for you?
> 
> Thanks.
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 37e6cc91d576..ec1d191f5c83 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -329,7 +329,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg
> *blkcg, struct gendisk *disk,
>  INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
>  #endif
>  
> -u64_stats_init(&blkg->iostat.sync);
>  for_each_possible_cpu(cpu) {
>  u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
>  per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
> @@ -632,24 +631,26 @@ static void blkg_iostat_set(struct blkg_iostat
> *dst, struct blkg_iostat *src)
>  static void __blkg_clear_stat(struct blkg_iostat_set *bis)
>  {
>  struct blkg_iostat cur = {0};
> -unsigned long flags;
>  
> -flags = u64_stats_update_begin_irqsave(&bis->sync);
>  blkg_iostat_set(&bis->cur, &cur);
>  blkg_iostat_set(&bis->last, &cur);
> -u64_stats_update_end_irqrestore(&bis->sync, flags);
>  }
>  
>  static void blkg_clear_stat(struct blkcg_gq *blkg)
>  {
> +unsigned long flags;
>  int cpu;
>  
> +raw_spin_lock_irqsave(&blkg_stat_lock, flags);
> +
>  for_each_possible_cpu(cpu) {
>  struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);
>  
>  __blkg_clear_stat(s);
>  }
>  __blkg_clear_stat(&blkg->iostat);
> +
> +raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
>  }
>  
>  static int blkcg_reset_stats(struct cgroup_subsys_state *css,
> @@ -998,12 +999,10 @@ static void blkcg_iostat_update(struct blkcg_gq
> *blkg, struct blkg_iostat *cur,
>  unsigned long flags;
>  
>  /* propagate percpu delta to global */
> -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
>  blkg_iostat_set(&delta, cur);
>  blkg_iostat_sub(&delta, last);
>  blkg_iostat_add(&blkg->iostat.cur, &delta);
>  blkg_iostat_add(last, &delta);
> -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>  }
>  
>  static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
> @@ -1134,9 +1133,9 @@ static void blkcg_fill_root_iostats(void)
>  cpu_dkstats->sectors[STAT_DISCARD] << 9;
>  }
>  
> -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> +raw_spin_lock_irqsave(&blkg_stat_lock, flags);
>  blkg_iostat_set(&blkg->iostat.cur, &tmp);
> -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> +raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
>  }
>  }
>  
> @@ -1145,7 +1144,6 @@ static void blkcg_print_one_stat(struct
> blkcg_gq *blkg, struct seq_file *s)
>  struct blkg_iostat_set *bis = &blkg->iostat;
>  u64 rbytes, wbytes, rios, wios, dbytes, dios;
>  const char *dname;
> -unsigned seq;
>  int i;
>  
>  if (!blkg->online)
> @@ -1157,16 +1155,14 @@ static void blkcg_print_one_stat(struct
> blkcg_gq *blkg, struct seq_file *s)
>  
>  seq_printf(s, "%s ", dname);
>  
> -do {
> -seq = u64_stats_fetch_begin(&bis->sync);
> -
> -rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> -wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> -dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> -rios = bis->cur.ios[BLKG_IOSTAT_READ];
> -wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> -dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> -} while (u64_stats_fetch_retry(&bis->sync, seq));
> +raw_spin_lock_irq(&blkg_stat_lock);
> +rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> +wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> +dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> +rios = bis->cur.ios[BLKG_IOSTAT_READ];
> +wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> +dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> +raw_spin_unlock_irq(&blkg_stat_lock, flags);
>  
>  if (rbytes || wbytes || rios || wios) {
>  seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu
> dbytes=%llu dios=%llu",

I think this will work, but as I mentioned before, this issue is only
on 32 bit SMP systems. Replacing u64 sync with spinlock will increase
overhead on 64 bit systems, because u64 sync does nothing on 64 bit
systems. However, if it is acceptable, we can proceed with this
solution.

--
boy.wu

Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by tj@kernel.org 2 months ago
Hello,

On Thu, Jul 18, 2024 at 01:21:32AM +0000, Boy Wu (吳勃誼) wrote:
...
> I think this will work, but as I mentioned before, this issue is only
> on 32 bit SMP systems. Replacing u64 sync with spinlock will increase
> overhead on 64 bit systems, because u64 sync does nothing on 64 bit
> systems. However, if it is acceptable, we can proceed with this
> solution.

We can encapsulate the spinlock in some helpers and make them conditional on
32bit. However, the u64_sync -> spinlock conversions in the suggested patch
are all on cold paths, so I doubt it'd be all that noticeable especially
given that the hottest path of them all is already grabbing blkg_stat_lock,
but either way is fine by me.

Thanks.

-- 
tejun
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Waiman Long 2 months ago
On 7/17/24 12:55, tj@kernel.org wrote:
> Hello,
>
> Does something like the following work for you?
>
> Thanks.
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 37e6cc91d576..ec1d191f5c83 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -329,7 +329,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>   	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
>   #endif
>   
> -	u64_stats_init(&blkg->iostat.sync);
>   	for_each_possible_cpu(cpu) {
>   		u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
>   		per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
> @@ -632,24 +631,26 @@ static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
>   static void __blkg_clear_stat(struct blkg_iostat_set *bis)
>   {
>   	struct blkg_iostat cur = {0};
> -	unsigned long flags;
>   
> -	flags = u64_stats_update_begin_irqsave(&bis->sync);
>   	blkg_iostat_set(&bis->cur, &cur);
>   	blkg_iostat_set(&bis->last, &cur);
> -	u64_stats_update_end_irqrestore(&bis->sync, flags);
>   }
>   
>   static void blkg_clear_stat(struct blkcg_gq *blkg)
>   {
> +	unsigned long flags;
>   	int cpu;
>   
> +	raw_spin_lock_irqsave(&blkg_stat_lock, flags);
> +
>   	for_each_possible_cpu(cpu) {
>   		struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);
>   
>   		__blkg_clear_stat(s);
>   	}
>   	__blkg_clear_stat(&blkg->iostat);
> +
> +	raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
>   }
>   
>   static int blkcg_reset_stats(struct cgroup_subsys_state *css,
> @@ -998,12 +999,10 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
>   	unsigned long flags;
>   
>   	/* propagate percpu delta to global */
> -	flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
>   	blkg_iostat_set(&delta, cur);
>   	blkg_iostat_sub(&delta, last);
>   	blkg_iostat_add(&blkg->iostat.cur, &delta);
>   	blkg_iostat_add(last, &delta);
> -	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>   }
>   
>   static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
> @@ -1134,9 +1133,9 @@ static void blkcg_fill_root_iostats(void)
>   				cpu_dkstats->sectors[STAT_DISCARD] << 9;
>   		}
>   
> -		flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> +		raw_spin_lock_irqsave(&blkg_stat_lock, flags);
>   		blkg_iostat_set(&blkg->iostat.cur, &tmp);
> -		u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> +		raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
>   	}
>   }
>   
> @@ -1145,7 +1144,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
>   	struct blkg_iostat_set *bis = &blkg->iostat;
>   	u64 rbytes, wbytes, rios, wios, dbytes, dios;
>   	const char *dname;
> -	unsigned seq;
>   	int i;
>   
>   	if (!blkg->online)
> @@ -1157,16 +1155,14 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
>   
>   	seq_printf(s, "%s ", dname);
>   
> -	do {
> -		seq = u64_stats_fetch_begin(&bis->sync);
> -
> -		rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> -		wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> -		dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> -		rios = bis->cur.ios[BLKG_IOSTAT_READ];
> -		wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> -		dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> -	} while (u64_stats_fetch_retry(&bis->sync, seq));
> +	raw_spin_lock_irq(&blkg_stat_lock);
> +	rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> +	wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> +	dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> +	rios = bis->cur.ios[BLKG_IOSTAT_READ];
> +	wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> +	dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> +	raw_spin_unlock_irq(&blkg_stat_lock, flags);
>   
>   	if (rbytes || wbytes || rios || wios) {
>   		seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
>
bis->sync is still being used in blk_cgroup_bio_start(). Replacing it 
with a global lock may kill performance. We may have to use a per-cpu 
lock if we want to go this route of eliminating bis->sync.

Cheers,
Longman
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by tj@kernel.org 2 months ago
Hello, Waiman.

On Wed, Jul 17, 2024 at 01:37:56PM -0400, Waiman Long wrote:
> bis->sync is still being used in blk_cgroup_bio_start(). Replacing it with a
> global lock may kill performance. We may have to use a per-cpu lock if we
> want to go this route of eliminating bis->sync.

So, the idea is to keep using u64_sync for blkg->iostat_cpu and use
blkg_stat_lock for blkg->iostat. The former is the only one which is updated
in hot path, right?

Thanks.

-- 
tejun
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Waiman Long 2 months ago
On 7/17/24 13:40, tj@kernel.org wrote:
> Hello, Waiman.
>
> On Wed, Jul 17, 2024 at 01:37:56PM -0400, Waiman Long wrote:
>> bis->sync is still being used in blk_cgroup_bio_start(). Replacing it with a
>> global lock may kill performance. We may have to use a per-cpu lock if we
>> want to go this route of eliminating bis->sync.
> So, the idea is to keep using u64_sync for blkg->iostat_cpu and use
> blkg_stat_lock for blkg->iostat. The former is the only one which is updated
> in hot path, right?

Well, it can be confusing whether we are dealing with blkg->iostat or 
blkg->iostat_cpu. In many cases, we are dealing with iostat_cpu instead 
of iostat like __blkcg_rstat_flush() and blkg_clear_stat(). So we can't 
eliminate the use of u64_stats_update_begin_irqsave() in those cases.

Cheers,
Longman
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by tj@kernel.org 2 months ago
Hello,

On Wed, Jul 17, 2024 at 02:18:39PM -0400, Waiman Long wrote:
> Well, it can be confusing whether we are dealing with blkg->iostat or
> blkg->iostat_cpu. In many cases, we are dealing with iostat_cpu instead of
> iostat like __blkcg_rstat_flush() and blkg_clear_stat(). So we can't
> eliminate the use of u64_stats_update_begin_irqsave() in those cases.

I mean, we need to distinguish them. For 32bits, blkg->iostat has multiple
updaters, so we can't use u64_sync; however, blkg->iostat_cpu has only one
updater (except blkg_clear_stat() which I don't think we need to worry too
much about), so u64_sync is fine.

Thanks.

-- 
tejun
Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by Waiman Long 2 months ago
On 7/17/24 14:24, tj@kernel.org wrote:
> Hello,
>
> On Wed, Jul 17, 2024 at 02:18:39PM -0400, Waiman Long wrote:
>> Well, it can be confusing whether we are dealing with blkg->iostat or
>> blkg->iostat_cpu. In many cases, we are dealing with iostat_cpu instead of
>> iostat like __blkcg_rstat_flush() and blkg_clear_stat(). So we can't
>> eliminate the use of u64_stats_update_begin_irqsave() in those cases.
> I mean, we need to distinguish them. For 32bits, blkg->iostat has multiple
> updaters, so we can't use u64_sync; however, blkg->iostat_cpu has only one
> updater (except blkg_clear_stat() which I don't think we need to worry too
> much about), so u64_sync is fine.

I was wrong about __blkcg_rstat_flush(). Right, the main updater of 
iostat_cpu is  blk_cgroup_bio_start(). We do need to drop down some 
comment on what is protected by u64_sync and what is by blkg_stat_lock 
though. It can be confusing.

Cheers,
Longman

Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Posted by tj@kernel.org 2 months ago
On Wed, Jul 17, 2024 at 02:55:37PM -0400, Waiman Long wrote:
> I was wrong about __blkcg_rstat_flush(). Right, the main updater of
> iostat_cpu is  blk_cgroup_bio_start(). We do need to drop down some comment
> on what is protected by u64_sync and what is by blkg_stat_lock though. It
> can be confusing.

Oh yeah, definitely. It is confusing.

-- 
tejun