memcg: cleanup the memcg stats interfaces

[PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Shakeel Butt 2 months, 4 weeks ago

The memcg stats are safe against irq (and nmi) context and thus does not
require disabling irqs. However some code paths for memcg stats also
update the node level stats and use irq unsafe interface and thus
require the users to disable irqs. However node level stats, on
architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface
which does not require irq disabling. Let's move memcg stats code to
start using that interface for node level stats.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 include/linux/memcontrol.h | 2 +-
 include/linux/vmstat.h     | 4 ++--
 mm/memcontrol.c            | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8c0f15e5978f..f82fac2fd988 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1408,7 +1408,7 @@ static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 {
 	struct page *page = virt_to_head_page(p);
 
-	__mod_node_page_state(page_pgdat(page), idx, val);
+	mod_node_page_state(page_pgdat(page), idx, val);
 }
 
 static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c287998908bf..11a37aaa4dd9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -557,7 +557,7 @@ static inline void mod_lruvec_page_state(struct page *page,
 static inline void __mod_lruvec_state(struct lruvec *lruvec,
 				      enum node_stat_item idx, int val)
 {
-	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 }
 
 static inline void mod_lruvec_state(struct lruvec *lruvec,
@@ -569,7 +569,7 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
 static inline void __lruvec_stat_mod_folio(struct folio *folio,
 					 enum node_stat_item idx, int val)
 {
-	__mod_node_page_state(folio_pgdat(folio), idx, val);
+	mod_node_page_state(folio_pgdat(folio), idx, val);
 }
 
 static inline void lruvec_stat_mod_folio(struct folio *folio,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 025da46d9959..f4b8a6414ed3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -770,7 +770,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val)
 {
 	/* Update node */
-	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 
 	/* Update memcg and lruvec */
 	if (!mem_cgroup_disabled())
@@ -789,7 +789,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 	/* Untracked pages have no memcg, no lruvec. Update only the node */
 	if (!memcg) {
 		rcu_read_unlock();
-		__mod_node_page_state(pgdat, idx, val);
+		mod_node_page_state(pgdat, idx, val);
 		return;
 	}
 
@@ -815,7 +815,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 	 * vmstats to keep it correct for the root memcg.
 	 */
 	if (!memcg) {
-		__mod_node_page_state(pgdat, idx, val);
+		mod_node_page_state(pgdat, idx, val);
 	} else {
 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
 		__mod_lruvec_state(lruvec, idx, val);
-- 
2.47.3

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Dev Jain 1 week, 2 days ago

On 11/11/25 4:50 am, Shakeel Butt wrote:
> The memcg stats are safe against irq (and nmi) context and thus does not
> require disabling irqs. However some code paths for memcg stats also
> update the node level stats and use irq unsafe interface and thus
> require the users to disable irqs. However node level stats, on
> architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface
> which does not require irq disabling. Let's move memcg stats code to
> start using that interface for node level stats.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---

Hello Shakeel,

We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
to munmap. Please see below if my understanding of this patch is correct.

>  include/linux/memcontrol.h | 2 +-
>  include/linux/vmstat.h     | 4 ++--
>  mm/memcontrol.c            | 6 +++---
>  3 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 8c0f15e5978f..f82fac2fd988 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1408,7 +1408,7 @@ static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
>  {
>  	struct page *page = virt_to_head_page(p);
>  
> -	__mod_node_page_state(page_pgdat(page), idx, val);
> +	mod_node_page_state(page_pgdat(page), idx, val);
>  }
>  
>  static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index c287998908bf..11a37aaa4dd9 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -557,7 +557,7 @@ static inline void mod_lruvec_page_state(struct page *page,
>  static inline void __mod_lruvec_state(struct lruvec *lruvec,
>  				      enum node_stat_item idx, int val)
>  {
> -	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
> +	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
>  }
>  
>  static inline void mod_lruvec_state(struct lruvec *lruvec,
> @@ -569,7 +569,7 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
>  static inline void __lruvec_stat_mod_folio(struct folio *folio,
>  					 enum node_stat_item idx, int val)
>  {
> -	__mod_node_page_state(folio_pgdat(folio), idx, val);
> +	mod_node_page_state(folio_pgdat(folio), idx, val);
>  }

See folio_remove_rmap_ptes -> __folio_mod_stat -> __lruvec_stat_mod_folio. This path now
has the unconditional overhead of doing this_cpu_try_cmpxchg(). AFAIU the purpose of
this patch was to remove local_irq_save and optimize it by using a cmpxchg atomic
(coupled with the fact that the caller will have ensured preempt_disable), but
there are code paths which are not doing local_irq_save in the first place, so
those get regressed.

>  
>  static inline void lruvec_stat_mod_folio(struct folio *folio,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 025da46d9959..f4b8a6414ed3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -770,7 +770,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
>  			int val)
>  {
>  	/* Update node */
> -	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
> +	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
>  
>  	/* Update memcg and lruvec */
>  	if (!mem_cgroup_disabled())
> @@ -789,7 +789,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
>  	/* Untracked pages have no memcg, no lruvec. Update only the node */
>  	if (!memcg) {
>  		rcu_read_unlock();
> -		__mod_node_page_state(pgdat, idx, val);
> +		mod_node_page_state(pgdat, idx, val);
>  		return;
>  	}
>  
> @@ -815,7 +815,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
>  	 * vmstats to keep it correct for the root memcg.
>  	 */
>  	if (!memcg) {
> -		__mod_node_page_state(pgdat, idx, val);
> +		mod_node_page_state(pgdat, idx, val);
>  	} else {
>  		lruvec = mem_cgroup_lruvec(memcg, pgdat);
>  		__mod_lruvec_state(lruvec, idx, val);

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Shakeel Butt 5 days, 21 hours ago

On Thu, Jan 29, 2026 at 06:35:21PM +0530, Dev Jain wrote:
> 
> On 11/11/25 4:50 am, Shakeel Butt wrote:
> > The memcg stats are safe against irq (and nmi) context and thus does not
> > require disabling irqs. However some code paths for memcg stats also
> > update the node level stats and use irq unsafe interface and thus
> > require the users to disable irqs. However node level stats, on
> > architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface
> > which does not require irq disabling. Let's move memcg stats code to
> > start using that interface for node level stats.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> > ---
> 
> Hello Shakeel,
> 
> We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
> the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
> to munmap. Please see below if my understanding of this patch is correct.

Thanks for the report. Are you seeing regression in just the benchmark
or some real workload as well? Also how much regression are you seeing?
I have a kernel rebot regression report [1] for this patch as well which
says 2.6% regression and thus it was on the back-burner for now. I will
take look at this again soon.

[1] https://lore.kernel.org/all/202512101408.af3876df-lkp@intel.com/

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Dev Jain 5 days, 20 hours ago

On 02/02/26 9:56 am, Shakeel Butt wrote:
> On Thu, Jan 29, 2026 at 06:35:21PM +0530, Dev Jain wrote:
>> On 11/11/25 4:50 am, Shakeel Butt wrote:
>>> The memcg stats are safe against irq (and nmi) context and thus does not
>>> require disabling irqs. However some code paths for memcg stats also
>>> update the node level stats and use irq unsafe interface and thus
>>> require the users to disable irqs. However node level stats, on
>>> architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface
>>> which does not require irq disabling. Let's move memcg stats code to
>>> start using that interface for node level stats.
>>>
>>> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
>>> ---
>> Hello Shakeel,
>>
>> We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
>> the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
>> to munmap. Please see below if my understanding of this patch is correct.
> Thanks for the report. Are you seeing regression in just the benchmark
> or some real workload as well? Also how much regression are you seeing?
> I have a kernel rebot regression report [1] for this patch as well which
> says 2.6% regression and thus it was on the back-burner for now. I will
> take look at this again soon.

The munmap regression is ~24%. Haven't observed a regression in any other
benchmark yet.

>
> [1] https://lore.kernel.org/all/202512101408.af3876df-lkp@intel.com/
>

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Shakeel Butt 5 days, 20 hours ago

> > 
> > > 
> > > Hello Shakeel,
> > > 
> > >  We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
> > >  the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
> > >  to munmap. Please see below if my understanding of this patch is correct.
> > > 
> >  Thanks for the report. Are you seeing regression in just the benchmark
> >  or some real workload as well? Also how much regression are you seeing?
> >  I have a kernel rebot regression report [1] for this patch as well which
> >  says 2.6% regression and thus it was on the back-burner for now. I will
> >  take look at this again soon.
> > 
> The munmap regression is ~24%. Haven't observed a regression in any other
> benchmark yet.

Please share the code/benchmark which shows such regression, also if you can
share the perf profile, that would be awesome.

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Dev Jain 5 days, 16 hours ago

On 02/02/26 10:24 am, Shakeel Butt wrote:
>>>> Hello Shakeel,
>>>>
>>>>  We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
>>>>  the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
>>>>  to munmap. Please see below if my understanding of this patch is correct.
>>>>
>>>  Thanks for the report. Are you seeing regression in just the benchmark
>>>  or some real workload as well? Also how much regression are you seeing?
>>>  I have a kernel rebot regression report [1] for this patch as well which
>>>  says 2.6% regression and thus it was on the back-burner for now. I will
>>>  take look at this again soon.
>>>
>> The munmap regression is ~24%. Haven't observed a regression in any other
>> benchmark yet.
> Please share the code/benchmark which shows such regression, also if you can
> share the perf profile, that would be awesome.

https://gitlab.arm.com/tooling/fastpath/-/blob/main/containers/microbench/micromm.c
You can run this with
./micromm 0 munmap 10

Don't have a perf profile, I measured the time taken by above command, with and
without the patch.

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Shakeel Butt 3 days, 4 hours ago

On Mon, Feb 02, 2026 at 02:23:54PM +0530, Dev Jain wrote:
> 
> On 02/02/26 10:24 am, Shakeel Butt wrote:
> >>>> Hello Shakeel,
> >>>>
> >>>>  We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
> >>>>  the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
> >>>>  to munmap. Please see below if my understanding of this patch is correct.
> >>>>
> >>>  Thanks for the report. Are you seeing regression in just the benchmark
> >>>  or some real workload as well? Also how much regression are you seeing?
> >>>  I have a kernel rebot regression report [1] for this patch as well which
> >>>  says 2.6% regression and thus it was on the back-burner for now. I will
> >>>  take look at this again soon.
> >>>
> >> The munmap regression is ~24%. Haven't observed a regression in any other
> >> benchmark yet.
> > Please share the code/benchmark which shows such regression, also if you can
> > share the perf profile, that would be awesome.
> 
> https://gitlab.arm.com/tooling/fastpath/-/blob/main/containers/microbench/micromm.c
> You can run this with
> ./micromm 0 munmap 10
> 
> Don't have a perf profile, I measured the time taken by above command, with and
> without the patch.
> 

Hi Dev, can you please try the following patch?


From 40155feca7e7bc846800ab8449735bdb03164d6d Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Wed, 4 Feb 2026 08:46:08 -0800
Subject: [PATCH] vmstat: use preempt disable instead of try_cmpxchg

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 include/linux/mmzone.h |  2 +-
 mm/vmstat.c            | 58 ++++++++++++++++++------------------------
 2 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..499cd53efdd6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -776,7 +776,7 @@ struct per_cpu_zonestat {
 
 struct per_cpu_nodestat {
 	s8 stat_threshold;
-	s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
+	long vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
 };
 
 #endif /* !__GENERATING_BOUNDS.H */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 86b14b0f77b5..0930695597bb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -377,7 +377,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 				long delta)
 {
 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-	s8 __percpu *p = pcp->vm_node_stat_diff + item;
+	long __percpu *p = pcp->vm_node_stat_diff + item;
 	long x;
 	long t;
 
@@ -456,8 +456,8 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 {
 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-	s8 __percpu *p = pcp->vm_node_stat_diff + item;
-	s8 v, t;
+	long __percpu *p = pcp->vm_node_stat_diff + item;
+	long v, t;
 
 	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
 
@@ -467,7 +467,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v > t)) {
-		s8 overstep = t >> 1;
+		long overstep = t >> 1;
 
 		node_page_state_add(v + overstep, pgdat, item);
 		__this_cpu_write(*p, -overstep);
@@ -512,8 +512,8 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 {
 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-	s8 __percpu *p = pcp->vm_node_stat_diff + item;
-	s8 v, t;
+	long __percpu *p = pcp->vm_node_stat_diff + item;
+	long v, t;
 
 	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
 
@@ -523,7 +523,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v < - t)) {
-		s8 overstep = t >> 1;
+		long overstep = t >> 1;
 
 		node_page_state_add(v - overstep, pgdat, item);
 		__this_cpu_write(*p, overstep);
@@ -619,9 +619,8 @@ static inline void mod_node_state(struct pglist_data *pgdat,
        enum node_stat_item item, int delta, int overstep_mode)
 {
 	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-	s8 __percpu *p = pcp->vm_node_stat_diff + item;
-	long n, t, z;
-	s8 o;
+	long __percpu *p = pcp->vm_node_stat_diff + item;
+	long o, n, t, z;
 
 	if (vmstat_item_in_bytes(item)) {
 		/*
@@ -634,32 +633,25 @@ static inline void mod_node_state(struct pglist_data *pgdat,
 		delta >>= PAGE_SHIFT;
 	}
 
+	preempt_disable();
+
 	o = this_cpu_read(*p);
-	do {
-		z = 0;  /* overflow to node counters */
+	n = o + delta;
 
-		/*
-		 * The fetching of the stat_threshold is racy. We may apply
-		 * a counter threshold to the wrong the cpu if we get
-		 * rescheduled while executing here. However, the next
-		 * counter update will apply the threshold again and
-		 * therefore bring the counter under the threshold again.
-		 *
-		 * Most of the time the thresholds are the same anyways
-		 * for all cpus in a node.
-		 */
-		t = this_cpu_read(pcp->stat_threshold);
+	t = this_cpu_read(pcp->stat_threshold);
+	z = 0;
 
-		n = delta + (long)o;
+	if (abs(n) > t) {
+		int os = overstep_mode * (t >> 1);
 
-		if (abs(n) > t) {
-			int os = overstep_mode * (t >> 1) ;
+		/* Overflow must be added to node counters */
+		z = n + os;
+		n = -os;
+	}
 
-			/* Overflow must be added to node counters */
-			z = n + os;
-			n = -os;
-		}
-	} while (!this_cpu_try_cmpxchg(*p, &o, n));
+	this_cpu_add(*p, n - o);
+
+	preempt_enable();
 
 	if (z)
 		node_page_state_add(z, pgdat, item);
@@ -866,7 +858,7 @@ static bool refresh_cpu_vm_stats(bool do_pagesets)
 		struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
 
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
-			int v;
+			long v;
 
 			v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
 			if (v) {
@@ -929,7 +921,7 @@ void cpu_vm_stats_fold(int cpu)
 
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 			if (p->vm_node_stat_diff[i]) {
-				int v;
+				long v;
 
 				v = p->vm_node_stat_diff[i];
 				p->vm_node_stat_diff[i] = 0;
-- 
2.47.3

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Dev Jain 2 days, 20 hours ago

On 05/02/26 2:08 am, Shakeel Butt wrote:
> On Mon, Feb 02, 2026 at 02:23:54PM +0530, Dev Jain wrote:
>> On 02/02/26 10:24 am, Shakeel Butt wrote:
>>>>>> Hello Shakeel,
>>>>>>
>>>>>>  We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
>>>>>>  the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
>>>>>>  to munmap. Please see below if my understanding of this patch is correct.
>>>>>>
>>>>>  Thanks for the report. Are you seeing regression in just the benchmark
>>>>>  or some real workload as well? Also how much regression are you seeing?
>>>>>  I have a kernel rebot regression report [1] for this patch as well which
>>>>>  says 2.6% regression and thus it was on the back-burner for now. I will
>>>>>  take look at this again soon.
>>>>>
>>>> The munmap regression is ~24%. Haven't observed a regression in any other
>>>> benchmark yet.
>>> Please share the code/benchmark which shows such regression, also if you can
>>> share the perf profile, that would be awesome.
>> https://gitlab.arm.com/tooling/fastpath/-/blob/main/containers/microbench/micromm.c
>> You can run this with
>> ./micromm 0 munmap 10
>>
>> Don't have a perf profile, I measured the time taken by above command, with and
>> without the patch.
>>
> Hi Dev, can you please try the following patch?
>
>
> From 40155feca7e7bc846800ab8449735bdb03164d6d Mon Sep 17 00:00:00 2001
> From: Shakeel Butt <shakeel.butt@linux.dev>
> Date: Wed, 4 Feb 2026 08:46:08 -0800
> Subject: [PATCH] vmstat: use preempt disable instead of try_cmpxchg
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
>  include/linux/mmzone.h |  2 +-
>  mm/vmstat.c            | 58 ++++++++++++++++++------------------------
>  2 files changed, 26 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 3e51190a55e4..499cd53efdd6 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -776,7 +776,7 @@ struct per_cpu_zonestat {
>  
>  struct per_cpu_nodestat {
>  	s8 stat_threshold;
> -	s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
> +	long vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
>  };
>  
>  #endif /* !__GENERATING_BOUNDS.H */
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 86b14b0f77b5..0930695597bb 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -377,7 +377,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
>  				long delta)
>  {
>  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
> -	s8 __percpu *p = pcp->vm_node_stat_diff + item;
> +	long __percpu *p = pcp->vm_node_stat_diff + item;
>  	long x;
>  	long t;
>  
> @@ -456,8 +456,8 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
>  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
>  {
>  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
> -	s8 __percpu *p = pcp->vm_node_stat_diff + item;
> -	s8 v, t;
> +	long __percpu *p = pcp->vm_node_stat_diff + item;
> +	long v, t;
>  
>  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
>  
> @@ -467,7 +467,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
>  	v = __this_cpu_inc_return(*p);
>  	t = __this_cpu_read(pcp->stat_threshold);
>  	if (unlikely(v > t)) {
> -		s8 overstep = t >> 1;
> +		long overstep = t >> 1;
>  
>  		node_page_state_add(v + overstep, pgdat, item);
>  		__this_cpu_write(*p, -overstep);
> @@ -512,8 +512,8 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
>  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
>  {
>  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
> -	s8 __percpu *p = pcp->vm_node_stat_diff + item;
> -	s8 v, t;
> +	long __percpu *p = pcp->vm_node_stat_diff + item;
> +	long v, t;
>  
>  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
>  
> @@ -523,7 +523,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
>  	v = __this_cpu_dec_return(*p);
>  	t = __this_cpu_read(pcp->stat_threshold);
>  	if (unlikely(v < - t)) {
> -		s8 overstep = t >> 1;
> +		long overstep = t >> 1;
>  
>  		node_page_state_add(v - overstep, pgdat, item);
>  		__this_cpu_write(*p, overstep);
> @@ -619,9 +619,8 @@ static inline void mod_node_state(struct pglist_data *pgdat,
>         enum node_stat_item item, int delta, int overstep_mode)
>  {
>  	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
> -	s8 __percpu *p = pcp->vm_node_stat_diff + item;
> -	long n, t, z;
> -	s8 o;
> +	long __percpu *p = pcp->vm_node_stat_diff + item;
> +	long o, n, t, z;
>  
>  	if (vmstat_item_in_bytes(item)) {
>  		/*
> @@ -634,32 +633,25 @@ static inline void mod_node_state(struct pglist_data *pgdat,
>  		delta >>= PAGE_SHIFT;
>  	}
>  
> +	preempt_disable();
> +
>  	o = this_cpu_read(*p);
> -	do {
> -		z = 0;  /* overflow to node counters */
> +	n = o + delta;
>  
> -		/*
> -		 * The fetching of the stat_threshold is racy. We may apply
> -		 * a counter threshold to the wrong the cpu if we get
> -		 * rescheduled while executing here. However, the next
> -		 * counter update will apply the threshold again and
> -		 * therefore bring the counter under the threshold again.
> -		 *
> -		 * Most of the time the thresholds are the same anyways
> -		 * for all cpus in a node.
> -		 */
> -		t = this_cpu_read(pcp->stat_threshold);
> +	t = this_cpu_read(pcp->stat_threshold);
> +	z = 0;
>  
> -		n = delta + (long)o;
> +	if (abs(n) > t) {
> +		int os = overstep_mode * (t >> 1);
>  
> -		if (abs(n) > t) {
> -			int os = overstep_mode * (t >> 1) ;
> +		/* Overflow must be added to node counters */
> +		z = n + os;
> +		n = -os;
> +	}
>  
> -			/* Overflow must be added to node counters */
> -			z = n + os;
> -			n = -os;
> -		}
> -	} while (!this_cpu_try_cmpxchg(*p, &o, n));
> +	this_cpu_add(*p, n - o);
> +
> +	preempt_enable();
>  
>  	if (z)
>  		node_page_state_add(z, pgdat, item);
> @@ -866,7 +858,7 @@ static bool refresh_cpu_vm_stats(bool do_pagesets)
>  		struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
>  
>  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> -			int v;
> +			long v;
>  
>  			v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
>  			if (v) {
> @@ -929,7 +921,7 @@ void cpu_vm_stats_fold(int cpu)
>  
>  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
>  			if (p->vm_node_stat_diff[i]) {
> -				int v;
> +				long v;
>  
>  				v = p->vm_node_stat_diff[i];
>  				p->vm_node_stat_diff[i] = 0;

Thanks for looking into this.

But this doesn't solve it :( preempt_disable() contains a compiler barrier,
probably that's why.

Also can you confirm whether my analysis of the regression was correct?
Because if it was, then this diff looks wrong - AFAIU preempt_disable()
won't stop an irq handler from interrupting the execution, so this
will introduce a bug for code paths running in irq context.

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Harry Yoo 2 days, 19 hours ago

On Thu, Feb 05, 2026 at 10:50:06AM +0530, Dev Jain wrote:
> 
> On 05/02/26 2:08 am, Shakeel Butt wrote:
> > On Mon, Feb 02, 2026 at 02:23:54PM +0530, Dev Jain wrote:
> >> On 02/02/26 10:24 am, Shakeel Butt wrote:
> >>>>>> Hello Shakeel,
> >>>>>>
> >>>>>>  We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
> >>>>>>  the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
> >>>>>>  to munmap. Please see below if my understanding of this patch is correct.
> >>>>>>
> >>>>>  Thanks for the report. Are you seeing regression in just the benchmark
> >>>>>  or some real workload as well? Also how much regression are you seeing?
> >>>>>  I have a kernel rebot regression report [1] for this patch as well which
> >>>>>  says 2.6% regression and thus it was on the back-burner for now. I will
> >>>>>  take look at this again soon.
> >>>>>
> >>>> The munmap regression is ~24%. Haven't observed a regression in any other
> >>>> benchmark yet.
> >>> Please share the code/benchmark which shows such regression, also if you can
> >>> share the perf profile, that would be awesome.
> >> https://gitlab.arm.com/tooling/fastpath/-/blob/main/containers/microbench/micromm.c
> >> You can run this with
> >> ./micromm 0 munmap 10
> >>
> >> Don't have a perf profile, I measured the time taken by above command, with and
> >> without the patch.
> >>
> > Hi Dev, can you please try the following patch?
> >
> >
> > From 40155feca7e7bc846800ab8449735bdb03164d6d Mon Sep 17 00:00:00 2001
> > From: Shakeel Butt <shakeel.butt@linux.dev>
> > Date: Wed, 4 Feb 2026 08:46:08 -0800
> > Subject: [PATCH] vmstat: use preempt disable instead of try_cmpxchg
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> > ---

[...snip...]

> 
> Thanks for looking into this.
> 
> But this doesn't solve it :( preempt_disable() contains a compiler barrier,
> probably that's why.

I think the reason why it doesn't solve the regression is because of how
arm64 implements this_cpu_add_8() and this_cpu_try_cmpxchg_8().

On arm64, IIUC both this_cpu_try_cmpxchg_8() and this_cpu_add_8() are
implemented using LL/SC instructions or LSE atomics (if supported).

See:
- this_cpu_add_8()
  -> __percpu_add_case_64
     (which is generated from PERCPU_OP)

- this_cpu_try_cmpxchg_8()
  -> __cpu_fallback_try_cmpxchg(..., this_cpu_cmpxchg_8)
  -> this_cpu_cmpxchg_8()
  -> cmpxchg_relaxed()
  -> raw_cmpxchg_relaxed()
  -> arch_cmpxchg_relaxed()
  -> __cmpxchg_wrapper()
  -> __cmpxchg_case_64()
  -> __lse_ll_sc_body(_cmpxchg_case_64, ...)

> Also can you confirm whether my analysis of the regression was correct?
> Because if it was, then this diff looks wrong - AFAIU preempt_disable()
> won't stop an irq handler from interrupting the execution, so this
> will introduce a bug for code paths running in irq context.

I was worried about the correctness too, but this_cpu_add() is safe
against IRQs and so the stat will be _eventually_ consistent?

Ofc it's so confusing! Maybe I'm the one confused.

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Shakeel Butt 2 days, 19 hours ago

> 
> On Thu, Feb 05, 2026 at 10:50:06AM +0530, Dev Jain wrote:
> 
> > 
> > On 05/02/26 2:08 am, Shakeel Butt wrote:
> >  On Mon, Feb 02, 2026 at 02:23:54PM +0530, Dev Jain wrote:
> >  On 02/02/26 10:24 am, Shakeel Butt wrote:
> >  Hello Shakeel,
> > 
> >  We are seeing a regression in micromm/munmap benchmark with this patch, on arm64 -
> >  the benchmark mmmaps a lot of memory, memsets it, and measures the time taken
> >  to munmap. Please see below if my understanding of this patch is correct.
> > 
> >  Thanks for the report. Are you seeing regression in just the benchmark
> >  or some real workload as well? Also how much regression are you seeing?
> >  I have a kernel rebot regression report [1] for this patch as well which
> >  says 2.6% regression and thus it was on the back-burner for now. I will
> >  take look at this again soon.
> > 
> >  The munmap regression is ~24%. Haven't observed a regression in any other
> >  benchmark yet.
> >  Please share the code/benchmark which shows such regression, also if you can
> >  share the perf profile, that would be awesome.
> >  https://gitlab.arm.com/tooling/fastpath/-/blob/main/containers/microbench/micromm.c
> >  You can run this with
> >  ./micromm 0 munmap 10
> > 
> >  Don't have a perf profile, I measured the time taken by above command, with and
> >  without the patch.
> > 
> >  Hi Dev, can you please try the following patch?
> > 
> >  From 40155feca7e7bc846800ab8449735bdb03164d6d Mon Sep 17 00:00:00 2001
> >  From: Shakeel Butt <shakeel.butt@linux.dev>
> >  Date: Wed, 4 Feb 2026 08:46:08 -0800
> >  Subject: [PATCH] vmstat: use preempt disable instead of try_cmpxchg
> > 
> >  Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> >  ---
> > 
> [...snip...]
> 
> > 
> > Thanks for looking into this.
> >  
> >  But this doesn't solve it :( preempt_disable() contains a compiler barrier,
> >  probably that's why.
> > 
> I think the reason why it doesn't solve the regression is because of how
> arm64 implements this_cpu_add_8() and this_cpu_try_cmpxchg_8().
> 
> On arm64, IIUC both this_cpu_try_cmpxchg_8() and this_cpu_add_8() are
> implemented using LL/SC instructions or LSE atomics (if supported).
> 
> See:
> - this_cpu_add_8()
>  -> __percpu_add_case_64
>  (which is generated from PERCPU_OP)
> 
> - this_cpu_try_cmpxchg_8()
>  -> __cpu_fallback_try_cmpxchg(..., this_cpu_cmpxchg_8)
>  -> this_cpu_cmpxchg_8()
>  -> cmpxchg_relaxed()
>  -> raw_cmpxchg_relaxed()
>  -> arch_cmpxchg_relaxed()
>  -> __cmpxchg_wrapper()
>  -> __cmpxchg_case_64()
>  -> __lse_ll_sc_body(_cmpxchg_case_64, ...)
> 

Oh so it is arm64 specific issue. I tested on x86-64 machine and it solves
the little regression it had before. So, on arm64 all this_cpu_ops i.e. without
double underscore, uses LL/SC instructions. 

Need more thought on this. 

> > 
> > Also can you confirm whether my analysis of the regression was correct?
> >  Because if it was, then this diff looks wrong - AFAIU preempt_disable()
> >  won't stop an irq handler from interrupting the execution, so this
> >  will introduce a bug for code paths running in irq context.
> > 
> I was worried about the correctness too, but this_cpu_add() is safe
> against IRQs and so the stat will be _eventually_ consistent?
> 
> Ofc it's so confusing! Maybe I'm the one confused.

Yeah there is no issue with proposed patch as it is making the function
re-entrant safe.

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Roman Gushchin 2 months, 4 weeks ago

Shakeel Butt <shakeel.butt@linux.dev> writes:

> The memcg stats are safe against irq (and nmi) context and thus does not
> require disabling irqs. However some code paths for memcg stats also
> update the node level stats and use irq unsafe interface and thus
> require the users to disable irqs. However node level stats, on
> architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface
> which does not require irq disabling. Let's move memcg stats code to
> start using that interface for node level stats.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Acked-by: Roman Gushchin <roman.gushchin@linux.dev>

Re: [PATCH 1/4] memcg: use mod_node_page_state to update stats

Posted by Harry Yoo 2 months, 4 weeks ago

On Mon, Nov 10, 2025 at 03:20:05PM -0800, Shakeel Butt wrote:
> The memcg stats are safe against irq (and nmi) context and thus does not
> require disabling irqs. However some code paths for memcg stats also
> update the node level stats and use irq unsafe interface and thus
> require the users to disable irqs. However node level stats, on
> architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface
> which does not require irq disabling. Let's move memcg stats code to
> start using that interface for node level stats.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---

Looks good to me,
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>

-- 
Cheers,
Harry / Hyeonggon