[PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local

Qi Zheng posted 30 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Qi Zheng 3 weeks, 5 days ago
From: Qi Zheng <zhengqi.arch@bytedance.com>

To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg. The following counts are all non-hierarchical
and need to be reparented to prevent the counts of parent memcg overflow.

1. memcg->vmstats->state_local[i]
2. pn->lruvec_stats->state_local[i]

This commit implements the specific function, which will be used during
the reparenting process.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/memcontrol.h |  4 +++
 mm/memcontrol-v1.c         | 16 +++++++++++
 mm/memcontrol-v1.h         |  3 ++
 mm/memcontrol.c            | 56 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 26c3c0e375f58..f84a23f13ffb4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -963,12 +963,16 @@ static inline void mod_memcg_page_state(struct page *page,
 
 unsigned long memcg_events(struct mem_cgroup *memcg, int event);
 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+				struct mem_cgroup *parent, int idx);
 unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
 bool memcg_stat_item_valid(int idx);
 bool memcg_vm_event_item_valid(enum vm_event_item idx);
 unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
 unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 				      enum node_stat_item idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+				       struct mem_cgroup *parent, int idx);
 
 void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index f0ef650d2317b..800606135e7ba 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1898,6 +1898,22 @@ static const unsigned int memcg1_events[] = {
 	PGMAJFAULT,
 };
 
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+		reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
+}
+
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 {
 	unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index eb3c3c1056574..45528195d3578 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -41,6 +41,7 @@ static inline bool do_memsw_account(void)
 
 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
 unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
+void mod_memcg_page_state_local(struct mem_cgroup *memcg, int idx, unsigned long val);
 unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
 bool memcg1_alloc_events(struct mem_cgroup *memcg);
 void memcg1_free_events(struct mem_cgroup *memcg);
@@ -73,6 +74,8 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 			   unsigned long nr_memory, int nid);
 
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
 
 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
 static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 70583394f421f..7aa32b97c9f17 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
 	return objcg;
 }
 
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return;
+
+	synchronize_rcu();
+
+	__mem_cgroup_flush_stats(memcg, true);
+
+	/* The following counts are all non-hierarchical and need to be reparented. */
+	reparent_memcg1_state_local(memcg, parent);
+	reparent_memcg1_lruvec_state_local(memcg, parent);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
 static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
 {
 	spin_lock_irq(&objcg_lock);
@@ -458,6 +480,28 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return x;
 }
 
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+				       struct mem_cgroup *parent, int idx)
+{
+	int i = memcg_stats_index(idx);
+	int nid;
+
+	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
+		return;
+
+	for_each_node(nid) {
+		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+		struct mem_cgroup_per_node *parent_pn;
+		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+
+		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
+
+		WRITE_ONCE(parent_pn->lruvec_stats->state_local[i],
+			   parent_pn->lruvec_stats->state_local[i] + value);
+	}
+}
+
 /* Subset of vm_event_item to report for memcg event stats */
 static const unsigned int memcg_vm_event_stat[] = {
 #ifdef CONFIG_MEMCG_V1
@@ -757,6 +801,18 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 #endif
 	return x;
 }
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+				struct mem_cgroup *parent, int idx)
+{
+	int i = memcg_stats_index(idx);
+	unsigned long value = memcg_page_state_local(memcg, idx);
+
+	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
+		return;
+
+	WRITE_ONCE(parent->vmstats->state_local[i], parent->vmstats->state_local[i] + value);
+}
 #endif
 
 static void mod_memcg_lruvec_state(struct lruvec *lruvec,
-- 
2.20.1
Re: [PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Shakeel Butt 3 weeks, 1 day ago
On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> To resolve the dying memcg issue, we need to reparent LRU folios of child
> memcg to its parent memcg. The following counts are all non-hierarchical
> and need to be reparented to prevent the counts of parent memcg overflow.
> 
> 1. memcg->vmstats->state_local[i]
> 2. pn->lruvec_stats->state_local[i]
> 
> This commit implements the specific function, which will be used during
> the reparenting process.

Please add more explanation which was discussed in the email chain at
https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/

Also move the upward traversal code in mod_memcg_state() and
mod_memcg_lruvec_state() you added in later patch to this patch and make
it under CONFIG_MEMCG_V1.

Something like:

#ifdef CONFIG_MEMCG_V1
	while (memcg_is_dying(memcg))
		memcg = parent_mem_cgroup(memcg);
#endif
		

> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  include/linux/memcontrol.h |  4 +++
>  mm/memcontrol-v1.c         | 16 +++++++++++
>  mm/memcontrol-v1.h         |  3 ++
>  mm/memcontrol.c            | 56 ++++++++++++++++++++++++++++++++++++++
>  4 files changed, 79 insertions(+)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 26c3c0e375f58..f84a23f13ffb4 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -963,12 +963,16 @@ static inline void mod_memcg_page_state(struct page *page,
>  
>  unsigned long memcg_events(struct mem_cgroup *memcg, int event);
>  unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
> +void reparent_memcg_state_local(struct mem_cgroup *memcg,
> +				struct mem_cgroup *parent, int idx);
>  unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
>  bool memcg_stat_item_valid(int idx);
>  bool memcg_vm_event_item_valid(enum vm_event_item idx);
>  unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>  unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  				      enum node_stat_item idx);
> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> +				       struct mem_cgroup *parent, int idx);
>  
>  void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>  void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index f0ef650d2317b..800606135e7ba 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1898,6 +1898,22 @@ static const unsigned int memcg1_events[] = {
>  	PGMAJFAULT,
>  };
>  
> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
> +		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
> +}
> +
> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
> +		reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
> +}
> +
>  void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>  {
>  	unsigned long memory, memsw;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index eb3c3c1056574..45528195d3578 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -41,6 +41,7 @@ static inline bool do_memsw_account(void)
>  
>  unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
>  unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
> +void mod_memcg_page_state_local(struct mem_cgroup *memcg, int idx, unsigned long val);
>  unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
>  bool memcg1_alloc_events(struct mem_cgroup *memcg);
>  void memcg1_free_events(struct mem_cgroup *memcg);
> @@ -73,6 +74,8 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>  			   unsigned long nr_memory, int nid);
>  
>  void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>  
>  void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>  static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 70583394f421f..7aa32b97c9f17 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>  	return objcg;
>  }
>  
> +#ifdef CONFIG_MEMCG_V1
> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
> +
> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> +		return;
> +
> +	synchronize_rcu();

Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
synchronize_rcu().

Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
css_killed_ref_fn(). It would be as simple as the following:

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e717208cfb18..549a8e026194 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6046,8 +6046,8 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
-	struct cgroup_subsys_state *css =
-		container_of(work, struct cgroup_subsys_state, destroy_work);
+	struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
+				 struct cgroup_subsys_state, destroy_rwork);
 
 	cgroup_lock();
 
@@ -6068,8 +6068,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
 	if (atomic_dec_and_test(&css->online_cnt)) {
-		INIT_WORK(&css->destroy_work, css_killed_work_fn);
-		queue_work(cgroup_offline_wq, &css->destroy_work);
+		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
 	}
 }
 
> +
> +	__mem_cgroup_flush_stats(memcg, true);
> +
> +	/* The following counts are all non-hierarchical and need to be reparented. */
> +	reparent_memcg1_state_local(memcg, parent);
> +	reparent_memcg1_lruvec_state_local(memcg, parent);
> +}
> +#else
> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +}
> +#endif
> +
Re: [PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Qi Zheng 3 weeks ago

On 1/18/26 11:20 AM, Shakeel Butt wrote:
> On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>> memcg to its parent memcg. The following counts are all non-hierarchical
>> and need to be reparented to prevent the counts of parent memcg overflow.
>>
>> 1. memcg->vmstats->state_local[i]
>> 2. pn->lruvec_stats->state_local[i]
>>
>> This commit implements the specific function, which will be used during
>> the reparenting process.
> 
> Please add more explanation which was discussed in the email chain at
> https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/

OK, will do.

> 
> Also move the upward traversal code in mod_memcg_state() and
> mod_memcg_lruvec_state() you added in later patch to this patch and make
> it under CONFIG_MEMCG_V1.
> 
> Something like:
> 
> #ifdef CONFIG_MEMCG_V1
> 	while (memcg_is_dying(memcg))
> 		memcg = parent_mem_cgroup(memcg);
> #endif

OK, will do.

> 		
> 
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   include/linux/memcontrol.h |  4 +++
>>   mm/memcontrol-v1.c         | 16 +++++++++++
>>   mm/memcontrol-v1.h         |  3 ++
>>   mm/memcontrol.c            | 56 ++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 79 insertions(+)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 26c3c0e375f58..f84a23f13ffb4 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -963,12 +963,16 @@ static inline void mod_memcg_page_state(struct page *page,
>>   
>>   unsigned long memcg_events(struct mem_cgroup *memcg, int event);
>>   unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
>> +void reparent_memcg_state_local(struct mem_cgroup *memcg,
>> +				struct mem_cgroup *parent, int idx);
>>   unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
>>   bool memcg_stat_item_valid(int idx);
>>   bool memcg_vm_event_item_valid(enum vm_event_item idx);
>>   unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>>   unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>   				      enum node_stat_item idx);
>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> +				       struct mem_cgroup *parent, int idx);
>>   
>>   void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>>   void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>> index f0ef650d2317b..800606135e7ba 100644
>> --- a/mm/memcontrol-v1.c
>> +++ b/mm/memcontrol-v1.c
>> @@ -1898,6 +1898,22 @@ static const unsigned int memcg1_events[] = {
>>   	PGMAJFAULT,
>>   };
>>   
>> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>> +		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
>> +}
>> +
>> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>> +		reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
>> +}
>> +
>>   void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>>   {
>>   	unsigned long memory, memsw;
>> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
>> index eb3c3c1056574..45528195d3578 100644
>> --- a/mm/memcontrol-v1.h
>> +++ b/mm/memcontrol-v1.h
>> @@ -41,6 +41,7 @@ static inline bool do_memsw_account(void)
>>   
>>   unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
>>   unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
>> +void mod_memcg_page_state_local(struct mem_cgroup *memcg, int idx, unsigned long val);
>>   unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
>>   bool memcg1_alloc_events(struct mem_cgroup *memcg);
>>   void memcg1_free_events(struct mem_cgroup *memcg);
>> @@ -73,6 +74,8 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>>   			   unsigned long nr_memory, int nid);
>>   
>>   void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
>> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>   
>>   void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>>   static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 70583394f421f..7aa32b97c9f17 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>   	return objcg;
>>   }
>>   
>> +#ifdef CONFIG_MEMCG_V1
>> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>> +
>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>> +		return;
>> +
>> +	synchronize_rcu();
> 
> Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
> read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
> synchronize_rcu().

Since these two functions require memcg or lruvec, they are already
within the critical section of the RCU lock.

> 
> Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
> css_killed_ref_fn(). It would be as simple as the following:

It does look much simpler, will do.

Thanks,
Qi

> 
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index e717208cfb18..549a8e026194 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6046,8 +6046,8 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
>    */
>   static void css_killed_work_fn(struct work_struct *work)
>   {
> -	struct cgroup_subsys_state *css =
> -		container_of(work, struct cgroup_subsys_state, destroy_work);
> +	struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
> +				 struct cgroup_subsys_state, destroy_rwork);
>   
>   	cgroup_lock();
>   
> @@ -6068,8 +6068,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
>   		container_of(ref, struct cgroup_subsys_state, refcnt);
>   
>   	if (atomic_dec_and_test(&css->online_cnt)) {
> -		INIT_WORK(&css->destroy_work, css_killed_work_fn);
> -		queue_work(cgroup_offline_wq, &css->destroy_work);
> +		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
> +		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
>   	}
>   }
>   
>> +
>> +	__mem_cgroup_flush_stats(memcg, true);
>> +
>> +	/* The following counts are all non-hierarchical and need to be reparented. */
>> +	reparent_memcg1_state_local(memcg, parent);
>> +	reparent_memcg1_lruvec_state_local(memcg, parent);
>> +}
>> +#else
>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +}
>> +#endif
>> +
Re: [PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Harry Yoo 1 week, 4 days ago
On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
> 
> 
> On 1/18/26 11:20 AM, Shakeel Butt wrote:
> > On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
> > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > > 
> > > To resolve the dying memcg issue, we need to reparent LRU folios of child
> > > memcg to its parent memcg. The following counts are all non-hierarchical
> > > and need to be reparented to prevent the counts of parent memcg overflow.
> > > 
> > > 1. memcg->vmstats->state_local[i]
> > > 2. pn->lruvec_stats->state_local[i]
> > > 
> > > This commit implements the specific function, which will be used during
> > > the reparenting process.
> > 
> > Please add more explanation which was discussed in the email chain at
> > https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
> 
> OK, will do.
>
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 70583394f421f..7aa32b97c9f17 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
> > >   	return objcg;
> > >   }
> > > +#ifdef CONFIG_MEMCG_V1
> > > +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
> > > +
> > > +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> > > +{
> > > +	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > +		return;
> > > +
> > > +	synchronize_rcu();
> > 
> > Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
> > read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
> > synchronize_rcu().
> 
> Since these two functions require memcg or lruvec, they are already
> within the critical section of the RCU lock.

What happens if someone grabbed a refcount and then release the rcu read
lock before percpu refkill and then call mod_memcg[_lruvec]_state()?

In this case, can we end up reparenting in the middle of non-hierarchical
stat update because they don't have RCU grace period?

Something like

T1				T2

				- rcu_read_lock()
				- get memcg refcnt
				- rcu_read_unlock()

				- call mod_memcg_state()
				- CSS_IS_DYING is not set
- Set CSS_IS_DYING
- Trigger percpu refkill
				
- Trigger offline_css()
  -> reparent non-hierarchical	- update non-hierarchical stats
     stats
				- put memcg refcount

> > Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
> > css_killed_ref_fn(). It would be as simple as the following:
> 
> It does look much simpler, will do.
> 
> Thanks,
> Qi

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Qi Zheng 1 week, 4 days ago

On 1/29/26 10:10 AM, Harry Yoo wrote:
> On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
>>
>>
>> On 1/18/26 11:20 AM, Shakeel Butt wrote:
>>> On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>
>>>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>>>> memcg to its parent memcg. The following counts are all non-hierarchical
>>>> and need to be reparented to prevent the counts of parent memcg overflow.
>>>>
>>>> 1. memcg->vmstats->state_local[i]
>>>> 2. pn->lruvec_stats->state_local[i]
>>>>
>>>> This commit implements the specific function, which will be used during
>>>> the reparenting process.
>>>
>>> Please add more explanation which was discussed in the email chain at
>>> https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
>>
>> OK, will do.
>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index 70583394f421f..7aa32b97c9f17 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>>>    	return objcg;
>>>>    }
>>>> +#ifdef CONFIG_MEMCG_V1
>>>> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>>> +
>>>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>>> +{
>>>> +	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>>> +		return;
>>>> +
>>>> +	synchronize_rcu();
>>>
>>> Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
>>> read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
>>> synchronize_rcu().
>>
>> Since these two functions require memcg or lruvec, they are already
>> within the critical section of the RCU lock.
> 
> What happens if someone grabbed a refcount and then release the rcu read
> lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
> 
> In this case, can we end up reparenting in the middle of non-hierarchical
> stat update because they don't have RCU grace period?
> 
> Something like
> 
> T1				T2
> 
> 				- rcu_read_lock()
> 				- get memcg refcnt
> 				- rcu_read_unlock()
> 
> 				- call mod_memcg_state()
> 				- CSS_IS_DYING is not set
> - Set CSS_IS_DYING
> - Trigger percpu refkill
> 				
> - Trigger offline_css()
>    -> reparent non-hierarchical	- update non-hierarchical stats
>       stats
> 				- put memcg refcount

Good catch, I think you are right.

The rcu lock should be added to mod_memcg_state() and
mod_memcg_lruvec_state().

I will update to v4 as soon as possible.

Thanks,
Qi

> 
>>> Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
>>> css_killed_ref_fn(). It would be as simple as the following:
>>
>> It does look much simpler, will do.
>>
>> Thanks,
>> Qi
>
Re: [PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Harry Yoo 1 week, 4 days ago
On Thu, Jan 29, 2026 at 04:50:39PM +0800, Qi Zheng wrote:
> 
> 
> On 1/29/26 10:10 AM, Harry Yoo wrote:
> > On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
> > > 
> > > 
> > > On 1/18/26 11:20 AM, Shakeel Butt wrote:
> > > > On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
> > > > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > > > > 
> > > > > To resolve the dying memcg issue, we need to reparent LRU folios of child
> > > > > memcg to its parent memcg. The following counts are all non-hierarchical
> > > > > and need to be reparented to prevent the counts of parent memcg overflow.
> > > > > 
> > > > > 1. memcg->vmstats->state_local[i]
> > > > > 2. pn->lruvec_stats->state_local[i]
> > > > > 
> > > > > This commit implements the specific function, which will be used during
> > > > > the reparenting process.
> > > > 
> > > > Please add more explanation which was discussed in the email chain at
> > > > https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
> > > 
> > > OK, will do.
> > > 
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index 70583394f421f..7aa32b97c9f17 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
> > > > >    	return objcg;
> > > > >    }
> > > > > +#ifdef CONFIG_MEMCG_V1
> > > > > +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
> > > > > +
> > > > > +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> > > > > +{
> > > > > +	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > > > +		return;
> > > > > +
> > > > > +	synchronize_rcu();
> > > > 
> > > > Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
> > > > read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
> > > > synchronize_rcu().
> > > 
> > > Since these two functions require memcg or lruvec, they are already
> > > within the critical section of the RCU lock.
> > 
> > What happens if someone grabbed a refcount and then release the rcu read
> > lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
> > 
> > In this case, can we end up reparenting in the middle of non-hierarchical
> > stat update because they don't have RCU grace period?
> > 
> > Something like
> > 
> > T1				T2
> > 
> > 				- rcu_read_lock()
> > 				- get memcg refcnt
> > 				- rcu_read_unlock()
> > 
> > 				- call mod_memcg_state()
> > 				- CSS_IS_DYING is not set
> > - Set CSS_IS_DYING
> > - Trigger percpu refkill
> > 				
> > - Trigger offline_css()
> >    -> reparent non-hierarchical	- update non-hierarchical stats
> >       stats
> > 				- put memcg refcount
> 
> Good catch, I think you are right.
> 
> The rcu lock should be added to mod_memcg_state() and
> mod_memcg_lruvec_state().

Thanks for confirming!

Because it's quite confusing, let me ask few more questions...

Q1. Yosry mentioned [1] [2] that stat updates should be done in the same
RCU section that is used to grab a refcount of the cgroup.

But I don't think your work is relying on that. Instead, I guess, it's
relying on the CSS_DYING check from reader side to determine whether it
should update stats of the child or parent memcg, right?

-> That being said, when rcu_read_lock() is called _after_ stats are
   reparented, the reader must observe that the CSS_DYING flag is set.

[1] https://lore.kernel.org/all/utl6esq7jz5e4f7kwgrpwdjc2rm3yi33ljb6xkm5nxzufa4o7s@hblq2piu3vnz 
[2] https://lore.kernel.org/all/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn

Q2. When a reader checks CSS_DYING flag, how is the flag change
guaranteed to be visible to the reader without any lock, memory barrier,
or atomic ops involved?

As Shakeel mentioned elsewhere, I hope some explanations for correctness
to be included in the commit message :)

> I will update to v4 as soon as possible.

Thanks a lot!

I'll wait for that and will review carefully to make sure it's correct ;)

> Thanks,
> Qi
> 
> > > > Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
> > > > css_killed_ref_fn(). It would be as simple as the following:
> > > 
> > > It does look much simpler, will do.
> > > 
> > > Thanks,
> > > Qi

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Qi Zheng 1 week, 3 days ago

On 1/29/26 8:23 PM, Harry Yoo wrote:
> On Thu, Jan 29, 2026 at 04:50:39PM +0800, Qi Zheng wrote:
>>
>>
>> On 1/29/26 10:10 AM, Harry Yoo wrote:
>>> On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
>>>>
>>>>
>>>> On 1/18/26 11:20 AM, Shakeel Butt wrote:
>>>>> On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
>>>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>
>>>>>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>>>>>> memcg to its parent memcg. The following counts are all non-hierarchical
>>>>>> and need to be reparented to prevent the counts of parent memcg overflow.
>>>>>>
>>>>>> 1. memcg->vmstats->state_local[i]
>>>>>> 2. pn->lruvec_stats->state_local[i]
>>>>>>
>>>>>> This commit implements the specific function, which will be used during
>>>>>> the reparenting process.
>>>>>
>>>>> Please add more explanation which was discussed in the email chain at
>>>>> https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
>>>>
>>>> OK, will do.
>>>>
>>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>>> index 70583394f421f..7aa32b97c9f17 100644
>>>>>> --- a/mm/memcontrol.c
>>>>>> +++ b/mm/memcontrol.c
>>>>>> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>>>>>     	return objcg;
>>>>>>     }
>>>>>> +#ifdef CONFIG_MEMCG_V1
>>>>>> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>>>>> +
>>>>>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>>>>> +{
>>>>>> +	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>>>>> +		return;
>>>>>> +
>>>>>> +	synchronize_rcu();
>>>>>
>>>>> Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
>>>>> read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
>>>>> synchronize_rcu().
>>>>
>>>> Since these two functions require memcg or lruvec, they are already
>>>> within the critical section of the RCU lock.
>>>
>>> What happens if someone grabbed a refcount and then release the rcu read
>>> lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
>>>
>>> In this case, can we end up reparenting in the middle of non-hierarchical
>>> stat update because they don't have RCU grace period?
>>>
>>> Something like
>>>
>>> T1				T2
>>>
>>> 				- rcu_read_lock()
>>> 				- get memcg refcnt
>>> 				- rcu_read_unlock()
>>>
>>> 				- call mod_memcg_state()
>>> 				- CSS_IS_DYING is not set
>>> - Set CSS_IS_DYING
>>> - Trigger percpu refkill
>>> 				
>>> - Trigger offline_css()
>>>     -> reparent non-hierarchical	- update non-hierarchical stats
>>>        stats
>>> 				- put memcg refcount
>>
>> Good catch, I think you are right.
>>
>> The rcu lock should be added to mod_memcg_state() and
>> mod_memcg_lruvec_state().
> 
> Thanks for confirming!
> 
> Because it's quite confusing, let me ask few more questions...
> 
> Q1. Yosry mentioned [1] [2] that stat updates should be done in the same
> RCU section that is used to grab a refcount of the cgroup.
> 
> But I don't think your work is relying on that. Instead, I guess, it's
> relying on the CSS_DYING check from reader side to determine whether it

Only relying the CSS_DYING check is insufficient. Otherwise, the
following race may occur:

T1				T2

				memcg_is_dying is false
Set CSS_IS_DYING
reparent non-hierarchical	update non-hierarchical stats for child

So IIUC we should add rcu lock, then:

T1				T2

				rcu_read_lock
				memcg_is_dying is false

Set CSS_IS_DYING
				update non-hierarchical stats for child
				rcu_read_unlock

synchronize_rcu or rcu work
--> reparent non-hierarchical


> should update stats of the child or parent memcg, right?
> 
> -> That being said, when rcu_read_lock() is called _after_ stats are
>     reparented, the reader must observe that the CSS_DYING flag is set.
> 
> [1] https://lore.kernel.org/all/utl6esq7jz5e4f7kwgrpwdjc2rm3yi33ljb6xkm5nxzufa4o7s@hblq2piu3vnz
> [2] https://lore.kernel.org/all/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
> 
> Q2. When a reader checks CSS_DYING flag, how is the flag change
> guaranteed to be visible to the reader without any lock, memory barrier,
> or atomic ops involved?

The main situation requiring CSS_DYING check is as follow:

T1				T2

Set CSS_IS_DYING

synchronize_rcu or rcu work
--> reparent non-hierarchical
				rcu_read_lock()
				memcg_is_dying is true
				update non-hierarchical stats for parent

Referring to the "Memory-Barrier Guarantees" section in [1], 
synchronize_rcu() can guarantee that T2 can see CSS_IS_DYING. Right?


[1]. 
https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.rst

Thanks,
Qi

> 
> As Shakeel mentioned elsewhere, I hope some explanations for correctness
> to be included in the commit message :)
> 
>> I will update to v4 as soon as possible.
> 
> Thanks a lot!
> 
> I'll wait for that and will review carefully to make sure it's correct ;)
> 
>> Thanks,
>> Qi
>>
>>>>> Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
>>>>> css_killed_ref_fn(). It would be as simple as the following:
>>>>
>>>> It does look much simpler, will do.
>>>>
>>>> Thanks,
>>>> Qi
>
Re: [PATCH v3 28/30] mm: memcontrol: prepare for reparenting state_local
Posted by Harry Yoo 1 week ago
On Fri, Jan 30, 2026 at 03:22:20PM +0800, Qi Zheng wrote:
> On 1/29/26 8:23 PM, Harry Yoo wrote:
> > On Thu, Jan 29, 2026 at 04:50:39PM +0800, Qi Zheng wrote:
> > > On 1/29/26 10:10 AM, Harry Yoo wrote:
> > > > On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
> > > > > On 1/18/26 11:20 AM, Shakeel Butt wrote:
> > > > > > On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
> > > > > Since these two functions require memcg or lruvec, they are already
> > > > > within the critical section of the RCU lock.
> > > > 
> > > > What happens if someone grabbed a refcount and then release the rcu read
> > > > lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
> > > > 
> > > > In this case, can we end up reparenting in the middle of non-hierarchical
> > > > stat update because they don't have RCU grace period?
> > > > 
> > > > Something like
> > > > 
> > > > T1				T2
> > > > 
> > > > 				- rcu_read_lock()
> > > > 				- get memcg refcnt
> > > > 				- rcu_read_unlock()
> > > > 
> > > > 				- call mod_memcg_state()
> > > > 				- CSS_IS_DYING is not set
> > > > - Set CSS_IS_DYING
> > > > - Trigger percpu refkill
> > > > 				
> > > > - Trigger offline_css()
> > > >     -> reparent non-hierarchical	- update non-hierarchical stats
> > > >        stats
> > > > 				- put memcg refcount
> > > 
> > > Good catch, I think you are right.
> > > 
> > > The rcu lock should be added to mod_memcg_state() and
> > > mod_memcg_lruvec_state().
> > 
> > Thanks for confirming!
> > 
> > Because it's quite confusing, let me ask few more questions...
> > 
> > Q1. Yosry mentioned [1] [2] that stat updates should be done in the same
> > RCU section that is used to grab a refcount of the cgroup.
> > 
> > But I don't think your work is relying on that. Instead, I guess, it's
> > relying on the CSS_DYING check from reader side to determine whether it
> 
> Only relying the CSS_DYING check is insufficient. Otherwise, the
> following race may occur:
> 
> T1				T2
> 
> 				memcg_is_dying is false
> Set CSS_IS_DYING
> reparent non-hierarchical	update non-hierarchical stats for child
> 
> So IIUC we should add rcu lock, then:

Right, RCU here is used to enforce ordering between reparenting
non-hierarchical stats and stat updates for child.

> T1				T2
> 
> 				rcu_read_lock
> 				memcg_is_dying is false
> 
> Set CSS_IS_DYING
> 				update non-hierarchical stats for child
> 				rcu_read_unlock
> 
> synchronize_rcu or rcu work
> --> reparent non-hierarchical
> 
> > should update stats of the child or parent memcg, right?
> > 
> > -> That being said, when rcu_read_lock() is called _after_ stats are
> >     reparented, the reader must observe that the CSS_DYING flag is set.
> > 
> > [1] https://lore.kernel.org/all/utl6esq7jz5e4f7kwgrpwdjc2rm3yi33ljb6xkm5nxzufa4o7s@hblq2piu3vnz
> > [2] https://lore.kernel.org/all/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
> > 
> > Q2. When a reader checks CSS_DYING flag, how is the flag change
> > guaranteed to be visible to the reader without any lock, memory barrier,
> > or atomic ops involved?
> 
> The main situation requiring CSS_DYING check is as follow:
> 
> T1				T2
> 
> Set CSS_IS_DYING
> 
> synchronize_rcu or rcu work
> --> reparent non-hierarchical
> 				rcu_read_lock()
> 				memcg_is_dying is true
> 				update non-hierarchical stats for parent
> 
> Referring to the "Memory-Barrier Guarantees" section in [1],

Today I learned a little bit about RCU's requirements :)

> synchronize_rcu() can guarantee that T2 can see CSS_IS_DYING. Right?

Right. It is fair to assume that a read-side critical section either

1) ends before synchronize_rcu(), in this case, the reader might see
   CSS_DYING flag or not, but it doesn't matter because this reader
   ends before reparenting the stats.

2) starts after synchronize_rcu() starts, in this case, all stores
   before the grace period starts must be propagated to all CPUs before
   the critical section starts. Thus, such a reader is guaranteed
   to see the CSS_DYING flag set.

Linux Kernel Memory Model [2] explicitly explains 1) and 2):
> RCU (Read-Copy-Update) is a powerful synchronization mechanism.  It
> rests on two concepts: grace periods and read-side critical sections.
> 
> A grace period is the span of time occupied by a call to
> synchronize_rcu().  A read-side critical section (or just critical
> section, for short) is a region of code delimited by rcu_read_lock()
> at the start and rcu_read_unlock() at the end.  Critical sections can
> be nested, although we won't make use of this fact.
> 
> As far as memory models are concerned, RCU's main feature is its
> Grace-Period Guarantee, which states that a critical section can never
> span a full grace period.  In more detail, the Guarantee says:
> 
>         For any critical section C and any grace period G, at least
>         one of the following statements must hold:
> 
> (1)     C ends before G does, and in addition, every store that
>         propagates to C's CPU before the end of C must propagate to
>         every CPU before G ends.
> 
> (2)     G starts before C does, and in addition, every store that
>         propagates to G's CPU before the start of G must propagate
>         to every CPU before C starts.

	  ^ this

[2] https://docs.kernel.org/dev-tools/lkmm/docs/explanation.html

> [1]. https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.rst
> Qi

-- 
Cheers,
Harry / Hyeonggon
[PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Qi Zheng 3 weeks, 4 days ago
From: Qi Zheng <zhengqi.arch@bytedance.com>

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/memcontrol.h |  2 --
 mm/memcontrol-v1.c         |  8 --------
 mm/memcontrol-v1.h         |  5 ++++-
 mm/memcontrol.c            | 19 ++++++++-----------
 4 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1fe554eec1e25..e0b84b109b7ac 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
 unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
 unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 				      enum node_stat_item idx);
-void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
-				       struct mem_cgroup *parent, int idx);
 
 void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 03b924920d6a5..daf9bad8c45ea 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
 		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
 }
 
-void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
-		reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
-}
-
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 {
 	unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 45528195d3578..5b1188f3d4173 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
 void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
-void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
 
 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
 static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
@@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
 
 static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
 
+static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+
 static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
 static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
 static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7333a37830051..b7b35143d4d2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
 	return objcg;
 }
 
-#ifdef CONFIG_MEMCG_V1
+static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+					      struct mem_cgroup *parent, int idx);
 static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
 
 static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
 {
-	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
-		return;
+	int i;
 
 	synchronize_rcu();
 
@@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
 
 	/* The following counts are all non-hierarchical and need to be reparented. */
 	reparent_memcg1_state_local(memcg, parent);
-	reparent_memcg1_lruvec_state_local(memcg, parent);
-}
-#else
-static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
-{
+
+	for (i = 0; i < NR_LRU_LISTS; i++)
+		reparent_memcg_lruvec_state_local(memcg, parent, i);
 }
-#endif
 
 static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
 {
@@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return x;
 }
 
-void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
-				       struct mem_cgroup *parent, int idx)
+static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+					      struct mem_cgroup *parent, int idx)
 {
 	int i = memcg_stats_index(idx);
 	int nid;
-- 
2.20.1
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Shakeel Butt 3 weeks, 1 day ago
On Thu, Jan 15, 2026 at 06:41:38PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  include/linux/memcontrol.h |  2 --
>  mm/memcontrol-v1.c         |  8 --------
>  mm/memcontrol-v1.h         |  5 ++++-
>  mm/memcontrol.c            | 19 ++++++++-----------
>  4 files changed, 12 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1fe554eec1e25..e0b84b109b7ac 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
>  unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>  unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  				      enum node_stat_item idx);
> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> -				       struct mem_cgroup *parent, int idx);
>  
>  void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>  void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 03b924920d6a5..daf9bad8c45ea 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
>  		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
>  }
>  
> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> -{
> -	int i;
> -
> -	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
> -		reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
> -}
> -
>  void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>  {
>  	unsigned long memory, memsw;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index 45528195d3578..5b1188f3d4173 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>  
>  void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
>  void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>  
>  void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>  static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
> @@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
>  
>  static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
>  
> +static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +}
> +
>  static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
>  static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
>  static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7333a37830051..b7b35143d4d2d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>  	return objcg;
>  }
>  
> -#ifdef CONFIG_MEMCG_V1
> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> +					      struct mem_cgroup *parent, int idx);
>  static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>  
>  static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>  {

No reparenting local stats for v2.

> -	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> -		return;
> +	int i;
>  
>  	synchronize_rcu();
>  
> @@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
>  
>  	/* The following counts are all non-hierarchical and need to be reparented. */
>  	reparent_memcg1_state_local(memcg, parent);
> -	reparent_memcg1_lruvec_state_local(memcg, parent);
> -}
> -#else
> -static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> -{
> +
> +	for (i = 0; i < NR_LRU_LISTS; i++)
> +		reparent_memcg_lruvec_state_local(memcg, parent, i);
>  }
> -#endif
>  
>  static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>  {
> @@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  	return x;
>  }
>  
> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> -				       struct mem_cgroup *parent, int idx)
> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> +					      struct mem_cgroup *parent, int idx)
>  {
>  	int i = memcg_stats_index(idx);
>  	int nid;
> -- 
> 2.20.1
>
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Qi Zheng 3 weeks ago

On 1/18/26 11:22 AM, Shakeel Butt wrote:
> On Thu, Jan 15, 2026 at 06:41:38PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   include/linux/memcontrol.h |  2 --
>>   mm/memcontrol-v1.c         |  8 --------
>>   mm/memcontrol-v1.h         |  5 ++++-
>>   mm/memcontrol.c            | 19 ++++++++-----------
>>   4 files changed, 12 insertions(+), 22 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 1fe554eec1e25..e0b84b109b7ac 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
>>   unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>>   unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>   				      enum node_stat_item idx);
>> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> -				       struct mem_cgroup *parent, int idx);
>>   
>>   void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>>   void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>> index 03b924920d6a5..daf9bad8c45ea 100644
>> --- a/mm/memcontrol-v1.c
>> +++ b/mm/memcontrol-v1.c
>> @@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
>>   		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
>>   }
>>   
>> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> -{
>> -	int i;
>> -
>> -	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>> -		reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
>> -}
>> -
>>   void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>>   {
>>   	unsigned long memory, memsw;
>> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
>> index 45528195d3578..5b1188f3d4173 100644
>> --- a/mm/memcontrol-v1.h
>> +++ b/mm/memcontrol-v1.h
>> @@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>>   
>>   void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
>>   void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>   
>>   void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>>   static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
>> @@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
>>   
>>   static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
>>   
>> +static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +}
>> +
>>   static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
>>   static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
>>   static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 7333a37830051..b7b35143d4d2d 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>   	return objcg;
>>   }
>>   
>> -#ifdef CONFIG_MEMCG_V1
>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> +					      struct mem_cgroup *parent, int idx);
>>   static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>   
>>   static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>   {
> 
> No reparenting local stats for v2.

It seems that lruvec_stats->state_local (non-hierarchical) needs to be
relocated in both v1 and v2.

> 
>> -	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>> -		return;
>> +	int i;
>>   
>>   	synchronize_rcu();
>>   
>> @@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
>>   
>>   	/* The following counts are all non-hierarchical and need to be reparented. */
>>   	reparent_memcg1_state_local(memcg, parent);
>> -	reparent_memcg1_lruvec_state_local(memcg, parent);
>> -}
>> -#else
>> -static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> -{
>> +
>> +	for (i = 0; i < NR_LRU_LISTS; i++)
>> +		reparent_memcg_lruvec_state_local(memcg, parent, i);
>>   }
>> -#endif
>>   
>>   static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>   {
>> @@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>   	return x;
>>   }
>>   
>> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> -				       struct mem_cgroup *parent, int idx)
>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> +					      struct mem_cgroup *parent, int idx)
>>   {
>>   	int i = memcg_stats_index(idx);
>>   	int nid;
>> -- 
>> 2.20.1
>>
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Muchun Song 2 weeks, 6 days ago

> On Jan 19, 2026, at 11:36, Qi Zheng <qi.zheng@linux.dev> wrote:
> 
> 
> 
> On 1/18/26 11:22 AM, Shakeel Butt wrote:
>> On Thu, Jan 15, 2026 at 06:41:38PM +0800, Qi Zheng wrote:
>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>> 
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>>  include/linux/memcontrol.h |  2 --
>>>  mm/memcontrol-v1.c         |  8 --------
>>>  mm/memcontrol-v1.h         |  5 ++++-
>>>  mm/memcontrol.c            | 19 ++++++++-----------
>>>  4 files changed, 12 insertions(+), 22 deletions(-)
>>> 
>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>> index 1fe554eec1e25..e0b84b109b7ac 100644
>>> --- a/include/linux/memcontrol.h
>>> +++ b/include/linux/memcontrol.h
>>> @@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
>>>  unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>>>  unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>         enum node_stat_item idx);
>>> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> -        struct mem_cgroup *parent, int idx);
>>>    void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>>>  void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
>>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>>> index 03b924920d6a5..daf9bad8c45ea 100644
>>> --- a/mm/memcontrol-v1.c
>>> +++ b/mm/memcontrol-v1.c
>>> @@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
>>>   reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
>>>  }
>>>  -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> -{
>>> - int i;
>>> -
>>> - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>>> - reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
>>> -}
>>> -
>>>  void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>>>  {
>>>   unsigned long memory, memsw;
>>> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
>>> index 45528195d3578..5b1188f3d4173 100644
>>> --- a/mm/memcontrol-v1.h
>>> +++ b/mm/memcontrol-v1.h
>>> @@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>>>    void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
>>>  void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>>    void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>>>  static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
>>> @@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
>>>    static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
>>>  +static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> +{
>>> +}
>>> +
>>>  static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
>>>  static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
>>>  static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>> index 7333a37830051..b7b35143d4d2d 100644
>>> --- a/mm/memcontrol.c
>>> +++ b/mm/memcontrol.c
>>> @@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>>   return objcg;
>>>  }
>>>  -#ifdef CONFIG_MEMCG_V1
>>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> +       struct mem_cgroup *parent, int idx);
>>>  static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>>    static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>>  {
>> No reparenting local stats for v2.
> 
> It seems that lruvec_stats->state_local (non-hierarchical) needs to be
> relocated in both v1 and v2.

Here we might need to elaborate a bit. Specifically, in the function
`count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain
LRU and SLAB pages seems to also require these logics to work correctly.
For SLAB, it appears that the statistics here have already been
problematic for a while since SLAB pages have been reparented, right?

> 
>>> - if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>> - return;
>>> + int i;
>>>     synchronize_rcu();
>>>  @@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
>>>     /* The following counts are all non-hierarchical and need to be reparented. */
>>>   reparent_memcg1_state_local(memcg, parent);
>>> - reparent_memcg1_lruvec_state_local(memcg, parent);
>>> -}
>>> -#else
>>> -static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> -{
>>> +
>>> + for (i = 0; i < NR_LRU_LISTS; i++)
>>> + reparent_memcg_lruvec_state_local(memcg, parent, i);
>>>  }
>>> -#endif
>>>    static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>>  {
>>> @@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>   return x;
>>>  }
>>>  -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> -        struct mem_cgroup *parent, int idx)
>>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> +       struct mem_cgroup *parent, int idx)
>>>  {
>>>   int i = memcg_stats_index(idx);
>>>   int nid;
>>> -- 
>>> 2.20.1
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Shakeel Butt 2 weeks, 5 days ago
On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote:
> 
> 
> >> No reparenting local stats for v2.
> > 
> > It seems that lruvec_stats->state_local (non-hierarchical) needs to be
> > relocated in both v1 and v2.
> 
> Here we might need to elaborate a bit. Specifically, in the function
> `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain
> LRU and SLAB pages seems to also require these logics to work correctly.
> For SLAB, it appears that the statistics here have already been
> problematic for a while since SLAB pages have been reparented, right?
> 

Thanks a lot, now it is clear and yes it seems like SLAB is problematic
but now I am wondering if it is really worth fixing. For LRU pages, how
about using lruvec_lru_size() defined in vmscan.c. That would at least
keep count_shadow_nodes() working irrespective of LRU reparenting.
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Qi Zheng 2 weeks, 5 days ago

On 1/21/26 2:47 AM, Shakeel Butt wrote:
> On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote:
>>
>>
>>>> No reparenting local stats for v2.
>>>
>>> It seems that lruvec_stats->state_local (non-hierarchical) needs to be
>>> relocated in both v1 and v2.
>>
>> Here we might need to elaborate a bit. Specifically, in the function
>> `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain
>> LRU and SLAB pages seems to also require these logics to work correctly.
>> For SLAB, it appears that the statistics here have already been
>> problematic for a while since SLAB pages have been reparented, right?
>>
> 
> Thanks a lot, now it is clear and yes it seems like SLAB is problematic
> but now I am wondering if it is really worth fixing. For LRU pages, how
> about using lruvec_lru_size() defined in vmscan.c. That would at least
> keep count_shadow_nodes() working irrespective of LRU reparenting.

Do you mean calling lruvec_lru_size() in count_shadow_nodes()? But
numa_stat interface also reads lruvec_stats->state and make it visible
to the user.

>
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Shakeel Butt 2 weeks, 5 days ago
On Wed, Jan 21, 2026 at 11:43:50AM +0800, Qi Zheng wrote:
> 
> 
> On 1/21/26 2:47 AM, Shakeel Butt wrote:
> > On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote:
> > > 
> > > 
> > > > > No reparenting local stats for v2.
> > > > 
> > > > It seems that lruvec_stats->state_local (non-hierarchical) needs to be
> > > > relocated in both v1 and v2.
> > > 
> > > Here we might need to elaborate a bit. Specifically, in the function
> > > `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain
> > > LRU and SLAB pages seems to also require these logics to work correctly.
> > > For SLAB, it appears that the statistics here have already been
> > > problematic for a while since SLAB pages have been reparented, right?
> > > 
> > 
> > Thanks a lot, now it is clear and yes it seems like SLAB is problematic
> > but now I am wondering if it is really worth fixing. For LRU pages, how
> > about using lruvec_lru_size() defined in vmscan.c. That would at least
> > keep count_shadow_nodes() working irrespective of LRU reparenting.
> 
> Do you mean calling lruvec_lru_size() in count_shadow_nodes()?

Yes but I am mainly brainstorming. We can keep the reparenting local
stats for both v1 and v2 for now as it is not a performance critical
path. I am more worried about the stats update path where upward
traversal of memcg for CSS_DYING can be costly and I don't want that in
v2.

> But
> numa_stat interface also reads lruvec_stats->state and make it visible
> to the user.
> 

Not sure how this is relevant.
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Qi Zheng 2 weeks, 5 days ago

On 1/21/26 4:20 PM, Shakeel Butt wrote:
> On Wed, Jan 21, 2026 at 11:43:50AM +0800, Qi Zheng wrote:
>>
>>
>> On 1/21/26 2:47 AM, Shakeel Butt wrote:
>>> On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote:
>>>>
>>>>
>>>>>> No reparenting local stats for v2.
>>>>>
>>>>> It seems that lruvec_stats->state_local (non-hierarchical) needs to be
>>>>> relocated in both v1 and v2.
>>>>
>>>> Here we might need to elaborate a bit. Specifically, in the function
>>>> `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain
>>>> LRU and SLAB pages seems to also require these logics to work correctly.
>>>> For SLAB, it appears that the statistics here have already been
>>>> problematic for a while since SLAB pages have been reparented, right?
>>>>
>>>
>>> Thanks a lot, now it is clear and yes it seems like SLAB is problematic
>>> but now I am wondering if it is really worth fixing. For LRU pages, how
>>> about using lruvec_lru_size() defined in vmscan.c. That would at least
>>> keep count_shadow_nodes() working irrespective of LRU reparenting.
>>
>> Do you mean calling lruvec_lru_size() in count_shadow_nodes()?
> 
> Yes but I am mainly brainstorming. We can keep the reparenting local

OK, I will take a closer look.

> stats for both v1 and v2 for now as it is not a performance critical
> path. I am more worried about the stats update path where upward
> traversal of memcg for CSS_DYING can be costly and I don't want that in
> v2.
> 
>> But
>> numa_stat interface also reads lruvec_stats->state and make it visible
>> to the user.
>>
> 
> Not sure how this is relevant.

My mistake, please ignore it.

Thanks!
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Andrew Morton 3 weeks, 3 days ago
On Thu, 15 Jan 2026 18:41:38 +0800 Qi Zheng <qi.zheng@linux.dev> wrote:

> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>

What was "fixed"?

I queued this and [2/2] as squashable fixes against "mm: memcontrol:
prepare for reparenting state_local".
Re: [PATCH v3 28/30 fix 1/2] mm: memcontrol: fix lruvec_stats->state_local reparenting
Posted by Qi Zheng 3 weeks, 3 days ago

On 1/16/26 1:47 AM, Andrew Morton wrote:
> On Thu, 15 Jan 2026 18:41:38 +0800 Qi Zheng <qi.zheng@linux.dev> wrote:
> 
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> What was "fixed"?

[1/2] fixed the issue reported by 
https://lore.kernel.org/all/6967cd38.050a0220.58bed.0001.GAE@google.com/.

Previuosly, state_local was protected by ss_rstat_lock. Since
[PATCH v3 28/30] added a concurrent path, [2/2] changed state_local
to the atomic_long_t type for protection.

> 
> I queued this and [2/2] as squashable fixes against "mm: memcontrol:
> prepare for reparenting state_local".

Thanks!

>
[PATCH v3 28/30 fix 2/2] mm: memcontrol: change state_locals to atomic_long_t type
Posted by Qi Zheng 3 weeks, 4 days ago
From: Qi Zheng <zhengqi.arch@bytedance.com>

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 mm/memcontrol.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b7b35143d4d2d..c303c483f55a4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -456,7 +456,7 @@ struct lruvec_stats {
 	long state[NR_MEMCG_NODE_STAT_ITEMS];
 
 	/* Non-hierarchical (CPU aggregated) state */
-	long state_local[NR_MEMCG_NODE_STAT_ITEMS];
+	atomic_long_t state_local[NR_MEMCG_NODE_STAT_ITEMS];
 
 	/* Pending child counts during tree propagation */
 	long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
@@ -499,7 +499,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 		return 0;
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats->state_local[i]);
+	x = atomic_long_read(&(pn->lruvec_stats->state_local[i]));
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -524,8 +524,7 @@ static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
 
 		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
 
-		WRITE_ONCE(parent_pn->lruvec_stats->state_local[i],
-			   parent_pn->lruvec_stats->state_local[i] + value);
+		atomic_long_add(value, &(parent_pn->lruvec_stats->state_local[i]));
 	}
 }
 
@@ -620,8 +619,8 @@ struct memcg_vmstats {
 	unsigned long		events[NR_MEMCG_EVENTS];
 
 	/* Non-hierarchical (CPU aggregated) page state & events */
-	long			state_local[MEMCG_VMSTAT_SIZE];
-	unsigned long		events_local[NR_MEMCG_EVENTS];
+	atomic_long_t		state_local[MEMCG_VMSTAT_SIZE];
+	atomic_long_t		events_local[NR_MEMCG_EVENTS];
 
 	/* Pending child counts during tree propagation */
 	long			state_pending[MEMCG_VMSTAT_SIZE];
@@ -824,7 +823,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 		return 0;
 
-	x = READ_ONCE(memcg->vmstats->state_local[i]);
+	x = atomic_long_read(&(memcg->vmstats->state_local[i]));
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -841,7 +840,7 @@ void reparent_memcg_state_local(struct mem_cgroup *memcg,
 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 		return;
 
-	WRITE_ONCE(parent->vmstats->state_local[i], parent->vmstats->state_local[i] + value);
+	atomic_long_add(value, &(parent->vmstats->state_local[i]));
 }
 #endif
 
@@ -1001,7 +1000,7 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
 		return 0;
 
-	return READ_ONCE(memcg->vmstats->events_local[i]);
+	return atomic_long_read(&(memcg->vmstats->events_local[i]));
 }
 #endif
 
@@ -4126,7 +4125,7 @@ struct aggregate_control {
 	/* pointer to the aggregated (CPU and subtree aggregated) counters */
 	long *aggregate;
 	/* pointer to the non-hierarchichal (CPU aggregated) counters */
-	long *local;
+	atomic_long_t *local;
 	/* pointer to the pending child counters during tree propagation */
 	long *pending;
 	/* pointer to the parent's pending counters, could be NULL */
@@ -4165,7 +4164,7 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
 
 		/* Aggregate counts on this level and propagate upwards */
 		if (delta_cpu)
-			ac->local[i] += delta_cpu;
+			atomic_long_add(delta_cpu, &(ac->local[i]));
 
 		if (delta) {
 			ac->aggregate[i] += delta;
-- 
2.20.1