[PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats

Qi Zheng posted 33 patches 3 weeks, 6 days ago
[PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Qi Zheng 3 weeks, 6 days ago
From: Qi Zheng <zhengqi.arch@bytedance.com>

To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg. This could cause problems for non-hierarchical
stats.

As Yosry Ahmed pointed out:

```
In short, if memory is charged to a dying cgroup at the time of
reparenting, when the memory gets uncharged the stats updates will occur
at the parent. This will update both hierarchical and non-hierarchical
stats of the parent, which would corrupt the parent's non-hierarchical
stats (because those counters were never incremented when the memory was
charged).
```

Now we have the following two types of non-hierarchical stats, and they
are only used in CONFIG_MEMCG_V1:

a. memcg->vmstats->state_local[i]
b. pn->lruvec_stats->state_local[i]

To ensure that these non-hierarchical stats work properly, we need to
reparent these non-hierarchical stats after reparenting LRU folios. To
this end, this commit makes the following preparations:

1. implement reparent_state_local() to reparent non-hierarchical stats
2. make css_killed_work_fn() to be called in rcu work, and implement
   get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
   between mod_memcg_state()/mod_memcg_lruvec_state()
   and reparent_state_local()

Co-developed-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 kernel/cgroup/cgroup.c |  9 ++--
 mm/memcontrol-v1.c     | 16 +++++++
 mm/memcontrol-v1.h     |  7 +++
 mm/memcontrol.c        | 97 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 125 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index be1d71dda3179..a9007f8aa029e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6044,8 +6044,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
-	struct cgroup_subsys_state *css =
-		container_of(work, struct cgroup_subsys_state, destroy_work);
+	struct cgroup_subsys_state *css;
+
+	css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
 
 	cgroup_lock();
 
@@ -6066,8 +6067,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
 	if (atomic_dec_and_test(&css->online_cnt)) {
-		INIT_WORK(&css->destroy_work, css_killed_work_fn);
-		queue_work(cgroup_offline_wq, &css->destroy_work);
+		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
 	}
 }
 
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index fe42ef664f1e1..51fb4406f45cf 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1897,6 +1897,22 @@ static const unsigned int memcg1_events[] = {
 	PGMAJFAULT,
 };
 
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+	int i;
+
+	for (i = 0; i < NR_LRU_LISTS; i++)
+		reparent_memcg_lruvec_state_local(memcg, parent, i);
+}
+
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 {
 	unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 4041b5027a94b..05e6ff40f7556 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -77,6 +77,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 			   unsigned long nr_memory, int nid);
 
 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+				struct mem_cgroup *parent, int idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+				       struct mem_cgroup *parent, int idx);
 
 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
 static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 23b70bd80ddc9..b0519a16f5684 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -226,6 +226,34 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
 	return objcg;
 }
 
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return;
+
+	/*
+	 * Reparent stats exposed non-hierarchically. Flush @memcg's stats first
+	 * to read its stats accurately , and conservatively flush @parent's
+	 * stats after reparenting to avoid hiding a potentially large stat
+	 * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
+	 */
+	__mem_cgroup_flush_stats(memcg, true);
+
+	/* The following counts are all non-hierarchical and need to be reparented. */
+	reparent_memcg1_state_local(memcg, parent);
+	reparent_memcg1_lruvec_state_local(memcg, parent);
+
+	__mem_cgroup_flush_stats(parent, true);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
 static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
 {
 	spin_lock_irq(&objcg_lock);
@@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return x;
 }
 
+#ifdef CONFIG_MEMCG_V1
+static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
+				     enum node_stat_item idx, int val);
+
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+				       struct mem_cgroup *parent, int idx)
+{
+	int nid;
+
+	for_each_node(nid) {
+		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+		struct mem_cgroup_per_node *child_pn, *parent_pn;
+
+		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
+		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
+
+		__mod_memcg_lruvec_state(child_pn, idx, -value);
+		__mod_memcg_lruvec_state(parent_pn, idx, value);
+	}
+}
+#endif
+
 /* Subset of vm_event_item to report for memcg event stats */
 static const unsigned int memcg_vm_event_stat[] = {
 #ifdef CONFIG_MEMCG_V1
@@ -718,6 +770,42 @@ static int memcg_state_val_in_pages(int idx, int val)
 		return max(val * unit / PAGE_SIZE, 1UL);
 }
 
+#ifdef CONFIG_MEMCG_V1
+/*
+ * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
+ * reparenting of non-hierarchical state_locals.
+ */
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return memcg;
+
+	rcu_read_lock();
+
+	while (memcg_is_dying(memcg))
+		memcg = parent_mem_cgroup(memcg);
+
+	return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return;
+
+	rcu_read_unlock();
+}
+#else
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+	return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+}
+#endif
+
 static void __mod_memcg_state(struct mem_cgroup *memcg,
 			      enum memcg_stat_item idx, int val)
 {
@@ -769,6 +857,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 #endif
 	return x;
 }
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+				struct mem_cgroup *parent, int idx)
+{
+	unsigned long value = memcg_page_state_local(memcg, idx);
+
+	__mod_memcg_state(memcg, idx, -value);
+	__mod_memcg_state(parent, idx, value);
+}
 #endif
 
 static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
-- 
2.20.1
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Harry Yoo (Oracle) 1 week, 2 days ago
On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> To resolve the dying memcg issue, we need to reparent LRU folios of child
> memcg to its parent memcg. This could cause problems for non-hierarchical
> stats.
> 
> As Yosry Ahmed pointed out:
> 
> ```
> In short, if memory is charged to a dying cgroup at the time of
> reparenting, when the memory gets uncharged the stats updates will occur
> at the parent. This will update both hierarchical and non-hierarchical
> stats of the parent, which would corrupt the parent's non-hierarchical
> stats (because those counters were never incremented when the memory was
> charged).
> ```
> 
> Now we have the following two types of non-hierarchical stats, and they
> are only used in CONFIG_MEMCG_V1:
> 
> a. memcg->vmstats->state_local[i]
> b. pn->lruvec_stats->state_local[i]
> 
> To ensure that these non-hierarchical stats work properly, we need to
> reparent these non-hierarchical stats after reparenting LRU folios. To
> this end, this commit makes the following preparations:
> 
> 1. implement reparent_state_local() to reparent non-hierarchical stats
> 2. make css_killed_work_fn() to be called in rcu work, and implement
>    get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>    between mod_memcg_state()/mod_memcg_lruvec_state()
>    and reparent_state_local()
> 
> Co-developed-by: Yosry Ahmed <yosry@kernel.org>
> Signed-off-by: Yosry Ahmed <yosry@kernel.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
>  kernel/cgroup/cgroup.c |  9 ++--
>  mm/memcontrol-v1.c     | 16 +++++++
>  mm/memcontrol-v1.h     |  7 +++
>  mm/memcontrol.c        | 97 ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 125 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 23b70bd80ddc9..b0519a16f5684 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  	return x;
>  }
>  
> +#ifdef CONFIG_MEMCG_V1
> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> +				     enum node_stat_item idx, int val);
> +
> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> +				       struct mem_cgroup *parent, int idx)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> +		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> +		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
> +		struct mem_cgroup_per_node *child_pn, *parent_pn;
> +
> +		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
> +		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
> +
> +		__mod_memcg_lruvec_state(child_pn, idx, -value);
> +		__mod_memcg_lruvec_state(parent_pn, idx, value);

We should probably change the type of `@val` from int to val to avoid
losing non hierarchical stats during reparenting?

>  #ifdef CONFIG_MEMCG_V1
>  static void __mod_memcg_state(struct mem_cgroup *memcg,
>  			      enum memcg_stat_item idx, int val)
>
> @@ -769,6 +857,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
>  #endif
>  	return x;
>  }
> +
> +void reparent_memcg_state_local(struct mem_cgroup *memcg,
> +				struct mem_cgroup *parent, int idx)
> +{
> +	unsigned long value = memcg_page_state_local(memcg, idx);
> +
> +	__mod_memcg_state(memcg, idx, -value);
> +	__mod_memcg_state(parent, idx, value);
> +}

Same here.

Otherwise LGTM.

>  #endif
>  
>  static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Qi Zheng 1 week, 2 days ago
Hi Harry,

On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>> memcg to its parent memcg. This could cause problems for non-hierarchical
>> stats.
>>
>> As Yosry Ahmed pointed out:
>>
>> ```
>> In short, if memory is charged to a dying cgroup at the time of
>> reparenting, when the memory gets uncharged the stats updates will occur
>> at the parent. This will update both hierarchical and non-hierarchical
>> stats of the parent, which would corrupt the parent's non-hierarchical
>> stats (because those counters were never incremented when the memory was
>> charged).
>> ```
>>
>> Now we have the following two types of non-hierarchical stats, and they
>> are only used in CONFIG_MEMCG_V1:
>>
>> a. memcg->vmstats->state_local[i]
>> b. pn->lruvec_stats->state_local[i]
>>
>> To ensure that these non-hierarchical stats work properly, we need to
>> reparent these non-hierarchical stats after reparenting LRU folios. To
>> this end, this commit makes the following preparations:
>>
>> 1. implement reparent_state_local() to reparent non-hierarchical stats
>> 2. make css_killed_work_fn() to be called in rcu work, and implement
>>     get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>>     between mod_memcg_state()/mod_memcg_lruvec_state()
>>     and reparent_state_local()
>>
>> Co-developed-by: Yosry Ahmed <yosry@kernel.org>
>> Signed-off-by: Yosry Ahmed <yosry@kernel.org>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>> ---
>>   kernel/cgroup/cgroup.c |  9 ++--
>>   mm/memcontrol-v1.c     | 16 +++++++
>>   mm/memcontrol-v1.h     |  7 +++
>>   mm/memcontrol.c        | 97 ++++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 125 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 23b70bd80ddc9..b0519a16f5684 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>   	return x;
>>   }
>>   
>> +#ifdef CONFIG_MEMCG_V1
>> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>> +				     enum node_stat_item idx, int val);
>> +
>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> +				       struct mem_cgroup *parent, int idx)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
>> +		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
>> +		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
>> +		struct mem_cgroup_per_node *child_pn, *parent_pn;
>> +
>> +		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
>> +		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
>> +
>> +		__mod_memcg_lruvec_state(child_pn, idx, -value);
>> +		__mod_memcg_lruvec_state(parent_pn, idx, value);
> 
> We should probably change the type of `@val` from int to val to avoid
> losing non hierarchical stats during reparenting?

The parameter and return value of memcg_state_val_in_pages() are both
of type int, so perhaps we need a cleanup patch to do this?

I will send a cleanup patchset to do this, which includes the following:

https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/

Thanks,
Qi


>
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Harry Yoo (Oracle) 1 week, 2 days ago
On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
> Hi Harry,
> 
> On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
> > On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
> > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > > 
> > > To resolve the dying memcg issue, we need to reparent LRU folios of child
> > > memcg to its parent memcg. This could cause problems for non-hierarchical
> > > stats.
> > > 
> > > As Yosry Ahmed pointed out:
> > > 
> > > ```
> > > In short, if memory is charged to a dying cgroup at the time of
> > > reparenting, when the memory gets uncharged the stats updates will occur
> > > at the parent. This will update both hierarchical and non-hierarchical
> > > stats of the parent, which would corrupt the parent's non-hierarchical
> > > stats (because those counters were never incremented when the memory was
> > > charged).
> > > ```
> > > 
> > > Now we have the following two types of non-hierarchical stats, and they
> > > are only used in CONFIG_MEMCG_V1:
> > > 
> > > a. memcg->vmstats->state_local[i]
> > > b. pn->lruvec_stats->state_local[i]
> > > 
> > > To ensure that these non-hierarchical stats work properly, we need to
> > > reparent these non-hierarchical stats after reparenting LRU folios. To
> > > this end, this commit makes the following preparations:
> > > 
> > > 1. implement reparent_state_local() to reparent non-hierarchical stats
> > > 2. make css_killed_work_fn() to be called in rcu work, and implement
> > >     get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
> > >     between mod_memcg_state()/mod_memcg_lruvec_state()
> > >     and reparent_state_local()
> > > 
> > > Co-developed-by: Yosry Ahmed <yosry@kernel.org>
> > > Signed-off-by: Yosry Ahmed <yosry@kernel.org>
> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> > > ---
> > >   kernel/cgroup/cgroup.c |  9 ++--
> > >   mm/memcontrol-v1.c     | 16 +++++++
> > >   mm/memcontrol-v1.h     |  7 +++
> > >   mm/memcontrol.c        | 97 ++++++++++++++++++++++++++++++++++++++++++
> > >   4 files changed, 125 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 23b70bd80ddc9..b0519a16f5684 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > >   	return x;
> > >   }
> > > +#ifdef CONFIG_MEMCG_V1
> > > +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> > > +				     enum node_stat_item idx, int val);
> > > +
> > > +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> > > +				       struct mem_cgroup *parent, int idx)
> > > +{
> > > +	int nid;
> > > +
> > > +	for_each_node(nid) {
> > > +		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > > +		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> > > +		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
> > > +		struct mem_cgroup_per_node *child_pn, *parent_pn;
> > > +
> > > +		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
> > > +		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
> > > +
> > > +		__mod_memcg_lruvec_state(child_pn, idx, -value);
> > > +		__mod_memcg_lruvec_state(parent_pn, idx, value);
> > 
> > We should probably change the type of `@val` from int to val to avoid
> > losing non hierarchical stats during reparenting?
> 
> The parameter and return value of memcg_state_val_in_pages() are both
> of type int, so perhaps we need a cleanup patch to do this?

Yes!

and @val in memcg_rstat_updated() too, I think.

> I will send a cleanup patchset to do this, which includes the following:
> 
> https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/

Thanks!

Should that ideally be applied before this patchset?

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Qi Zheng 1 week, 1 day ago

On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote:
> On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
>> Hi Harry,
>>
>> On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
>>> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>
>>>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>>>> memcg to its parent memcg. This could cause problems for non-hierarchical
>>>> stats.
>>>>
>>>> As Yosry Ahmed pointed out:
>>>>
>>>> ```
>>>> In short, if memory is charged to a dying cgroup at the time of
>>>> reparenting, when the memory gets uncharged the stats updates will occur
>>>> at the parent. This will update both hierarchical and non-hierarchical
>>>> stats of the parent, which would corrupt the parent's non-hierarchical
>>>> stats (because those counters were never incremented when the memory was
>>>> charged).
>>>> ```
>>>>
>>>> Now we have the following two types of non-hierarchical stats, and they
>>>> are only used in CONFIG_MEMCG_V1:
>>>>
>>>> a. memcg->vmstats->state_local[i]
>>>> b. pn->lruvec_stats->state_local[i]
>>>>
>>>> To ensure that these non-hierarchical stats work properly, we need to
>>>> reparent these non-hierarchical stats after reparenting LRU folios. To
>>>> this end, this commit makes the following preparations:
>>>>
>>>> 1. implement reparent_state_local() to reparent non-hierarchical stats
>>>> 2. make css_killed_work_fn() to be called in rcu work, and implement
>>>>      get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>>>>      between mod_memcg_state()/mod_memcg_lruvec_state()
>>>>      and reparent_state_local()
>>>>
>>>> Co-developed-by: Yosry Ahmed <yosry@kernel.org>
>>>> Signed-off-by: Yosry Ahmed <yosry@kernel.org>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>>>> ---
>>>>    kernel/cgroup/cgroup.c |  9 ++--
>>>>    mm/memcontrol-v1.c     | 16 +++++++
>>>>    mm/memcontrol-v1.h     |  7 +++
>>>>    mm/memcontrol.c        | 97 ++++++++++++++++++++++++++++++++++++++++++
>>>>    4 files changed, 125 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index 23b70bd80ddc9..b0519a16f5684 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>>    	return x;
>>>>    }
>>>> +#ifdef CONFIG_MEMCG_V1
>>>> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>>>> +				     enum node_stat_item idx, int val);
>>>> +
>>>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>>> +				       struct mem_cgroup *parent, int idx)
>>>> +{
>>>> +	int nid;
>>>> +
>>>> +	for_each_node(nid) {
>>>> +		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
>>>> +		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
>>>> +		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
>>>> +		struct mem_cgroup_per_node *child_pn, *parent_pn;
>>>> +
>>>> +		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
>>>> +		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
>>>> +
>>>> +		__mod_memcg_lruvec_state(child_pn, idx, -value);
>>>> +		__mod_memcg_lruvec_state(parent_pn, idx, value);
>>>
>>> We should probably change the type of `@val` from int to val to avoid
>>> losing non hierarchical stats during reparenting?
>>
>> The parameter and return value of memcg_state_val_in_pages() are both
>> of type int, so perhaps we need a cleanup patch to do this?
> 
> Yes!
> 
> and @val in memcg_rstat_updated() too, I think.

Right.

> 
>> I will send a cleanup patchset to do this, which includes the following:
>>
>> https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
> 
> Thanks!
> 
> Should that ideally be applied before this patchset?

This would conflict with the current patchset. The v6 has been in
mm-unstable for some time, so I prefer to add a cleanup patchset to
avoid interfering with the merge of this patchset.

Otherwise, sending v7 might be more appropriate.

Thanks,
Qi

>
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Harry Yoo (Oracle) 1 week, 1 day ago
On Tue, Mar 24, 2026 at 10:54:48AM +0800, Qi Zheng wrote:
> On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote:
> > On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
> > > On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
> > > > On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index 23b70bd80ddc9..b0519a16f5684 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > > > >    	return x;
> > > > >    }
> > > > > +#ifdef CONFIG_MEMCG_V1
> > > > > +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> > > > > +				     enum node_stat_item idx, int val);
> > > > > +
> > > > > +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> > > > > +				       struct mem_cgroup *parent, int idx)
> > > > > +{
> > > > > +	int nid;
> > > > > +
> > > > > +	for_each_node(nid) {
> > > > > +		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > > > > +		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> > > > > +		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
> > > > > +		struct mem_cgroup_per_node *child_pn, *parent_pn;
> > > > > +
> > > > > +		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
> > > > > +		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
> > > > > +
> > > > > +		__mod_memcg_lruvec_state(child_pn, idx, -value);
> > > > > +		__mod_memcg_lruvec_state(parent_pn, idx, value);
> > > > 
> > > > We should probably change the type of `@val` from int to val to avoid
> > > > losing non hierarchical stats during reparenting?
> > > 
> > > The parameter and return value of memcg_state_val_in_pages() are both
> > > of type int, so perhaps we need a cleanup patch to do this?
> > 
> > Yes!
> > 
> > and @val in memcg_rstat_updated() too, I think.
> 
> Right.
> 
> > 
> > > I will send a cleanup patchset to do this, which includes the following:
> > > 
> > > https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
> > 
> > Thanks!
> > 
> > Should that ideally be applied before this patchset?
> 
> This would conflict with the current patchset.

Right.

> The v6 has been in mm-unstable for some time

Right.

> so I prefer to add a cleanup patchset to
> avoid interfering with the merge of this patchset.

but it's a little bit more than a cleanup, no?

I'd say without the followup, this patchset introduces a very subtle
(likely nobody would notice) functional regression visible on memcgs
with billions of pages.

> Otherwise, sending v7 might be more appropriate.
 
I think it should be sent either as v7 or as part of v7.1-rcX.
(Whichever way Andrew prefers)

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Qi Zheng 1 week, 1 day ago

On 3/24/26 12:05 PM, Harry Yoo (Oracle) wrote:
> On Tue, Mar 24, 2026 at 10:54:48AM +0800, Qi Zheng wrote:
>> On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote:
>>> On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
>>>> On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
>>>>> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
>>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>>> index 23b70bd80ddc9..b0519a16f5684 100644
>>>>>> --- a/mm/memcontrol.c
>>>>>> +++ b/mm/memcontrol.c
>>>>>> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>>>>     	return x;
>>>>>>     }
>>>>>> +#ifdef CONFIG_MEMCG_V1
>>>>>> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>>>>>> +				     enum node_stat_item idx, int val);
>>>>>> +
>>>>>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>>>>> +				       struct mem_cgroup *parent, int idx)
>>>>>> +{
>>>>>> +	int nid;
>>>>>> +
>>>>>> +	for_each_node(nid) {
>>>>>> +		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
>>>>>> +		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
>>>>>> +		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
>>>>>> +		struct mem_cgroup_per_node *child_pn, *parent_pn;
>>>>>> +
>>>>>> +		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
>>>>>> +		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
>>>>>> +
>>>>>> +		__mod_memcg_lruvec_state(child_pn, idx, -value);
>>>>>> +		__mod_memcg_lruvec_state(parent_pn, idx, value);
>>>>>
>>>>> We should probably change the type of `@val` from int to val to avoid
>>>>> losing non hierarchical stats during reparenting?
>>>>
>>>> The parameter and return value of memcg_state_val_in_pages() are both
>>>> of type int, so perhaps we need a cleanup patch to do this?
>>>
>>> Yes!
>>>
>>> and @val in memcg_rstat_updated() too, I think.
>>
>> Right.
>>
>>>
>>>> I will send a cleanup patchset to do this, which includes the following:
>>>>
>>>> https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
>>>
>>> Thanks!
>>>
>>> Should that ideally be applied before this patchset?
>>
>> This would conflict with the current patchset.
> 
> Right.
> 
>> The v6 has been in mm-unstable for some time
> 
> Right.
> 
>> so I prefer to add a cleanup patchset to
>> avoid interfering with the merge of this patchset.
> 
> but it's a little bit more than a cleanup, no?
> 
> I'd say without the followup, this patchset introduces a very subtle
> (likely nobody would notice) functional regression visible on memcgs
> with billions of pages.

Right.

> 
>> Otherwise, sending v7 might be more appropriate.
>   
> I think it should be sent either as v7 or as part of v7.1-rcX.
> (Whichever way Andrew prefers)

OK, In any case I will first send out the cleanup/fix patchset for
everyone to review.

Thanks,
Qi

>
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Harry Yoo (Oracle) 1 week, 1 day ago
On Tue, Mar 24, 2026 at 12:25:16PM +0800, Qi Zheng wrote:
> On 3/24/26 12:05 PM, Harry Yoo (Oracle) wrote:
> > On Tue, Mar 24, 2026 at 10:54:48AM +0800, Qi Zheng wrote:
> > > On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote:
> > > > On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
> > >
> > > so I prefer to add a cleanup patchset to
> > > avoid interfering with the merge of this patchset.
> > 
> > but it's a little bit more than a cleanup, no?
> > 
> > I'd say without the followup, this patchset introduces a very subtle
> > (likely nobody would notice) functional regression visible on memcgs
> > with billions of pages.
> 
> Right.
> 
> > > Otherwise, sending v7 might be more appropriate.
> > I think it should be sent either as v7 or as part of v7.1-rcX.
> > (Whichever way Andrew prefers)
> 
> OK, In any case I will first send out the cleanup/fix patchset for
> everyone to review.

Sounds great, thanks!

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Michal Koutný 2 weeks, 5 days ago
Hello Qi.

On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
> To ensure that these non-hierarchical stats work properly, we need to
> reparent these non-hierarchical stats after reparenting LRU folios. To
> this end, this commit makes the following preparations:
> 
> 1. implement reparent_state_local() to reparent non-hierarchical stats
> 2. make css_killed_work_fn() to be called in rcu work, and implement
>    get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>    between mod_memcg_state()/mod_memcg_lruvec_state()
>    and reparent_state_local()


css_free_rwork_fn has() already RCU deferal but we discussed some
reasons why stats reparenting cannot be done from there. IIUC something
like:

| reparent_state_local() must be already at css_killed_work_fn() because
| waiting until css_free_rwork_fn() would mean the non-hierarchical
| stats of the surrogate ancestor are outdated, e.g. underflown.
| And the waiting until css_free_rwork_fn() may still be indefinite due
| to non-folio references to the offlined memcg.

Could this be captured in the commit (if it's correct)?


> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6044,8 +6044,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
>   */
>  static void css_killed_work_fn(struct work_struct *work)
>  {
> -	struct cgroup_subsys_state *css =
> -		container_of(work, struct cgroup_subsys_state, destroy_work);
> +	struct cgroup_subsys_state *css;
> +
> +	css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
>  
>  	cgroup_lock();
>  
> @@ -6066,8 +6067,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
>  		container_of(ref, struct cgroup_subsys_state, refcnt);
>  
>  	if (atomic_dec_and_test(&css->online_cnt)) {
> -		INIT_WORK(&css->destroy_work, css_killed_work_fn);
> -		queue_work(cgroup_offline_wq, &css->destroy_work);
> +		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
> +		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
>  	}
>  }
>

Could this be

#ifdef CONFIG_MEMCG_V1
		/* See get_non_dying_memcg_start, get_non_dying_memcg_end
		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
#else
		INIT_WORK(&css->destroy_work, css_killed_work_fn);
		queue_work(cgroup_offline_wq, &css->destroy_work);
#endif

?

IOW there's no need to make the cgroup release path even more
asynchronous without CONFIG_MEMCG_V1 (all this seems CONFIG_MEMCG_V1
specific).

(+not so beautiful css_killed_work_fn ifdefing but given there are the
variants of _start, _end)

> +#ifdef CONFIG_MEMCG_V1
> +/*
> + * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
> + * reparenting of non-hierarchical state_locals.
> + */
> +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)

Nit: I think the idiomatic names are begin + end (in the meaning of
paired parenthesis, don't look at css_task_iter_start ;-).
Re: [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats
Posted by Qi Zheng 2 weeks, 2 days ago
Hi Michal,

On 3/14/26 12:22 AM, Michal Koutný wrote:
> Hello Qi.
> 
> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
>> To ensure that these non-hierarchical stats work properly, we need to
>> reparent these non-hierarchical stats after reparenting LRU folios. To
>> this end, this commit makes the following preparations:
>>
>> 1. implement reparent_state_local() to reparent non-hierarchical stats
>> 2. make css_killed_work_fn() to be called in rcu work, and implement
>>     get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>>     between mod_memcg_state()/mod_memcg_lruvec_state()
>>     and reparent_state_local()
> 
> 
> css_free_rwork_fn has() already RCU deferal but we discussed some
> reasons why stats reparenting cannot be done from there. IIUC something
> like:
> 
> | reparent_state_local() must be already at css_killed_work_fn() because
> | waiting until css_free_rwork_fn() would mean the non-hierarchical
> | stats of the surrogate ancestor are outdated, e.g. underflown.
> | And the waiting until css_free_rwork_fn() may still be indefinite due
> | to non-folio references to the offlined memcg.

We need to ensure that when reparent_state_local() is called, no writer
modifies the non-hierarchical stats of child memcg, otherwise these
stats modifications may be missed.

In the v3, we used synchronize_rcu() in reparent_state_local() to ensure
this, but this could cause blocking, so in this version we changed to
this asynchronous approach.

> 
> Could this be captured in the commit (if it's correct)?
> 
> 
>> --- a/kernel/cgroup/cgroup.c
>> +++ b/kernel/cgroup/cgroup.c
>> @@ -6044,8 +6044,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
>>    */
>>   static void css_killed_work_fn(struct work_struct *work)
>>   {
>> -	struct cgroup_subsys_state *css =
>> -		container_of(work, struct cgroup_subsys_state, destroy_work);
>> +	struct cgroup_subsys_state *css;
>> +
>> +	css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
>>   
>>   	cgroup_lock();
>>   
>> @@ -6066,8 +6067,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
>>   		container_of(ref, struct cgroup_subsys_state, refcnt);
>>   
>>   	if (atomic_dec_and_test(&css->online_cnt)) {
>> -		INIT_WORK(&css->destroy_work, css_killed_work_fn);
>> -		queue_work(cgroup_offline_wq, &css->destroy_work);
>> +		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
>> +		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
>>   	}
>>   }
>>
> 
> Could this be
> 
> #ifdef CONFIG_MEMCG_V1
> 		/* See get_non_dying_memcg_start, get_non_dying_memcg_end
> 		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
> 		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
> #else
> 		INIT_WORK(&css->destroy_work, css_killed_work_fn);
> 		queue_work(cgroup_offline_wq, &css->destroy_work);
> #endif
> 
> ?
> 
> IOW there's no need to make the cgroup release path even more
> asynchronous without CONFIG_MEMCG_V1 (all this seems CONFIG_MEMCG_V1
> specific).

Right. But I'm wondering if it's necessary to use ifdefing, since
asynchronous methods shouldn't cause any significant drawbacks?

> 
> (+not so beautiful css_killed_work_fn ifdefing but given there are the
> variants of _start, _end)
> 
>> +#ifdef CONFIG_MEMCG_V1
>> +/*
>> + * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
>> + * reparenting of non-hierarchical state_locals.
>> + */
>> +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
> 
> Nit: I think the idiomatic names are begin + end (in the meaning of
> paired parenthesis, don't look at css_task_iter_start ;-).

Both are fine for me, but changing the name requires changing
[PACTH v6 30/33] and [PACTH v6 32/33], if we need to update to v7,
I can include them.

Thanks,
Qi