From: Qi Zheng <zhengqi.arch@bytedance.com>
To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg. This could cause problems for non-hierarchical
stats.
As Yosry Ahmed pointed out:
```
In short, if memory is charged to a dying cgroup at the time of
reparenting, when the memory gets uncharged the stats updates will occur
at the parent. This will update both hierarchical and non-hierarchical
stats of the parent, which would corrupt the parent's non-hierarchical
stats (because those counters were never incremented when the memory was
charged).
```
Now we have the following two types of non-hierarchical stats, and they
are only used in CONFIG_MEMCG_V1:
a. memcg->vmstats->state_local[i]
b. pn->lruvec_stats->state_local[i]
To ensure that these non-hierarchical stats work properly, we need to
reparent these non-hierarchical stats after reparenting LRU folios. To
this end, this commit makes the following preparations:
1. implement reparent_state_local() to reparent non-hierarchical stats
2. make css_killed_work_fn() to be called in rcu work, and implement
get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
between mod_memcg_state()/mod_memcg_lruvec_state()
and reparent_state_local()
Co-developed-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
---
kernel/cgroup/cgroup.c | 9 ++--
mm/memcontrol-v1.c | 16 +++++++
mm/memcontrol-v1.h | 7 +++
mm/memcontrol.c | 97 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 125 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index be1d71dda3179..a9007f8aa029e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6044,8 +6044,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys_state *css;
+
+ css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
cgroup_lock();
@@ -6066,8 +6067,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_offline_wq, &css->destroy_work);
+ INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+ queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
}
}
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index fe42ef664f1e1..51fb4406f45cf 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1897,6 +1897,22 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+ reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ reparent_memcg_lruvec_state_local(memcg, parent, i);
+}
+
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 4041b5027a94b..05e6ff40f7556 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -77,6 +77,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 23b70bd80ddc9..b0519a16f5684 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -226,6 +226,34 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
return objcg;
}
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ /*
+ * Reparent stats exposed non-hierarchically. Flush @memcg's stats first
+ * to read its stats accurately , and conservatively flush @parent's
+ * stats after reparenting to avoid hiding a potentially large stat
+ * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
+ */
+ __mem_cgroup_flush_stats(memcg, true);
+
+ /* The following counts are all non-hierarchical and need to be reparented. */
+ reparent_memcg1_state_local(memcg, parent);
+ reparent_memcg1_lruvec_state_local(memcg, parent);
+
+ __mem_cgroup_flush_stats(parent, true);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
spin_lock_irq(&objcg_lock);
@@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+#ifdef CONFIG_MEMCG_V1
+static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
+ enum node_stat_item idx, int val);
+
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+ unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+ struct mem_cgroup_per_node *child_pn, *parent_pn;
+
+ child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
+ parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
+
+ __mod_memcg_lruvec_state(child_pn, idx, -value);
+ __mod_memcg_lruvec_state(parent_pn, idx, value);
+ }
+}
+#endif
+
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
@@ -718,6 +770,42 @@ static int memcg_state_val_in_pages(int idx, int val)
return max(val * unit / PAGE_SIZE, 1UL);
}
+#ifdef CONFIG_MEMCG_V1
+/*
+ * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
+ * reparenting of non-hierarchical state_locals.
+ */
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return memcg;
+
+ rcu_read_lock();
+
+ while (memcg_is_dying(memcg))
+ memcg = parent_mem_cgroup(memcg);
+
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ rcu_read_unlock();
+}
+#else
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+}
+#endif
+
static void __mod_memcg_state(struct mem_cgroup *memcg,
enum memcg_stat_item idx, int val)
{
@@ -769,6 +857,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ unsigned long value = memcg_page_state_local(memcg, idx);
+
+ __mod_memcg_state(memcg, idx, -value);
+ __mod_memcg_state(parent, idx, value);
+}
#endif
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
--
2.20.1
On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
>
> To resolve the dying memcg issue, we need to reparent LRU folios of child
> memcg to its parent memcg. This could cause problems for non-hierarchical
> stats.
>
> As Yosry Ahmed pointed out:
>
> ```
> In short, if memory is charged to a dying cgroup at the time of
> reparenting, when the memory gets uncharged the stats updates will occur
> at the parent. This will update both hierarchical and non-hierarchical
> stats of the parent, which would corrupt the parent's non-hierarchical
> stats (because those counters were never incremented when the memory was
> charged).
> ```
>
> Now we have the following two types of non-hierarchical stats, and they
> are only used in CONFIG_MEMCG_V1:
>
> a. memcg->vmstats->state_local[i]
> b. pn->lruvec_stats->state_local[i]
>
> To ensure that these non-hierarchical stats work properly, we need to
> reparent these non-hierarchical stats after reparenting LRU folios. To
> this end, this commit makes the following preparations:
>
> 1. implement reparent_state_local() to reparent non-hierarchical stats
> 2. make css_killed_work_fn() to be called in rcu work, and implement
> get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
> between mod_memcg_state()/mod_memcg_lruvec_state()
> and reparent_state_local()
>
> Co-developed-by: Yosry Ahmed <yosry@kernel.org>
> Signed-off-by: Yosry Ahmed <yosry@kernel.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
> kernel/cgroup/cgroup.c | 9 ++--
> mm/memcontrol-v1.c | 16 +++++++
> mm/memcontrol-v1.h | 7 +++
> mm/memcontrol.c | 97 ++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 125 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 23b70bd80ddc9..b0519a16f5684 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> return x;
> }
>
> +#ifdef CONFIG_MEMCG_V1
> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> + enum node_stat_item idx, int val);
> +
> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> + struct mem_cgroup *parent, int idx)
> +{
> + int nid;
> +
> + for_each_node(nid) {
> + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> + unsigned long value = lruvec_page_state_local(child_lruvec, idx);
> + struct mem_cgroup_per_node *child_pn, *parent_pn;
> +
> + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
> + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
> +
> + __mod_memcg_lruvec_state(child_pn, idx, -value);
> + __mod_memcg_lruvec_state(parent_pn, idx, value);
We should probably change the type of `@val` from int to val to avoid
losing non hierarchical stats during reparenting?
> #ifdef CONFIG_MEMCG_V1
> static void __mod_memcg_state(struct mem_cgroup *memcg,
> enum memcg_stat_item idx, int val)
>
> @@ -769,6 +857,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
> #endif
> return x;
> }
> +
> +void reparent_memcg_state_local(struct mem_cgroup *memcg,
> + struct mem_cgroup *parent, int idx)
> +{
> + unsigned long value = memcg_page_state_local(memcg, idx);
> +
> + __mod_memcg_state(memcg, idx, -value);
> + __mod_memcg_state(parent, idx, value);
> +}
Same here.
Otherwise LGTM.
> #endif
>
> static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
--
Cheers,
Harry / Hyeonggon
Hi Harry,
On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>> memcg to its parent memcg. This could cause problems for non-hierarchical
>> stats.
>>
>> As Yosry Ahmed pointed out:
>>
>> ```
>> In short, if memory is charged to a dying cgroup at the time of
>> reparenting, when the memory gets uncharged the stats updates will occur
>> at the parent. This will update both hierarchical and non-hierarchical
>> stats of the parent, which would corrupt the parent's non-hierarchical
>> stats (because those counters were never incremented when the memory was
>> charged).
>> ```
>>
>> Now we have the following two types of non-hierarchical stats, and they
>> are only used in CONFIG_MEMCG_V1:
>>
>> a. memcg->vmstats->state_local[i]
>> b. pn->lruvec_stats->state_local[i]
>>
>> To ensure that these non-hierarchical stats work properly, we need to
>> reparent these non-hierarchical stats after reparenting LRU folios. To
>> this end, this commit makes the following preparations:
>>
>> 1. implement reparent_state_local() to reparent non-hierarchical stats
>> 2. make css_killed_work_fn() to be called in rcu work, and implement
>> get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>> between mod_memcg_state()/mod_memcg_lruvec_state()
>> and reparent_state_local()
>>
>> Co-developed-by: Yosry Ahmed <yosry@kernel.org>
>> Signed-off-by: Yosry Ahmed <yosry@kernel.org>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>> ---
>> kernel/cgroup/cgroup.c | 9 ++--
>> mm/memcontrol-v1.c | 16 +++++++
>> mm/memcontrol-v1.h | 7 +++
>> mm/memcontrol.c | 97 ++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 125 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 23b70bd80ddc9..b0519a16f5684 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>> return x;
>> }
>>
>> +#ifdef CONFIG_MEMCG_V1
>> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>> + enum node_stat_item idx, int val);
>> +
>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> + struct mem_cgroup *parent, int idx)
>> +{
>> + int nid;
>> +
>> + for_each_node(nid) {
>> + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
>> + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
>> + unsigned long value = lruvec_page_state_local(child_lruvec, idx);
>> + struct mem_cgroup_per_node *child_pn, *parent_pn;
>> +
>> + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
>> + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
>> +
>> + __mod_memcg_lruvec_state(child_pn, idx, -value);
>> + __mod_memcg_lruvec_state(parent_pn, idx, value);
>
> We should probably change the type of `@val` from int to val to avoid
> losing non hierarchical stats during reparenting?
The parameter and return value of memcg_state_val_in_pages() are both
of type int, so perhaps we need a cleanup patch to do this?
I will send a cleanup patchset to do this, which includes the following:
https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
Thanks,
Qi
>
On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
> Hi Harry,
>
> On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
> > On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
> > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > >
> > > To resolve the dying memcg issue, we need to reparent LRU folios of child
> > > memcg to its parent memcg. This could cause problems for non-hierarchical
> > > stats.
> > >
> > > As Yosry Ahmed pointed out:
> > >
> > > ```
> > > In short, if memory is charged to a dying cgroup at the time of
> > > reparenting, when the memory gets uncharged the stats updates will occur
> > > at the parent. This will update both hierarchical and non-hierarchical
> > > stats of the parent, which would corrupt the parent's non-hierarchical
> > > stats (because those counters were never incremented when the memory was
> > > charged).
> > > ```
> > >
> > > Now we have the following two types of non-hierarchical stats, and they
> > > are only used in CONFIG_MEMCG_V1:
> > >
> > > a. memcg->vmstats->state_local[i]
> > > b. pn->lruvec_stats->state_local[i]
> > >
> > > To ensure that these non-hierarchical stats work properly, we need to
> > > reparent these non-hierarchical stats after reparenting LRU folios. To
> > > this end, this commit makes the following preparations:
> > >
> > > 1. implement reparent_state_local() to reparent non-hierarchical stats
> > > 2. make css_killed_work_fn() to be called in rcu work, and implement
> > > get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
> > > between mod_memcg_state()/mod_memcg_lruvec_state()
> > > and reparent_state_local()
> > >
> > > Co-developed-by: Yosry Ahmed <yosry@kernel.org>
> > > Signed-off-by: Yosry Ahmed <yosry@kernel.org>
> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> > > ---
> > > kernel/cgroup/cgroup.c | 9 ++--
> > > mm/memcontrol-v1.c | 16 +++++++
> > > mm/memcontrol-v1.h | 7 +++
> > > mm/memcontrol.c | 97 ++++++++++++++++++++++++++++++++++++++++++
> > > 4 files changed, 125 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 23b70bd80ddc9..b0519a16f5684 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > > return x;
> > > }
> > > +#ifdef CONFIG_MEMCG_V1
> > > +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> > > + enum node_stat_item idx, int val);
> > > +
> > > +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> > > + struct mem_cgroup *parent, int idx)
> > > +{
> > > + int nid;
> > > +
> > > + for_each_node(nid) {
> > > + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > > + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> > > + unsigned long value = lruvec_page_state_local(child_lruvec, idx);
> > > + struct mem_cgroup_per_node *child_pn, *parent_pn;
> > > +
> > > + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
> > > + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
> > > +
> > > + __mod_memcg_lruvec_state(child_pn, idx, -value);
> > > + __mod_memcg_lruvec_state(parent_pn, idx, value);
> >
> > We should probably change the type of `@val` from int to val to avoid
> > losing non hierarchical stats during reparenting?
>
> The parameter and return value of memcg_state_val_in_pages() are both
> of type int, so perhaps we need a cleanup patch to do this?
Yes!
and @val in memcg_rstat_updated() too, I think.
> I will send a cleanup patchset to do this, which includes the following:
>
> https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
Thanks!
Should that ideally be applied before this patchset?
--
Cheers,
Harry / Hyeonggon
On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote:
> On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
>> Hi Harry,
>>
>> On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
>>> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>
>>>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>>>> memcg to its parent memcg. This could cause problems for non-hierarchical
>>>> stats.
>>>>
>>>> As Yosry Ahmed pointed out:
>>>>
>>>> ```
>>>> In short, if memory is charged to a dying cgroup at the time of
>>>> reparenting, when the memory gets uncharged the stats updates will occur
>>>> at the parent. This will update both hierarchical and non-hierarchical
>>>> stats of the parent, which would corrupt the parent's non-hierarchical
>>>> stats (because those counters were never incremented when the memory was
>>>> charged).
>>>> ```
>>>>
>>>> Now we have the following two types of non-hierarchical stats, and they
>>>> are only used in CONFIG_MEMCG_V1:
>>>>
>>>> a. memcg->vmstats->state_local[i]
>>>> b. pn->lruvec_stats->state_local[i]
>>>>
>>>> To ensure that these non-hierarchical stats work properly, we need to
>>>> reparent these non-hierarchical stats after reparenting LRU folios. To
>>>> this end, this commit makes the following preparations:
>>>>
>>>> 1. implement reparent_state_local() to reparent non-hierarchical stats
>>>> 2. make css_killed_work_fn() to be called in rcu work, and implement
>>>> get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>>>> between mod_memcg_state()/mod_memcg_lruvec_state()
>>>> and reparent_state_local()
>>>>
>>>> Co-developed-by: Yosry Ahmed <yosry@kernel.org>
>>>> Signed-off-by: Yosry Ahmed <yosry@kernel.org>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>>>> ---
>>>> kernel/cgroup/cgroup.c | 9 ++--
>>>> mm/memcontrol-v1.c | 16 +++++++
>>>> mm/memcontrol-v1.h | 7 +++
>>>> mm/memcontrol.c | 97 ++++++++++++++++++++++++++++++++++++++++++
>>>> 4 files changed, 125 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index 23b70bd80ddc9..b0519a16f5684 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>> return x;
>>>> }
>>>> +#ifdef CONFIG_MEMCG_V1
>>>> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>>>> + enum node_stat_item idx, int val);
>>>> +
>>>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>>> + struct mem_cgroup *parent, int idx)
>>>> +{
>>>> + int nid;
>>>> +
>>>> + for_each_node(nid) {
>>>> + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
>>>> + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
>>>> + unsigned long value = lruvec_page_state_local(child_lruvec, idx);
>>>> + struct mem_cgroup_per_node *child_pn, *parent_pn;
>>>> +
>>>> + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
>>>> + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
>>>> +
>>>> + __mod_memcg_lruvec_state(child_pn, idx, -value);
>>>> + __mod_memcg_lruvec_state(parent_pn, idx, value);
>>>
>>> We should probably change the type of `@val` from int to val to avoid
>>> losing non hierarchical stats during reparenting?
>>
>> The parameter and return value of memcg_state_val_in_pages() are both
>> of type int, so perhaps we need a cleanup patch to do this?
>
> Yes!
>
> and @val in memcg_rstat_updated() too, I think.
Right.
>
>> I will send a cleanup patchset to do this, which includes the following:
>>
>> https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
>
> Thanks!
>
> Should that ideally be applied before this patchset?
This would conflict with the current patchset. The v6 has been in
mm-unstable for some time, so I prefer to add a cleanup patchset to
avoid interfering with the merge of this patchset.
Otherwise, sending v7 might be more appropriate.
Thanks,
Qi
>
On Tue, Mar 24, 2026 at 10:54:48AM +0800, Qi Zheng wrote:
> On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote:
> > On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
> > > On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
> > > > On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index 23b70bd80ddc9..b0519a16f5684 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > > > > return x;
> > > > > }
> > > > > +#ifdef CONFIG_MEMCG_V1
> > > > > +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> > > > > + enum node_stat_item idx, int val);
> > > > > +
> > > > > +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> > > > > + struct mem_cgroup *parent, int idx)
> > > > > +{
> > > > > + int nid;
> > > > > +
> > > > > + for_each_node(nid) {
> > > > > + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > > > > + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> > > > > + unsigned long value = lruvec_page_state_local(child_lruvec, idx);
> > > > > + struct mem_cgroup_per_node *child_pn, *parent_pn;
> > > > > +
> > > > > + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
> > > > > + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
> > > > > +
> > > > > + __mod_memcg_lruvec_state(child_pn, idx, -value);
> > > > > + __mod_memcg_lruvec_state(parent_pn, idx, value);
> > > >
> > > > We should probably change the type of `@val` from int to val to avoid
> > > > losing non hierarchical stats during reparenting?
> > >
> > > The parameter and return value of memcg_state_val_in_pages() are both
> > > of type int, so perhaps we need a cleanup patch to do this?
> >
> > Yes!
> >
> > and @val in memcg_rstat_updated() too, I think.
>
> Right.
>
> >
> > > I will send a cleanup patchset to do this, which includes the following:
> > >
> > > https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
> >
> > Thanks!
> >
> > Should that ideally be applied before this patchset?
>
> This would conflict with the current patchset.
Right.
> The v6 has been in mm-unstable for some time
Right.
> so I prefer to add a cleanup patchset to
> avoid interfering with the merge of this patchset.
but it's a little bit more than a cleanup, no?
I'd say without the followup, this patchset introduces a very subtle
(likely nobody would notice) functional regression visible on memcgs
with billions of pages.
> Otherwise, sending v7 might be more appropriate.
I think it should be sent either as v7 or as part of v7.1-rcX.
(Whichever way Andrew prefers)
--
Cheers,
Harry / Hyeonggon
On 3/24/26 12:05 PM, Harry Yoo (Oracle) wrote:
> On Tue, Mar 24, 2026 at 10:54:48AM +0800, Qi Zheng wrote:
>> On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote:
>>> On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote:
>>>> On 3/23/26 3:53 PM, Harry Yoo (Oracle) wrote:
>>>>> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng wrote:
>>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>>> index 23b70bd80ddc9..b0519a16f5684 100644
>>>>>> --- a/mm/memcontrol.c
>>>>>> +++ b/mm/memcontrol.c
>>>>>> @@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>>>> return x;
>>>>>> }
>>>>>> +#ifdef CONFIG_MEMCG_V1
>>>>>> +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>>>>>> + enum node_stat_item idx, int val);
>>>>>> +
>>>>>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>>>>> + struct mem_cgroup *parent, int idx)
>>>>>> +{
>>>>>> + int nid;
>>>>>> +
>>>>>> + for_each_node(nid) {
>>>>>> + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
>>>>>> + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
>>>>>> + unsigned long value = lruvec_page_state_local(child_lruvec, idx);
>>>>>> + struct mem_cgroup_per_node *child_pn, *parent_pn;
>>>>>> +
>>>>>> + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
>>>>>> + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
>>>>>> +
>>>>>> + __mod_memcg_lruvec_state(child_pn, idx, -value);
>>>>>> + __mod_memcg_lruvec_state(parent_pn, idx, value);
>>>>>
>>>>> We should probably change the type of `@val` from int to val to avoid
>>>>> losing non hierarchical stats during reparenting?
>>>>
>>>> The parameter and return value of memcg_state_val_in_pages() are both
>>>> of type int, so perhaps we need a cleanup patch to do this?
>>>
>>> Yes!
>>>
>>> and @val in memcg_rstat_updated() too, I think.
>>
>> Right.
>>
>>>
>>>> I will send a cleanup patchset to do this, which includes the following:
>>>>
>>>> https://lore.kernel.org/all/5e178b4e-a9e0-44dc-a18d-8c014365ee2f@linux.dev/
>>>
>>> Thanks!
>>>
>>> Should that ideally be applied before this patchset?
>>
>> This would conflict with the current patchset.
>
> Right.
>
>> The v6 has been in mm-unstable for some time
>
> Right.
>
>> so I prefer to add a cleanup patchset to
>> avoid interfering with the merge of this patchset.
>
> but it's a little bit more than a cleanup, no?
>
> I'd say without the followup, this patchset introduces a very subtle
> (likely nobody would notice) functional regression visible on memcgs
> with billions of pages.
Right.
>
>> Otherwise, sending v7 might be more appropriate.
>
> I think it should be sent either as v7 or as part of v7.1-rcX.
> (Whichever way Andrew prefers)
OK, In any case I will first send out the cleanup/fix patchset for
everyone to review.
Thanks,
Qi
>
On Tue, Mar 24, 2026 at 12:25:16PM +0800, Qi Zheng wrote: > On 3/24/26 12:05 PM, Harry Yoo (Oracle) wrote: > > On Tue, Mar 24, 2026 at 10:54:48AM +0800, Qi Zheng wrote: > > > On 3/23/26 8:25 PM, Harry Yoo (Oracle) wrote: > > > > On Mon, Mar 23, 2026 at 05:47:27PM +0800, Qi Zheng wrote: > > > > > > so I prefer to add a cleanup patchset to > > > avoid interfering with the merge of this patchset. > > > > but it's a little bit more than a cleanup, no? > > > > I'd say without the followup, this patchset introduces a very subtle > > (likely nobody would notice) functional regression visible on memcgs > > with billions of pages. > > Right. > > > > Otherwise, sending v7 might be more appropriate. > > I think it should be sent either as v7 or as part of v7.1-rcX. > > (Whichever way Andrew prefers) > > OK, In any case I will first send out the cleanup/fix patchset for > everyone to review. Sounds great, thanks! -- Cheers, Harry / Hyeonggon
Hello Qi.
On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
> To ensure that these non-hierarchical stats work properly, we need to
> reparent these non-hierarchical stats after reparenting LRU folios. To
> this end, this commit makes the following preparations:
>
> 1. implement reparent_state_local() to reparent non-hierarchical stats
> 2. make css_killed_work_fn() to be called in rcu work, and implement
> get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
> between mod_memcg_state()/mod_memcg_lruvec_state()
> and reparent_state_local()
css_free_rwork_fn has() already RCU deferal but we discussed some
reasons why stats reparenting cannot be done from there. IIUC something
like:
| reparent_state_local() must be already at css_killed_work_fn() because
| waiting until css_free_rwork_fn() would mean the non-hierarchical
| stats of the surrogate ancestor are outdated, e.g. underflown.
| And the waiting until css_free_rwork_fn() may still be indefinite due
| to non-folio references to the offlined memcg.
Could this be captured in the commit (if it's correct)?
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6044,8 +6044,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
> */
> static void css_killed_work_fn(struct work_struct *work)
> {
> - struct cgroup_subsys_state *css =
> - container_of(work, struct cgroup_subsys_state, destroy_work);
> + struct cgroup_subsys_state *css;
> +
> + css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
>
> cgroup_lock();
>
> @@ -6066,8 +6067,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
> container_of(ref, struct cgroup_subsys_state, refcnt);
>
> if (atomic_dec_and_test(&css->online_cnt)) {
> - INIT_WORK(&css->destroy_work, css_killed_work_fn);
> - queue_work(cgroup_offline_wq, &css->destroy_work);
> + INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
> + queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
> }
> }
>
Could this be
#ifdef CONFIG_MEMCG_V1
/* See get_non_dying_memcg_start, get_non_dying_memcg_end
INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
#else
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_offline_wq, &css->destroy_work);
#endif
?
IOW there's no need to make the cgroup release path even more
asynchronous without CONFIG_MEMCG_V1 (all this seems CONFIG_MEMCG_V1
specific).
(+not so beautiful css_killed_work_fn ifdefing but given there are the
variants of _start, _end)
> +#ifdef CONFIG_MEMCG_V1
> +/*
> + * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
> + * reparenting of non-hierarchical state_locals.
> + */
> +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
Nit: I think the idiomatic names are begin + end (in the meaning of
paired parenthesis, don't look at css_task_iter_start ;-).
Hi Michal,
On 3/14/26 12:22 AM, Michal Koutný wrote:
> Hello Qi.
>
> On Thu, Mar 05, 2026 at 07:52:48PM +0800, Qi Zheng <qi.zheng@linux.dev> wrote:
>> To ensure that these non-hierarchical stats work properly, we need to
>> reparent these non-hierarchical stats after reparenting LRU folios. To
>> this end, this commit makes the following preparations:
>>
>> 1. implement reparent_state_local() to reparent non-hierarchical stats
>> 2. make css_killed_work_fn() to be called in rcu work, and implement
>> get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
>> between mod_memcg_state()/mod_memcg_lruvec_state()
>> and reparent_state_local()
>
>
> css_free_rwork_fn has() already RCU deferal but we discussed some
> reasons why stats reparenting cannot be done from there. IIUC something
> like:
>
> | reparent_state_local() must be already at css_killed_work_fn() because
> | waiting until css_free_rwork_fn() would mean the non-hierarchical
> | stats of the surrogate ancestor are outdated, e.g. underflown.
> | And the waiting until css_free_rwork_fn() may still be indefinite due
> | to non-folio references to the offlined memcg.
We need to ensure that when reparent_state_local() is called, no writer
modifies the non-hierarchical stats of child memcg, otherwise these
stats modifications may be missed.
In the v3, we used synchronize_rcu() in reparent_state_local() to ensure
this, but this could cause blocking, so in this version we changed to
this asynchronous approach.
>
> Could this be captured in the commit (if it's correct)?
>
>
>> --- a/kernel/cgroup/cgroup.c
>> +++ b/kernel/cgroup/cgroup.c
>> @@ -6044,8 +6044,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
>> */
>> static void css_killed_work_fn(struct work_struct *work)
>> {
>> - struct cgroup_subsys_state *css =
>> - container_of(work, struct cgroup_subsys_state, destroy_work);
>> + struct cgroup_subsys_state *css;
>> +
>> + css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
>>
>> cgroup_lock();
>>
>> @@ -6066,8 +6067,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
>> container_of(ref, struct cgroup_subsys_state, refcnt);
>>
>> if (atomic_dec_and_test(&css->online_cnt)) {
>> - INIT_WORK(&css->destroy_work, css_killed_work_fn);
>> - queue_work(cgroup_offline_wq, &css->destroy_work);
>> + INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
>> + queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
>> }
>> }
>>
>
> Could this be
>
> #ifdef CONFIG_MEMCG_V1
> /* See get_non_dying_memcg_start, get_non_dying_memcg_end
> INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
> queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
> #else
> INIT_WORK(&css->destroy_work, css_killed_work_fn);
> queue_work(cgroup_offline_wq, &css->destroy_work);
> #endif
>
> ?
>
> IOW there's no need to make the cgroup release path even more
> asynchronous without CONFIG_MEMCG_V1 (all this seems CONFIG_MEMCG_V1
> specific).
Right. But I'm wondering if it's necessary to use ifdefing, since
asynchronous methods shouldn't cause any significant drawbacks?
>
> (+not so beautiful css_killed_work_fn ifdefing but given there are the
> variants of _start, _end)
>
>> +#ifdef CONFIG_MEMCG_V1
>> +/*
>> + * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
>> + * reparenting of non-hierarchical state_locals.
>> + */
>> +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
>
> Nit: I think the idiomatic names are begin + end (in the meaning of
> paired parenthesis, don't look at css_task_iter_start ;-).
Both are fine for me, but changing the name requires changing
[PACTH v6 30/33] and [PACTH v6 32/33], if we need to update to v7,
I can include them.
Thanks,
Qi
© 2016 - 2026 Red Hat, Inc.