From: Qi Zheng <zhengqi.arch@bytedance.com>
To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg. The following counts are all non-hierarchical
and need to be reparented to prevent the counts of parent memcg overflow.
1. memcg->vmstats->state_local[i]
2. pn->lruvec_stats->state_local[i]
This commit implements the specific function, which will be used during
the reparenting process.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
include/linux/memcontrol.h | 4 +++
mm/memcontrol-v1.c | 16 +++++++++++
mm/memcontrol-v1.h | 3 ++
mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++
4 files changed, 79 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 26c3c0e375f58..f84a23f13ffb4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -963,12 +963,16 @@ static inline void mod_memcg_page_state(struct page *page,
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
bool memcg_stat_item_valid(int idx);
bool memcg_vm_event_item_valid(enum vm_event_item idx);
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
enum node_stat_item idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index f0ef650d2317b..800606135e7ba 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1898,6 +1898,22 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+ reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+ reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
+}
+
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index eb3c3c1056574..45528195d3578 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -41,6 +41,7 @@ static inline bool do_memsw_account(void)
unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
+void mod_memcg_page_state_local(struct mem_cgroup *memcg, int idx, unsigned long val);
unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
bool memcg1_alloc_events(struct mem_cgroup *memcg);
void memcg1_free_events(struct mem_cgroup *memcg);
@@ -73,6 +74,8 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 70583394f421f..7aa32b97c9f17 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
return objcg;
}
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ synchronize_rcu();
+
+ __mem_cgroup_flush_stats(memcg, true);
+
+ /* The following counts are all non-hierarchical and need to be reparented. */
+ reparent_memcg1_state_local(memcg, parent);
+ reparent_memcg1_lruvec_state_local(memcg, parent);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
spin_lock_irq(&objcg_lock);
@@ -458,6 +480,28 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ int i = memcg_stats_index(idx);
+ int nid;
+
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
+ return;
+
+ for_each_node(nid) {
+ struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+ struct mem_cgroup_per_node *parent_pn;
+ unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+
+ parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
+
+ WRITE_ONCE(parent_pn->lruvec_stats->state_local[i],
+ parent_pn->lruvec_stats->state_local[i] + value);
+ }
+}
+
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
@@ -757,6 +801,18 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ int i = memcg_stats_index(idx);
+ unsigned long value = memcg_page_state_local(memcg, idx);
+
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
+ return;
+
+ WRITE_ONCE(parent->vmstats->state_local[i], parent->vmstats->state_local[i] + value);
+}
#endif
static void mod_memcg_lruvec_state(struct lruvec *lruvec,
--
2.20.1
On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
>
> To resolve the dying memcg issue, we need to reparent LRU folios of child
> memcg to its parent memcg. The following counts are all non-hierarchical
> and need to be reparented to prevent the counts of parent memcg overflow.
>
> 1. memcg->vmstats->state_local[i]
> 2. pn->lruvec_stats->state_local[i]
>
> This commit implements the specific function, which will be used during
> the reparenting process.
Please add more explanation which was discussed in the email chain at
https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
Also move the upward traversal code in mod_memcg_state() and
mod_memcg_lruvec_state() you added in later patch to this patch and make
it under CONFIG_MEMCG_V1.
Something like:
#ifdef CONFIG_MEMCG_V1
while (memcg_is_dying(memcg))
memcg = parent_mem_cgroup(memcg);
#endif
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> include/linux/memcontrol.h | 4 +++
> mm/memcontrol-v1.c | 16 +++++++++++
> mm/memcontrol-v1.h | 3 ++
> mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++
> 4 files changed, 79 insertions(+)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 26c3c0e375f58..f84a23f13ffb4 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -963,12 +963,16 @@ static inline void mod_memcg_page_state(struct page *page,
>
> unsigned long memcg_events(struct mem_cgroup *memcg, int event);
> unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
> +void reparent_memcg_state_local(struct mem_cgroup *memcg,
> + struct mem_cgroup *parent, int idx);
> unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
> bool memcg_stat_item_valid(int idx);
> bool memcg_vm_event_item_valid(enum vm_event_item idx);
> unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
> unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> enum node_stat_item idx);
> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> + struct mem_cgroup *parent, int idx);
>
> void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
> void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index f0ef650d2317b..800606135e7ba 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1898,6 +1898,22 @@ static const unsigned int memcg1_events[] = {
> PGMAJFAULT,
> };
>
> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
> + reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
> +}
> +
> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
> + reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
> +}
> +
> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
> {
> unsigned long memory, memsw;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index eb3c3c1056574..45528195d3578 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -41,6 +41,7 @@ static inline bool do_memsw_account(void)
>
> unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
> unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
> +void mod_memcg_page_state_local(struct mem_cgroup *memcg, int idx, unsigned long val);
> unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
> bool memcg1_alloc_events(struct mem_cgroup *memcg);
> void memcg1_free_events(struct mem_cgroup *memcg);
> @@ -73,6 +74,8 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
> unsigned long nr_memory, int nid);
>
> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>
> void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 70583394f421f..7aa32b97c9f17 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
> return objcg;
> }
>
> +#ifdef CONFIG_MEMCG_V1
> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
> +
> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> + return;
> +
> + synchronize_rcu();
Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
synchronize_rcu().
Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
css_killed_ref_fn(). It would be as simple as the following:
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e717208cfb18..549a8e026194 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6046,8 +6046,8 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
+ struct cgroup_subsys_state, destroy_rwork);
cgroup_lock();
@@ -6068,8 +6068,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_offline_wq, &css->destroy_work);
+ INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+ queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
}
}
> +
> + __mem_cgroup_flush_stats(memcg, true);
> +
> + /* The following counts are all non-hierarchical and need to be reparented. */
> + reparent_memcg1_state_local(memcg, parent);
> + reparent_memcg1_lruvec_state_local(memcg, parent);
> +}
> +#else
> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +}
> +#endif
> +
On 1/18/26 11:20 AM, Shakeel Butt wrote:
> On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>> memcg to its parent memcg. The following counts are all non-hierarchical
>> and need to be reparented to prevent the counts of parent memcg overflow.
>>
>> 1. memcg->vmstats->state_local[i]
>> 2. pn->lruvec_stats->state_local[i]
>>
>> This commit implements the specific function, which will be used during
>> the reparenting process.
>
> Please add more explanation which was discussed in the email chain at
> https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
OK, will do.
>
> Also move the upward traversal code in mod_memcg_state() and
> mod_memcg_lruvec_state() you added in later patch to this patch and make
> it under CONFIG_MEMCG_V1.
>
> Something like:
>
> #ifdef CONFIG_MEMCG_V1
> while (memcg_is_dying(memcg))
> memcg = parent_mem_cgroup(memcg);
> #endif
OK, will do.
>
>
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>> include/linux/memcontrol.h | 4 +++
>> mm/memcontrol-v1.c | 16 +++++++++++
>> mm/memcontrol-v1.h | 3 ++
>> mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 79 insertions(+)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 26c3c0e375f58..f84a23f13ffb4 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -963,12 +963,16 @@ static inline void mod_memcg_page_state(struct page *page,
>>
>> unsigned long memcg_events(struct mem_cgroup *memcg, int event);
>> unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
>> +void reparent_memcg_state_local(struct mem_cgroup *memcg,
>> + struct mem_cgroup *parent, int idx);
>> unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
>> bool memcg_stat_item_valid(int idx);
>> bool memcg_vm_event_item_valid(enum vm_event_item idx);
>> unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>> unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>> enum node_stat_item idx);
>> +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> + struct mem_cgroup *parent, int idx);
>>
>> void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>> void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>> index f0ef650d2317b..800606135e7ba 100644
>> --- a/mm/memcontrol-v1.c
>> +++ b/mm/memcontrol-v1.c
>> @@ -1898,6 +1898,22 @@ static const unsigned int memcg1_events[] = {
>> PGMAJFAULT,
>> };
>>
>> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>> + reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
>> +}
>> +
>> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>> + reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
>> +}
>> +
>> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>> {
>> unsigned long memory, memsw;
>> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
>> index eb3c3c1056574..45528195d3578 100644
>> --- a/mm/memcontrol-v1.h
>> +++ b/mm/memcontrol-v1.h
>> @@ -41,6 +41,7 @@ static inline bool do_memsw_account(void)
>>
>> unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
>> unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
>> +void mod_memcg_page_state_local(struct mem_cgroup *memcg, int idx, unsigned long val);
>> unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
>> bool memcg1_alloc_events(struct mem_cgroup *memcg);
>> void memcg1_free_events(struct mem_cgroup *memcg);
>> @@ -73,6 +74,8 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>> unsigned long nr_memory, int nid);
>>
>> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
>> +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>> +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>
>> void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 70583394f421f..7aa32b97c9f17 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>> return objcg;
>> }
>>
>> +#ifdef CONFIG_MEMCG_V1
>> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>> +
>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>> + return;
>> +
>> + synchronize_rcu();
>
> Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
> read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
> synchronize_rcu().
Since these two functions require memcg or lruvec, they are already
within the critical section of the RCU lock.
>
> Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
> css_killed_ref_fn(). It would be as simple as the following:
It does look much simpler, will do.
Thanks,
Qi
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index e717208cfb18..549a8e026194 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6046,8 +6046,8 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
> */
> static void css_killed_work_fn(struct work_struct *work)
> {
> - struct cgroup_subsys_state *css =
> - container_of(work, struct cgroup_subsys_state, destroy_work);
> + struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
> + struct cgroup_subsys_state, destroy_rwork);
>
> cgroup_lock();
>
> @@ -6068,8 +6068,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
> container_of(ref, struct cgroup_subsys_state, refcnt);
>
> if (atomic_dec_and_test(&css->online_cnt)) {
> - INIT_WORK(&css->destroy_work, css_killed_work_fn);
> - queue_work(cgroup_offline_wq, &css->destroy_work);
> + INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
> + queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
> }
> }
>
>> +
>> + __mem_cgroup_flush_stats(memcg, true);
>> +
>> + /* The following counts are all non-hierarchical and need to be reparented. */
>> + reparent_memcg1_state_local(memcg, parent);
>> + reparent_memcg1_lruvec_state_local(memcg, parent);
>> +}
>> +#else
>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +}
>> +#endif
>> +
On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
>
>
> On 1/18/26 11:20 AM, Shakeel Butt wrote:
> > On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
> > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > >
> > > To resolve the dying memcg issue, we need to reparent LRU folios of child
> > > memcg to its parent memcg. The following counts are all non-hierarchical
> > > and need to be reparented to prevent the counts of parent memcg overflow.
> > >
> > > 1. memcg->vmstats->state_local[i]
> > > 2. pn->lruvec_stats->state_local[i]
> > >
> > > This commit implements the specific function, which will be used during
> > > the reparenting process.
> >
> > Please add more explanation which was discussed in the email chain at
> > https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
>
> OK, will do.
>
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 70583394f421f..7aa32b97c9f17 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
> > > return objcg;
> > > }
> > > +#ifdef CONFIG_MEMCG_V1
> > > +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
> > > +
> > > +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> > > +{
> > > + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > + return;
> > > +
> > > + synchronize_rcu();
> >
> > Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
> > read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
> > synchronize_rcu().
>
> Since these two functions require memcg or lruvec, they are already
> within the critical section of the RCU lock.
What happens if someone grabbed a refcount and then release the rcu read
lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
In this case, can we end up reparenting in the middle of non-hierarchical
stat update because they don't have RCU grace period?
Something like
T1 T2
- rcu_read_lock()
- get memcg refcnt
- rcu_read_unlock()
- call mod_memcg_state()
- CSS_IS_DYING is not set
- Set CSS_IS_DYING
- Trigger percpu refkill
- Trigger offline_css()
-> reparent non-hierarchical - update non-hierarchical stats
stats
- put memcg refcount
> > Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
> > css_killed_ref_fn(). It would be as simple as the following:
>
> It does look much simpler, will do.
>
> Thanks,
> Qi
--
Cheers,
Harry / Hyeonggon
On 1/29/26 10:10 AM, Harry Yoo wrote:
> On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
>>
>>
>> On 1/18/26 11:20 AM, Shakeel Butt wrote:
>>> On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>
>>>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>>>> memcg to its parent memcg. The following counts are all non-hierarchical
>>>> and need to be reparented to prevent the counts of parent memcg overflow.
>>>>
>>>> 1. memcg->vmstats->state_local[i]
>>>> 2. pn->lruvec_stats->state_local[i]
>>>>
>>>> This commit implements the specific function, which will be used during
>>>> the reparenting process.
>>>
>>> Please add more explanation which was discussed in the email chain at
>>> https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
>>
>> OK, will do.
>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index 70583394f421f..7aa32b97c9f17 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>>> return objcg;
>>>> }
>>>> +#ifdef CONFIG_MEMCG_V1
>>>> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>>> +
>>>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>>> +{
>>>> + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>>> + return;
>>>> +
>>>> + synchronize_rcu();
>>>
>>> Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
>>> read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
>>> synchronize_rcu().
>>
>> Since these two functions require memcg or lruvec, they are already
>> within the critical section of the RCU lock.
>
> What happens if someone grabbed a refcount and then release the rcu read
> lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
>
> In this case, can we end up reparenting in the middle of non-hierarchical
> stat update because they don't have RCU grace period?
>
> Something like
>
> T1 T2
>
> - rcu_read_lock()
> - get memcg refcnt
> - rcu_read_unlock()
>
> - call mod_memcg_state()
> - CSS_IS_DYING is not set
> - Set CSS_IS_DYING
> - Trigger percpu refkill
>
> - Trigger offline_css()
> -> reparent non-hierarchical - update non-hierarchical stats
> stats
> - put memcg refcount
Good catch, I think you are right.
The rcu lock should be added to mod_memcg_state() and
mod_memcg_lruvec_state().
I will update to v4 as soon as possible.
Thanks,
Qi
>
>>> Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
>>> css_killed_ref_fn(). It would be as simple as the following:
>>
>> It does look much simpler, will do.
>>
>> Thanks,
>> Qi
>
On Thu, Jan 29, 2026 at 04:50:39PM +0800, Qi Zheng wrote:
>
>
> On 1/29/26 10:10 AM, Harry Yoo wrote:
> > On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
> > >
> > >
> > > On 1/18/26 11:20 AM, Shakeel Butt wrote:
> > > > On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
> > > > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > > > >
> > > > > To resolve the dying memcg issue, we need to reparent LRU folios of child
> > > > > memcg to its parent memcg. The following counts are all non-hierarchical
> > > > > and need to be reparented to prevent the counts of parent memcg overflow.
> > > > >
> > > > > 1. memcg->vmstats->state_local[i]
> > > > > 2. pn->lruvec_stats->state_local[i]
> > > > >
> > > > > This commit implements the specific function, which will be used during
> > > > > the reparenting process.
> > > >
> > > > Please add more explanation which was discussed in the email chain at
> > > > https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
> > >
> > > OK, will do.
> > >
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index 70583394f421f..7aa32b97c9f17 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
> > > > > return objcg;
> > > > > }
> > > > > +#ifdef CONFIG_MEMCG_V1
> > > > > +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
> > > > > +
> > > > > +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> > > > > +{
> > > > > + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > > > + return;
> > > > > +
> > > > > + synchronize_rcu();
> > > >
> > > > Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
> > > > read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
> > > > synchronize_rcu().
> > >
> > > Since these two functions require memcg or lruvec, they are already
> > > within the critical section of the RCU lock.
> >
> > What happens if someone grabbed a refcount and then release the rcu read
> > lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
> >
> > In this case, can we end up reparenting in the middle of non-hierarchical
> > stat update because they don't have RCU grace period?
> >
> > Something like
> >
> > T1 T2
> >
> > - rcu_read_lock()
> > - get memcg refcnt
> > - rcu_read_unlock()
> >
> > - call mod_memcg_state()
> > - CSS_IS_DYING is not set
> > - Set CSS_IS_DYING
> > - Trigger percpu refkill
> >
> > - Trigger offline_css()
> > -> reparent non-hierarchical - update non-hierarchical stats
> > stats
> > - put memcg refcount
>
> Good catch, I think you are right.
>
> The rcu lock should be added to mod_memcg_state() and
> mod_memcg_lruvec_state().
Thanks for confirming!
Because it's quite confusing, let me ask few more questions...
Q1. Yosry mentioned [1] [2] that stat updates should be done in the same
RCU section that is used to grab a refcount of the cgroup.
But I don't think your work is relying on that. Instead, I guess, it's
relying on the CSS_DYING check from reader side to determine whether it
should update stats of the child or parent memcg, right?
-> That being said, when rcu_read_lock() is called _after_ stats are
reparented, the reader must observe that the CSS_DYING flag is set.
[1] https://lore.kernel.org/all/utl6esq7jz5e4f7kwgrpwdjc2rm3yi33ljb6xkm5nxzufa4o7s@hblq2piu3vnz
[2] https://lore.kernel.org/all/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
Q2. When a reader checks CSS_DYING flag, how is the flag change
guaranteed to be visible to the reader without any lock, memory barrier,
or atomic ops involved?
As Shakeel mentioned elsewhere, I hope some explanations for correctness
to be included in the commit message :)
> I will update to v4 as soon as possible.
Thanks a lot!
I'll wait for that and will review carefully to make sure it's correct ;)
> Thanks,
> Qi
>
> > > > Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
> > > > css_killed_ref_fn(). It would be as simple as the following:
> > >
> > > It does look much simpler, will do.
> > >
> > > Thanks,
> > > Qi
--
Cheers,
Harry / Hyeonggon
On 1/29/26 8:23 PM, Harry Yoo wrote:
> On Thu, Jan 29, 2026 at 04:50:39PM +0800, Qi Zheng wrote:
>>
>>
>> On 1/29/26 10:10 AM, Harry Yoo wrote:
>>> On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote:
>>>>
>>>>
>>>> On 1/18/26 11:20 AM, Shakeel Butt wrote:
>>>>> On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote:
>>>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>
>>>>>> To resolve the dying memcg issue, we need to reparent LRU folios of child
>>>>>> memcg to its parent memcg. The following counts are all non-hierarchical
>>>>>> and need to be reparented to prevent the counts of parent memcg overflow.
>>>>>>
>>>>>> 1. memcg->vmstats->state_local[i]
>>>>>> 2. pn->lruvec_stats->state_local[i]
>>>>>>
>>>>>> This commit implements the specific function, which will be used during
>>>>>> the reparenting process.
>>>>>
>>>>> Please add more explanation which was discussed in the email chain at
>>>>> https://lore.kernel.org/all/5dsb6q2r4xsi24kk5gcnckljuvgvvp6nwifwvc4wuho5hsifeg@5ukg2dq6ini5/
>>>>
>>>> OK, will do.
>>>>
>>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>>> index 70583394f421f..7aa32b97c9f17 100644
>>>>>> --- a/mm/memcontrol.c
>>>>>> +++ b/mm/memcontrol.c
>>>>>> @@ -225,6 +225,28 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>>>>> return objcg;
>>>>>> }
>>>>>> +#ifdef CONFIG_MEMCG_V1
>>>>>> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>>>>> +
>>>>>> +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>>>>> +{
>>>>>> + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>>>>> + return;
>>>>>> +
>>>>>> + synchronize_rcu();
>>>>>
>>>>> Hmm synchrinuze_rcu() is a heavy hammer here. Also you would need rcu
>>>>> read lock in mod_memcg_state() & mod_memcg_lruvec_state() for this
>>>>> synchronize_rcu().
>>>>
>>>> Since these two functions require memcg or lruvec, they are already
>>>> within the critical section of the RCU lock.
>>>
>>> What happens if someone grabbed a refcount and then release the rcu read
>>> lock before percpu refkill and then call mod_memcg[_lruvec]_state()?
>>>
>>> In this case, can we end up reparenting in the middle of non-hierarchical
>>> stat update because they don't have RCU grace period?
>>>
>>> Something like
>>>
>>> T1 T2
>>>
>>> - rcu_read_lock()
>>> - get memcg refcnt
>>> - rcu_read_unlock()
>>>
>>> - call mod_memcg_state()
>>> - CSS_IS_DYING is not set
>>> - Set CSS_IS_DYING
>>> - Trigger percpu refkill
>>>
>>> - Trigger offline_css()
>>> -> reparent non-hierarchical - update non-hierarchical stats
>>> stats
>>> - put memcg refcount
>>
>> Good catch, I think you are right.
>>
>> The rcu lock should be added to mod_memcg_state() and
>> mod_memcg_lruvec_state().
>
> Thanks for confirming!
>
> Because it's quite confusing, let me ask few more questions...
>
> Q1. Yosry mentioned [1] [2] that stat updates should be done in the same
> RCU section that is used to grab a refcount of the cgroup.
>
> But I don't think your work is relying on that. Instead, I guess, it's
> relying on the CSS_DYING check from reader side to determine whether it
Only relying the CSS_DYING check is insufficient. Otherwise, the
following race may occur:
T1 T2
memcg_is_dying is false
Set CSS_IS_DYING
reparent non-hierarchical update non-hierarchical stats for child
So IIUC we should add rcu lock, then:
T1 T2
rcu_read_lock
memcg_is_dying is false
Set CSS_IS_DYING
update non-hierarchical stats for child
rcu_read_unlock
synchronize_rcu or rcu work
--> reparent non-hierarchical
> should update stats of the child or parent memcg, right?
>
> -> That being said, when rcu_read_lock() is called _after_ stats are
> reparented, the reader must observe that the CSS_DYING flag is set.
>
> [1] https://lore.kernel.org/all/utl6esq7jz5e4f7kwgrpwdjc2rm3yi33ljb6xkm5nxzufa4o7s@hblq2piu3vnz
> [2] https://lore.kernel.org/all/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn
>
> Q2. When a reader checks CSS_DYING flag, how is the flag change
> guaranteed to be visible to the reader without any lock, memory barrier,
> or atomic ops involved?
The main situation requiring CSS_DYING check is as follow:
T1 T2
Set CSS_IS_DYING
synchronize_rcu or rcu work
--> reparent non-hierarchical
rcu_read_lock()
memcg_is_dying is true
update non-hierarchical stats for parent
Referring to the "Memory-Barrier Guarantees" section in [1],
synchronize_rcu() can guarantee that T2 can see CSS_IS_DYING. Right?
[1].
https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.rst
Thanks,
Qi
>
> As Shakeel mentioned elsewhere, I hope some explanations for correctness
> to be included in the commit message :)
>
>> I will update to v4 as soon as possible.
>
> Thanks a lot!
>
> I'll wait for that and will review carefully to make sure it's correct ;)
>
>> Thanks,
>> Qi
>>
>>>>> Hmm instead of synchronize_rcu() here, we can use queue_rcu_work() in
>>>>> css_killed_ref_fn(). It would be as simple as the following:
>>>>
>>>> It does look much simpler, will do.
>>>>
>>>> Thanks,
>>>> Qi
>
On Fri, Jan 30, 2026 at 03:22:20PM +0800, Qi Zheng wrote: > On 1/29/26 8:23 PM, Harry Yoo wrote: > > On Thu, Jan 29, 2026 at 04:50:39PM +0800, Qi Zheng wrote: > > > On 1/29/26 10:10 AM, Harry Yoo wrote: > > > > On Mon, Jan 19, 2026 at 11:34:53AM +0800, Qi Zheng wrote: > > > > > On 1/18/26 11:20 AM, Shakeel Butt wrote: > > > > > > On Wed, Jan 14, 2026 at 07:32:55PM +0800, Qi Zheng wrote: > > > > > Since these two functions require memcg or lruvec, they are already > > > > > within the critical section of the RCU lock. > > > > > > > > What happens if someone grabbed a refcount and then release the rcu read > > > > lock before percpu refkill and then call mod_memcg[_lruvec]_state()? > > > > > > > > In this case, can we end up reparenting in the middle of non-hierarchical > > > > stat update because they don't have RCU grace period? > > > > > > > > Something like > > > > > > > > T1 T2 > > > > > > > > - rcu_read_lock() > > > > - get memcg refcnt > > > > - rcu_read_unlock() > > > > > > > > - call mod_memcg_state() > > > > - CSS_IS_DYING is not set > > > > - Set CSS_IS_DYING > > > > - Trigger percpu refkill > > > > > > > > - Trigger offline_css() > > > > -> reparent non-hierarchical - update non-hierarchical stats > > > > stats > > > > - put memcg refcount > > > > > > Good catch, I think you are right. > > > > > > The rcu lock should be added to mod_memcg_state() and > > > mod_memcg_lruvec_state(). > > > > Thanks for confirming! > > > > Because it's quite confusing, let me ask few more questions... > > > > Q1. Yosry mentioned [1] [2] that stat updates should be done in the same > > RCU section that is used to grab a refcount of the cgroup. > > > > But I don't think your work is relying on that. Instead, I guess, it's > > relying on the CSS_DYING check from reader side to determine whether it > > Only relying the CSS_DYING check is insufficient. Otherwise, the > following race may occur: > > T1 T2 > > memcg_is_dying is false > Set CSS_IS_DYING > reparent non-hierarchical update non-hierarchical stats for child > > So IIUC we should add rcu lock, then: Right, RCU here is used to enforce ordering between reparenting non-hierarchical stats and stat updates for child. > T1 T2 > > rcu_read_lock > memcg_is_dying is false > > Set CSS_IS_DYING > update non-hierarchical stats for child > rcu_read_unlock > > synchronize_rcu or rcu work > --> reparent non-hierarchical > > > should update stats of the child or parent memcg, right? > > > > -> That being said, when rcu_read_lock() is called _after_ stats are > > reparented, the reader must observe that the CSS_DYING flag is set. > > > > [1] https://lore.kernel.org/all/utl6esq7jz5e4f7kwgrpwdjc2rm3yi33ljb6xkm5nxzufa4o7s@hblq2piu3vnz > > [2] https://lore.kernel.org/all/ebdhvcwygvnfejai5azhg3sjudsjorwmlcvmzadpkhexoeq3tb@5gj5y2exdhpn > > > > Q2. When a reader checks CSS_DYING flag, how is the flag change > > guaranteed to be visible to the reader without any lock, memory barrier, > > or atomic ops involved? > > The main situation requiring CSS_DYING check is as follow: > > T1 T2 > > Set CSS_IS_DYING > > synchronize_rcu or rcu work > --> reparent non-hierarchical > rcu_read_lock() > memcg_is_dying is true > update non-hierarchical stats for parent > > Referring to the "Memory-Barrier Guarantees" section in [1], Today I learned a little bit about RCU's requirements :) > synchronize_rcu() can guarantee that T2 can see CSS_IS_DYING. Right? Right. It is fair to assume that a read-side critical section either 1) ends before synchronize_rcu(), in this case, the reader might see CSS_DYING flag or not, but it doesn't matter because this reader ends before reparenting the stats. 2) starts after synchronize_rcu() starts, in this case, all stores before the grace period starts must be propagated to all CPUs before the critical section starts. Thus, such a reader is guaranteed to see the CSS_DYING flag set. Linux Kernel Memory Model [2] explicitly explains 1) and 2): > RCU (Read-Copy-Update) is a powerful synchronization mechanism. It > rests on two concepts: grace periods and read-side critical sections. > > A grace period is the span of time occupied by a call to > synchronize_rcu(). A read-side critical section (or just critical > section, for short) is a region of code delimited by rcu_read_lock() > at the start and rcu_read_unlock() at the end. Critical sections can > be nested, although we won't make use of this fact. > > As far as memory models are concerned, RCU's main feature is its > Grace-Period Guarantee, which states that a critical section can never > span a full grace period. In more detail, the Guarantee says: > > For any critical section C and any grace period G, at least > one of the following statements must hold: > > (1) C ends before G does, and in addition, every store that > propagates to C's CPU before the end of C must propagate to > every CPU before G ends. > > (2) G starts before C does, and in addition, every store that > propagates to G's CPU before the start of G must propagate > to every CPU before C starts. ^ this [2] https://docs.kernel.org/dev-tools/lkmm/docs/explanation.html > [1]. https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.rst > Qi -- Cheers, Harry / Hyeonggon
From: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
include/linux/memcontrol.h | 2 --
mm/memcontrol-v1.c | 8 --------
mm/memcontrol-v1.h | 5 ++++-
mm/memcontrol.c | 19 ++++++++-----------
4 files changed, 12 insertions(+), 22 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1fe554eec1e25..e0b84b109b7ac 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
enum node_stat_item idx);
-void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
- struct mem_cgroup *parent, int idx);
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 03b924920d6a5..daf9bad8c45ea 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
}
-void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
- reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
-}
-
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 45528195d3578..5b1188f3d4173 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
-void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
@@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
+static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+
static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7333a37830051..b7b35143d4d2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
return objcg;
}
-#ifdef CONFIG_MEMCG_V1
+static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- return;
+ int i;
synchronize_rcu();
@@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
/* The following counts are all non-hierarchical and need to be reparented. */
reparent_memcg1_state_local(memcg, parent);
- reparent_memcg1_lruvec_state_local(memcg, parent);
-}
-#else
-static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
-{
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ reparent_memcg_lruvec_state_local(memcg, parent, i);
}
-#endif
static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
@@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
-void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
- struct mem_cgroup *parent, int idx)
+static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
{
int i = memcg_stats_index(idx);
int nid;
--
2.20.1
On Thu, Jan 15, 2026 at 06:41:38PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> include/linux/memcontrol.h | 2 --
> mm/memcontrol-v1.c | 8 --------
> mm/memcontrol-v1.h | 5 ++++-
> mm/memcontrol.c | 19 ++++++++-----------
> 4 files changed, 12 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1fe554eec1e25..e0b84b109b7ac 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
> unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
> unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> enum node_stat_item idx);
> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> - struct mem_cgroup *parent, int idx);
>
> void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
> void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 03b924920d6a5..daf9bad8c45ea 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
> reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
> }
>
> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> -{
> - int i;
> -
> - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
> - reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
> -}
> -
> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
> {
> unsigned long memory, memsw;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index 45528195d3578..5b1188f3d4173 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>
> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
> void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>
> void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
> @@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
>
> static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
>
> +static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +}
> +
> static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
> static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7333a37830051..b7b35143d4d2d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
> return objcg;
> }
>
> -#ifdef CONFIG_MEMCG_V1
> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> + struct mem_cgroup *parent, int idx);
> static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>
> static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> {
No reparenting local stats for v2.
> - if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> - return;
> + int i;
>
> synchronize_rcu();
>
> @@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
>
> /* The following counts are all non-hierarchical and need to be reparented. */
> reparent_memcg1_state_local(memcg, parent);
> - reparent_memcg1_lruvec_state_local(memcg, parent);
> -}
> -#else
> -static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> -{
> +
> + for (i = 0; i < NR_LRU_LISTS; i++)
> + reparent_memcg_lruvec_state_local(memcg, parent, i);
> }
> -#endif
>
> static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> {
> @@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> return x;
> }
>
> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> - struct mem_cgroup *parent, int idx)
> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> + struct mem_cgroup *parent, int idx)
> {
> int i = memcg_stats_index(idx);
> int nid;
> --
> 2.20.1
>
On 1/18/26 11:22 AM, Shakeel Butt wrote:
> On Thu, Jan 15, 2026 at 06:41:38PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>> include/linux/memcontrol.h | 2 --
>> mm/memcontrol-v1.c | 8 --------
>> mm/memcontrol-v1.h | 5 ++++-
>> mm/memcontrol.c | 19 ++++++++-----------
>> 4 files changed, 12 insertions(+), 22 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 1fe554eec1e25..e0b84b109b7ac 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
>> unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>> unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>> enum node_stat_item idx);
>> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> - struct mem_cgroup *parent, int idx);
>>
>> void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>> void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>> index 03b924920d6a5..daf9bad8c45ea 100644
>> --- a/mm/memcontrol-v1.c
>> +++ b/mm/memcontrol-v1.c
>> @@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
>> reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
>> }
>>
>> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> -{
>> - int i;
>> -
>> - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>> - reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
>> -}
>> -
>> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>> {
>> unsigned long memory, memsw;
>> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
>> index 45528195d3578..5b1188f3d4173 100644
>> --- a/mm/memcontrol-v1.h
>> +++ b/mm/memcontrol-v1.h
>> @@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>>
>> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
>> void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>
>> void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
>> @@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
>>
>> static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
>>
>> +static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +}
>> +
>> static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
>> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
>> static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 7333a37830051..b7b35143d4d2d 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>> return objcg;
>> }
>>
>> -#ifdef CONFIG_MEMCG_V1
>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> + struct mem_cgroup *parent, int idx);
>> static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>
>> static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> {
>
> No reparenting local stats for v2.
It seems that lruvec_stats->state_local (non-hierarchical) needs to be
relocated in both v1 and v2.
>
>> - if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>> - return;
>> + int i;
>>
>> synchronize_rcu();
>>
>> @@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
>>
>> /* The following counts are all non-hierarchical and need to be reparented. */
>> reparent_memcg1_state_local(memcg, parent);
>> - reparent_memcg1_lruvec_state_local(memcg, parent);
>> -}
>> -#else
>> -static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> -{
>> +
>> + for (i = 0; i < NR_LRU_LISTS; i++)
>> + reparent_memcg_lruvec_state_local(memcg, parent, i);
>> }
>> -#endif
>>
>> static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> {
>> @@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>> return x;
>> }
>>
>> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> - struct mem_cgroup *parent, int idx)
>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> + struct mem_cgroup *parent, int idx)
>> {
>> int i = memcg_stats_index(idx);
>> int nid;
>> --
>> 2.20.1
>>
> On Jan 19, 2026, at 11:36, Qi Zheng <qi.zheng@linux.dev> wrote:
>
>
>
> On 1/18/26 11:22 AM, Shakeel Butt wrote:
>> On Thu, Jan 15, 2026 at 06:41:38PM +0800, Qi Zheng wrote:
>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>> include/linux/memcontrol.h | 2 --
>>> mm/memcontrol-v1.c | 8 --------
>>> mm/memcontrol-v1.h | 5 ++++-
>>> mm/memcontrol.c | 19 ++++++++-----------
>>> 4 files changed, 12 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>> index 1fe554eec1e25..e0b84b109b7ac 100644
>>> --- a/include/linux/memcontrol.h
>>> +++ b/include/linux/memcontrol.h
>>> @@ -944,8 +944,6 @@ bool memcg_vm_event_item_valid(enum vm_event_item idx);
>>> unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
>>> unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>> enum node_stat_item idx);
>>> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> - struct mem_cgroup *parent, int idx);
>>> void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>>> void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
>>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>>> index 03b924920d6a5..daf9bad8c45ea 100644
>>> --- a/mm/memcontrol-v1.c
>>> +++ b/mm/memcontrol-v1.c
>>> @@ -1909,14 +1909,6 @@ void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *pa
>>> reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
>>> }
>>> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> -{
>>> - int i;
>>> -
>>> - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
>>> - reparent_memcg_lruvec_state_local(memcg, parent, memcg1_stats[i]);
>>> -}
>>> -
>>> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>>> {
>>> unsigned long memory, memsw;
>>> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
>>> index 45528195d3578..5b1188f3d4173 100644
>>> --- a/mm/memcontrol-v1.h
>>> +++ b/mm/memcontrol-v1.h
>>> @@ -75,7 +75,6 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>>> void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
>>> void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>> -void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>>> void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>>> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
>>> @@ -116,6 +115,10 @@ static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
>>> static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
>>> +static inline void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> +{
>>> +}
>>> +
>>> static inline void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) {}
>>> static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) { return false; }
>>> static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>> index 7333a37830051..b7b35143d4d2d 100644
>>> --- a/mm/memcontrol.c
>>> +++ b/mm/memcontrol.c
>>> @@ -225,13 +225,13 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
>>> return objcg;
>>> }
>>> -#ifdef CONFIG_MEMCG_V1
>>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> + struct mem_cgroup *parent, int idx);
>>> static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
>>> static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> {
>> No reparenting local stats for v2.
>
> It seems that lruvec_stats->state_local (non-hierarchical) needs to be
> relocated in both v1 and v2.
Here we might need to elaborate a bit. Specifically, in the function
`count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain
LRU and SLAB pages seems to also require these logics to work correctly.
For SLAB, it appears that the statistics here have already been
problematic for a while since SLAB pages have been reparented, right?
>
>>> - if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>> - return;
>>> + int i;
>>> synchronize_rcu();
>>> @@ -239,13 +239,10 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr
>>> /* The following counts are all non-hierarchical and need to be reparented. */
>>> reparent_memcg1_state_local(memcg, parent);
>>> - reparent_memcg1_lruvec_state_local(memcg, parent);
>>> -}
>>> -#else
>>> -static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> -{
>>> +
>>> + for (i = 0; i < NR_LRU_LISTS; i++)
>>> + reparent_memcg_lruvec_state_local(memcg, parent, i);
>>> }
>>> -#endif
>>> static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>>> {
>>> @@ -510,8 +507,8 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>> return x;
>>> }
>>> -void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> - struct mem_cgroup *parent, int idx)
>>> +static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>> + struct mem_cgroup *parent, int idx)
>>> {
>>> int i = memcg_stats_index(idx);
>>> int nid;
>>> --
>>> 2.20.1
On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote: > > > >> No reparenting local stats for v2. > > > > It seems that lruvec_stats->state_local (non-hierarchical) needs to be > > relocated in both v1 and v2. > > Here we might need to elaborate a bit. Specifically, in the function > `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain > LRU and SLAB pages seems to also require these logics to work correctly. > For SLAB, it appears that the statistics here have already been > problematic for a while since SLAB pages have been reparented, right? > Thanks a lot, now it is clear and yes it seems like SLAB is problematic but now I am wondering if it is really worth fixing. For LRU pages, how about using lruvec_lru_size() defined in vmscan.c. That would at least keep count_shadow_nodes() working irrespective of LRU reparenting.
On 1/21/26 2:47 AM, Shakeel Butt wrote: > On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote: >> >> >>>> No reparenting local stats for v2. >>> >>> It seems that lruvec_stats->state_local (non-hierarchical) needs to be >>> relocated in both v1 and v2. >> >> Here we might need to elaborate a bit. Specifically, in the function >> `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain >> LRU and SLAB pages seems to also require these logics to work correctly. >> For SLAB, it appears that the statistics here have already been >> problematic for a while since SLAB pages have been reparented, right? >> > > Thanks a lot, now it is clear and yes it seems like SLAB is problematic > but now I am wondering if it is really worth fixing. For LRU pages, how > about using lruvec_lru_size() defined in vmscan.c. That would at least > keep count_shadow_nodes() working irrespective of LRU reparenting. Do you mean calling lruvec_lru_size() in count_shadow_nodes()? But numa_stat interface also reads lruvec_stats->state and make it visible to the user. >
On Wed, Jan 21, 2026 at 11:43:50AM +0800, Qi Zheng wrote: > > > On 1/21/26 2:47 AM, Shakeel Butt wrote: > > On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote: > > > > > > > > > > > No reparenting local stats for v2. > > > > > > > > It seems that lruvec_stats->state_local (non-hierarchical) needs to be > > > > relocated in both v1 and v2. > > > > > > Here we might need to elaborate a bit. Specifically, in the function > > > `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain > > > LRU and SLAB pages seems to also require these logics to work correctly. > > > For SLAB, it appears that the statistics here have already been > > > problematic for a while since SLAB pages have been reparented, right? > > > > > > > Thanks a lot, now it is clear and yes it seems like SLAB is problematic > > but now I am wondering if it is really worth fixing. For LRU pages, how > > about using lruvec_lru_size() defined in vmscan.c. That would at least > > keep count_shadow_nodes() working irrespective of LRU reparenting. > > Do you mean calling lruvec_lru_size() in count_shadow_nodes()? Yes but I am mainly brainstorming. We can keep the reparenting local stats for both v1 and v2 for now as it is not a performance critical path. I am more worried about the stats update path where upward traversal of memcg for CSS_DYING can be costly and I don't want that in v2. > But > numa_stat interface also reads lruvec_stats->state and make it visible > to the user. > Not sure how this is relevant.
On 1/21/26 4:20 PM, Shakeel Butt wrote: > On Wed, Jan 21, 2026 at 11:43:50AM +0800, Qi Zheng wrote: >> >> >> On 1/21/26 2:47 AM, Shakeel Butt wrote: >>> On Tue, Jan 20, 2026 at 03:19:00PM +0800, Muchun Song wrote: >>>> >>>> >>>>>> No reparenting local stats for v2. >>>>> >>>>> It seems that lruvec_stats->state_local (non-hierarchical) needs to be >>>>> relocated in both v1 and v2. >>>> >>>> Here we might need to elaborate a bit. Specifically, in the function >>>> `count_shadow_nodes`, the use of `lruvec_page_state_local` to obtain >>>> LRU and SLAB pages seems to also require these logics to work correctly. >>>> For SLAB, it appears that the statistics here have already been >>>> problematic for a while since SLAB pages have been reparented, right? >>>> >>> >>> Thanks a lot, now it is clear and yes it seems like SLAB is problematic >>> but now I am wondering if it is really worth fixing. For LRU pages, how >>> about using lruvec_lru_size() defined in vmscan.c. That would at least >>> keep count_shadow_nodes() working irrespective of LRU reparenting. >> >> Do you mean calling lruvec_lru_size() in count_shadow_nodes()? > > Yes but I am mainly brainstorming. We can keep the reparenting local OK, I will take a closer look. > stats for both v1 and v2 for now as it is not a performance critical > path. I am more worried about the stats update path where upward > traversal of memcg for CSS_DYING can be costly and I don't want that in > v2. > >> But >> numa_stat interface also reads lruvec_stats->state and make it visible >> to the user. >> > > Not sure how this is relevant. My mistake, please ignore it. Thanks!
On Thu, 15 Jan 2026 18:41:38 +0800 Qi Zheng <qi.zheng@linux.dev> wrote: > From: Qi Zheng <zhengqi.arch@bytedance.com> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> What was "fixed"? I queued this and [2/2] as squashable fixes against "mm: memcontrol: prepare for reparenting state_local".
On 1/16/26 1:47 AM, Andrew Morton wrote: > On Thu, 15 Jan 2026 18:41:38 +0800 Qi Zheng <qi.zheng@linux.dev> wrote: > >> From: Qi Zheng <zhengqi.arch@bytedance.com> >> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > > What was "fixed"? [1/2] fixed the issue reported by https://lore.kernel.org/all/6967cd38.050a0220.58bed.0001.GAE@google.com/. Previuosly, state_local was protected by ss_rstat_lock. Since [PATCH v3 28/30] added a concurrent path, [2/2] changed state_local to the atomic_long_t type for protection. > > I queued this and [2/2] as squashable fixes against "mm: memcontrol: > prepare for reparenting state_local". Thanks! >
From: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/memcontrol.c | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b7b35143d4d2d..c303c483f55a4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -456,7 +456,7 @@ struct lruvec_stats {
long state[NR_MEMCG_NODE_STAT_ITEMS];
/* Non-hierarchical (CPU aggregated) state */
- long state_local[NR_MEMCG_NODE_STAT_ITEMS];
+ atomic_long_t state_local[NR_MEMCG_NODE_STAT_ITEMS];
/* Pending child counts during tree propagation */
long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
@@ -499,7 +499,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return 0;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = READ_ONCE(pn->lruvec_stats->state_local[i]);
+ x = atomic_long_read(&(pn->lruvec_stats->state_local[i]));
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -524,8 +524,7 @@ static void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
- WRITE_ONCE(parent_pn->lruvec_stats->state_local[i],
- parent_pn->lruvec_stats->state_local[i] + value);
+ atomic_long_add(value, &(parent_pn->lruvec_stats->state_local[i]));
}
}
@@ -620,8 +619,8 @@ struct memcg_vmstats {
unsigned long events[NR_MEMCG_EVENTS];
/* Non-hierarchical (CPU aggregated) page state & events */
- long state_local[MEMCG_VMSTAT_SIZE];
- unsigned long events_local[NR_MEMCG_EVENTS];
+ atomic_long_t state_local[MEMCG_VMSTAT_SIZE];
+ atomic_long_t events_local[NR_MEMCG_EVENTS];
/* Pending child counts during tree propagation */
long state_pending[MEMCG_VMSTAT_SIZE];
@@ -824,7 +823,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
- x = READ_ONCE(memcg->vmstats->state_local[i]);
+ x = atomic_long_read(&(memcg->vmstats->state_local[i]));
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -841,7 +840,7 @@ void reparent_memcg_state_local(struct mem_cgroup *memcg,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- WRITE_ONCE(parent->vmstats->state_local[i], parent->vmstats->state_local[i] + value);
+ atomic_long_add(value, &(parent->vmstats->state_local[i]));
}
#endif
@@ -1001,7 +1000,7 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
return 0;
- return READ_ONCE(memcg->vmstats->events_local[i]);
+ return atomic_long_read(&(memcg->vmstats->events_local[i]));
}
#endif
@@ -4126,7 +4125,7 @@ struct aggregate_control {
/* pointer to the aggregated (CPU and subtree aggregated) counters */
long *aggregate;
/* pointer to the non-hierarchichal (CPU aggregated) counters */
- long *local;
+ atomic_long_t *local;
/* pointer to the pending child counters during tree propagation */
long *pending;
/* pointer to the parent's pending counters, could be NULL */
@@ -4165,7 +4164,7 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
/* Aggregate counts on this level and propagate upwards */
if (delta_cpu)
- ac->local[i] += delta_cpu;
+ atomic_long_add(delta_cpu, &(ac->local[i]));
if (delta) {
ac->aggregate[i] += delta;
--
2.20.1
© 2016 - 2026 Red Hat, Inc.