[v1] Eliminate Dying Memory Cgroup

[PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Qi Zheng 3 months, 1 week ago

From: Qi Zheng <zhengqi.arch@bytedance.com>

Similar to traditional LRU folios, in order to solve the dying memcg
problem, we also need to reparenting MGLRU folios to the parent memcg when
memcg offline.

However, there are the following challenges:

1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
   number of generations of the parent and child memcg may be different,
   so we cannot simply transfer MGLRU folios in the child memcg to the
   parent memcg as we did for traditional LRU folios.
2. The generation information is stored in folio->flags, but we cannot
   traverse these folios while holding the lru lock, otherwise it may
   cause softlockup.
3. In walk_update_folio(), the gen of folio and corresponding lru size
   may be updated, but the folio is not immediately moved to the
   corresponding lru list. Therefore, there may be folios of different
   generations on an LRU list.
4. In lru_gen_del_folio(), the generation to which the folio belongs is
   found based on the generation information in folio->flags, and the
   corresponding LRU size will be updated. Therefore, we need to update
   the lru size correctly during reparenting, otherwise the lru size may
   be updated incorrectly in lru_gen_del_folio().

Finally, this patch chose a compromise method, which is to splice the lru
list in the child memcg to the lru list of the same generation in the
parent memcg during reparenting. And in order to ensure that the parent
memcg has the same generation, we need to increase the generations in the
parent memcg to the MAX_NR_GENS before reparenting.

Of course, the same generation has different meanings in the parent and
child memcg, this will cause confusion in the hot and cold information of
folios. But other than that, this method is simple enough, the lru size
is correct, and there is no need to consider some concurrency issues (such
as lru_gen_del_folio()).

To prepare for the above work, this commit implements the specific
functions, which will be used during reparenting.

Suggested-by: Harry Yoo <harry.yoo@oracle.com>
Suggested-by: Imran Khan <imran.f.khan@oracle.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/mmzone.h | 16 ++++++++
 mm/vmscan.c            | 86 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d8776e5b6747..0a71bf015d12b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -628,6 +628,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg);
 void lru_gen_offline_memcg(struct mem_cgroup *memcg);
 void lru_gen_release_memcg(struct mem_cgroup *memcg);
 void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
+void max_lru_gen_memcg(struct mem_cgroup *memcg);
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg);
+void lru_gen_reparent_memcg(struct mem_cgroup *src, struct mem_cgroup *dst);
 
 #else /* !CONFIG_LRU_GEN */
 
@@ -668,6 +671,19 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
 {
 }
 
+static inline void max_lru_gen_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg)
+{
+	return true;
+}
+
+static inline void lru_gen_reparent_memcg(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+}
+
 #endif /* CONFIG_LRU_GEN */
 
 struct lruvec {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7aa8e1472d10d..3ee7fb96b8aeb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4468,6 +4468,92 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
 		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
 }
 
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg)
+{
+	int nid;
+
+	for_each_node(nid) {
+		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		int type;
+
+		for (type = 0; type < ANON_AND_FILE; type++) {
+			if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * We need to ensure that the folios of child memcg can be reparented to the
+ * same gen of the parent memcg, so the gens of the parent memcg needed be
+ * incremented to the MAX_NR_GENS before reparenting.
+ */
+void max_lru_gen_memcg(struct mem_cgroup *memcg)
+{
+	int nid;
+
+	for_each_node(nid) {
+		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		int type;
+
+		for (type = 0; type < ANON_AND_FILE; type++) {
+			while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
+				DEFINE_MAX_SEQ(lruvec);
+
+				inc_max_seq(lruvec, max_seq, mem_cgroup_swappiness(memcg));
+				cond_resched();
+			}
+		}
+	}
+}
+
+static void __lru_gen_reparent_memcg(struct lruvec *src_lruvec, struct lruvec *dst_lruvec,
+				     int zone, int type)
+{
+	struct lru_gen_folio *src_lrugen, *dst_lrugen;
+	enum lru_list lru = type * LRU_INACTIVE_FILE;
+	int i;
+
+	src_lrugen = &src_lruvec->lrugen;
+	dst_lrugen = &dst_lruvec->lrugen;
+
+	for (i = 0; i < get_nr_gens(src_lruvec, type); i++) {
+		int gen = lru_gen_from_seq(src_lrugen->max_seq - i);
+		int nr_pages = src_lrugen->nr_pages[gen][type][zone];
+		int src_lru_active = lru_gen_is_active(src_lruvec, gen) ? LRU_ACTIVE : 0;
+		int dst_lru_active = lru_gen_is_active(dst_lruvec, gen) ? LRU_ACTIVE : 0;
+
+		list_splice_tail_init(&src_lrugen->folios[gen][type][zone],
+				      &dst_lrugen->folios[gen][type][zone]);
+
+		WRITE_ONCE(src_lrugen->nr_pages[gen][type][zone], 0);
+		WRITE_ONCE(dst_lrugen->nr_pages[gen][type][zone],
+			   dst_lrugen->nr_pages[gen][type][zone] + nr_pages);
+
+		__update_lru_size(src_lruvec, lru + src_lru_active, zone, -nr_pages);
+		__update_lru_size(dst_lruvec, lru + dst_lru_active, zone, nr_pages);
+	}
+}
+
+void lru_gen_reparent_memcg(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+	int nid;
+
+	for_each_node(nid) {
+		struct lruvec *src_lruvec, *dst_lruvec;
+		int type, zone;
+
+		src_lruvec = get_lruvec(src, nid);
+		dst_lruvec = get_lruvec(dst, nid);
+
+		for (zone = 0; zone < MAX_NR_ZONES; zone++)
+			for (type = 0; type < ANON_AND_FILE; type++)
+				__lru_gen_reparent_memcg(src_lruvec, dst_lruvec, zone, type);
+	}
+}
+
 #endif /* CONFIG_MEMCG */
 
 /******************************************************************************
-- 
2.20.1

Re: [PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Harry Yoo 2 months, 2 weeks ago

On Tue, Oct 28, 2025 at 09:58:36PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> Similar to traditional LRU folios, in order to solve the dying memcg
> problem, we also need to reparenting MGLRU folios to the parent memcg when
> memcg offline.
> 
> However, there are the following challenges:
> 
> 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
>    number of generations of the parent and child memcg may be different,
>    so we cannot simply transfer MGLRU folios in the child memcg to the
>    parent memcg as we did for traditional LRU folios.
> 2. The generation information is stored in folio->flags, but we cannot
>    traverse these folios while holding the lru lock, otherwise it may
>    cause softlockup.
> 3. In walk_update_folio(), the gen of folio and corresponding lru size
>    may be updated, but the folio is not immediately moved to the
>    corresponding lru list. Therefore, there may be folios of different
>    generations on an LRU list.
> 4. In lru_gen_del_folio(), the generation to which the folio belongs is
>    found based on the generation information in folio->flags, and the
>    corresponding LRU size will be updated. Therefore, we need to update
>    the lru size correctly during reparenting, otherwise the lru size may
>    be updated incorrectly in lru_gen_del_folio().
> 
> Finally, this patch chose a compromise method, which is to splice the lru
> list in the child memcg to the lru list of the same generation in the
> parent memcg during reparenting. And in order to ensure that the parent
> memcg has the same generation, we need to increase the generations in the
> parent memcg to the MAX_NR_GENS before reparenting.
> 
> Of course, the same generation has different meanings in the parent and
> child memcg, this will cause confusion in the hot and cold information of
> folios. But other than that, this method is simple enough, the lru size
> is correct, and there is no need to consider some concurrency issues (such
> as lru_gen_del_folio()).
> 
> To prepare for the above work, this commit implements the specific
> functions, which will be used during reparenting.
> 
> Suggested-by: Harry Yoo <harry.yoo@oracle.com>
> Suggested-by: Imran Khan <imran.f.khan@oracle.com>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  include/linux/mmzone.h | 16 ++++++++
>  mm/vmscan.c            | 86 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 102 insertions(+)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 7aa8e1472d10d..3ee7fb96b8aeb 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4468,6 +4468,92 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
>  		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
>  }
>  
> +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
> +		int type;
> +
> +		for (type = 0; type < ANON_AND_FILE; type++) {
> +			if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
> +				return false;
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +/*
> + * We need to ensure that the folios of child memcg can be reparented to the
> + * same gen of the parent memcg, so the gens of the parent memcg needed be
> + * incremented to the MAX_NR_GENS before reparenting.
> + */
> +void max_lru_gen_memcg(struct mem_cgroup *memcg)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
> +		int type;
> +

I was testing this series and observed two warnings...

> +		for (type = 0; type < ANON_AND_FILE; type++) {
> +			while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
> +				DEFINE_MAX_SEQ(lruvec);
> +
> +				inc_max_seq(lruvec, max_seq, mem_cgroup_swappiness(memcg));
> +				cond_resched();

Warning 1) Here we increment max_seq but we skip updating mm_state->seq.
(try_to_inc_max_seq() iterates the mm list and update mm_state->seq after
an iteration, but since we directly call inc_max_seq(), we don't update it)

When mm_state->seq is more than one generation behind walk->seq, a warning is
triggered in iterate_mm_list():

        VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);

Warning 2) In try_to_inc_max_seq(), the last walker of mm list
is supposed to succeed to increment max_seq by calling inc_max_seq():

        if (success) {                                                          
                 success = inc_max_seq(lruvec, seq, swappiness);                 
                 WARN_ON_ONCE(!success);                                         
         }   

But with this patch it may observe the max_seq is already advanced due to
reparenting and thus inc_max_seq() returns false, triggering the warning.

I'm learning MGLRU internals to see whether we can simply remove the warnings
or if we need to do something to advance max_seq without actually iterating
over the mm list.

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Qi Zheng 2 months, 1 week ago


On 11/26/25 9:48 PM, Harry Yoo wrote:
> On Tue, Oct 28, 2025 at 09:58:36PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> Similar to traditional LRU folios, in order to solve the dying memcg
>> problem, we also need to reparenting MGLRU folios to the parent memcg when
>> memcg offline.
>>
>> However, there are the following challenges:
>>
>> 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
>>     number of generations of the parent and child memcg may be different,
>>     so we cannot simply transfer MGLRU folios in the child memcg to the
>>     parent memcg as we did for traditional LRU folios.
>> 2. The generation information is stored in folio->flags, but we cannot
>>     traverse these folios while holding the lru lock, otherwise it may
>>     cause softlockup.
>> 3. In walk_update_folio(), the gen of folio and corresponding lru size
>>     may be updated, but the folio is not immediately moved to the
>>     corresponding lru list. Therefore, there may be folios of different
>>     generations on an LRU list.
>> 4. In lru_gen_del_folio(), the generation to which the folio belongs is
>>     found based on the generation information in folio->flags, and the
>>     corresponding LRU size will be updated. Therefore, we need to update
>>     the lru size correctly during reparenting, otherwise the lru size may
>>     be updated incorrectly in lru_gen_del_folio().
>>
>> Finally, this patch chose a compromise method, which is to splice the lru
>> list in the child memcg to the lru list of the same generation in the
>> parent memcg during reparenting. And in order to ensure that the parent
>> memcg has the same generation, we need to increase the generations in the
>> parent memcg to the MAX_NR_GENS before reparenting.
>>
>> Of course, the same generation has different meanings in the parent and
>> child memcg, this will cause confusion in the hot and cold information of
>> folios. But other than that, this method is simple enough, the lru size
>> is correct, and there is no need to consider some concurrency issues (such
>> as lru_gen_del_folio()).
>>
>> To prepare for the above work, this commit implements the specific
>> functions, which will be used during reparenting.
>>
>> Suggested-by: Harry Yoo <harry.yoo@oracle.com>
>> Suggested-by: Imran Khan <imran.f.khan@oracle.com>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   include/linux/mmzone.h | 16 ++++++++
>>   mm/vmscan.c            | 86 ++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 102 insertions(+)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 7aa8e1472d10d..3ee7fb96b8aeb 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -4468,6 +4468,92 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
>>   		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
>>   }
>>   
>> +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
>> +		int type;
>> +
>> +		for (type = 0; type < ANON_AND_FILE; type++) {
>> +			if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
>> +				return false;
>> +		}
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +/*
>> + * We need to ensure that the folios of child memcg can be reparented to the
>> + * same gen of the parent memcg, so the gens of the parent memcg needed be
>> + * incremented to the MAX_NR_GENS before reparenting.
>> + */
>> +void max_lru_gen_memcg(struct mem_cgroup *memcg)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
>> +		int type;
>> +
> 
> I was testing this series and observed two warnings...
> 
>> +		for (type = 0; type < ANON_AND_FILE; type++) {
>> +			while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
>> +				DEFINE_MAX_SEQ(lruvec);
>> +
>> +				inc_max_seq(lruvec, max_seq, mem_cgroup_swappiness(memcg));
>> +				cond_resched();
> 
> Warning 1) Here we increment max_seq but we skip updating mm_state->seq.
> (try_to_inc_max_seq() iterates the mm list and update mm_state->seq after
> an iteration, but since we directly call inc_max_seq(), we don't update it)
> 
> When mm_state->seq is more than one generation behind walk->seq, a warning is
> triggered in iterate_mm_list():
> 
>          VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);

The mm_state->seq is just to record the completion of a full traversal
of mm_list. If we simply delete this warning, it may cause this judgment
in iterate_mm_list to become invalid:

         if (walk->seq <= mm_state->seq)
		goto done;

So it seems we can manually increase mm_state->seq during reparenting to
avoid this warning.

However, we cannot directly call iterate_mm_list_nowalk() because we do
not want to reset mm_state->head and mm_state->tail to NULL. Otherwise,
we wouldn't be able to continue iterating over the mm_list.

> 
> Warning 2) In try_to_inc_max_seq(), the last walker of mm list
> is supposed to succeed to increment max_seq by calling inc_max_seq():
> 
>          if (success) {
>                   success = inc_max_seq(lruvec, seq, swappiness);
>                   WARN_ON_ONCE(!success);
>           }
> 
> But with this patch it may observe the max_seq is already advanced due to
> reparenting and thus inc_max_seq() returns false, triggering the warning.

After we correct the warning above, we will satisfy this condition in
iterate_mm_list():

         if (walk->seq <= mm_state->seq)
		goto done;

thus returning false, and avoiding triggering this warning.

> 
> I'm learning MGLRU internals to see whether we can simply remove the warnings
> or if we need to do something to advance max_seq without actually iterating
> over the mm list.

So IIUC, we can simple increase mm_state->seq during reparenting to
fix these warnings.

>

Re: [PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Yuanchu Xie 2 months, 1 week ago

On Mon, Dec 1, 2025 at 9:41 AM Qi Zheng <qi.zheng@linux.dev> wrote:
> > Warning 1) Here we increment max_seq but we skip updating mm_state->seq.
> > (try_to_inc_max_seq() iterates the mm list and update mm_state->seq after
> > an iteration, but since we directly call inc_max_seq(), we don't update it)
> >
> > When mm_state->seq is more than one generation behind walk->seq, a warning is
> > triggered in iterate_mm_list():
> >
> >          VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
>
> The mm_state->seq is just to record the completion of a full traversal
> of mm_list. If we simply delete this warning, it may cause this judgment
> in iterate_mm_list to become invalid:
>
>          if (walk->seq <= mm_state->seq)
>                 goto done;
>
> So it seems we can manually increase mm_state->seq during reparenting to
> avoid this warning.
Agreed, don't get rid of the warning as this check is supposed to make
stale walkers exit early.

>
> However, we cannot directly call iterate_mm_list_nowalk() because we do
> not want to reset mm_state->head and mm_state->tail to NULL. Otherwise,
> we wouldn't be able to continue iterating over the mm_list.
>

From the original posting:
> Of course, the same generation has different meanings in the parent and
> child memcg, this will cause confusion in the hot and cold information of
> folios. But other than that, this method is simple enough, the lru size
> is correct, and there is no need to consider some concurrency issues (such
> as lru_gen_del_folio()).
One way to solve this is to map generations based on
lrugen->timestamp, but of course this runs into the reading
folio->flags issue you described. I think the current method is a good
compromise, but the splicing of generations doesn't much make semantic
sense. It would be good to leave a comment somewhere in
__lru_gen_reparent_memcg to note this weirdness.

Re: [PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Qi Zheng 2 months, 1 week ago


On 12/2/25 5:50 AM, Yuanchu Xie wrote:
> On Mon, Dec 1, 2025 at 9:41 AM Qi Zheng <qi.zheng@linux.dev> wrote:
>>> Warning 1) Here we increment max_seq but we skip updating mm_state->seq.
>>> (try_to_inc_max_seq() iterates the mm list and update mm_state->seq after
>>> an iteration, but since we directly call inc_max_seq(), we don't update it)
>>>
>>> When mm_state->seq is more than one generation behind walk->seq, a warning is
>>> triggered in iterate_mm_list():
>>>
>>>           VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
>>
>> The mm_state->seq is just to record the completion of a full traversal
>> of mm_list. If we simply delete this warning, it may cause this judgment
>> in iterate_mm_list to become invalid:
>>
>>           if (walk->seq <= mm_state->seq)
>>                  goto done;
>>
>> So it seems we can manually increase mm_state->seq during reparenting to
>> avoid this warning.
> Agreed, don't get rid of the warning as this check is supposed to make
> stale walkers exit early.

OK, will incease mm_state->seq during reparenting in the next version.

> 
>>
>> However, we cannot directly call iterate_mm_list_nowalk() because we do
>> not want to reset mm_state->head and mm_state->tail to NULL. Otherwise,
>> we wouldn't be able to continue iterating over the mm_list.
>>
> 
>  From the original posting:
>> Of course, the same generation has different meanings in the parent and
>> child memcg, this will cause confusion in the hot and cold information of
>> folios. But other than that, this method is simple enough, the lru size
>> is correct, and there is no need to consider some concurrency issues (such
>> as lru_gen_del_folio()).
> One way to solve this is to map generations based on
> lrugen->timestamp, but of course this runs into the reading
> folio->flags issue you described. I think the current method is a good
> compromise, but the splicing of generations doesn't much make semantic
> sense. It would be good to leave a comment somewhere in
> __lru_gen_reparent_memcg to note this weirdness.

OK, will do.

Thanks!

Re: [PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Qi Zheng 2 months, 2 weeks ago


On 11/26/25 9:48 PM, Harry Yoo wrote:
> On Tue, Oct 28, 2025 at 09:58:36PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> Similar to traditional LRU folios, in order to solve the dying memcg
>> problem, we also need to reparenting MGLRU folios to the parent memcg when
>> memcg offline.
>>
>> However, there are the following challenges:
>>
>> 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
>>     number of generations of the parent and child memcg may be different,
>>     so we cannot simply transfer MGLRU folios in the child memcg to the
>>     parent memcg as we did for traditional LRU folios.
>> 2. The generation information is stored in folio->flags, but we cannot
>>     traverse these folios while holding the lru lock, otherwise it may
>>     cause softlockup.
>> 3. In walk_update_folio(), the gen of folio and corresponding lru size
>>     may be updated, but the folio is not immediately moved to the
>>     corresponding lru list. Therefore, there may be folios of different
>>     generations on an LRU list.
>> 4. In lru_gen_del_folio(), the generation to which the folio belongs is
>>     found based on the generation information in folio->flags, and the
>>     corresponding LRU size will be updated. Therefore, we need to update
>>     the lru size correctly during reparenting, otherwise the lru size may
>>     be updated incorrectly in lru_gen_del_folio().
>>
>> Finally, this patch chose a compromise method, which is to splice the lru
>> list in the child memcg to the lru list of the same generation in the
>> parent memcg during reparenting. And in order to ensure that the parent
>> memcg has the same generation, we need to increase the generations in the
>> parent memcg to the MAX_NR_GENS before reparenting.
>>
>> Of course, the same generation has different meanings in the parent and
>> child memcg, this will cause confusion in the hot and cold information of
>> folios. But other than that, this method is simple enough, the lru size
>> is correct, and there is no need to consider some concurrency issues (such
>> as lru_gen_del_folio()).
>>
>> To prepare for the above work, this commit implements the specific
>> functions, which will be used during reparenting.
>>
>> Suggested-by: Harry Yoo <harry.yoo@oracle.com>
>> Suggested-by: Imran Khan <imran.f.khan@oracle.com>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   include/linux/mmzone.h | 16 ++++++++
>>   mm/vmscan.c            | 86 ++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 102 insertions(+)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 7aa8e1472d10d..3ee7fb96b8aeb 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -4468,6 +4468,92 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
>>   		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
>>   }
>>   
>> +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
>> +		int type;
>> +
>> +		for (type = 0; type < ANON_AND_FILE; type++) {
>> +			if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
>> +				return false;
>> +		}
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +/*
>> + * We need to ensure that the folios of child memcg can be reparented to the
>> + * same gen of the parent memcg, so the gens of the parent memcg needed be
>> + * incremented to the MAX_NR_GENS before reparenting.
>> + */
>> +void max_lru_gen_memcg(struct mem_cgroup *memcg)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
>> +		int type;
>> +
> 
> I was testing this series and observed two warnings...
> 
>> +		for (type = 0; type < ANON_AND_FILE; type++) {
>> +			while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
>> +				DEFINE_MAX_SEQ(lruvec);
>> +
>> +				inc_max_seq(lruvec, max_seq, mem_cgroup_swappiness(memcg));
>> +				cond_resched();
> 
> Warning 1) Here we increment max_seq but we skip updating mm_state->seq.
> (try_to_inc_max_seq() iterates the mm list and update mm_state->seq after
> an iteration, but since we directly call inc_max_seq(), we don't update it)
> 
> When mm_state->seq is more than one generation behind walk->seq, a warning is
> triggered in iterate_mm_list():
> 
>          VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
> 
> Warning 2) In try_to_inc_max_seq(), the last walker of mm list
> is supposed to succeed to increment max_seq by calling inc_max_seq():
> 
>          if (success) {
>                   success = inc_max_seq(lruvec, seq, swappiness);
>                   WARN_ON_ONCE(!success);
>           }
> 
> But with this patch it may observe the max_seq is already advanced due to
> reparenting and thus inc_max_seq() returns false, triggering the warning.

Got it. Thanks for testing and reporting!

> 
> I'm learning MGLRU internals to see whether we can simply remove the warnings
> or if we need to do something to advance max_seq without actually iterating
> over the mm list.

Thanks! I will also check on this.

>

Re: [PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Harry Yoo 2 months, 2 weeks ago

On Tue, Oct 28, 2025 at 09:58:36PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> Similar to traditional LRU folios, in order to solve the dying memcg
> problem, we also need to reparenting MGLRU folios to the parent memcg when
> memcg offline.
> 
> However, there are the following challenges:
> 
> 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
>    number of generations of the parent and child memcg may be different,
>    so we cannot simply transfer MGLRU folios in the child memcg to the
>    parent memcg as we did for traditional LRU folios.
> 2. The generation information is stored in folio->flags, but we cannot
>    traverse these folios while holding the lru lock, otherwise it may
>    cause softlockup.
> 3. In walk_update_folio(), the gen of folio and corresponding lru size
>    may be updated, but the folio is not immediately moved to the
>    corresponding lru list. Therefore, there may be folios of different
>    generations on an LRU list.
> 4. In lru_gen_del_folio(), the generation to which the folio belongs is
>    found based on the generation information in folio->flags, and the
>    corresponding LRU size will be updated. Therefore, we need to update
>    the lru size correctly during reparenting, otherwise the lru size may
>    be updated incorrectly in lru_gen_del_folio().
> 
> Finally, this patch chose a compromise method, which is to splice the lru
> list in the child memcg to the lru list of the same generation in the
> parent memcg during reparenting. And in order to ensure that the parent
> memcg has the same generation, we need to increase the generations in the
> parent memcg to the MAX_NR_GENS before reparenting.
> 
> Of course, the same generation has different meanings in the parent and
> child memcg, this will cause confusion in the hot and cold information of
> folios. But other than that, this method is simple enough, the lru size
> is correct, and there is no need to consider some concurrency issues (such
> as lru_gen_del_folio()).
> 
> To prepare for the above work, this commit implements the specific
> functions, which will be used during reparenting.
> 
> Suggested-by: Harry Yoo <harry.yoo@oracle.com>
> Suggested-by: Imran Khan <imran.f.khan@oracle.com>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  include/linux/mmzone.h | 16 ++++++++
>  mm/vmscan.c            | 86 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 102 insertions(+)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 7aa8e1472d10d..3ee7fb96b8aeb 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4468,6 +4468,92 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
>  		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
>  }
>  
> +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
> +		int type;
> +
> +		for (type = 0; type < ANON_AND_FILE; type++) {
> +			if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
> +				return false;
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +/*
> + * We need to ensure that the folios of child memcg can be reparented to the
> + * same gen of the parent memcg, so the gens of the parent memcg needed be
> + * incremented to the MAX_NR_GENS before reparenting.
> + */
> +void max_lru_gen_memcg(struct mem_cgroup *memcg)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
> +		int type;
> +
> +		for (type = 0; type < ANON_AND_FILE; type++) {
> +			while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
> +				DEFINE_MAX_SEQ(lruvec);
> +
> +				inc_max_seq(lruvec, max_seq, mem_cgroup_swappiness(memcg));
> +				cond_resched();
> +			}

To best of my knowledge this looks functionally correct.

> +		}
> +	}
> +}
> +
> +static void __lru_gen_reparent_memcg(struct lruvec *src_lruvec, struct lruvec *dst_lruvec,
> +				     int zone, int type)
> +{
> +	struct lru_gen_folio *src_lrugen, *dst_lrugen;
> +	enum lru_list lru = type * LRU_INACTIVE_FILE;
> +	int i;
> +
> +	src_lrugen = &src_lruvec->lrugen;
> +	dst_lrugen = &dst_lruvec->lrugen;
> +
> +	for (i = 0; i < get_nr_gens(src_lruvec, type); i++) {
> +		int gen = lru_gen_from_seq(src_lrugen->max_seq - i);
> +		int nr_pages = src_lrugen->nr_pages[gen][type][zone];

nr_pages should be long type since nothing prevents us from reparenting
more than 2 billions of pages :)

Otherwise looks correct to me.

-- 
Cheers,
Harry / Hyeonggon

> +		int src_lru_active = lru_gen_is_active(src_lruvec, gen) ? LRU_ACTIVE : 0;
> +		int dst_lru_active = lru_gen_is_active(dst_lruvec, gen) ? LRU_ACTIVE : 0;
> +
> +		list_splice_tail_init(&src_lrugen->folios[gen][type][zone],
> +				      &dst_lrugen->folios[gen][type][zone]);
> +
> +		WRITE_ONCE(src_lrugen->nr_pages[gen][type][zone], 0);
> +		WRITE_ONCE(dst_lrugen->nr_pages[gen][type][zone],
> +			   dst_lrugen->nr_pages[gen][type][zone] + nr_pages);
> +
> +		__update_lru_size(src_lruvec, lru + src_lru_active, zone, -nr_pages);
> +		__update_lru_size(dst_lruvec, lru + dst_lru_active, zone, nr_pages);
> +	}
> +}
> +
> +void lru_gen_reparent_memcg(struct mem_cgroup *src, struct mem_cgroup *dst)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		struct lruvec *src_lruvec, *dst_lruvec;
> +		int type, zone;
> +
> +		src_lruvec = get_lruvec(src, nid);
> +		dst_lruvec = get_lruvec(dst, nid);
> +
> +		for (zone = 0; zone < MAX_NR_ZONES; zone++)
> +			for (type = 0; type < ANON_AND_FILE; type++)
> +				__lru_gen_reparent_memcg(src_lruvec, dst_lruvec, zone, type);
> +	}
> +}
> +
>  #endif /* CONFIG_MEMCG */
>  
>  /******************************************************************************
> -- 
> 2.20.1
>

Re: [PATCH v1 23/26] mm: vmscan: prepare for reparenting MGLRU folios

Posted by Qi Zheng 2 months, 2 weeks ago


On 11/25/25 5:55 PM, Harry Yoo wrote:
> On Tue, Oct 28, 2025 at 09:58:36PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> Similar to traditional LRU folios, in order to solve the dying memcg
>> problem, we also need to reparenting MGLRU folios to the parent memcg when
>> memcg offline.
>>
>> However, there are the following challenges:
>>
>> 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
>>     number of generations of the parent and child memcg may be different,
>>     so we cannot simply transfer MGLRU folios in the child memcg to the
>>     parent memcg as we did for traditional LRU folios.
>> 2. The generation information is stored in folio->flags, but we cannot
>>     traverse these folios while holding the lru lock, otherwise it may
>>     cause softlockup.
>> 3. In walk_update_folio(), the gen of folio and corresponding lru size
>>     may be updated, but the folio is not immediately moved to the
>>     corresponding lru list. Therefore, there may be folios of different
>>     generations on an LRU list.
>> 4. In lru_gen_del_folio(), the generation to which the folio belongs is
>>     found based on the generation information in folio->flags, and the
>>     corresponding LRU size will be updated. Therefore, we need to update
>>     the lru size correctly during reparenting, otherwise the lru size may
>>     be updated incorrectly in lru_gen_del_folio().
>>
>> Finally, this patch chose a compromise method, which is to splice the lru
>> list in the child memcg to the lru list of the same generation in the
>> parent memcg during reparenting. And in order to ensure that the parent
>> memcg has the same generation, we need to increase the generations in the
>> parent memcg to the MAX_NR_GENS before reparenting.
>>
>> Of course, the same generation has different meanings in the parent and
>> child memcg, this will cause confusion in the hot and cold information of
>> folios. But other than that, this method is simple enough, the lru size
>> is correct, and there is no need to consider some concurrency issues (such
>> as lru_gen_del_folio()).
>>
>> To prepare for the above work, this commit implements the specific
>> functions, which will be used during reparenting.
>>
>> Suggested-by: Harry Yoo <harry.yoo@oracle.com>
>> Suggested-by: Imran Khan <imran.f.khan@oracle.com>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   include/linux/mmzone.h | 16 ++++++++
>>   mm/vmscan.c            | 86 ++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 102 insertions(+)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 7aa8e1472d10d..3ee7fb96b8aeb 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -4468,6 +4468,92 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
>>   		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
>>   }
>>   
>> +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
>> +		int type;
>> +
>> +		for (type = 0; type < ANON_AND_FILE; type++) {
>> +			if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
>> +				return false;
>> +		}
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +/*
>> + * We need to ensure that the folios of child memcg can be reparented to the
>> + * same gen of the parent memcg, so the gens of the parent memcg needed be
>> + * incremented to the MAX_NR_GENS before reparenting.
>> + */
>> +void max_lru_gen_memcg(struct mem_cgroup *memcg)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		struct lruvec *lruvec = get_lruvec(memcg, nid);
>> +		int type;
>> +
>> +		for (type = 0; type < ANON_AND_FILE; type++) {
>> +			while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
>> +				DEFINE_MAX_SEQ(lruvec);
>> +
>> +				inc_max_seq(lruvec, max_seq, mem_cgroup_swappiness(memcg));
>> +				cond_resched();
>> +			}
> 
> To best of my knowledge this looks functionally correct.
> 
>> +		}
>> +	}
>> +}
>> +
>> +static void __lru_gen_reparent_memcg(struct lruvec *src_lruvec, struct lruvec *dst_lruvec,
>> +				     int zone, int type)
>> +{
>> +	struct lru_gen_folio *src_lrugen, *dst_lrugen;
>> +	enum lru_list lru = type * LRU_INACTIVE_FILE;
>> +	int i;
>> +
>> +	src_lrugen = &src_lruvec->lrugen;
>> +	dst_lrugen = &dst_lruvec->lrugen;
>> +
>> +	for (i = 0; i < get_nr_gens(src_lruvec, type); i++) {
>> +		int gen = lru_gen_from_seq(src_lrugen->max_seq - i);
>> +		int nr_pages = src_lrugen->nr_pages[gen][type][zone];
> 
> nr_pages should be long type since nothing prevents us from reparenting
> more than 2 billions of pages :)

Right. The lru_gen_folio.nr_pages is long type, I don't know how I ended
up writing it as an int type.

Will fix it in the next version.

> 
> Otherwise looks correct to me.

Thanks!

>