[PATCH v3 25/30] mm: vmscan: prepare for reparenting traditional LRU folios

Qi Zheng posted 30 patches 3 weeks, 4 days ago
There is a newer version of this series
[PATCH v3 25/30] mm: vmscan: prepare for reparenting traditional LRU folios
Posted by Qi Zheng 3 weeks, 4 days ago
From: Qi Zheng <zhengqi.arch@bytedance.com>

To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg. For traditional LRU list, each lruvec of every
memcg comprises four LRU lists. Due to the symmetry of the LRU lists, it
is feasible to transfer the LRU lists from a memcg to its parent memcg
during the reparenting process.

This commit implements the specific function, which will be used during
the reparenting process.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/mmzone.h |  4 ++++
 include/linux/swap.h   | 19 +++++++++++++++++++
 mm/swap.c              | 37 +++++++++++++++++++++++++++++++++++++
 mm/vmscan.c            | 19 -------------------
 4 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6a7db0fee54a3..1014b5a93c09c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -366,6 +366,10 @@ enum lruvec_flags {
 	LRUVEC_NODE_CONGESTED,
 };
 
+#ifdef CONFIG_MEMCG
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+#endif /* CONFIG_MEMCG */
+
 #endif /* !__GENERATING_BOUNDS_H */
 
 /*
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e60f45b48e74d..4449d1f371a56 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -636,5 +636,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio)
 }
 #endif
 
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx)	\
+	for ((idx) = 0, (zone) = (pgdat)->node_zones;		\
+	    (idx) <= (highidx);					\
+	    (idx)++, (zone)++)					\
+		if (!managed_zone(zone))			\
+			continue;				\
+		else
+
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/swap.c b/mm/swap.c
index 7e53479ca1732..cb40e80da53cd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1090,6 +1090,43 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
 	fbatch->nr = j;
 }
 
+#ifdef CONFIG_MEMCG
+static void lruvec_reparent_lru(struct lruvec *child_lruvec,
+				struct lruvec *parent_lruvec,
+				enum lru_list lru, int nid)
+{
+	int zid;
+	struct zone *zone;
+
+	if (lru != LRU_UNEVICTABLE)
+		list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);
+
+	for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
+		unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
+
+		mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
+	}
+}
+
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+	int nid;
+
+	for_each_node(nid) {
+		enum lru_list lru;
+		struct lruvec *child_lruvec, *parent_lruvec;
+
+		child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+		parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+		parent_lruvec->anon_cost += child_lruvec->anon_cost;
+		parent_lruvec->file_cost += child_lruvec->file_cost;
+
+		for_each_lru(lru)
+			lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
+	}
+}
+#endif
+
 static const struct ctl_table swap_sysctl_table[] = {
 	{
 		.procname	= "page-cluster",
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c48ff6e05e004..e738082874878 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -270,25 +270,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
 }
 #endif
 
-/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
- * and including the specified highidx
- * @zone: The current zone in the iterator
- * @pgdat: The pgdat which node_zones are being iterated
- * @idx: The index variable
- * @highidx: The index of the highest zone to return
- *
- * This macro iterates through all managed zones up to and including the specified highidx.
- * The zone iterator enters an invalid state after macro call and must be reinitialized
- * before it can be used again.
- */
-#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx)	\
-	for ((idx) = 0, (zone) = (pgdat)->node_zones;		\
-	    (idx) <= (highidx);					\
-	    (idx)++, (zone)++)					\
-		if (!managed_zone(zone))			\
-			continue;				\
-		else
-
 static void set_task_reclaim_state(struct task_struct *task,
 				   struct reclaim_state *rs)
 {
-- 
2.20.1
Re: [PATCH v3 25/30] mm: vmscan: prepare for reparenting traditional LRU folios
Posted by Shakeel Butt 3 weeks, 1 day ago
On Wed, Jan 14, 2026 at 07:32:52PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
[...]
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -1090,6 +1090,43 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
>  	fbatch->nr = j;
>  }
>  

Why not define the following two functions in memcontrol.c?

> +#ifdef CONFIG_MEMCG
> +static void lruvec_reparent_lru(struct lruvec *child_lruvec,
> +				struct lruvec *parent_lruvec,
> +				enum lru_list lru, int nid)
> +{
> +	int zid;
> +	struct zone *zone;
> +
> +	if (lru != LRU_UNEVICTABLE)
> +		list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);
> +
> +	for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
> +		unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
> +
> +		mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
> +	}
> +}
> +
> +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		enum lru_list lru;
> +		struct lruvec *child_lruvec, *parent_lruvec;
> +
> +		child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> +		parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> +		parent_lruvec->anon_cost += child_lruvec->anon_cost;
> +		parent_lruvec->file_cost += child_lruvec->file_cost;
> +
> +		for_each_lru(lru)
> +			lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
> +	}
> +}
> +#endif
> +
Re: [PATCH v3 25/30] mm: vmscan: prepare for reparenting traditional LRU folios
Posted by Qi Zheng 3 weeks ago

On 1/18/26 9:11 AM, Shakeel Butt wrote:
> On Wed, Jan 14, 2026 at 07:32:52PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
> [...]
>> --- a/mm/swap.c
>> +++ b/mm/swap.c
>> @@ -1090,6 +1090,43 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
>>   	fbatch->nr = j;
>>   }
>>   
> 
> Why not define the following two functions in memcontrol.c?

Ah, Because Johannes previously suggested [1] putting it in swap.c:

```
Lastly, vmscan.c is the reclaim policy. Mechanical LRU shuffling like
this is better placed in mm/swap.c.
```

[1]. https://lore.kernel.org/all/aUQCfdnoLQDLoVyg@cmpxchg.org/

> 
>> +#ifdef CONFIG_MEMCG
>> +static void lruvec_reparent_lru(struct lruvec *child_lruvec,
>> +				struct lruvec *parent_lruvec,
>> +				enum lru_list lru, int nid)
>> +{
>> +	int zid;
>> +	struct zone *zone;
>> +
>> +	if (lru != LRU_UNEVICTABLE)
>> +		list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);
>> +
>> +	for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
>> +		unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
>> +
>> +		mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
>> +	}
>> +}
>> +
>> +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent)
>> +{
>> +	int nid;
>> +
>> +	for_each_node(nid) {
>> +		enum lru_list lru;
>> +		struct lruvec *child_lruvec, *parent_lruvec;
>> +
>> +		child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
>> +		parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
>> +		parent_lruvec->anon_cost += child_lruvec->anon_cost;
>> +		parent_lruvec->file_cost += child_lruvec->file_cost;
>> +
>> +		for_each_lru(lru)
>> +			lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
>> +	}
>> +}
>> +#endif
>> +
> 
>
Re: [PATCH v3 25/30] mm: vmscan: prepare for reparenting traditional LRU folios
Posted by Muchun Song 3 weeks, 2 days ago

On 2026/1/14 19:32, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
>
> To resolve the dying memcg issue, we need to reparent LRU folios of child
> memcg to its parent memcg. For traditional LRU list, each lruvec of every
> memcg comprises four LRU lists. Due to the symmetry of the LRU lists, it
> is feasible to transfer the LRU lists from a memcg to its parent memcg
> during the reparenting process.
>
> This commit implements the specific function, which will be used during
> the reparenting process.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Muchun Song <muchun.song@linux.dev>

With one comment below.
> ---
>   include/linux/mmzone.h |  4 ++++
>   include/linux/swap.h   | 19 +++++++++++++++++++
>   mm/swap.c              | 37 +++++++++++++++++++++++++++++++++++++
>   mm/vmscan.c            | 19 -------------------
>   4 files changed, 60 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 6a7db0fee54a3..1014b5a93c09c 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -366,6 +366,10 @@ enum lruvec_flags {
>   	LRUVEC_NODE_CONGESTED,
>   };
>   
> +#ifdef CONFIG_MEMCG
> +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent);
> +#endif /* CONFIG_MEMCG */

Can we move it to swap.h since it is declared in swap.c?

> +
>   #endif /* !__GENERATING_BOUNDS_H */
>   
>   /*
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index e60f45b48e74d..4449d1f371a56 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -636,5 +636,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio)
>   }
>   #endif
>   
> +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
> + * and including the specified highidx
> + * @zone: The current zone in the iterator
> + * @pgdat: The pgdat which node_zones are being iterated
> + * @idx: The index variable
> + * @highidx: The index of the highest zone to return
> + *
> + * This macro iterates through all managed zones up to and including the specified highidx.
> + * The zone iterator enters an invalid state after macro call and must be reinitialized
> + * before it can be used again.
> + */
> +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx)	\
> +	for ((idx) = 0, (zone) = (pgdat)->node_zones;		\
> +	    (idx) <= (highidx);					\
> +	    (idx)++, (zone)++)					\
> +		if (!managed_zone(zone))			\
> +			continue;				\
> +		else
> +
>   #endif /* __KERNEL__*/
>   #endif /* _LINUX_SWAP_H */
> diff --git a/mm/swap.c b/mm/swap.c
> index 7e53479ca1732..cb40e80da53cd 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -1090,6 +1090,43 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
>   	fbatch->nr = j;
>   }
>   
> +#ifdef CONFIG_MEMCG
> +static void lruvec_reparent_lru(struct lruvec *child_lruvec,
> +				struct lruvec *parent_lruvec,
> +				enum lru_list lru, int nid)
> +{
> +	int zid;
> +	struct zone *zone;
> +
> +	if (lru != LRU_UNEVICTABLE)
> +		list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);
> +
> +	for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
> +		unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
> +
> +		mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
> +	}
> +}
> +
> +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> +{
> +	int nid;
> +
> +	for_each_node(nid) {
> +		enum lru_list lru;
> +		struct lruvec *child_lruvec, *parent_lruvec;
> +
> +		child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> +		parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
> +		parent_lruvec->anon_cost += child_lruvec->anon_cost;
> +		parent_lruvec->file_cost += child_lruvec->file_cost;
> +
> +		for_each_lru(lru)
> +			lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
> +	}
> +}
> +#endif
> +
>   static const struct ctl_table swap_sysctl_table[] = {
>   	{
>   		.procname	= "page-cluster",
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c48ff6e05e004..e738082874878 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -270,25 +270,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
>   }
>   #endif
>   
> -/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
> - * and including the specified highidx
> - * @zone: The current zone in the iterator
> - * @pgdat: The pgdat which node_zones are being iterated
> - * @idx: The index variable
> - * @highidx: The index of the highest zone to return
> - *
> - * This macro iterates through all managed zones up to and including the specified highidx.
> - * The zone iterator enters an invalid state after macro call and must be reinitialized
> - * before it can be used again.
> - */
> -#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx)	\
> -	for ((idx) = 0, (zone) = (pgdat)->node_zones;		\
> -	    (idx) <= (highidx);					\
> -	    (idx)++, (zone)++)					\
> -		if (!managed_zone(zone))			\
> -			continue;				\
> -		else
> -
>   static void set_task_reclaim_state(struct task_struct *task,
>   				   struct reclaim_state *rs)
>   {