[v1] mm/mglru: remove memcg lru

[RFC PATCH -next 1/2] mm/mglru: use mem_cgroup_iter for global reclaim

Posted by Chen Ridong 2 months, 1 week ago

From: Chen Ridong <chenridong@huawei.com>

The memcg LRU was originally introduced for global reclaim to enhance
scalability. However, its implementation complexity has led to performance
regressions when dealing with a large number of memory cgroups [1].

As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
cookie-based iteration for global reclaim, aligning with the approach
already used in shrink_node_memcgs. This simplification removes the
dedicated memcg LRU tracking while maintaining the core functionality.

It performed a stress test based on Zhao Yu's methodology [2] on a
1 TB, 4-node NUMA system. The results are summarized below:

					memcg LRU    memcg iter
stddev(pgsteal) / mean(pgsteal)            91.2%         75.7%
sum(pgsteal) / sum(requested)             216.4%        230.5%

The new implementation demonstrates a significant improvement in
fairness, reducing the standard deviation relative to the mean by
15.5 percentage points. While the reclaim accuracy shows a slight
increase in overscan (from 85086871 to 90633890, 6.5%).

The primary benefits of this change are:
1. Simplified codebase by removing custom memcg LRU infrastructure
2. Improved fairness in memory reclaim across multiple cgroups
3. Better performance when creating many memory cgroups

[1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
[2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
 mm/vmscan.c | 117 ++++++++++++++++------------------------------------
 1 file changed, 36 insertions(+), 81 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fddd168a9737..70b0e7e5393c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4895,27 +4895,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	return nr_to_scan < 0;
 }
 
-static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 {
-	bool success;
 	unsigned long scanned = sc->nr_scanned;
 	unsigned long reclaimed = sc->nr_reclaimed;
-	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 
-	/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
-	if (mem_cgroup_below_min(NULL, memcg))
-		return MEMCG_LRU_YOUNG;
-
-	if (mem_cgroup_below_low(NULL, memcg)) {
-		/* see the comment on MEMCG_NR_GENS */
-		if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
-			return MEMCG_LRU_TAIL;
-
-		memcg_memory_event(memcg, MEMCG_LOW);
-	}
-
-	success = try_to_shrink_lruvec(lruvec, sc);
+	try_to_shrink_lruvec(lruvec, sc);
 
 	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
 
@@ -4924,86 +4911,55 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 			   sc->nr_reclaimed - reclaimed);
 
 	flush_reclaim_state(sc);
-
-	if (success && mem_cgroup_online(memcg))
-		return MEMCG_LRU_YOUNG;
-
-	if (!success && lruvec_is_sizable(lruvec, sc))
-		return 0;
-
-	/* one retry if offlined or too small */
-	return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
-	       MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
 }
 
 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
 {
-	int op;
-	int gen;
-	int bin;
-	int first_bin;
-	struct lruvec *lruvec;
-	struct lru_gen_folio *lrugen;
+	struct mem_cgroup *target = sc->target_mem_cgroup;
+	struct mem_cgroup_reclaim_cookie reclaim = {
+		.pgdat = pgdat,
+	};
+	struct mem_cgroup_reclaim_cookie *cookie = &reclaim;
 	struct mem_cgroup *memcg;
-	struct hlist_nulls_node *pos;
 
-	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
-	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
-restart:
-	op = 0;
-	memcg = NULL;
-
-	rcu_read_lock();
+	if (current_is_kswapd() || sc->memcg_full_walk)
+		cookie = NULL;
 
-	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
-		if (op) {
-			lru_gen_rotate_memcg(lruvec, op);
-			op = 0;
-		}
+	memcg = mem_cgroup_iter(target, NULL, cookie);
+	while (memcg) {
+		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 
-		mem_cgroup_put(memcg);
-		memcg = NULL;
+		cond_resched();
 
-		if (gen != READ_ONCE(lrugen->gen))
-			continue;
+		mem_cgroup_calculate_protection(target, memcg);
 
-		lruvec = container_of(lrugen, struct lruvec, lrugen);
-		memcg = lruvec_memcg(lruvec);
+		if (mem_cgroup_below_min(target, memcg))
+			goto next;
 
-		if (!mem_cgroup_tryget(memcg)) {
-			lru_gen_release_memcg(memcg);
-			memcg = NULL;
-			continue;
+		if (mem_cgroup_below_low(target, memcg)) {
+			if (!sc->memcg_low_reclaim) {
+				sc->memcg_low_skipped = 1;
+				goto next;
+			}
+			memcg_memory_event(memcg, MEMCG_LOW);
 		}
 
-		rcu_read_unlock();
+		shrink_one(lruvec, sc);
 
-		op = shrink_one(lruvec, sc);
-
-		rcu_read_lock();
-
-		if (should_abort_scan(lruvec, sc))
+		if (should_abort_scan(lruvec, sc)) {
+			if (cookie)
+				mem_cgroup_iter_break(target, memcg);
 			break;
-	}
-
-	rcu_read_unlock();
-
-	if (op)
-		lru_gen_rotate_memcg(lruvec, op);
-
-	mem_cgroup_put(memcg);
-
-	if (!is_a_nulls(pos))
-		return;
+		}
 
-	/* restart if raced with lru_gen_rotate_memcg() */
-	if (gen != get_nulls_value(pos))
-		goto restart;
+next:
+		if (cookie && sc->nr_reclaimed >= sc->nr_to_reclaim) {
+			mem_cgroup_iter_break(target, memcg);
+			break;
+		}
 
-	/* try the rest of the bins of the current generation */
-	bin = get_memcg_bin(bin + 1);
-	if (bin != first_bin)
-		goto restart;
+		memcg = mem_cgroup_iter(target, memcg, cookie);
+	}
 }
 
 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -5019,8 +4975,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 
 	set_mm_walk(NULL, sc->proactive);
 
-	if (try_to_shrink_lruvec(lruvec, sc))
-		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
+	try_to_shrink_lruvec(lruvec, sc);
 
 	clear_mm_walk();
 
-- 
2.34.1

Re: [RFC PATCH -next 1/2] mm/mglru: use mem_cgroup_iter for global reclaim

Posted by Shakeel Butt 2 months ago

Hi Chen,

On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
> 
> The memcg LRU was originally introduced for global reclaim to enhance
> scalability. However, its implementation complexity has led to performance
> regressions when dealing with a large number of memory cgroups [1].
> 
> As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
> cookie-based iteration for global reclaim, aligning with the approach
> already used in shrink_node_memcgs. This simplification removes the
> dedicated memcg LRU tracking while maintaining the core functionality.
> 
> It performed a stress test based on Zhao Yu's methodology [2] on a
> 1 TB, 4-node NUMA system. The results are summarized below:
> 
> 					memcg LRU    memcg iter
> stddev(pgsteal) / mean(pgsteal)            91.2%         75.7%
> sum(pgsteal) / sum(requested)             216.4%        230.5%
> 
> The new implementation demonstrates a significant improvement in
> fairness, reducing the standard deviation relative to the mean by
> 15.5 percentage points. While the reclaim accuracy shows a slight
> increase in overscan (from 85086871 to 90633890, 6.5%).
> 
> The primary benefits of this change are:
> 1. Simplified codebase by removing custom memcg LRU infrastructure
> 2. Improved fairness in memory reclaim across multiple cgroups
> 3. Better performance when creating many memory cgroups
> 
> [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
> [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
> Signed-off-by: Chen Ridong <chenridong@huawei.com>

Thanks a lot of this awesome work.

> ---
>  mm/vmscan.c | 117 ++++++++++++++++------------------------------------
>  1 file changed, 36 insertions(+), 81 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index fddd168a9737..70b0e7e5393c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4895,27 +4895,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>  	return nr_to_scan < 0;
>  }
>  
> -static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
> +static void shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>  {
> -	bool success;
>  	unsigned long scanned = sc->nr_scanned;
>  	unsigned long reclaimed = sc->nr_reclaimed;
> -	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> +	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>  
> -	/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
> -	if (mem_cgroup_below_min(NULL, memcg))
> -		return MEMCG_LRU_YOUNG;
> -
> -	if (mem_cgroup_below_low(NULL, memcg)) {
> -		/* see the comment on MEMCG_NR_GENS */
> -		if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
> -			return MEMCG_LRU_TAIL;
> -
> -		memcg_memory_event(memcg, MEMCG_LOW);
> -	}
> -
> -	success = try_to_shrink_lruvec(lruvec, sc);
> +	try_to_shrink_lruvec(lruvec, sc);
>  
>  	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
>  
> @@ -4924,86 +4911,55 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>  			   sc->nr_reclaimed - reclaimed);
>  
>  	flush_reclaim_state(sc);

Unrealted to your patch but why this flush_reclaim_state() is at
different place from the non-MGLRU code path?

> -
> -	if (success && mem_cgroup_online(memcg))
> -		return MEMCG_LRU_YOUNG;
> -
> -	if (!success && lruvec_is_sizable(lruvec, sc))
> -		return 0;
> -
> -	/* one retry if offlined or too small */
> -	return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
> -	       MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
>  }
>  
>  static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)

This function kind of become very similar to shrink_node_memcgs()
function other than shrink_one vs shrink_lruvec. Can you try to combine
them and see if it looks not-ugly? Otherwise the code looks good to me.

Re: [RFC PATCH -next 1/2] mm/mglru: use mem_cgroup_iter for global reclaim

Posted by Chen Ridong 2 months ago


On 2025/12/5 6:29, Shakeel Butt wrote:
> Hi Chen,
> 
> On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> The memcg LRU was originally introduced for global reclaim to enhance
>> scalability. However, its implementation complexity has led to performance
>> regressions when dealing with a large number of memory cgroups [1].
>>
>> As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
>> cookie-based iteration for global reclaim, aligning with the approach
>> already used in shrink_node_memcgs. This simplification removes the
>> dedicated memcg LRU tracking while maintaining the core functionality.
>>
>> It performed a stress test based on Zhao Yu's methodology [2] on a
>> 1 TB, 4-node NUMA system. The results are summarized below:
>>
>> 					memcg LRU    memcg iter
>> stddev(pgsteal) / mean(pgsteal)            91.2%         75.7%
>> sum(pgsteal) / sum(requested)             216.4%        230.5%
>>
>> The new implementation demonstrates a significant improvement in
>> fairness, reducing the standard deviation relative to the mean by
>> 15.5 percentage points. While the reclaim accuracy shows a slight
>> increase in overscan (from 85086871 to 90633890, 6.5%).
>>
>> The primary benefits of this change are:
>> 1. Simplified codebase by removing custom memcg LRU infrastructure
>> 2. Improved fairness in memory reclaim across multiple cgroups
>> 3. Better performance when creating many memory cgroups
>>
>> [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
>> [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
> 
> Thanks a lot of this awesome work.
> 
>> ---
>>  mm/vmscan.c | 117 ++++++++++++++++------------------------------------
>>  1 file changed, 36 insertions(+), 81 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index fddd168a9737..70b0e7e5393c 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -4895,27 +4895,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>>  	return nr_to_scan < 0;
>>  }
>>  
>> -static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>> +static void shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>>  {
>> -	bool success;
>>  	unsigned long scanned = sc->nr_scanned;
>>  	unsigned long reclaimed = sc->nr_reclaimed;
>> -	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>> +	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>  
>> -	/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
>> -	if (mem_cgroup_below_min(NULL, memcg))
>> -		return MEMCG_LRU_YOUNG;
>> -
>> -	if (mem_cgroup_below_low(NULL, memcg)) {
>> -		/* see the comment on MEMCG_NR_GENS */
>> -		if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
>> -			return MEMCG_LRU_TAIL;
>> -
>> -		memcg_memory_event(memcg, MEMCG_LOW);
>> -	}
>> -
>> -	success = try_to_shrink_lruvec(lruvec, sc);
>> +	try_to_shrink_lruvec(lruvec, sc);
>>  
>>  	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
>>  
>> @@ -4924,86 +4911,55 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>>  			   sc->nr_reclaimed - reclaimed);
>>  
>>  	flush_reclaim_state(sc);
> 
> Unrealted to your patch but why this flush_reclaim_state() is at
> different place from the non-MGLRU code path?
> 

Thank you Shakeel for you reply.

IIUC, I think adding flush_reclaim_state here makes sense. Currently, shrink_one is only used for
root-level reclaim in gen-LRU, and flush_reclaim_state is only relevant during root reclaim.
Flushing after each lruvec is shrunk could help the reclaim loop terminate earlier, as
sc->nr_reclaimed += current->reclaim_state->reclaimed; may reach nr_to_reclaim sooner.

That said, I'm also wondering whether we should apply flush_reclaim_state for every iteration in
non-MGLLU reclaim as well. For non-root reclaim, it should be negligible since it effectively does
nothing. But for root-level reclaim under non-MGLRU, it might similarly help stop the iteration earlier.

>> -
>> -	if (success && mem_cgroup_online(memcg))
>> -		return MEMCG_LRU_YOUNG;
>> -
>> -	if (!success && lruvec_is_sizable(lruvec, sc))
>> -		return 0;
>> -
>> -	/* one retry if offlined or too small */
>> -	return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
>> -	       MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
>>  }
>>  
>>  static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
> 
> This function kind of become very similar to shrink_node_memcgs()
> function other than shrink_one vs shrink_lruvec. Can you try to combine
> them and see if it looks not-ugly? Otherwise the code looks good to me.
> 

Will try to.

-- 
Best regards,
Ridong

Re: [RFC PATCH -next 1/2] mm/mglru: use mem_cgroup_iter for global reclaim

Posted by Chen Ridong 2 months ago


On 2025/12/5 6:29, Shakeel Butt wrote:
> Hi Chen,
> 
> On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> The memcg LRU was originally introduced for global reclaim to enhance
>> scalability. However, its implementation complexity has led to performance
>> regressions when dealing with a large number of memory cgroups [1].
>>
>> As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
>> cookie-based iteration for global reclaim, aligning with the approach
>> already used in shrink_node_memcgs. This simplification removes the
>> dedicated memcg LRU tracking while maintaining the core functionality.
>>
>> It performed a stress test based on Zhao Yu's methodology [2] on a
>> 1 TB, 4-node NUMA system. The results are summarized below:
>>
>> 					memcg LRU    memcg iter
>> stddev(pgsteal) / mean(pgsteal)            91.2%         75.7%
>> sum(pgsteal) / sum(requested)             216.4%        230.5%
>>
>> The new implementation demonstrates a significant improvement in
>> fairness, reducing the standard deviation relative to the mean by
>> 15.5 percentage points. While the reclaim accuracy shows a slight
>> increase in overscan (from 85086871 to 90633890, 6.5%).
>>
>> The primary benefits of this change are:
>> 1. Simplified codebase by removing custom memcg LRU infrastructure
>> 2. Improved fairness in memory reclaim across multiple cgroups
>> 3. Better performance when creating many memory cgroups
>>
>> [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
>> [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
> 
> Thanks a lot of this awesome work.
> 

Hello Shakeel and Johannes,

I apologize for the incorrect results I provided earlier. I initially used an AI tool to process the
data (I admit that was lazy of me—please forget that). When I re-ran the test to re-extracted the
refault data and processed it again, I found that the AI tool had given me the wrong output.

I have now processed the data manually in Excel, and the correct results are:

pgsteal：
						memcg LRU    memcg iter
stddev(pgsteal) / mean(pgsteal)			106.03%		93.20%
sum(pgsteal) / sum(requested)			98.10%		99.28%

workingset_refault_anon：
						memcg LRU    memcg iter
stddev(refault) / mean(refault)			193.97%		134.67%
sum(refault)					1963229		2027567

I believe these final results are much better than the previous incorrect ones, especially since the
pgsteal ratio is now close to 100%, indicating we are not over-scanning. Additionally, refaults
increased by 64,238 (a 3.2% rise).

Let me know if you have any questions.

----------------------------------------------------------------------

The original data memcg LRU:
pgsteal:
SUM: 38572704 AVERAGE: 301349.25 STDEV: 319518.5965
refault:
SUM: 1963229 AVERAGE: 15337.72656 STDEV: 29750.03391

pgsteal	655392				workingset_refault_anon	17131
pgsteal	657308				workingset_refault_anon	24841
pgsteal	103777				workingset_refault_anon	430
pgsteal	103134				workingset_refault_anon	884
pgsteal	964772				workingset_refault_anon	117159
pgsteal	103462				workingset_refault_anon	539
pgsteal	102878				workingset_refault_anon	25
pgsteal	707851				workingset_refault_anon	30634
pgsteal	103925				workingset_refault_anon	497
pgsteal	103913				workingset_refault_anon	953
pgsteal	103020				workingset_refault_anon	110
pgsteal	102871				workingset_refault_anon	607
pgsteal	697775				workingset_refault_anon	21529
pgsteal	102944				workingset_refault_anon	57
pgsteal	103090				workingset_refault_anon	819
pgsteal	102988				workingset_refault_anon	583
pgsteal	102987				workingset_refault_anon	108
pgsteal	103093				workingset_refault_anon	17
pgsteal	778016				workingset_refault_anon	79000
pgsteal	102920				workingset_refault_anon	14
pgsteal	655447				workingset_refault_anon	9069
pgsteal	102869				workingset_refault_anon	6
pgsteal	699920				workingset_refault_anon	34409
pgsteal	103127				workingset_refault_anon	223
pgsteal	102876				workingset_refault_anon	646
pgsteal	103642				workingset_refault_anon	439
pgsteal	102881				workingset_refault_anon	110
pgsteal	863202				workingset_refault_anon	77605
pgsteal	651786				workingset_refault_anon	8322
pgsteal	102981				workingset_refault_anon	51
pgsteal	103380				workingset_refault_anon	877
pgsteal	706377				workingset_refault_anon	27729
pgsteal	103436				workingset_refault_anon	682
pgsteal	103839				workingset_refault_anon	336
pgsteal	103012				workingset_refault_anon	23
pgsteal	103476				workingset_refault_anon	729
pgsteal	102867				workingset_refault_anon	12
pgsteal	102914				workingset_refault_anon	122
pgsteal	102886				workingset_refault_anon	627
pgsteal	103736				workingset_refault_anon	514
pgsteal	102879				workingset_refault_anon	618
pgsteal	102860				workingset_refault_anon	3
pgsteal	102877				workingset_refault_anon	27
pgsteal	103255				workingset_refault_anon	384
pgsteal	982183				workingset_refault_anon	85362
pgsteal	102947				workingset_refault_anon	158
pgsteal	102880				workingset_refault_anon	651
pgsteal	973764				workingset_refault_anon	81542
pgsteal	923711				workingset_refault_anon	94596
pgsteal	102938				workingset_refault_anon	660
pgsteal	888882				workingset_refault_anon	69549
pgsteal	102868				workingset_refault_anon	14
pgsteal	103130				workingset_refault_anon	166
pgsteal	103388				workingset_refault_anon	467
pgsteal	102965				workingset_refault_anon	197
pgsteal	964699				workingset_refault_anon	74903
pgsteal	103263				workingset_refault_anon	373
pgsteal	103614				workingset_refault_anon	781
pgsteal	962228				workingset_refault_anon	72108
pgsteal	672174				workingset_refault_anon	19739
pgsteal	102920				workingset_refault_anon	19
pgsteal	670248				workingset_refault_anon	18411
pgsteal	102877				workingset_refault_anon	581
pgsteal	103758				workingset_refault_anon	871
pgsteal	102874				workingset_refault_anon	609
pgsteal	103075				workingset_refault_anon	274
pgsteal	103550				workingset_refault_anon	102
pgsteal	755180				workingset_refault_anon	44303
pgsteal	951252				workingset_refault_anon	84566
pgsteal	929144				workingset_refault_anon	99081
pgsteal	103207				workingset_refault_anon	30
pgsteal	103292				workingset_refault_anon	427
pgsteal	103271				workingset_refault_anon	332
pgsteal	102865				workingset_refault_anon	4
pgsteal	923280				workingset_refault_anon	72715
pgsteal	104682				workingset_refault_anon	372
pgsteal	102870				workingset_refault_anon	7
pgsteal	102902				workingset_refault_anon	661
pgsteal	103053				workingset_refault_anon	40
pgsteal	103685				workingset_refault_anon	540
pgsteal	103857				workingset_refault_anon	970
pgsteal	109210				workingset_refault_anon	2806
pgsteal	103627				workingset_refault_anon	319
pgsteal	104029				workingset_refault_anon	42
pgsteal	918361				workingset_refault_anon	90387
pgsteal	103489				workingset_refault_anon	626
pgsteal	103188				workingset_refault_anon	801
pgsteal	102875				workingset_refault_anon	11
pgsteal	102994				workingset_refault_anon	79
pgsteal	102910				workingset_refault_anon	43
pgsteal	102922				workingset_refault_anon	687
pgsteal	103941				workingset_refault_anon	1219
pgsteal	903622				workingset_refault_anon	113751
pgsteal	664357				workingset_refault_anon	27959
pgsteal	104947				workingset_refault_anon	11
pgsteal	701084				workingset_refault_anon	30665
pgsteal	650719				workingset_refault_anon	20810
pgsteal	641924				workingset_refault_anon	17137
pgsteal	933870				workingset_refault_anon	98393
pgsteal	633231				workingset_refault_anon	15924
pgsteal	102936				workingset_refault_anon	34
pgsteal	104020				workingset_refault_anon	781
pgsteal	104274				workingset_refault_anon	1841
pgsteal	621672				workingset_refault_anon	5891
pgsteal	103307				workingset_refault_anon	474
pgsteal	103386				workingset_refault_anon	27
pgsteal	103266				workingset_refault_anon	243
pgsteal	102896				workingset_refault_anon	15
pgsteal	103905				workingset_refault_anon	988
pgsteal	103104				workingset_refault_anon	304
pgsteal	104277				workingset_refault_anon	285
pgsteal	696374				workingset_refault_anon	24971
pgsteal	103009				workingset_refault_anon	775
pgsteal	103849				workingset_refault_anon	747
pgsteal	102867				workingset_refault_anon	9
pgsteal	700211				workingset_refault_anon	35289
pgsteal	102923				workingset_refault_anon	88
pgsteal	104139				workingset_refault_anon	789
pgsteal	105152				workingset_refault_anon	1257
pgsteal	102945				workingset_refault_anon	76
pgsteal	103227				workingset_refault_anon	343
pgsteal	102880				workingset_refault_anon	95
pgsteal	102967				workingset_refault_anon	101
pgsteal	989176				workingset_refault_anon	89597
pgsteal	694181				workingset_refault_anon	22499
pgsteal	784354				workingset_refault_anon	68311
pgsteal	102882				workingset_refault_anon	24
pgsteal	103108				workingset_refault_anon	24

-------------------------------------------------------------------

The original data memcg iter:
pgsteal:
SUM: 39036863 AVERAGE: 304975.4922 STDEV: 284226.526
refault:
SUM: 2027567 AVERAGE: 15840.36719 STDEV: 21332.00262

pgsteal	103167				workingset_refault_anon	203
pgsteal	714044				workingset_refault_anon	42633
pgsteal	103209				workingset_refault_anon	581
pgsteal	103605				workingset_refault_anon	240
pgsteal	740909				workingset_refault_anon	53177
pgsteal	103089				workingset_refault_anon	141
pgsteal	726760				workingset_refault_anon	32624
pgsteal	104039				workingset_refault_anon	397
pgsteal	754667				workingset_refault_anon	56144
pgsteal	713916				workingset_refault_anon	41813
pgsteal	104104				workingset_refault_anon	307
pgsteal	109567				workingset_refault_anon	244
pgsteal	714194				workingset_refault_anon	47076
pgsteal	711693				workingset_refault_anon	35616
pgsteal	105026				workingset_refault_anon	2221
pgsteal	103442				workingset_refault_anon	269
pgsteal	112773				workingset_refault_anon	5086
pgsteal	715969				workingset_refault_anon	32457
pgsteal	127828				workingset_refault_anon	9579
pgsteal	102885				workingset_refault_anon	109
pgsteal	112156				workingset_refault_anon	2974
pgsteal	104242				workingset_refault_anon	948
pgsteal	701184				workingset_refault_anon	47940
pgsteal	104080				workingset_refault_anon	836
pgsteal	106606				workingset_refault_anon	2420
pgsteal	103666				workingset_refault_anon	129
pgsteal	103330				workingset_refault_anon	532
pgsteal	103639				workingset_refault_anon	275
pgsteal	108494				workingset_refault_anon	3814
pgsteal	103626				workingset_refault_anon	412
pgsteal	103697				workingset_refault_anon	577
pgsteal	103736				workingset_refault_anon	582
pgsteal	103360				workingset_refault_anon	281
pgsteal	116733				workingset_refault_anon	6674
pgsteal	102978				workingset_refault_anon	5
pgsteal	108945				workingset_refault_anon	3141
pgsteal	706630				workingset_refault_anon	33241
pgsteal	103426				workingset_refault_anon	134
pgsteal	715070				workingset_refault_anon	33575
pgsteal	102871				workingset_refault_anon	12
pgsteal	103617				workingset_refault_anon	776
pgsteal	767084				workingset_refault_anon	64710
pgsteal	104197				workingset_refault_anon	176
pgsteal	104488				workingset_refault_anon	1469
pgsteal	103253				workingset_refault_anon	228
pgsteal	702800				workingset_refault_anon	26424
pgsteal	107469				workingset_refault_anon	2838
pgsteal	104441				workingset_refault_anon	1562
pgsteal	123013				workingset_refault_anon	13117
pgsteal	737817				workingset_refault_anon	53330
pgsteal	103939				workingset_refault_anon	759
pgsteal	103568				workingset_refault_anon	783
pgsteal	122707				workingset_refault_anon	11944
pgsteal	103690				workingset_refault_anon	885
pgsteal	103456				workingset_refault_anon	145
pgsteal	104068				workingset_refault_anon	632
pgsteal	319368				workingset_refault_anon	12579
pgsteal	103912				workingset_refault_anon	304
pgsteal	119416				workingset_refault_anon	3350
pgsteal	717107				workingset_refault_anon	34764
pgsteal	107163				workingset_refault_anon	535
pgsteal	103299				workingset_refault_anon	142
pgsteal	103825				workingset_refault_anon	176
pgsteal	408564				workingset_refault_anon	14606
pgsteal	115785				workingset_refault_anon	4622
pgsteal	119234				workingset_refault_anon	9225
pgsteal	729060				workingset_refault_anon	54309
pgsteal	107149				workingset_refault_anon	536
pgsteal	708839				workingset_refault_anon	43133
pgsteal	695961				workingset_refault_anon	40182
pgsteal	723303				workingset_refault_anon	32298
pgsteal	103581				workingset_refault_anon	1305
pgsteal	699646				workingset_refault_anon	49924
pgsteal	717867				workingset_refault_anon	39229
pgsteal	104148				workingset_refault_anon	1318
pgsteal	104127				workingset_refault_anon	568
pgsteal	103168				workingset_refault_anon	322
pgsteal	103477				workingset_refault_anon	538
pgsteal	103022				workingset_refault_anon	60
pgsteal	103305				workingset_refault_anon	323
pgsteal	103812				workingset_refault_anon	1324
pgsteal	103139				workingset_refault_anon	126
pgsteal	723251				workingset_refault_anon	34206
pgsteal	103068				workingset_refault_anon	861
pgsteal	742515				workingset_refault_anon	54439
pgsteal	762161				workingset_refault_anon	52654
pgsteal	103934				workingset_refault_anon	889
pgsteal	104065				workingset_refault_anon	315
pgsteal	383893				workingset_refault_anon	25036
pgsteal	107929				workingset_refault_anon	2367
pgsteal	726127				workingset_refault_anon	45809
pgsteal	675291				workingset_refault_anon	66534
pgsteal	105585				workingset_refault_anon	2323
pgsteal	105098				workingset_refault_anon	1625
pgsteal	104264				workingset_refault_anon	718
pgsteal	741873				workingset_refault_anon	47045
pgsteal	103466				workingset_refault_anon	70
pgsteal	723870				workingset_refault_anon	58780
pgsteal	104740				workingset_refault_anon	521
pgsteal	740739				workingset_refault_anon	45099
pgsteal	752994				workingset_refault_anon	53713
pgsteal	110164				workingset_refault_anon	2572
pgsteal	711304				workingset_refault_anon	41135
pgsteal	746870				workingset_refault_anon	60298
pgsteal	729166				workingset_refault_anon	42594
pgsteal	110138				workingset_refault_anon	1511
pgsteal	103836				workingset_refault_anon	675
pgsteal	116821				workingset_refault_anon	3952
pgsteal	104967				workingset_refault_anon	2035
pgsteal	711362				workingset_refault_anon	31458
pgsteal	103835				workingset_refault_anon	507
pgsteal	113846				workingset_refault_anon	2997
pgsteal	104406				workingset_refault_anon	1724
pgsteal	103551				workingset_refault_anon	1293
pgsteal	705340				workingset_refault_anon	44234
pgsteal	728076				workingset_refault_anon	29849
pgsteal	103829				workingset_refault_anon	254
pgsteal	103700				workingset_refault_anon	712
pgsteal	103382				workingset_refault_anon	506
pgsteal	728881				workingset_refault_anon	60152
pgsteal	614645				workingset_refault_anon	43956
pgsteal	107672				workingset_refault_anon	2768
pgsteal	123550				workingset_refault_anon	11937
pgsteal	103747				workingset_refault_anon	899
pgsteal	747657				workingset_refault_anon	50264
pgsteal	110949				workingset_refault_anon	1422
pgsteal	103596				workingset_refault_anon	278
pgsteal	742471				workingset_refault_anon	69586

-- 
Best regards,
Ridong

Re: [RFC PATCH -next 1/2] mm/mglru: use mem_cgroup_iter for global reclaim

Posted by Johannes Weiner 2 months, 1 week ago

On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
> 
> The memcg LRU was originally introduced for global reclaim to enhance
> scalability. However, its implementation complexity has led to performance
> regressions when dealing with a large number of memory cgroups [1].
> 
> As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
> cookie-based iteration for global reclaim, aligning with the approach
> already used in shrink_node_memcgs. This simplification removes the
> dedicated memcg LRU tracking while maintaining the core functionality.
> 
> It performed a stress test based on Zhao Yu's methodology [2] on a
> 1 TB, 4-node NUMA system. The results are summarized below:
> 
> 					memcg LRU    memcg iter
> stddev(pgsteal) / mean(pgsteal)            91.2%         75.7%
> sum(pgsteal) / sum(requested)             216.4%        230.5%
> 
> The new implementation demonstrates a significant improvement in
> fairness, reducing the standard deviation relative to the mean by
> 15.5 percentage points. While the reclaim accuracy shows a slight
> increase in overscan (from 85086871 to 90633890, 6.5%).
> 
> The primary benefits of this change are:
> 1. Simplified codebase by removing custom memcg LRU infrastructure
> 2. Improved fairness in memory reclaim across multiple cgroups
> 3. Better performance when creating many memory cgroups
> 
> [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
> [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
> Signed-off-by: Chen Ridong <chenridong@huawei.com>

Acked-by: Johannes Weiner <hannes@cmxpchg.org>

The diff and the test results look good to me. Comparing the resulting
shrink_many() with shrink_node_memcgs(), this also looks like a great
step towards maintainability and unification.

Thanks!

[RFC PATCH -next 1/2] mm/mglru: use mem_cgroup_iter for global reclaim
[RFC PATCH -next 2/2] mm/mglru: remove memcg lru