From: Chen Ridong <chenridong@huawei.com>
The memcg LRU was originally introduced for global reclaim to enhance
scalability. However, its implementation complexity has led to performance
regressions when dealing with a large number of memory cgroups [1].
As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
cookie-based iteration for global reclaim, aligning with the approach
already used in shrink_node_memcgs. This simplification removes the
dedicated memcg LRU tracking while maintaining the core functionality.
It performed a stress test based on Zhao Yu's methodology [2] on a
1 TB, 4-node NUMA system. The results are summarized below:
memcg LRU memcg iter
stddev(pgsteal) / mean(pgsteal) 91.2% 75.7%
sum(pgsteal) / sum(requested) 216.4% 230.5%
The new implementation demonstrates a significant improvement in
fairness, reducing the standard deviation relative to the mean by
15.5 percentage points. While the reclaim accuracy shows a slight
increase in overscan (from 85086871 to 90633890, 6.5%).
The primary benefits of this change are:
1. Simplified codebase by removing custom memcg LRU infrastructure
2. Improved fairness in memory reclaim across multiple cgroups
3. Better performance when creating many memory cgroups
[1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
[2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
mm/vmscan.c | 117 ++++++++++++++++------------------------------------
1 file changed, 36 insertions(+), 81 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fddd168a9737..70b0e7e5393c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4895,27 +4895,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
return nr_to_scan < 0;
}
-static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_one(struct lruvec *lruvec, struct scan_control *sc)
{
- bool success;
unsigned long scanned = sc->nr_scanned;
unsigned long reclaimed = sc->nr_reclaimed;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
- if (mem_cgroup_below_min(NULL, memcg))
- return MEMCG_LRU_YOUNG;
-
- if (mem_cgroup_below_low(NULL, memcg)) {
- /* see the comment on MEMCG_NR_GENS */
- if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
- return MEMCG_LRU_TAIL;
-
- memcg_memory_event(memcg, MEMCG_LOW);
- }
-
- success = try_to_shrink_lruvec(lruvec, sc);
+ try_to_shrink_lruvec(lruvec, sc);
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
@@ -4924,86 +4911,55 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
sc->nr_reclaimed - reclaimed);
flush_reclaim_state(sc);
-
- if (success && mem_cgroup_online(memcg))
- return MEMCG_LRU_YOUNG;
-
- if (!success && lruvec_is_sizable(lruvec, sc))
- return 0;
-
- /* one retry if offlined or too small */
- return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
- MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
}
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
{
- int op;
- int gen;
- int bin;
- int first_bin;
- struct lruvec *lruvec;
- struct lru_gen_folio *lrugen;
+ struct mem_cgroup *target = sc->target_mem_cgroup;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .pgdat = pgdat,
+ };
+ struct mem_cgroup_reclaim_cookie *cookie = &reclaim;
struct mem_cgroup *memcg;
- struct hlist_nulls_node *pos;
- gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
- bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
-restart:
- op = 0;
- memcg = NULL;
-
- rcu_read_lock();
+ if (current_is_kswapd() || sc->memcg_full_walk)
+ cookie = NULL;
- hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
- if (op) {
- lru_gen_rotate_memcg(lruvec, op);
- op = 0;
- }
+ memcg = mem_cgroup_iter(target, NULL, cookie);
+ while (memcg) {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mem_cgroup_put(memcg);
- memcg = NULL;
+ cond_resched();
- if (gen != READ_ONCE(lrugen->gen))
- continue;
+ mem_cgroup_calculate_protection(target, memcg);
- lruvec = container_of(lrugen, struct lruvec, lrugen);
- memcg = lruvec_memcg(lruvec);
+ if (mem_cgroup_below_min(target, memcg))
+ goto next;
- if (!mem_cgroup_tryget(memcg)) {
- lru_gen_release_memcg(memcg);
- memcg = NULL;
- continue;
+ if (mem_cgroup_below_low(target, memcg)) {
+ if (!sc->memcg_low_reclaim) {
+ sc->memcg_low_skipped = 1;
+ goto next;
+ }
+ memcg_memory_event(memcg, MEMCG_LOW);
}
- rcu_read_unlock();
+ shrink_one(lruvec, sc);
- op = shrink_one(lruvec, sc);
-
- rcu_read_lock();
-
- if (should_abort_scan(lruvec, sc))
+ if (should_abort_scan(lruvec, sc)) {
+ if (cookie)
+ mem_cgroup_iter_break(target, memcg);
break;
- }
-
- rcu_read_unlock();
-
- if (op)
- lru_gen_rotate_memcg(lruvec, op);
-
- mem_cgroup_put(memcg);
-
- if (!is_a_nulls(pos))
- return;
+ }
- /* restart if raced with lru_gen_rotate_memcg() */
- if (gen != get_nulls_value(pos))
- goto restart;
+next:
+ if (cookie && sc->nr_reclaimed >= sc->nr_to_reclaim) {
+ mem_cgroup_iter_break(target, memcg);
+ break;
+ }
- /* try the rest of the bins of the current generation */
- bin = get_memcg_bin(bin + 1);
- if (bin != first_bin)
- goto restart;
+ memcg = mem_cgroup_iter(target, memcg, cookie);
+ }
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -5019,8 +4975,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
set_mm_walk(NULL, sc->proactive);
- if (try_to_shrink_lruvec(lruvec, sc))
- lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
+ try_to_shrink_lruvec(lruvec, sc);
clear_mm_walk();
--
2.34.1
Hi Chen,
On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
>
> The memcg LRU was originally introduced for global reclaim to enhance
> scalability. However, its implementation complexity has led to performance
> regressions when dealing with a large number of memory cgroups [1].
>
> As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
> cookie-based iteration for global reclaim, aligning with the approach
> already used in shrink_node_memcgs. This simplification removes the
> dedicated memcg LRU tracking while maintaining the core functionality.
>
> It performed a stress test based on Zhao Yu's methodology [2] on a
> 1 TB, 4-node NUMA system. The results are summarized below:
>
> memcg LRU memcg iter
> stddev(pgsteal) / mean(pgsteal) 91.2% 75.7%
> sum(pgsteal) / sum(requested) 216.4% 230.5%
>
> The new implementation demonstrates a significant improvement in
> fairness, reducing the standard deviation relative to the mean by
> 15.5 percentage points. While the reclaim accuracy shows a slight
> increase in overscan (from 85086871 to 90633890, 6.5%).
>
> The primary benefits of this change are:
> 1. Simplified codebase by removing custom memcg LRU infrastructure
> 2. Improved fairness in memory reclaim across multiple cgroups
> 3. Better performance when creating many memory cgroups
>
> [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
> [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
> Signed-off-by: Chen Ridong <chenridong@huawei.com>
Thanks a lot of this awesome work.
> ---
> mm/vmscan.c | 117 ++++++++++++++++------------------------------------
> 1 file changed, 36 insertions(+), 81 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index fddd168a9737..70b0e7e5393c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4895,27 +4895,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> return nr_to_scan < 0;
> }
>
> -static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
> +static void shrink_one(struct lruvec *lruvec, struct scan_control *sc)
> {
> - bool success;
> unsigned long scanned = sc->nr_scanned;
> unsigned long reclaimed = sc->nr_reclaimed;
> - struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>
> - /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
> - if (mem_cgroup_below_min(NULL, memcg))
> - return MEMCG_LRU_YOUNG;
> -
> - if (mem_cgroup_below_low(NULL, memcg)) {
> - /* see the comment on MEMCG_NR_GENS */
> - if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
> - return MEMCG_LRU_TAIL;
> -
> - memcg_memory_event(memcg, MEMCG_LOW);
> - }
> -
> - success = try_to_shrink_lruvec(lruvec, sc);
> + try_to_shrink_lruvec(lruvec, sc);
>
> shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
>
> @@ -4924,86 +4911,55 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
> sc->nr_reclaimed - reclaimed);
>
> flush_reclaim_state(sc);
Unrealted to your patch but why this flush_reclaim_state() is at
different place from the non-MGLRU code path?
> -
> - if (success && mem_cgroup_online(memcg))
> - return MEMCG_LRU_YOUNG;
> -
> - if (!success && lruvec_is_sizable(lruvec, sc))
> - return 0;
> -
> - /* one retry if offlined or too small */
> - return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
> - MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
> }
>
> static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
This function kind of become very similar to shrink_node_memcgs()
function other than shrink_one vs shrink_lruvec. Can you try to combine
them and see if it looks not-ugly? Otherwise the code looks good to me.
On 2025/12/5 6:29, Shakeel Butt wrote:
> Hi Chen,
>
> On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> The memcg LRU was originally introduced for global reclaim to enhance
>> scalability. However, its implementation complexity has led to performance
>> regressions when dealing with a large number of memory cgroups [1].
>>
>> As suggested by Johannes [1], this patch adopts mem_cgroup_iter with
>> cookie-based iteration for global reclaim, aligning with the approach
>> already used in shrink_node_memcgs. This simplification removes the
>> dedicated memcg LRU tracking while maintaining the core functionality.
>>
>> It performed a stress test based on Zhao Yu's methodology [2] on a
>> 1 TB, 4-node NUMA system. The results are summarized below:
>>
>> memcg LRU memcg iter
>> stddev(pgsteal) / mean(pgsteal) 91.2% 75.7%
>> sum(pgsteal) / sum(requested) 216.4% 230.5%
>>
>> The new implementation demonstrates a significant improvement in
>> fairness, reducing the standard deviation relative to the mean by
>> 15.5 percentage points. While the reclaim accuracy shows a slight
>> increase in overscan (from 85086871 to 90633890, 6.5%).
>>
>> The primary benefits of this change are:
>> 1. Simplified codebase by removing custom memcg LRU infrastructure
>> 2. Improved fairness in memory reclaim across multiple cgroups
>> 3. Better performance when creating many memory cgroups
>>
>> [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
>> [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
>
> Thanks a lot of this awesome work.
>
>> ---
>> mm/vmscan.c | 117 ++++++++++++++++------------------------------------
>> 1 file changed, 36 insertions(+), 81 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index fddd168a9737..70b0e7e5393c 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -4895,27 +4895,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>> return nr_to_scan < 0;
>> }
>>
>> -static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>> +static void shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>> {
>> - bool success;
>> unsigned long scanned = sc->nr_scanned;
>> unsigned long reclaimed = sc->nr_reclaimed;
>> - struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>> struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>> + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>
>> - /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
>> - if (mem_cgroup_below_min(NULL, memcg))
>> - return MEMCG_LRU_YOUNG;
>> -
>> - if (mem_cgroup_below_low(NULL, memcg)) {
>> - /* see the comment on MEMCG_NR_GENS */
>> - if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
>> - return MEMCG_LRU_TAIL;
>> -
>> - memcg_memory_event(memcg, MEMCG_LOW);
>> - }
>> -
>> - success = try_to_shrink_lruvec(lruvec, sc);
>> + try_to_shrink_lruvec(lruvec, sc);
>>
>> shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
>>
>> @@ -4924,86 +4911,55 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>> sc->nr_reclaimed - reclaimed);
>>
>> flush_reclaim_state(sc);
>
> Unrealted to your patch but why this flush_reclaim_state() is at
> different place from the non-MGLRU code path?
>
Thank you Shakeel for you reply.
IIUC, I think adding flush_reclaim_state here makes sense. Currently, shrink_one is only used for
root-level reclaim in gen-LRU, and flush_reclaim_state is only relevant during root reclaim.
Flushing after each lruvec is shrunk could help the reclaim loop terminate earlier, as
sc->nr_reclaimed += current->reclaim_state->reclaimed; may reach nr_to_reclaim sooner.
That said, I'm also wondering whether we should apply flush_reclaim_state for every iteration in
non-MGLLU reclaim as well. For non-root reclaim, it should be negligible since it effectively does
nothing. But for root-level reclaim under non-MGLRU, it might similarly help stop the iteration earlier.
>> -
>> - if (success && mem_cgroup_online(memcg))
>> - return MEMCG_LRU_YOUNG;
>> -
>> - if (!success && lruvec_is_sizable(lruvec, sc))
>> - return 0;
>> -
>> - /* one retry if offlined or too small */
>> - return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
>> - MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
>> }
>>
>> static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
>
> This function kind of become very similar to shrink_node_memcgs()
> function other than shrink_one vs shrink_lruvec. Can you try to combine
> them and see if it looks not-ugly? Otherwise the code looks good to me.
>
Will try to.
--
Best regards,
Ridong
On 2025/12/5 6:29, Shakeel Butt wrote: > Hi Chen, > > On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote: >> From: Chen Ridong <chenridong@huawei.com> >> >> The memcg LRU was originally introduced for global reclaim to enhance >> scalability. However, its implementation complexity has led to performance >> regressions when dealing with a large number of memory cgroups [1]. >> >> As suggested by Johannes [1], this patch adopts mem_cgroup_iter with >> cookie-based iteration for global reclaim, aligning with the approach >> already used in shrink_node_memcgs. This simplification removes the >> dedicated memcg LRU tracking while maintaining the core functionality. >> >> It performed a stress test based on Zhao Yu's methodology [2] on a >> 1 TB, 4-node NUMA system. The results are summarized below: >> >> memcg LRU memcg iter >> stddev(pgsteal) / mean(pgsteal) 91.2% 75.7% >> sum(pgsteal) / sum(requested) 216.4% 230.5% >> >> The new implementation demonstrates a significant improvement in >> fairness, reducing the standard deviation relative to the mean by >> 15.5 percentage points. While the reclaim accuracy shows a slight >> increase in overscan (from 85086871 to 90633890, 6.5%). >> >> The primary benefits of this change are: >> 1. Simplified codebase by removing custom memcg LRU infrastructure >> 2. Improved fairness in memory reclaim across multiple cgroups >> 3. Better performance when creating many memory cgroups >> >> [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org >> [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com >> Signed-off-by: Chen Ridong <chenridong@huawei.com> > > Thanks a lot of this awesome work. > Hello Shakeel and Johannes, I apologize for the incorrect results I provided earlier. I initially used an AI tool to process the data (I admit that was lazy of me—please forget that). When I re-ran the test to re-extracted the refault data and processed it again, I found that the AI tool had given me the wrong output. I have now processed the data manually in Excel, and the correct results are: pgsteal: memcg LRU memcg iter stddev(pgsteal) / mean(pgsteal) 106.03% 93.20% sum(pgsteal) / sum(requested) 98.10% 99.28% workingset_refault_anon: memcg LRU memcg iter stddev(refault) / mean(refault) 193.97% 134.67% sum(refault) 1963229 2027567 I believe these final results are much better than the previous incorrect ones, especially since the pgsteal ratio is now close to 100%, indicating we are not over-scanning. Additionally, refaults increased by 64,238 (a 3.2% rise). Let me know if you have any questions. ---------------------------------------------------------------------- The original data memcg LRU: pgsteal: SUM: 38572704 AVERAGE: 301349.25 STDEV: 319518.5965 refault: SUM: 1963229 AVERAGE: 15337.72656 STDEV: 29750.03391 pgsteal 655392 workingset_refault_anon 17131 pgsteal 657308 workingset_refault_anon 24841 pgsteal 103777 workingset_refault_anon 430 pgsteal 103134 workingset_refault_anon 884 pgsteal 964772 workingset_refault_anon 117159 pgsteal 103462 workingset_refault_anon 539 pgsteal 102878 workingset_refault_anon 25 pgsteal 707851 workingset_refault_anon 30634 pgsteal 103925 workingset_refault_anon 497 pgsteal 103913 workingset_refault_anon 953 pgsteal 103020 workingset_refault_anon 110 pgsteal 102871 workingset_refault_anon 607 pgsteal 697775 workingset_refault_anon 21529 pgsteal 102944 workingset_refault_anon 57 pgsteal 103090 workingset_refault_anon 819 pgsteal 102988 workingset_refault_anon 583 pgsteal 102987 workingset_refault_anon 108 pgsteal 103093 workingset_refault_anon 17 pgsteal 778016 workingset_refault_anon 79000 pgsteal 102920 workingset_refault_anon 14 pgsteal 655447 workingset_refault_anon 9069 pgsteal 102869 workingset_refault_anon 6 pgsteal 699920 workingset_refault_anon 34409 pgsteal 103127 workingset_refault_anon 223 pgsteal 102876 workingset_refault_anon 646 pgsteal 103642 workingset_refault_anon 439 pgsteal 102881 workingset_refault_anon 110 pgsteal 863202 workingset_refault_anon 77605 pgsteal 651786 workingset_refault_anon 8322 pgsteal 102981 workingset_refault_anon 51 pgsteal 103380 workingset_refault_anon 877 pgsteal 706377 workingset_refault_anon 27729 pgsteal 103436 workingset_refault_anon 682 pgsteal 103839 workingset_refault_anon 336 pgsteal 103012 workingset_refault_anon 23 pgsteal 103476 workingset_refault_anon 729 pgsteal 102867 workingset_refault_anon 12 pgsteal 102914 workingset_refault_anon 122 pgsteal 102886 workingset_refault_anon 627 pgsteal 103736 workingset_refault_anon 514 pgsteal 102879 workingset_refault_anon 618 pgsteal 102860 workingset_refault_anon 3 pgsteal 102877 workingset_refault_anon 27 pgsteal 103255 workingset_refault_anon 384 pgsteal 982183 workingset_refault_anon 85362 pgsteal 102947 workingset_refault_anon 158 pgsteal 102880 workingset_refault_anon 651 pgsteal 973764 workingset_refault_anon 81542 pgsteal 923711 workingset_refault_anon 94596 pgsteal 102938 workingset_refault_anon 660 pgsteal 888882 workingset_refault_anon 69549 pgsteal 102868 workingset_refault_anon 14 pgsteal 103130 workingset_refault_anon 166 pgsteal 103388 workingset_refault_anon 467 pgsteal 102965 workingset_refault_anon 197 pgsteal 964699 workingset_refault_anon 74903 pgsteal 103263 workingset_refault_anon 373 pgsteal 103614 workingset_refault_anon 781 pgsteal 962228 workingset_refault_anon 72108 pgsteal 672174 workingset_refault_anon 19739 pgsteal 102920 workingset_refault_anon 19 pgsteal 670248 workingset_refault_anon 18411 pgsteal 102877 workingset_refault_anon 581 pgsteal 103758 workingset_refault_anon 871 pgsteal 102874 workingset_refault_anon 609 pgsteal 103075 workingset_refault_anon 274 pgsteal 103550 workingset_refault_anon 102 pgsteal 755180 workingset_refault_anon 44303 pgsteal 951252 workingset_refault_anon 84566 pgsteal 929144 workingset_refault_anon 99081 pgsteal 103207 workingset_refault_anon 30 pgsteal 103292 workingset_refault_anon 427 pgsteal 103271 workingset_refault_anon 332 pgsteal 102865 workingset_refault_anon 4 pgsteal 923280 workingset_refault_anon 72715 pgsteal 104682 workingset_refault_anon 372 pgsteal 102870 workingset_refault_anon 7 pgsteal 102902 workingset_refault_anon 661 pgsteal 103053 workingset_refault_anon 40 pgsteal 103685 workingset_refault_anon 540 pgsteal 103857 workingset_refault_anon 970 pgsteal 109210 workingset_refault_anon 2806 pgsteal 103627 workingset_refault_anon 319 pgsteal 104029 workingset_refault_anon 42 pgsteal 918361 workingset_refault_anon 90387 pgsteal 103489 workingset_refault_anon 626 pgsteal 103188 workingset_refault_anon 801 pgsteal 102875 workingset_refault_anon 11 pgsteal 102994 workingset_refault_anon 79 pgsteal 102910 workingset_refault_anon 43 pgsteal 102922 workingset_refault_anon 687 pgsteal 103941 workingset_refault_anon 1219 pgsteal 903622 workingset_refault_anon 113751 pgsteal 664357 workingset_refault_anon 27959 pgsteal 104947 workingset_refault_anon 11 pgsteal 701084 workingset_refault_anon 30665 pgsteal 650719 workingset_refault_anon 20810 pgsteal 641924 workingset_refault_anon 17137 pgsteal 933870 workingset_refault_anon 98393 pgsteal 633231 workingset_refault_anon 15924 pgsteal 102936 workingset_refault_anon 34 pgsteal 104020 workingset_refault_anon 781 pgsteal 104274 workingset_refault_anon 1841 pgsteal 621672 workingset_refault_anon 5891 pgsteal 103307 workingset_refault_anon 474 pgsteal 103386 workingset_refault_anon 27 pgsteal 103266 workingset_refault_anon 243 pgsteal 102896 workingset_refault_anon 15 pgsteal 103905 workingset_refault_anon 988 pgsteal 103104 workingset_refault_anon 304 pgsteal 104277 workingset_refault_anon 285 pgsteal 696374 workingset_refault_anon 24971 pgsteal 103009 workingset_refault_anon 775 pgsteal 103849 workingset_refault_anon 747 pgsteal 102867 workingset_refault_anon 9 pgsteal 700211 workingset_refault_anon 35289 pgsteal 102923 workingset_refault_anon 88 pgsteal 104139 workingset_refault_anon 789 pgsteal 105152 workingset_refault_anon 1257 pgsteal 102945 workingset_refault_anon 76 pgsteal 103227 workingset_refault_anon 343 pgsteal 102880 workingset_refault_anon 95 pgsteal 102967 workingset_refault_anon 101 pgsteal 989176 workingset_refault_anon 89597 pgsteal 694181 workingset_refault_anon 22499 pgsteal 784354 workingset_refault_anon 68311 pgsteal 102882 workingset_refault_anon 24 pgsteal 103108 workingset_refault_anon 24 ------------------------------------------------------------------- The original data memcg iter: pgsteal: SUM: 39036863 AVERAGE: 304975.4922 STDEV: 284226.526 refault: SUM: 2027567 AVERAGE: 15840.36719 STDEV: 21332.00262 pgsteal 103167 workingset_refault_anon 203 pgsteal 714044 workingset_refault_anon 42633 pgsteal 103209 workingset_refault_anon 581 pgsteal 103605 workingset_refault_anon 240 pgsteal 740909 workingset_refault_anon 53177 pgsteal 103089 workingset_refault_anon 141 pgsteal 726760 workingset_refault_anon 32624 pgsteal 104039 workingset_refault_anon 397 pgsteal 754667 workingset_refault_anon 56144 pgsteal 713916 workingset_refault_anon 41813 pgsteal 104104 workingset_refault_anon 307 pgsteal 109567 workingset_refault_anon 244 pgsteal 714194 workingset_refault_anon 47076 pgsteal 711693 workingset_refault_anon 35616 pgsteal 105026 workingset_refault_anon 2221 pgsteal 103442 workingset_refault_anon 269 pgsteal 112773 workingset_refault_anon 5086 pgsteal 715969 workingset_refault_anon 32457 pgsteal 127828 workingset_refault_anon 9579 pgsteal 102885 workingset_refault_anon 109 pgsteal 112156 workingset_refault_anon 2974 pgsteal 104242 workingset_refault_anon 948 pgsteal 701184 workingset_refault_anon 47940 pgsteal 104080 workingset_refault_anon 836 pgsteal 106606 workingset_refault_anon 2420 pgsteal 103666 workingset_refault_anon 129 pgsteal 103330 workingset_refault_anon 532 pgsteal 103639 workingset_refault_anon 275 pgsteal 108494 workingset_refault_anon 3814 pgsteal 103626 workingset_refault_anon 412 pgsteal 103697 workingset_refault_anon 577 pgsteal 103736 workingset_refault_anon 582 pgsteal 103360 workingset_refault_anon 281 pgsteal 116733 workingset_refault_anon 6674 pgsteal 102978 workingset_refault_anon 5 pgsteal 108945 workingset_refault_anon 3141 pgsteal 706630 workingset_refault_anon 33241 pgsteal 103426 workingset_refault_anon 134 pgsteal 715070 workingset_refault_anon 33575 pgsteal 102871 workingset_refault_anon 12 pgsteal 103617 workingset_refault_anon 776 pgsteal 767084 workingset_refault_anon 64710 pgsteal 104197 workingset_refault_anon 176 pgsteal 104488 workingset_refault_anon 1469 pgsteal 103253 workingset_refault_anon 228 pgsteal 702800 workingset_refault_anon 26424 pgsteal 107469 workingset_refault_anon 2838 pgsteal 104441 workingset_refault_anon 1562 pgsteal 123013 workingset_refault_anon 13117 pgsteal 737817 workingset_refault_anon 53330 pgsteal 103939 workingset_refault_anon 759 pgsteal 103568 workingset_refault_anon 783 pgsteal 122707 workingset_refault_anon 11944 pgsteal 103690 workingset_refault_anon 885 pgsteal 103456 workingset_refault_anon 145 pgsteal 104068 workingset_refault_anon 632 pgsteal 319368 workingset_refault_anon 12579 pgsteal 103912 workingset_refault_anon 304 pgsteal 119416 workingset_refault_anon 3350 pgsteal 717107 workingset_refault_anon 34764 pgsteal 107163 workingset_refault_anon 535 pgsteal 103299 workingset_refault_anon 142 pgsteal 103825 workingset_refault_anon 176 pgsteal 408564 workingset_refault_anon 14606 pgsteal 115785 workingset_refault_anon 4622 pgsteal 119234 workingset_refault_anon 9225 pgsteal 729060 workingset_refault_anon 54309 pgsteal 107149 workingset_refault_anon 536 pgsteal 708839 workingset_refault_anon 43133 pgsteal 695961 workingset_refault_anon 40182 pgsteal 723303 workingset_refault_anon 32298 pgsteal 103581 workingset_refault_anon 1305 pgsteal 699646 workingset_refault_anon 49924 pgsteal 717867 workingset_refault_anon 39229 pgsteal 104148 workingset_refault_anon 1318 pgsteal 104127 workingset_refault_anon 568 pgsteal 103168 workingset_refault_anon 322 pgsteal 103477 workingset_refault_anon 538 pgsteal 103022 workingset_refault_anon 60 pgsteal 103305 workingset_refault_anon 323 pgsteal 103812 workingset_refault_anon 1324 pgsteal 103139 workingset_refault_anon 126 pgsteal 723251 workingset_refault_anon 34206 pgsteal 103068 workingset_refault_anon 861 pgsteal 742515 workingset_refault_anon 54439 pgsteal 762161 workingset_refault_anon 52654 pgsteal 103934 workingset_refault_anon 889 pgsteal 104065 workingset_refault_anon 315 pgsteal 383893 workingset_refault_anon 25036 pgsteal 107929 workingset_refault_anon 2367 pgsteal 726127 workingset_refault_anon 45809 pgsteal 675291 workingset_refault_anon 66534 pgsteal 105585 workingset_refault_anon 2323 pgsteal 105098 workingset_refault_anon 1625 pgsteal 104264 workingset_refault_anon 718 pgsteal 741873 workingset_refault_anon 47045 pgsteal 103466 workingset_refault_anon 70 pgsteal 723870 workingset_refault_anon 58780 pgsteal 104740 workingset_refault_anon 521 pgsteal 740739 workingset_refault_anon 45099 pgsteal 752994 workingset_refault_anon 53713 pgsteal 110164 workingset_refault_anon 2572 pgsteal 711304 workingset_refault_anon 41135 pgsteal 746870 workingset_refault_anon 60298 pgsteal 729166 workingset_refault_anon 42594 pgsteal 110138 workingset_refault_anon 1511 pgsteal 103836 workingset_refault_anon 675 pgsteal 116821 workingset_refault_anon 3952 pgsteal 104967 workingset_refault_anon 2035 pgsteal 711362 workingset_refault_anon 31458 pgsteal 103835 workingset_refault_anon 507 pgsteal 113846 workingset_refault_anon 2997 pgsteal 104406 workingset_refault_anon 1724 pgsteal 103551 workingset_refault_anon 1293 pgsteal 705340 workingset_refault_anon 44234 pgsteal 728076 workingset_refault_anon 29849 pgsteal 103829 workingset_refault_anon 254 pgsteal 103700 workingset_refault_anon 712 pgsteal 103382 workingset_refault_anon 506 pgsteal 728881 workingset_refault_anon 60152 pgsteal 614645 workingset_refault_anon 43956 pgsteal 107672 workingset_refault_anon 2768 pgsteal 123550 workingset_refault_anon 11937 pgsteal 103747 workingset_refault_anon 899 pgsteal 747657 workingset_refault_anon 50264 pgsteal 110949 workingset_refault_anon 1422 pgsteal 103596 workingset_refault_anon 278 pgsteal 742471 workingset_refault_anon 69586 -- Best regards, Ridong
On Thu, Dec 04, 2025 at 12:31:23PM +0000, Chen Ridong wrote: > From: Chen Ridong <chenridong@huawei.com> > > The memcg LRU was originally introduced for global reclaim to enhance > scalability. However, its implementation complexity has led to performance > regressions when dealing with a large number of memory cgroups [1]. > > As suggested by Johannes [1], this patch adopts mem_cgroup_iter with > cookie-based iteration for global reclaim, aligning with the approach > already used in shrink_node_memcgs. This simplification removes the > dedicated memcg LRU tracking while maintaining the core functionality. > > It performed a stress test based on Zhao Yu's methodology [2] on a > 1 TB, 4-node NUMA system. The results are summarized below: > > memcg LRU memcg iter > stddev(pgsteal) / mean(pgsteal) 91.2% 75.7% > sum(pgsteal) / sum(requested) 216.4% 230.5% > > The new implementation demonstrates a significant improvement in > fairness, reducing the standard deviation relative to the mean by > 15.5 percentage points. While the reclaim accuracy shows a slight > increase in overscan (from 85086871 to 90633890, 6.5%). > > The primary benefits of this change are: > 1. Simplified codebase by removing custom memcg LRU infrastructure > 2. Improved fairness in memory reclaim across multiple cgroups > 3. Better performance when creating many memory cgroups > > [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org > [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com > Signed-off-by: Chen Ridong <chenridong@huawei.com> Acked-by: Johannes Weiner <hannes@cmxpchg.org> The diff and the test results look good to me. Comparing the resulting shrink_many() with shrink_node_memcgs(), this also looks like a great step towards maintainability and unification. Thanks!
© 2016 - 2025 Red Hat, Inc.