include/linux/mmzone.h | 4 +++ mm/vmscan.c | 78 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 7 deletions(-)
From: Chen Ridong <chenridong@huawei.com>
With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
when creating a large number of memory cgroups (memcgs):
# time mkdir testcg_{1..10000}
real 0m7.167s
user 0m0.037s
sys 0m6.773s
# time mkdir testcg_{1..20000}
real 0m27.158s
user 0m0.079s
sys 0m26.270s
In contrast, with LRU_GEN=n, creation of the same number of memcgs
performs better:
# time mkdir testcg_{1..10000}
real 0m3.386s
user 0m0.044s
sys 0m3.009s
# time mkdir testcg_{1..20000}
real 0m6.876s
user 0m0.075s
sys 0m6.121s
The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
which traverses the entire list to find the tail. This traversal scales
with the number of memcgs, even when LRU_GEN is runtime-disabled.
Fix this by adding a per-lru_gen tail pointer to track the list's tail.
Appending new nodes now uses the tail pointer directly, eliminating full
list traversal.
After applying this patch, memcg creation performance with LRU_GEN=y
matches the fully disabled baseline:
#time mkdir testcg_{1..10000}
real 0m3.368s
user 0m0.025s
sys 0m3.012s
# time mkdir testcg_{1..20000}
real 0m6.742s
user 0m0.085s
sys 0m5.995s
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
include/linux/mmzone.h | 4 +++
mm/vmscan.c | 78 ++++++++++++++++++++++++++++++++++++++----
2 files changed, 75 insertions(+), 7 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4398e027f450..bdee57b35126 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -513,6 +513,8 @@ struct lru_gen_folio {
u8 gen;
/* the list segment this lru_gen_folio belongs to */
u8 seg;
+ /* the bin index this lru_gen_folio is queued on */
+ u8 bin;
/* per-node lru_gen_folio list for global reclaim */
struct hlist_nulls_node list;
};
@@ -610,6 +612,8 @@ struct lru_gen_memcg {
unsigned long nr_memcgs[MEMCG_NR_GENS];
/* per-node lru_gen_folio list for global reclaim */
struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
+ /* cached tails to speed up enqueueing */
+ struct hlist_nulls_node *tails[MEMCG_NR_GENS][MEMCG_NR_BINS];
/* protects the above */
spinlock_t lock;
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8890f4b58673..6c2665e48f19 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4299,6 +4299,66 @@ enum {
MEMCG_LRU_YOUNG,
};
+static void memcg_lru_add_head_locked(struct pglist_data *pgdat,
+ struct lruvec *lruvec, int gen, int bin)
+{
+ struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
+ struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
+ struct hlist_nulls_node *node = &lruvec->lrugen.list;
+ bool empty = !memcg_lru->tails[gen][bin];
+
+ hlist_nulls_add_head_rcu(node, head);
+ lruvec->lrugen.bin = bin;
+
+ if (empty)
+ memcg_lru->tails[gen][bin] = node;
+}
+
+static void memcg_lru_add_tail_locked(struct pglist_data *pgdat,
+ struct lruvec *lruvec, int gen, int bin)
+{
+ struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
+ struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
+ struct hlist_nulls_node *node = &lruvec->lrugen.list;
+ struct hlist_nulls_node *tail = memcg_lru->tails[gen][bin];
+
+ if (tail) {
+ WRITE_ONCE(node->next, tail->next);
+ WRITE_ONCE(node->pprev, &tail->next);
+ rcu_assign_pointer(hlist_nulls_next_rcu(tail), node);
+ } else {
+ hlist_nulls_add_head_rcu(node, head);
+ }
+
+ memcg_lru->tails[gen][bin] = node;
+ lruvec->lrugen.bin = bin;
+}
+
+static void memcg_lru_del_locked(struct pglist_data *pgdat, struct lruvec *lruvec,
+ bool reinit)
+{
+ int gen = lruvec->lrugen.gen;
+ int bin = lruvec->lrugen.bin;
+ struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
+ struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
+ struct hlist_nulls_node *node = &lruvec->lrugen.list;
+ struct hlist_nulls_node *prev = NULL;
+
+ if (hlist_nulls_unhashed(node))
+ return;
+
+ if (memcg_lru->tails[gen][bin] == node) {
+ if (node->pprev != &head->first)
+ prev = container_of(node->pprev, struct hlist_nulls_node, next);
+ memcg_lru->tails[gen][bin] = prev;
+ }
+
+ if (reinit)
+ hlist_nulls_del_init_rcu(node);
+ else
+ hlist_nulls_del_rcu(node);
+}
+
static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
int seg;
@@ -4326,15 +4386,15 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
else
VM_WARN_ON_ONCE(true);
+ memcg_lru_del_locked(pgdat, lruvec, false);
+
WRITE_ONCE(lruvec->lrugen.seg, seg);
WRITE_ONCE(lruvec->lrugen.gen, new);
- hlist_nulls_del_rcu(&lruvec->lrugen.list);
-
if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
- hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+ memcg_lru_add_head_locked(pgdat, lruvec, new, bin);
else
- hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+ memcg_lru_add_tail_locked(pgdat, lruvec, new, bin);
pgdat->memcg_lru.nr_memcgs[old]--;
pgdat->memcg_lru.nr_memcgs[new]++;
@@ -4365,7 +4425,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
lruvec->lrugen.gen = gen;
- hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
+ memcg_lru_add_tail_locked(pgdat, lruvec, gen, bin);
pgdat->memcg_lru.nr_memcgs[gen]++;
spin_unlock_irq(&pgdat->memcg_lru.lock);
@@ -4399,7 +4459,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
gen = lruvec->lrugen.gen;
- hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
+ memcg_lru_del_locked(pgdat, lruvec, true);
pgdat->memcg_lru.nr_memcgs[gen]--;
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
@@ -5664,8 +5724,10 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
spin_lock_init(&pgdat->memcg_lru.lock);
for (i = 0; i < MEMCG_NR_GENS; i++) {
- for (j = 0; j < MEMCG_NR_BINS; j++)
+ for (j = 0; j < MEMCG_NR_BINS; j++) {
INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+ pgdat->memcg_lru.tails[i][j] = NULL;
+ }
}
}
@@ -5687,6 +5749,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
if (mm_state)
mm_state->seq = MIN_NR_GENS;
+
+ lrugen->bin = 0;
}
#ifdef CONFIG_MEMCG
--
2.34.1
On Wed, Nov 19, 2025 at 08:37:22AM +0000, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
>
> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
> when creating a large number of memory cgroups (memcgs):
>
> # time mkdir testcg_{1..10000}
>
> real 0m7.167s
> user 0m0.037s
> sys 0m6.773s
>
> # time mkdir testcg_{1..20000}
>
> real 0m27.158s
> user 0m0.079s
> sys 0m26.270s
>
> In contrast, with LRU_GEN=n, creation of the same number of memcgs
> performs better:
>
> # time mkdir testcg_{1..10000}
>
> real 0m3.386s
> user 0m0.044s
> sys 0m3.009s
>
> # time mkdir testcg_{1..20000}
>
> real 0m6.876s
> user 0m0.075s
> sys 0m6.121s
>
> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
> which traverses the entire list to find the tail. This traversal scales
> with the number of memcgs, even when LRU_GEN is runtime-disabled.
Can you please look into removing the memcg LRU instead?
Use mem_cgroup_iter() with a reclaim cookie in shrink_many(), like we
do in shrink_node_memcgs().
The memcg LRU is complicated, and it only works for global reclaim; if
you have a subtree with a memory.max at the top, it'll go through
shrink_node_memcgs() already anyway.
On 2025/11/27 1:15, Johannes Weiner wrote:
> On Wed, Nov 19, 2025 at 08:37:22AM +0000, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
>> when creating a large number of memory cgroups (memcgs):
>>
>> # time mkdir testcg_{1..10000}
>>
>> real 0m7.167s
>> user 0m0.037s
>> sys 0m6.773s
>>
>> # time mkdir testcg_{1..20000}
>>
>> real 0m27.158s
>> user 0m0.079s
>> sys 0m26.270s
>>
>> In contrast, with LRU_GEN=n, creation of the same number of memcgs
>> performs better:
>>
>> # time mkdir testcg_{1..10000}
>>
>> real 0m3.386s
>> user 0m0.044s
>> sys 0m3.009s
>>
>> # time mkdir testcg_{1..20000}
>>
>> real 0m6.876s
>> user 0m0.075s
>> sys 0m6.121s
>>
>> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
>> which traverses the entire list to find the tail. This traversal scales
>> with the number of memcgs, even when LRU_GEN is runtime-disabled.
>
> Can you please look into removing the memcg LRU instead?
>
Thanks Johannes, this is indeed a promising approach.
The memcg LRU was originally designed exclusively for global reclaim scenarios. Before we move
forward with its removal, I'd like to hear Yu's thoughts on this.
Hello Yu,
Do you have any opinions on removing the memcg LRU?
> Use mem_cgroup_iter() with a reclaim cookie in shrink_many(), like we
> do in shrink_node_memcgs().
>
> The memcg LRU is complicated, and it only works for global reclaim; if
> you have a subtree with a memory.max at the top, it'll go through
> shrink_node_memcgs() already anyway.
--
Best regards,
Ridong
On 2025/11/19 16:37, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
>
> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
> when creating a large number of memory cgroups (memcgs):
>
> # time mkdir testcg_{1..10000}
>
> real 0m7.167s
> user 0m0.037s
> sys 0m6.773s
>
> # time mkdir testcg_{1..20000}
>
> real 0m27.158s
> user 0m0.079s
> sys 0m26.270s
>
> In contrast, with LRU_GEN=n, creation of the same number of memcgs
> performs better:
>
> # time mkdir testcg_{1..10000}
>
> real 0m3.386s
> user 0m0.044s
> sys 0m3.009s
>
> # time mkdir testcg_{1..20000}
>
> real 0m6.876s
> user 0m0.075s
> sys 0m6.121s
>
> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
> which traverses the entire list to find the tail. This traversal scales
> with the number of memcgs, even when LRU_GEN is runtime-disabled.
>
> Fix this by adding a per-lru_gen tail pointer to track the list's tail.
> Appending new nodes now uses the tail pointer directly, eliminating full
> list traversal.
>
> After applying this patch, memcg creation performance with LRU_GEN=y
> matches the fully disabled baseline:
>
> #time mkdir testcg_{1..10000}
>
> real 0m3.368s
> user 0m0.025s
> sys 0m3.012s
>
> # time mkdir testcg_{1..20000}
> real 0m6.742s
> user 0m0.085s
> sys 0m5.995s
>
> Signed-off-by: Chen Ridong <chenridong@huawei.com>
> ---
> include/linux/mmzone.h | 4 +++
> mm/vmscan.c | 78 ++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 75 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 4398e027f450..bdee57b35126 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -513,6 +513,8 @@ struct lru_gen_folio {
> u8 gen;
> /* the list segment this lru_gen_folio belongs to */
> u8 seg;
> + /* the bin index this lru_gen_folio is queued on */
> + u8 bin;
> /* per-node lru_gen_folio list for global reclaim */
> struct hlist_nulls_node list;
> };
> @@ -610,6 +612,8 @@ struct lru_gen_memcg {
> unsigned long nr_memcgs[MEMCG_NR_GENS];
> /* per-node lru_gen_folio list for global reclaim */
> struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
> + /* cached tails to speed up enqueueing */
> + struct hlist_nulls_node *tails[MEMCG_NR_GENS][MEMCG_NR_BINS];
> /* protects the above */
> spinlock_t lock;
> };
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8890f4b58673..6c2665e48f19 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4299,6 +4299,66 @@ enum {
> MEMCG_LRU_YOUNG,
> };
>
> +static void memcg_lru_add_head_locked(struct pglist_data *pgdat,
> + struct lruvec *lruvec, int gen, int bin)
> +{
> + struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> + struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> + struct hlist_nulls_node *node = &lruvec->lrugen.list;
> + bool empty = !memcg_lru->tails[gen][bin];
> +
> + hlist_nulls_add_head_rcu(node, head);
> + lruvec->lrugen.bin = bin;
> +
> + if (empty)
> + memcg_lru->tails[gen][bin] = node;
> +}
> +
> +static void memcg_lru_add_tail_locked(struct pglist_data *pgdat,
> + struct lruvec *lruvec, int gen, int bin)
> +{
> + struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> + struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> + struct hlist_nulls_node *node = &lruvec->lrugen.list;
> + struct hlist_nulls_node *tail = memcg_lru->tails[gen][bin];
> +
> + if (tail) {
> + WRITE_ONCE(node->next, tail->next);
> + WRITE_ONCE(node->pprev, &tail->next);
> + rcu_assign_pointer(hlist_nulls_next_rcu(tail), node);
> + } else {
> + hlist_nulls_add_head_rcu(node, head);
> + }
> +
> + memcg_lru->tails[gen][bin] = node;
> + lruvec->lrugen.bin = bin;
> +}
> +
> +static void memcg_lru_del_locked(struct pglist_data *pgdat, struct lruvec *lruvec,
> + bool reinit)
> +{
> + int gen = lruvec->lrugen.gen;
> + int bin = lruvec->lrugen.bin;
> + struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> + struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> + struct hlist_nulls_node *node = &lruvec->lrugen.list;
> + struct hlist_nulls_node *prev = NULL;
> +
> + if (hlist_nulls_unhashed(node))
> + return;
> +
> + if (memcg_lru->tails[gen][bin] == node) {
> + if (node->pprev != &head->first)
> + prev = container_of(node->pprev, struct hlist_nulls_node, next);
> + memcg_lru->tails[gen][bin] = prev;
> + }
> +
> + if (reinit)
> + hlist_nulls_del_init_rcu(node);
> + else
> + hlist_nulls_del_rcu(node);
> +}
> +
> static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
> {
> int seg;
> @@ -4326,15 +4386,15 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
> else
> VM_WARN_ON_ONCE(true);
>
> + memcg_lru_del_locked(pgdat, lruvec, false);
> +
> WRITE_ONCE(lruvec->lrugen.seg, seg);
> WRITE_ONCE(lruvec->lrugen.gen, new);
>
> - hlist_nulls_del_rcu(&lruvec->lrugen.list);
> -
> if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
> - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
> + memcg_lru_add_head_locked(pgdat, lruvec, new, bin);
> else
> - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
> + memcg_lru_add_tail_locked(pgdat, lruvec, new, bin);
>
> pgdat->memcg_lru.nr_memcgs[old]--;
> pgdat->memcg_lru.nr_memcgs[new]++;
> @@ -4365,7 +4425,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
>
> lruvec->lrugen.gen = gen;
>
> - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
> + memcg_lru_add_tail_locked(pgdat, lruvec, gen, bin);
> pgdat->memcg_lru.nr_memcgs[gen]++;
>
> spin_unlock_irq(&pgdat->memcg_lru.lock);
> @@ -4399,7 +4459,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
>
> gen = lruvec->lrugen.gen;
>
> - hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
> + memcg_lru_del_locked(pgdat, lruvec, true);
> pgdat->memcg_lru.nr_memcgs[gen]--;
>
> if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
> @@ -5664,8 +5724,10 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
> spin_lock_init(&pgdat->memcg_lru.lock);
>
> for (i = 0; i < MEMCG_NR_GENS; i++) {
> - for (j = 0; j < MEMCG_NR_BINS; j++)
> + for (j = 0; j < MEMCG_NR_BINS; j++) {
> INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
> + pgdat->memcg_lru.tails[i][j] = NULL;
> + }
> }
> }
>
> @@ -5687,6 +5749,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
>
> if (mm_state)
> mm_state->seq = MIN_NR_GENS;
> +
> + lrugen->bin = 0;
> }
>
> #ifdef CONFIG_MEMCG
Hello all,
Is anyone interested in this issue?
Any better ideas or suggestions are welcome.
--
Best regards,
Ridong
© 2016 - 2025 Red Hat, Inc.