The deferred split queue handles cgroups in a suboptimal fashion. The
queue is per-NUMA node or per-cgroup, not the intersection. That means
on a cgrouped system, a node-restricted allocation entering reclaim
can end up splitting large pages on other nodes:
alloc/unmap
deferred_split_folio()
list_add_tail(memcg->split_queue)
set_shrinker_bit(memcg, node, deferred_shrinker_id)
for_each_zone_zonelist_nodemask(restricted_nodes)
mem_cgroup_iter()
shrink_slab(node, memcg)
shrink_slab_memcg(node, memcg)
if test_shrinker_bit(memcg, node, deferred_shrinker_id)
deferred_split_scan()
walks memcg->split_queue
The shrinker bit adds an imperfect guard rail. As soon as the cgroup
has a single large page on the node of interest, all large pages owned
by that memcg, including those on other nodes, will be split.
list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
streamlines a lot of the list operations and reclaim walks. It's used
widely by other major shrinkers already. Convert the deferred split
queue as well.
The list_lru per-memcg heads are instantiated on demand when the first
object of interest is allocated for a cgroup, by calling
memcg_list_lru_alloc_folio(). Add calls to where splittable pages are
created: anon faults, swapin faults, khugepaged collapse.
These calls create all possible node heads for the cgroup at once, so
the migration code (between nodes) doesn't need any special care.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
include/linux/huge_mm.h | 6 +-
include/linux/memcontrol.h | 4 -
include/linux/mmzone.h | 12 --
mm/huge_memory.c | 330 +++++++++++--------------------------
mm/internal.h | 2 +-
mm/khugepaged.c | 7 +
mm/memcontrol.c | 12 +-
mm/memory.c | 52 +++---
mm/mm_init.c | 15 --
9 files changed, 140 insertions(+), 300 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a4d9f964dfde..2d0d0c797dd8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list_to_order(page, NULL, 0);
}
+
+extern struct list_lru deferred_split_lru;
void deferred_split_folio(struct folio *folio, bool partially_mapped);
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg);
-#endif
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze);
@@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
}
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
-static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 086158969529..0782c72a1997 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -277,10 +277,6 @@ struct mem_cgroup {
struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct deferred_split deferred_split_queue;
-#endif
-
#ifdef CONFIG_LRU_GEN_WALKS_MMU
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7bd0134c241c..232b7a71fd69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1429,14 +1429,6 @@ struct zonelist {
*/
extern struct page *mem_map;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct deferred_split {
- spinlock_t split_queue_lock;
- struct list_head split_queue;
- unsigned long split_queue_len;
-};
-#endif
-
#ifdef CONFIG_MEMORY_FAILURE
/*
* Per NUMA node memory failure handling statistics.
@@ -1562,10 +1554,6 @@ typedef struct pglist_data {
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct deferred_split deferred_split_queue;
-#endif
-
#ifdef CONFIG_NUMA_BALANCING
/* start time in ms of current promote rate limit period */
unsigned int nbp_rl_start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7d0a64033b18..ed9b98e2e166 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -14,6 +14,7 @@
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
+#include <linux/list_lru.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
@@ -67,6 +68,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+struct list_lru deferred_split_lru;
static struct shrinker *deferred_split_shrinker;
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc);
@@ -866,6 +868,11 @@ static int __init thp_shrinker_init(void)
if (!deferred_split_shrinker)
return -ENOMEM;
+ if (list_lru_init_memcg(&deferred_split_lru, deferred_split_shrinker)) {
+ shrinker_free(deferred_split_shrinker);
+ return -ENOMEM;
+ }
+
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
shrinker_register(deferred_split_shrinker);
@@ -886,6 +893,7 @@ static int __init thp_shrinker_init(void)
huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
if (!huge_zero_folio_shrinker) {
+ list_lru_destroy(&deferred_split_lru);
shrinker_free(deferred_split_shrinker);
return -ENOMEM;
}
@@ -900,6 +908,7 @@ static int __init thp_shrinker_init(void)
static void __init thp_shrinker_exit(void)
{
shrinker_free(huge_zero_folio_shrinker);
+ list_lru_destroy(&deferred_split_lru);
shrinker_free(deferred_split_shrinker);
}
@@ -1080,119 +1089,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
-static struct deferred_split *split_queue_node(int nid)
-{
- struct pglist_data *pgdata = NODE_DATA(nid);
-
- return &pgdata->deferred_split_queue;
-}
-
-#ifdef CONFIG_MEMCG
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
- struct deferred_split *queue)
-{
- if (mem_cgroup_disabled())
- return NULL;
- if (split_queue_node(folio_nid(folio)) == queue)
- return NULL;
- return container_of(queue, struct mem_cgroup, deferred_split_queue);
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
- return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
-}
-#else
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
- struct deferred_split *queue)
-{
- return NULL;
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
- return split_queue_node(nid);
-}
-#endif
-
-static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
-{
- struct deferred_split *queue;
-
-retry:
- queue = memcg_split_queue(nid, memcg);
- spin_lock(&queue->split_queue_lock);
- /*
- * There is a period between setting memcg to dying and reparenting
- * deferred split queue, and during this period the THPs in the deferred
- * split queue will be hidden from the shrinker side.
- */
- if (unlikely(memcg_is_dying(memcg))) {
- spin_unlock(&queue->split_queue_lock);
- memcg = parent_mem_cgroup(memcg);
- goto retry;
- }
-
- return queue;
-}
-
-static struct deferred_split *
-split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
-{
- struct deferred_split *queue;
-
-retry:
- queue = memcg_split_queue(nid, memcg);
- spin_lock_irqsave(&queue->split_queue_lock, *flags);
- if (unlikely(memcg_is_dying(memcg))) {
- spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
- memcg = parent_mem_cgroup(memcg);
- goto retry;
- }
-
- return queue;
-}
-
-static struct deferred_split *folio_split_queue_lock(struct folio *folio)
-{
- struct deferred_split *queue;
-
- rcu_read_lock();
- queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
- /*
- * The memcg destruction path is acquiring the split queue lock for
- * reparenting. Once you have it locked, it's safe to drop the rcu lock.
- */
- rcu_read_unlock();
-
- return queue;
-}
-
-static struct deferred_split *
-folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
-{
- struct deferred_split *queue;
-
- rcu_read_lock();
- queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
- rcu_read_unlock();
-
- return queue;
-}
-
-static inline void split_queue_unlock(struct deferred_split *queue)
-{
- spin_unlock(&queue->split_queue_lock);
-}
-
-static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
- unsigned long flags)
-{
- spin_unlock_irqrestore(&queue->split_queue_lock, flags);
-}
-
static inline bool is_transparent_hugepage(const struct folio *folio)
{
if (!folio_test_large(folio))
@@ -1293,6 +1189,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
return NULL;
}
+
+ if (memcg_list_lru_alloc_folio(folio, &deferred_split_lru, gfp)) {
+ folio_put(folio);
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ return NULL;
+ }
+
folio_throttle_swaprate(folio, gfp);
/*
@@ -3802,33 +3706,28 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
struct folio *new_folio, *next;
int old_order = folio_order(folio);
int ret = 0;
- struct deferred_split *ds_queue;
+ struct list_lru_one *l;
VM_WARN_ON_ONCE(!mapping && end);
/* Prevent deferred_split_scan() touching ->_refcount */
- ds_queue = folio_split_queue_lock(folio);
+ rcu_read_lock();
+ l = list_lru_lock(&deferred_split_lru, folio_nid(folio), folio_memcg(folio));
if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
struct swap_cluster_info *ci = NULL;
struct lruvec *lruvec;
if (old_order > 1) {
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- /*
- * Reinitialize page_deferred_list after removing the
- * page from the split_queue, otherwise a subsequent
- * split will see list corruption when checking the
- * page_deferred_list.
- */
- list_del_init(&folio->_deferred_list);
- }
+ __list_lru_del(&deferred_split_lru, l,
+ &folio->_deferred_list, folio_nid(folio));
if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
mod_mthp_stat(old_order,
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
}
- split_queue_unlock(ds_queue);
+ list_lru_unlock(l);
+ rcu_read_unlock();
+
if (mapping) {
int nr = folio_nr_pages(folio);
@@ -3929,7 +3828,8 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
if (ci)
swap_cluster_unlock(ci);
} else {
- split_queue_unlock(ds_queue);
+ list_lru_unlock(l);
+ rcu_read_unlock();
return -EAGAIN;
}
@@ -4296,33 +4196,35 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
* queueing THP splits, and that list is (racily observed to be) non-empty.
*
* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
- * zero: because even when split_queue_lock is held, a non-empty _deferred_list
- * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ * zero: because even when the list_lru lock is held, a non-empty
+ * _deferred_list might be in use on deferred_split_scan()'s unlocked
+ * on-stack list.
*
- * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
- * therefore important to unqueue deferred split before changing folio memcg.
+ * The list_lru sublist is determined by folio's memcg: it is therefore
+ * important to unqueue deferred split before changing folio memcg.
*/
bool __folio_unqueue_deferred_split(struct folio *folio)
{
- struct deferred_split *ds_queue;
+ struct list_lru_one *l;
+ int nid = folio_nid(folio);
unsigned long flags;
bool unqueued = false;
WARN_ON_ONCE(folio_ref_count(folio));
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
- ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
+ rcu_read_lock();
+ l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
+ if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
- list_del_init(&folio->_deferred_list);
unqueued = true;
}
- split_queue_unlock_irqrestore(ds_queue, flags);
+ list_lru_unlock_irqrestore(l, &flags);
+ rcu_read_unlock();
return unqueued; /* useful for debug warnings */
}
@@ -4330,7 +4232,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
- struct deferred_split *ds_queue;
+ struct list_lru_one *l;
+ int nid;
+ struct mem_cgroup *memcg;
unsigned long flags;
/*
@@ -4353,7 +4257,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
if (folio_test_swapcache(folio))
return;
- ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
+ nid = folio_nid(folio);
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
folio_set_partially_mapped(folio);
@@ -4361,36 +4269,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
-
}
} else {
/* partially mapped folios cannot become non-partially mapped */
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
}
- if (list_empty(&folio->_deferred_list)) {
- struct mem_cgroup *memcg;
-
- memcg = folio_split_queue_memcg(folio, ds_queue);
- list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
- ds_queue->split_queue_len++;
- if (memcg)
- set_shrinker_bit(memcg, folio_nid(folio),
- shrinker_id(deferred_split_shrinker));
- }
- split_queue_unlock_irqrestore(ds_queue, flags);
+ __list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
+ list_lru_unlock_irqrestore(l, &flags);
+ rcu_read_unlock();
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct pglist_data *pgdata = NODE_DATA(sc->nid);
- struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
-
-#ifdef CONFIG_MEMCG
- if (sc->memcg)
- ds_queue = &sc->memcg->deferred_split_queue;
-#endif
- return READ_ONCE(ds_queue->split_queue_len);
+ return list_lru_shrink_count(&deferred_split_lru, sc);
}
static bool thp_underused(struct folio *folio)
@@ -4420,45 +4312,47 @@ static bool thp_underused(struct folio *folio)
return false;
}
+static enum lru_status deferred_split_isolate(struct list_head *item,
+ struct list_lru_one *lru,
+ void *cb_arg)
+{
+ struct folio *folio = container_of(item, struct folio, _deferred_list);
+ struct list_head *freeable = cb_arg;
+
+ if (folio_try_get(folio)) {
+ list_lru_isolate_move(lru, item, freeable);
+ return LRU_REMOVED;
+ }
+
+ /* We lost race with folio_put() */
+ list_lru_isolate(lru, item);
+ if (folio_test_partially_mapped(folio)) {
+ folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+ }
+ return LRU_REMOVED;
+}
+
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct deferred_split *ds_queue;
- unsigned long flags;
+ LIST_HEAD(dispose);
struct folio *folio, *next;
- int split = 0, i;
- struct folio_batch fbatch;
-
- folio_batch_init(&fbatch);
+ int split = 0;
+ unsigned long isolated;
-retry:
- ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
- /* Take pin on all head pages to avoid freeing them under us */
- list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
- _deferred_list) {
- if (folio_try_get(folio)) {
- folio_batch_add(&fbatch, folio);
- } else if (folio_test_partially_mapped(folio)) {
- /* We lost race with folio_put() */
- folio_clear_partially_mapped(folio);
- mod_mthp_stat(folio_order(folio),
- MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
- }
- list_del_init(&folio->_deferred_list);
- ds_queue->split_queue_len--;
- if (!--sc->nr_to_scan)
- break;
- if (!folio_batch_space(&fbatch))
- break;
- }
- split_queue_unlock_irqrestore(ds_queue, flags);
+ isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
+ deferred_split_isolate, &dispose);
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
bool did_split = false;
bool underused = false;
- struct deferred_split *fqueue;
+ struct list_lru_one *l;
+ unsigned long flags;
+
+ list_del_init(&folio->_deferred_list);
- folio = fbatch.folios[i];
if (!folio_test_partially_mapped(folio)) {
/*
* See try_to_map_unused_to_zeropage(): we cannot
@@ -4481,64 +4375,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
}
folio_unlock(folio);
next:
- if (did_split || !folio_test_partially_mapped(folio))
- continue;
/*
* Only add back to the queue if folio is partially mapped.
* If thp_underused returns false, or if split_folio fails
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- fqueue = folio_split_queue_lock_irqsave(folio, &flags);
- if (list_empty(&folio->_deferred_list)) {
- list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
- fqueue->split_queue_len++;
+ if (!did_split && folio_test_partially_mapped(folio)) {
+ rcu_read_lock();
+ l = list_lru_lock_irqsave(&deferred_split_lru,
+ folio_nid(folio),
+ folio_memcg(folio),
+ &flags);
+ __list_lru_add(&deferred_split_lru, l,
+ &folio->_deferred_list,
+ folio_nid(folio), folio_memcg(folio));
+ list_lru_unlock_irqrestore(l, &flags);
+ rcu_read_unlock();
}
- split_queue_unlock_irqrestore(fqueue, flags);
- }
- folios_put(&fbatch);
-
- if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
- cond_resched();
- goto retry;
+ folio_put(folio);
}
- /*
- * Stop shrinker if we didn't split any page, but the queue is empty.
- * This can happen if pages were freed under us.
- */
- if (!split && list_empty(&ds_queue->split_queue))
+ if (!split && !isolated)
return SHRINK_STOP;
return split;
}
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct deferred_split *ds_queue = &memcg->deferred_split_queue;
- struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
- int nid;
-
- spin_lock_irq(&ds_queue->split_queue_lock);
- spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
-
- if (!ds_queue->split_queue_len)
- goto unlock;
-
- list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
- parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
- ds_queue->split_queue_len = 0;
-
- for_each_node(nid)
- set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
-
-unlock:
- spin_unlock(&parent_ds_queue->split_queue_lock);
- spin_unlock_irq(&ds_queue->split_queue_lock);
-}
-#endif
-
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
diff --git a/mm/internal.h b/mm/internal.h
index 95b583e7e4f7..71d2605f8040 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -857,7 +857,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
/*
* At this point, there is no one trying to add the folio to
* deferred_list. If folio is not in deferred_list, it's safe
- * to check without acquiring the split_queue_lock.
+ * to check without acquiring the list_lru lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
return false;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b7b4680d27ab..01fd3d5933c5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1076,6 +1076,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
}
count_vm_event(THP_COLLAPSE_ALLOC);
+
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*foliop = NULL;
@@ -1084,6 +1085,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+ if (memcg_list_lru_alloc_folio(folio, &deferred_split_lru, gfp)) {
+ folio_put(folio);
+ *foliop = NULL;
+ return SCAN_CGROUP_CHARGE_FAIL;
+ }
+
*foliop = folio;
return SCAN_SUCCEED;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a47fb68dd65f..f381cb6bdff1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
memcg->cgwb_frn[i].done =
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
- INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
- memcg->deferred_split_queue.split_queue_len = 0;
#endif
lru_gen_init_memcg(memcg);
return memcg;
@@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
zswap_memcg_offline_cleanup(memcg);
memcg_offline_kmem(memcg);
- reparent_deferred_split_queue(memcg);
/*
- * The reparenting of objcg must be after the reparenting of the
- * list_lru and deferred_split_queue above, which ensures that they will
- * not mistakenly get the parent list_lru and deferred_split_queue.
+ * The reparenting of objcg must be after the reparenting of
+ * the list_lru in memcg_offline_kmem(), which ensures that
+ * they will not mistakenly get the parent list_lru.
*/
memcg_reparent_objcgs(memcg);
reparent_shrinker_deferred(memcg);
diff --git a/mm/memory.c b/mm/memory.c
index 38062f8e1165..4dad1a7890aa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
folio = vma_alloc_folio(gfp, order, vma, addr);
- if (folio) {
- if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
- gfp, entry))
- return folio;
+ if (!folio)
+ goto next;
+ if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
folio_put(folio);
+ goto next;
}
+ if (memcg_list_lru_alloc_folio(folio, &deferred_split_lru, gfp)) {
+ folio_put(folio);
+ goto fallback;
+ }
+ return folio;
+next:
count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
order = next_order(&orders, order);
}
@@ -5168,24 +5174,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
folio = vma_alloc_folio(gfp, order, vma, addr);
- if (folio) {
- if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
- count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
- folio_put(folio);
- goto next;
- }
- folio_throttle_swaprate(folio, gfp);
- /*
- * When a folio is not zeroed during allocation
- * (__GFP_ZERO not used) or user folios require special
- * handling, folio_zero_user() is used to make sure
- * that the page corresponding to the faulting address
- * will be hot in the cache after zeroing.
- */
- if (user_alloc_needs_zeroing())
- folio_zero_user(folio, vmf->address);
- return folio;
+ if (!folio)
+ goto next;
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ folio_put(folio);
+ goto next;
}
+ if (memcg_list_lru_alloc_folio(folio, &deferred_split_lru, gfp)) {
+ folio_put(folio);
+ goto fallback;
+ }
+ folio_throttle_swaprate(folio, gfp);
+ /*
+ * When a folio is not zeroed during allocation
+ * (__GFP_ZERO not used) or user folios require special
+ * handling, folio_zero_user() is used to make sure
+ * that the page corresponding to the faulting address
+ * will be hot in the cache after zeroing.
+ */
+ if (user_alloc_needs_zeroing())
+ folio_zero_user(folio, vmf->address);
+ return folio;
next:
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
order = next_order(&orders, order);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index cec7bb758bdd..f293a62e652a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1388,19 +1388,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
- struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
- spin_lock_init(&ds_queue->split_queue_lock);
- INIT_LIST_HEAD(&ds_queue->split_queue);
- ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
#ifdef CONFIG_COMPACTION
static void pgdat_init_kcompactd(struct pglist_data *pgdat)
{
@@ -1416,8 +1403,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
pgdat_resize_init(pgdat);
pgdat_kswapd_lock_init(pgdat);
-
- pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
--
2.53.0
On 3/12/26 21:51, Johannes Weiner wrote:
> The deferred split queue handles cgroups in a suboptimal fashion. The
> queue is per-NUMA node or per-cgroup, not the intersection. That means
> on a cgrouped system, a node-restricted allocation entering reclaim
> can end up splitting large pages on other nodes:
>
> alloc/unmap
> deferred_split_folio()
> list_add_tail(memcg->split_queue)
> set_shrinker_bit(memcg, node, deferred_shrinker_id)
>
> for_each_zone_zonelist_nodemask(restricted_nodes)
> mem_cgroup_iter()
> shrink_slab(node, memcg)
> shrink_slab_memcg(node, memcg)
> if test_shrinker_bit(memcg, node, deferred_shrinker_id)
> deferred_split_scan()
> walks memcg->split_queue
>
> The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> has a single large page on the node of interest, all large pages owned
> by that memcg, including those on other nodes, will be split.
>
> list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> streamlines a lot of the list operations and reclaim walks. It's used
> widely by other major shrinkers already. Convert the deferred split
> queue as well.
>
> The list_lru per-memcg heads are instantiated on demand when the first
> object of interest is allocated for a cgroup, by calling
> memcg_list_lru_alloc_folio(). Add calls to where splittable pages are
> created: anon faults, swapin faults, khugepaged collapse.
>
> These calls create all possible node heads for the cgroup at once, so
> the migration code (between nodes) doesn't need any special care.
[...]
> -
> static inline bool is_transparent_hugepage(const struct folio *folio)
> {
> if (!folio_test_large(folio))
> @@ -1293,6 +1189,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
> count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> return NULL;
> }
> +
> + if (memcg_list_lru_alloc_folio(folio, &deferred_split_lru, gfp)) {
> + folio_put(folio);
> + count_vm_event(THP_FAULT_FALLBACK);
> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> + return NULL;
> + }
So, in all anon alloc paths, we essentialy have
1) vma_alloc_folio / __folio_alloc (khugepaged being odd)
2) mem_cgroup_charge / mem_cgroup_swapin_charge_folio
3) memcg_list_lru_alloc_folio
I wonder if we could do better in most cases and have something like a
vma_alloc_anon_folio()
That wraps the vma_alloc_folio() + memcg_list_lru_alloc_folio(), but
still leaves the charging to the caller?
The would at least combine 1) and 3) in a single API. (except for the
odd cases without a VMA).
I guess we would want to skip the memcg_list_lru_alloc_folio() for
order-0 folios, correct?
> +
> folio_throttle_swaprate(folio, gfp);
>
> /*
> @@ -3802,33 +3706,28 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
> struct folio *new_folio, *next;
> int old_order = folio_order(folio);
> int ret = 0;
> - struct deferred_split *ds_queue;
> + struct list_lru_one *l;
>
> VM_WARN_ON_ONCE(!mapping && end);
> /* Prevent deferred_split_scan() touching ->_refcount */
> - ds_queue = folio_split_queue_lock(folio);
> + rcu_read_lock();
The RCU lock is for the folio_memcg(), right?
I recall I raised in the past that some get/put-like logic (that wraps
the rcu_read_lock() + folio_memcg()) might make this a lot easier to get.
memcg = folio_memcg_lookup(folio)
... do stuff
folio_memcg_putback(folio, memcg);
Or sth like that.
Alternativey, you could have some helpers that do the
list_lru_lock+unlock etc.
folio_memcg_list_lru_lock()
...
folio_memcg_list_ru_unlock(l);
Just some thoughts as inspiration :)
> + l = list_lru_lock(&deferred_split_lru, folio_nid(folio), folio_memcg(folio));
> if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
> struct swap_cluster_info *ci = NULL;
> struct lruvec *lruvec;
>
> if (old_order > 1) {
> - if (!list_empty(&folio->_deferred_list)) {
> - ds_queue->split_queue_len--;
> - /*
> - * Reinitialize page_deferred_list after removing the
> - * page from the split_queue, otherwise a subsequent
> - * split will see list corruption when checking the
> - * page_deferred_list.
> - */
> - list_del_init(&folio->_deferred_list);
> - }
> + __list_lru_del(&deferred_split_lru, l,
> + &folio->_deferred_list, folio_nid(folio));
> if (folio_test_partially_mapped(folio)) {
> folio_clear_partially_mapped(folio);
> mod_mthp_stat(old_order,
> MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> }
> }
> - split_queue_unlock(ds_queue);
> + list_lru_unlock(l);
> + rcu_read_unlock();
> +
> if (mapping) {
[...]
Most changes here look mostly mechanically, quite nice. I'll probably
have to go over some bits once again with a fresh mind :)
--
Cheers,
David
On Wed, Mar 18, 2026 at 09:25:17PM +0100, David Hildenbrand (Arm) wrote:
> On 3/12/26 21:51, Johannes Weiner wrote:
> > The deferred split queue handles cgroups in a suboptimal fashion. The
> > queue is per-NUMA node or per-cgroup, not the intersection. That means
> > on a cgrouped system, a node-restricted allocation entering reclaim
> > can end up splitting large pages on other nodes:
> >
> > alloc/unmap
> > deferred_split_folio()
> > list_add_tail(memcg->split_queue)
> > set_shrinker_bit(memcg, node, deferred_shrinker_id)
> >
> > for_each_zone_zonelist_nodemask(restricted_nodes)
> > mem_cgroup_iter()
> > shrink_slab(node, memcg)
> > shrink_slab_memcg(node, memcg)
> > if test_shrinker_bit(memcg, node, deferred_shrinker_id)
> > deferred_split_scan()
> > walks memcg->split_queue
> >
> > The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> > has a single large page on the node of interest, all large pages owned
> > by that memcg, including those on other nodes, will be split.
> >
> > list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> > streamlines a lot of the list operations and reclaim walks. It's used
> > widely by other major shrinkers already. Convert the deferred split
> > queue as well.
> >
> > The list_lru per-memcg heads are instantiated on demand when the first
> > object of interest is allocated for a cgroup, by calling
> > memcg_list_lru_alloc_folio(). Add calls to where splittable pages are
> > created: anon faults, swapin faults, khugepaged collapse.
> >
> > These calls create all possible node heads for the cgroup at once, so
> > the migration code (between nodes) doesn't need any special care.
>
>
> [...]
>
> > -
> > static inline bool is_transparent_hugepage(const struct folio *folio)
> > {
> > if (!folio_test_large(folio))
> > @@ -1293,6 +1189,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
> > count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > return NULL;
> > }
> > +
> > + if (memcg_list_lru_alloc_folio(folio, &deferred_split_lru, gfp)) {
> > + folio_put(folio);
> > + count_vm_event(THP_FAULT_FALLBACK);
> > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> > + return NULL;
> > + }
>
> So, in all anon alloc paths, we essentialy have
>
> 1) vma_alloc_folio / __folio_alloc (khugepaged being odd)
> 2) mem_cgroup_charge / mem_cgroup_swapin_charge_folio
> 3) memcg_list_lru_alloc_folio
>
> I wonder if we could do better in most cases and have something like a
>
> vma_alloc_anon_folio()
>
> That wraps the vma_alloc_folio() + memcg_list_lru_alloc_folio(), but
> still leaves the charging to the caller?
Hm, but it's the charging that figures out the memcg and sets
folio_memcg() :(
> The would at least combine 1) and 3) in a single API. (except for the
> odd cases without a VMA).
>
> I guess we would want to skip the memcg_list_lru_alloc_folio() for
> order-0 folios, correct?
Yeah, we don't use the queue for < order-1. In deferred_split_folio():
/*
* Order 1 folios have no space for a deferred list, but we also
* won't waste much memory by not adding them to the deferred list.
*/
if (folio_order(folio) <= 1)
return;
> > @@ -3802,33 +3706,28 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
> > struct folio *new_folio, *next;
> > int old_order = folio_order(folio);
> > int ret = 0;
> > - struct deferred_split *ds_queue;
> > + struct list_lru_one *l;
> >
> > VM_WARN_ON_ONCE(!mapping && end);
> > /* Prevent deferred_split_scan() touching ->_refcount */
> > - ds_queue = folio_split_queue_lock(folio);
> > + rcu_read_lock();
>
> The RCU lock is for the folio_memcg(), right?
>
> I recall I raised in the past that some get/put-like logic (that wraps
> the rcu_read_lock() + folio_memcg()) might make this a lot easier to get.
>
>
> memcg = folio_memcg_lookup(folio)
>
> ... do stuff
>
> folio_memcg_putback(folio, memcg);
>
> Or sth like that.
>
>
> Alternativey, you could have some helpers that do the
> list_lru_lock+unlock etc.
>
> folio_memcg_list_lru_lock()
> ...
> folio_memcg_list_ru_unlock(l);
>
> Just some thoughts as inspiration :)
I remember you raising this in the objcg + reparenting patches. There
are a few more instances of
rcu_read_lock()
foo = folio_memcg()
...
rcu_read_unlock()
in other parts of the code not touched by these patches here, so the
first pattern is a more universal encapsulation.
Let me look into this. Would you be okay with a follow-up that covers
the others as well?
>> >> So, in all anon alloc paths, we essentialy have >> >> 1) vma_alloc_folio / __folio_alloc (khugepaged being odd) >> 2) mem_cgroup_charge / mem_cgroup_swapin_charge_folio >> 3) memcg_list_lru_alloc_folio >> >> I wonder if we could do better in most cases and have something like a >> >> vma_alloc_anon_folio() >> >> That wraps the vma_alloc_folio() + memcg_list_lru_alloc_folio(), but >> still leaves the charging to the caller? > > Hm, but it's the charging that figures out the memcg and sets > folio_memcg() :( Oh ... right. I guess we would then have to do all 3 things at the same time, which makes the helper a bit more involved. I'll note that collapse_file() also calls alloc_charge_folio(), but not for allocating an anonymous folio that would have to be placed on the deferred split queue. > >> The would at least combine 1) and 3) in a single API. (except for the >> odd cases without a VMA). >> >> I guess we would want to skip the memcg_list_lru_alloc_folio() for >> order-0 folios, correct? > > Yeah, we don't use the queue for < order-1. In deferred_split_folio(): > > /* > * Order 1 folios have no space for a deferred list, but we also > * won't waste much memory by not adding them to the deferred list. > */ > if (folio_order(folio) <= 1) > return; > >>> @@ -3802,33 +3706,28 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n >>> struct folio *new_folio, *next; >>> int old_order = folio_order(folio); >>> int ret = 0; >>> - struct deferred_split *ds_queue; >>> + struct list_lru_one *l; >>> >>> VM_WARN_ON_ONCE(!mapping && end); >>> /* Prevent deferred_split_scan() touching ->_refcount */ >>> - ds_queue = folio_split_queue_lock(folio); >>> + rcu_read_lock(); >> >> The RCU lock is for the folio_memcg(), right? >> >> I recall I raised in the past that some get/put-like logic (that wraps >> the rcu_read_lock() + folio_memcg()) might make this a lot easier to get. >> >> >> memcg = folio_memcg_lookup(folio) >> >> ... do stuff >> >> folio_memcg_putback(folio, memcg); >> >> Or sth like that. >> >> >> Alternativey, you could have some helpers that do the >> list_lru_lock+unlock etc. >> >> folio_memcg_list_lru_lock() >> ... >> folio_memcg_list_ru_unlock(l); >> >> Just some thoughts as inspiration :) > > I remember you raising this in the objcg + reparenting patches. There > are a few more instances of > > rcu_read_lock() > foo = folio_memcg() > ... > rcu_read_unlock() > > in other parts of the code not touched by these patches here, so the > first pattern is a more universal encapsulation. > > Let me look into this. Would you be okay with a follow-up that covers > the others as well? Of course :) If list_lru lock helpers would be the right thing to do, it might be better placed in this series. -- Cheers, David
On Thu, Mar 19, 2026 at 08:21:21AM +0100, David Hildenbrand (Arm) wrote:
> Of course :) If list_lru lock helpers would be the right thing to do, it
> might be better placed in this series.
I think this is slightly more promising. See below. The callsites in
huge_memory.c look nicer. But the double folio_nid() and folio_memcg()
lookups (when the caller needs them too) are kind of unfortunate; and
it feels like a lot of API for 4 callsites. Thoughts?
include/linux/list_lru.h | 8 ++++++++
mm/huge_memory.c | 43 +++++++++++++++----------------------------
mm/list_lru.c | 29 +++++++++++++++++++++++++++++
3 files changed, 52 insertions(+), 28 deletions(-)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 4bd29b61c59a..6b734d08fa1b 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -123,6 +123,14 @@ struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid,
void list_lru_unlock_irqrestore(struct list_lru_one *l,
unsigned long *irq_flags);
+struct list_lru_one *folio_list_lru_lock(struct folio *folio,
+ struct list_lru *lru);
+void folio_list_lru_unlock(struct folio *folio, struct list_lru_one *l);
+struct list_lru_one *folio_list_lru_lock_irqsave(struct folio *folio,
+ struct list_lru *lru, unsigned long *flags);
+void folio_list_lru_unlock_irqrestore(struct folio *folio,
+ struct list_lru_one *l, unsigned long *flags);
+
/* Caller-locked variants, see list_lru_add() etc for documentation */
bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l,
struct list_head *item, int nid, struct mem_cgroup *memcg);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e90d08db219d..6996ef224e24 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3768,11 +3768,8 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
VM_WARN_ON_ONCE(!mapping && end);
/* Prevent deferred_split_scan() touching ->_refcount */
dequeue_deferred = folio_test_anon(folio) && old_order > 1;
- if (dequeue_deferred) {
- rcu_read_lock();
- l = list_lru_lock(&deferred_split_lru,
- folio_nid(folio), folio_memcg(folio));
- }
+ if (dequeue_deferred)
+ l = folio_list_lru_lock(folio, &deferred_split_lru);
if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
struct swap_cluster_info *ci = NULL;
struct lruvec *lruvec;
@@ -3785,8 +3782,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
mod_mthp_stat(old_order,
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
- list_lru_unlock(l);
- rcu_read_unlock();
+ folio_list_lru_unlock(folio, l);
}
if (mapping) {
@@ -3889,10 +3885,8 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
if (ci)
swap_cluster_unlock(ci);
} else {
- if (dequeue_deferred) {
- list_lru_unlock(l);
- rcu_read_unlock();
- }
+ if (dequeue_deferred)
+ folio_list_lru_unlock(folio, l);
return -EAGAIN;
}
@@ -4276,8 +4270,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
WARN_ON_ONCE(folio_ref_count(folio));
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
- rcu_read_lock();
- l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
+ l = folio_list_lru_lock_irqsave(folio, &deferred_split_lru, &flags);
if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
@@ -4286,7 +4279,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
}
unqueued = true;
}
- list_lru_unlock_irqrestore(l, &flags);
+ folio_list_lru_unlock_irqrestore(folio, l, &flags);
rcu_read_unlock();
return unqueued; /* useful for debug warnings */
@@ -4297,7 +4290,6 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
struct list_lru_one *l;
int nid;
- struct mem_cgroup *memcg;
unsigned long flags;
/*
@@ -4322,9 +4314,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
nid = folio_nid(folio);
- rcu_read_lock();
- memcg = folio_memcg(folio);
- l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
+ l = folio_list_lru_lock_irqsave(folio, &deferred_split_lru, &flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
folio_set_partially_mapped(folio);
@@ -4337,9 +4327,9 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
/* partially mapped folios cannot become non-partially mapped */
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
}
- __list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
- list_lru_unlock_irqrestore(l, &flags);
- rcu_read_unlock();
+ __list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid,
+ folio_memcg(folio));
+ folio_list_lru_unlock_irqrestore(folio, l, &flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
@@ -4445,16 +4435,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
* don't add it back to split_queue.
*/
if (!did_split && folio_test_partially_mapped(folio)) {
- rcu_read_lock();
- l = list_lru_lock_irqsave(&deferred_split_lru,
- folio_nid(folio),
- folio_memcg(folio),
- &flags);
+ l = folio_list_lru_lock_irqsave(folio,
+ &deferred_split_lru,
+ &flags);
__list_lru_add(&deferred_split_lru, l,
&folio->_deferred_list,
folio_nid(folio), folio_memcg(folio));
- list_lru_unlock_irqrestore(l, &flags);
- rcu_read_unlock();
+ folio_list_lru_unlock_irqrestore(folio, l, &flags);
}
folio_put(folio);
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 1ccdd45b1d14..8d50741ef18d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -173,6 +173,35 @@ void list_lru_unlock_irqrestore(struct list_lru_one *l, unsigned long *flags)
unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/flags);
}
+struct list_lru_one *folio_list_lru_lock(struct folio *folio, struct list_lru *lru)
+{
+ rcu_read_lock();
+ return list_lru_lock(lru, folio_nid(folio), folio_memcg(folio));
+}
+
+void folio_list_lru_unlock(struct folio *folio, struct list_lru_one *l)
+{
+ list_lru_unlock(l);
+ rcu_read_unlock();
+}
+
+struct list_lru_one *folio_list_lru_lock_irqsave(struct folio *folio,
+ struct list_lru *lru,
+ unsigned long *flags)
+{
+ rcu_read_lock();
+ return list_lru_lock_irqsave(lru, folio_nid(folio),
+ folio_memcg(folio), flags);
+}
+
+void folio_list_lru_unlock_irqrestore(struct folio *folio,
+ struct list_lru_one *l,
+ unsigned long *flags)
+{
+ list_lru_unlock_irqrestore(l, flags);
+ rcu_read_unlock();
+}
+
bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l,
struct list_head *item, int nid,
struct mem_cgroup *memcg)
On 3/20/26 17:07, Johannes Weiner wrote: > On Thu, Mar 19, 2026 at 08:21:21AM +0100, David Hildenbrand (Arm) wrote: >> Of course :) If list_lru lock helpers would be the right thing to do, it >> might be better placed in this series. > > I think this is slightly more promising. See below. The callsites in > huge_memory.c look nicer. But the double folio_nid() and folio_memcg() > lookups (when the caller needs them too) are kind of unfortunate; and > it feels like a lot of API for 4 callsites. Thoughts? I like that. Could we just put the implementation inline into list_lru.h, such that the compiler could try reusing nid+memcg? [...] > __list_lru_add(&deferred_split_lru, l, > &folio->_deferred_list, > folio_nid(folio), folio_memcg(folio)); > - list_lru_unlock_irqrestore(l, &flags); > - rcu_read_unlock(); > + folio_list_lru_unlock_irqrestore(folio, l, &flags); I guess it would look even cleaner if we would wrap the __list_lru_add() that needs the memcg + nid in an own helper. These are the only remaining users of memcg+nid, right? -- Cheers, David
On Thu, Mar 19, 2026 at 08:21:21AM +0100, David Hildenbrand (Arm) wrote:
> >>> @@ -3802,33 +3706,28 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
> >>> struct folio *new_folio, *next;
> >>> int old_order = folio_order(folio);
> >>> int ret = 0;
> >>> - struct deferred_split *ds_queue;
> >>> + struct list_lru_one *l;
> >>>
> >>> VM_WARN_ON_ONCE(!mapping && end);
> >>> /* Prevent deferred_split_scan() touching ->_refcount */
> >>> - ds_queue = folio_split_queue_lock(folio);
> >>> + rcu_read_lock();
> >>
> >> The RCU lock is for the folio_memcg(), right?
> >>
> >> I recall I raised in the past that some get/put-like logic (that wraps
> >> the rcu_read_lock() + folio_memcg()) might make this a lot easier to get.
> >>
> >>
> >> memcg = folio_memcg_lookup(folio)
> >>
> >> ... do stuff
> >>
> >> folio_memcg_putback(folio, memcg);
> >>
> >> Or sth like that.
> >>
> >>
> >> Alternativey, you could have some helpers that do the
> >> list_lru_lock+unlock etc.
> >>
> >> folio_memcg_list_lru_lock()
> >> ...
> >> folio_memcg_list_ru_unlock(l);
> >>
> >> Just some thoughts as inspiration :)
> >
> > I remember you raising this in the objcg + reparenting patches. There
> > are a few more instances of
> >
> > rcu_read_lock()
> > foo = folio_memcg()
> > ...
> > rcu_read_unlock()
> >
> > in other parts of the code not touched by these patches here, so the
> > first pattern is a more universal encapsulation.
> >
> > Let me look into this. Would you be okay with a follow-up that covers
> > the others as well?
>
> Of course :) If list_lru lock helpers would be the right thing to do, it
> might be better placed in this series.
I'm playing around with the below. But there are a few things that
seem suboptimal:
- We need a local @memcg, which makes sites that just pass
folio_memcg() somewhere else fatter. More site LOC on average.
- Despite being more verbose, it communicates less. rcu_read_lock()
is universally understood, folio_memcg_foo() is cryptic.
- It doesn't cover similar accessors with the same lifetime rules,
like folio_lruvec(), folio_memcg_check()
include/linux/memcontrol.h | 35 ++++++++++++++++++++++++++---------
mm/huge_memory.c | 34 ++++++++++++++++++----------------
mm/list_lru.c | 5 ++---
mm/memcontrol.c | 17 +++++++----------
mm/migrate.c | 5 ++---
mm/page_io.c | 12 ++++++------
mm/vmscan.c | 7 ++++---
mm/workingset.c | 5 ++---
mm/zswap.c | 11 ++++++-----
9 files changed, 73 insertions(+), 58 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0782c72a1997..5162145b9322 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -430,6 +430,17 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
+static inline struct mem_cgroup *folio_memcg_begin(struct folio *folio)
+{
+ rcu_read_lock();
+ return folio_memcg(folio);
+}
+
+static inline void folio_memcg_end(void)
+{
+ rcu_read_unlock();
+}
+
/*
* folio_memcg_charged - If a folio is charged to a memory cgroup.
* @folio: Pointer to the folio.
@@ -917,11 +928,10 @@ static inline void mod_memcg_page_state(struct page *page,
if (mem_cgroup_disabled())
return;
- rcu_read_lock();
- memcg = folio_memcg(page_folio(page));
+ memcg = folio_memcg_begin(page_folio(page));
if (memcg)
mod_memcg_state(memcg, idx, val);
- rcu_read_unlock();
+ folio_memcg_end();
}
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
@@ -949,10 +959,9 @@ static inline void count_memcg_folio_events(struct folio *folio,
if (!folio_memcg_charged(folio))
return;
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
count_memcg_events(memcg, idx, nr);
- rcu_read_unlock();
+ folio_memcg_end();
}
static inline void count_memcg_events_mm(struct mm_struct *mm,
@@ -1035,6 +1044,15 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
return NULL;
}
+static inline struct mem_cgroup *folio_memcg_begin(struct folio *folio)
+{
+ return NULL;
+}
+
+static inline void folio_memcg_end(void)
+{
+}
+
static inline bool folio_memcg_charged(struct folio *folio)
{
return false;
@@ -1546,11 +1564,10 @@ static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
if (!folio_memcg_charged(folio))
return;
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
if (unlikely(&memcg->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
- rcu_read_unlock();
+ folio_memcg_end();
}
void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e90d08db219d..1aa20c1dd0c1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3769,9 +3769,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
/* Prevent deferred_split_scan() touching ->_refcount */
dequeue_deferred = folio_test_anon(folio) && old_order > 1;
if (dequeue_deferred) {
- rcu_read_lock();
- l = list_lru_lock(&deferred_split_lru,
- folio_nid(folio), folio_memcg(folio));
+ struct mem_cgroup *memcg;
+
+ memcg = folio_memcg_begin(folio);
+ l = list_lru_lock(&deferred_split_lru, folio_nid(folio), memcg);
}
if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
struct swap_cluster_info *ci = NULL;
@@ -3786,7 +3787,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_lru_unlock(l);
- rcu_read_unlock();
+ folio_memcg_end();
}
if (mapping) {
@@ -3891,7 +3892,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
} else {
if (dequeue_deferred) {
list_lru_unlock(l);
- rcu_read_unlock();
+ folio_memcg_end();
}
return -EAGAIN;
}
@@ -4272,12 +4273,13 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
int nid = folio_nid(folio);
unsigned long flags;
bool unqueued = false;
+ struct mem_cgroup *memcg;
WARN_ON_ONCE(folio_ref_count(folio));
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
- rcu_read_lock();
- l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
+ memcg = folio_memcg_begin(folio);
+ l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
@@ -4287,7 +4289,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
unqueued = true;
}
list_lru_unlock_irqrestore(l, &flags);
- rcu_read_unlock();
+ folio_memcg_end();
return unqueued; /* useful for debug warnings */
}
@@ -4322,8 +4324,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
nid = folio_nid(folio);
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
@@ -4339,7 +4340,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
}
__list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
list_lru_unlock_irqrestore(l, &flags);
- rcu_read_unlock();
+ folio_memcg_end();
}
static unsigned long deferred_split_count(struct shrinker *shrink,
@@ -4445,16 +4446,17 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
* don't add it back to split_queue.
*/
if (!did_split && folio_test_partially_mapped(folio)) {
- rcu_read_lock();
+ struct mem_cgroup *memcg;
+
+ memcg = folio_memcg_begin(folio);
l = list_lru_lock_irqsave(&deferred_split_lru,
- folio_nid(folio),
- folio_memcg(folio),
+ folio_nid(folio), memcg,
&flags);
__list_lru_add(&deferred_split_lru, l,
&folio->_deferred_list,
- folio_nid(folio), folio_memcg(folio));
+ folio_nid(folio), memcg);
list_lru_unlock_irqrestore(l, &flags);
- rcu_read_unlock();
+ folio_memcg_end();
}
folio_put(folio);
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 1ccdd45b1d14..638d084bb0f5 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -604,10 +604,9 @@ int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru,
return 0;
/* Fast path when list_lru heads already exist */
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
res = memcg_list_lru_allocated(memcg, lru);
- rcu_read_unlock();
+ folio_memcg_end();
if (likely(res))
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f381cb6bdff1..14732f1542f2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -965,18 +965,17 @@ void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
pg_data_t *pgdat = folio_pgdat(folio);
struct lruvec *lruvec;
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
- rcu_read_unlock();
+ folio_memcg_end();
mod_node_page_state(pgdat, idx, val);
return;
}
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_lruvec_state(lruvec, idx, val);
- rcu_read_unlock();
+ folio_memcg_end();
}
EXPORT_SYMBOL(lruvec_stat_mod_folio);
@@ -1170,11 +1169,10 @@ struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
if (!folio_memcg_charged(folio))
return root_mem_cgroup;
- rcu_read_lock();
do {
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
} while (unlikely(!css_tryget(&memcg->css)));
- rcu_read_unlock();
+ folio_memcg_end();
return memcg;
}
@@ -5535,8 +5533,7 @@ bool mem_cgroup_swap_full(struct folio *folio)
if (do_memsw_account() || !folio_memcg_charged(folio))
return ret;
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
@@ -5546,7 +5543,7 @@ bool mem_cgroup_swap_full(struct folio *folio)
break;
}
}
- rcu_read_unlock();
+ folio_memcg_end();
return ret;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index fdbb20163f66..a2d542ebf3ed 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -672,8 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
struct lruvec *old_lruvec, *new_lruvec;
struct mem_cgroup *memcg;
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
@@ -700,7 +699,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
- rcu_read_unlock();
+ folio_memcg_end();
}
local_irq_enable();
diff --git a/mm/page_io.c b/mm/page_io.c
index 63b262f4c5a9..862135a65848 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -239,6 +239,7 @@ static void swap_zeromap_folio_clear(struct folio *folio)
*/
int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
{
+ struct mem_cgroup *memcg;
int ret = 0;
if (folio_free_swap(folio))
@@ -277,13 +278,13 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
goto out_unlock;
}
- rcu_read_lock();
- if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
+ memcg = folio_memcg_begin(folio);
+ if (!mem_cgroup_zswap_writeback_enabled(memcg)) {
rcu_read_unlock();
folio_mark_dirty(folio);
return AOP_WRITEPAGE_ACTIVATE;
}
- rcu_read_unlock();
+ folio_memcg_end();
__swap_writepage(folio, swap_plug);
return 0;
@@ -314,11 +315,10 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
if (!folio_memcg_charged(folio))
return;
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
- rcu_read_unlock();
+ folio_memcg_end();
}
#else
#define bio_associate_blkg_from_page(bio, folio) do { } while (0)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33287ba4a500..12ad40fa7d60 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3407,6 +3407,7 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
struct pglist_data *pgdat)
{
struct folio *folio = pfn_folio(pfn);
+ struct mem_cgroup *this_memcg;
if (folio_lru_gen(folio) < 0)
return NULL;
@@ -3414,10 +3415,10 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
if (folio_nid(folio) != pgdat->node_id)
return NULL;
- rcu_read_lock();
- if (folio_memcg(folio) != memcg)
+ this_memcg = folio_memcg_begin(folio);
+ if (this_memcg != memcg)
folio = NULL;
- rcu_read_unlock();
+ folio_memcg_end();
return folio;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 07e6836d0502..77bfec58b797 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -251,8 +251,7 @@ static void *lru_gen_eviction(struct folio *folio)
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
- rcu_read_lock();
- memcg = folio_memcg(folio);
+ memcg = folio_memcg_begin(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
@@ -261,7 +260,7 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
memcg_id = mem_cgroup_private_id(memcg);
- rcu_read_unlock();
+ folio_memcg_end();
return pack_shadow(memcg_id, pgdat, token, workingset, type);
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 4f2e652e8ad3..fb035dd70d8b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -895,14 +895,15 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
* to the active LRU list in the case.
*/
if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
- rcu_read_lock();
- if (!mem_cgroup_zswap_writeback_enabled(
- folio_memcg(page_folio(page)))) {
- rcu_read_unlock();
+ struct mem_cgroup *memcg;
+
+ memcg = folio_memcg_begin(page_folio(page));
+ if (!mem_cgroup_zswap_writeback_enabled(memcg)) {
+ folio_memcg_end();
comp_ret = comp_ret ? comp_ret : -EINVAL;
goto unlock;
}
- rcu_read_unlock();
+ folio_memcg_end();
comp_ret = 0;
dlen = PAGE_SIZE;
dst = kmap_local_page(page);
On 3/20/26 17:02, Johannes Weiner wrote: > On Thu, Mar 19, 2026 at 08:21:21AM +0100, David Hildenbrand (Arm) wrote: >>> >>> I remember you raising this in the objcg + reparenting patches. There >>> are a few more instances of >>> >>> rcu_read_lock() >>> foo = folio_memcg() >>> ... >>> rcu_read_unlock() >>> >>> in other parts of the code not touched by these patches here, so the >>> first pattern is a more universal encapsulation. >>> >>> Let me look into this. Would you be okay with a follow-up that covers >>> the others as well? >> >> Of course :) If list_lru lock helpers would be the right thing to do, it >> might be better placed in this series. > > I'm playing around with the below. But there are a few things that > seem suboptimal: I like that as well (and could even be applied on top of the other proposal later). > > - We need a local @memcg, which makes sites that just pass > folio_memcg() somewhere else fatter. More site LOC on average. The LOC is really mostly just from the helper functions IIUC. > - Despite being more verbose, it communicates less. rcu_read_lock() > is universally understood, folio_memcg_foo() is cryptic. begin/end is pretty clear IMHO. Not sure about the "cryptic" part. Taste differs I guess. :) > - It doesn't cover similar accessors with the same lifetime rules, > like folio_lruvec(), folio_memcg_check() I think it gets inetresting once the RCU would implicitly protect other stuff as well. And that would be my point: from the code alone it's not quite clear what the RCU actually protects and what new code should be using. But I won't push for that if you/others prefer to spell out the RCU stuff :) Thanks for playing with the code! -- Cheers, David
© 2016 - 2026 Red Hat, Inc.