[PATCH v3 7/7] mm: switch deferred split shrinker to list_lru

Johannes Weiner posted 7 patches 2 weeks, 4 days ago
[PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by Johannes Weiner 2 weeks, 4 days ago
The deferred split queue handles cgroups in a suboptimal fashion. The
queue is per-NUMA node or per-cgroup, not the intersection. That means
on a cgrouped system, a node-restricted allocation entering reclaim
can end up splitting large pages on other nodes:

	alloc/unmap
	  deferred_split_folio()
	    list_add_tail(memcg->split_queue)
	    set_shrinker_bit(memcg, node, deferred_shrinker_id)

	for_each_zone_zonelist_nodemask(restricted_nodes)
	  mem_cgroup_iter()
	    shrink_slab(node, memcg)
	      shrink_slab_memcg(node, memcg)
	        if test_shrinker_bit(memcg, node, deferred_shrinker_id)
	          deferred_split_scan()
	            walks memcg->split_queue

The shrinker bit adds an imperfect guard rail. As soon as the cgroup
has a single large page on the node of interest, all large pages owned
by that memcg, including those on other nodes, will be split.

list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
streamlines a lot of the list operations and reclaim walks. It's used
widely by other major shrinkers already. Convert the deferred split
queue as well.

The list_lru per-memcg heads are instantiated on demand when the first
object of interest is allocated for a cgroup, by calling
folio_memcg_list_lru_alloc(). Add calls to where splittable pages are
created: anon faults, swapin faults, khugepaged collapse.

These calls create all possible node heads for the cgroup at once, so
the migration code (between nodes) doesn't need any special care.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/huge_mm.h    |   6 +-
 include/linux/memcontrol.h |   4 -
 include/linux/mmzone.h     |  12 --
 mm/huge_memory.c           | 342 ++++++++++++-------------------------
 mm/internal.h              |   2 +-
 mm/khugepaged.c            |   7 +
 mm/memcontrol.c            |  12 +-
 mm/memory.c                |  52 +++---
 mm/mm_init.c               |  15 --
 9 files changed, 151 insertions(+), 301 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index bd7f0e1d8094..8d801ed378db 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
 {
 	return split_huge_page_to_list_to_order(page, NULL, 0);
 }
+
+extern struct list_lru deferred_split_lru;
 void deferred_split_folio(struct folio *folio, bool partially_mapped);
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg);
-#endif
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze);
@@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
 }
 
 static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
-static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 086158969529..0782c72a1997 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -277,10 +277,6 @@ struct mem_cgroup {
 	struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	struct deferred_split deferred_split_queue;
-#endif
-
 #ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* per-memcg mm_struct list */
 	struct lru_gen_mm_list mm_list;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7bd0134c241c..232b7a71fd69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1429,14 +1429,6 @@ struct zonelist {
  */
 extern struct page *mem_map;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct deferred_split {
-	spinlock_t split_queue_lock;
-	struct list_head split_queue;
-	unsigned long split_queue_len;
-};
-#endif
-
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Per NUMA node memory failure handling statistics.
@@ -1562,10 +1554,6 @@ typedef struct pglist_data {
 	unsigned long first_deferred_pfn;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	struct deferred_split deferred_split_queue;
-#endif
-
 #ifdef CONFIG_NUMA_BALANCING
 	/* start time in ms of current promote rate limit period */
 	unsigned int nbp_rl_start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3fc02913b63e..e90d08db219d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -14,6 +14,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/list_lru.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
 #include <linux/swapops.h>
@@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
+static struct lock_class_key deferred_split_key;
+struct list_lru deferred_split_lru;
 static struct shrinker *deferred_split_shrinker;
 static unsigned long deferred_split_count(struct shrinker *shrink,
 					  struct shrink_control *sc);
@@ -919,6 +922,13 @@ static int __init thp_shrinker_init(void)
 	if (!deferred_split_shrinker)
 		return -ENOMEM;
 
+	if (list_lru_init_memcg_key(&deferred_split_lru,
+				    deferred_split_shrinker,
+				    &deferred_split_key)) {
+		shrinker_free(deferred_split_shrinker);
+		return -ENOMEM;
+	}
+
 	deferred_split_shrinker->count_objects = deferred_split_count;
 	deferred_split_shrinker->scan_objects = deferred_split_scan;
 	shrinker_register(deferred_split_shrinker);
@@ -939,6 +949,7 @@ static int __init thp_shrinker_init(void)
 
 	huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
 	if (!huge_zero_folio_shrinker) {
+		list_lru_destroy(&deferred_split_lru);
 		shrinker_free(deferred_split_shrinker);
 		return -ENOMEM;
 	}
@@ -953,6 +964,7 @@ static int __init thp_shrinker_init(void)
 static void __init thp_shrinker_exit(void)
 {
 	shrinker_free(huge_zero_folio_shrinker);
+	list_lru_destroy(&deferred_split_lru);
 	shrinker_free(deferred_split_shrinker);
 }
 
@@ -1133,119 +1145,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 	return pmd;
 }
 
-static struct deferred_split *split_queue_node(int nid)
-{
-	struct pglist_data *pgdata = NODE_DATA(nid);
-
-	return &pgdata->deferred_split_queue;
-}
-
-#ifdef CONFIG_MEMCG
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
-					   struct deferred_split *queue)
-{
-	if (mem_cgroup_disabled())
-		return NULL;
-	if (split_queue_node(folio_nid(folio)) == queue)
-		return NULL;
-	return container_of(queue, struct mem_cgroup, deferred_split_queue);
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
-	return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
-}
-#else
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
-					   struct deferred_split *queue)
-{
-	return NULL;
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
-	return split_queue_node(nid);
-}
-#endif
-
-static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
-{
-	struct deferred_split *queue;
-
-retry:
-	queue = memcg_split_queue(nid, memcg);
-	spin_lock(&queue->split_queue_lock);
-	/*
-	 * There is a period between setting memcg to dying and reparenting
-	 * deferred split queue, and during this period the THPs in the deferred
-	 * split queue will be hidden from the shrinker side.
-	 */
-	if (unlikely(memcg_is_dying(memcg))) {
-		spin_unlock(&queue->split_queue_lock);
-		memcg = parent_mem_cgroup(memcg);
-		goto retry;
-	}
-
-	return queue;
-}
-
-static struct deferred_split *
-split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
-{
-	struct deferred_split *queue;
-
-retry:
-	queue = memcg_split_queue(nid, memcg);
-	spin_lock_irqsave(&queue->split_queue_lock, *flags);
-	if (unlikely(memcg_is_dying(memcg))) {
-		spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
-		memcg = parent_mem_cgroup(memcg);
-		goto retry;
-	}
-
-	return queue;
-}
-
-static struct deferred_split *folio_split_queue_lock(struct folio *folio)
-{
-	struct deferred_split *queue;
-
-	rcu_read_lock();
-	queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
-	/*
-	 * The memcg destruction path is acquiring the split queue lock for
-	 * reparenting. Once you have it locked, it's safe to drop the rcu lock.
-	 */
-	rcu_read_unlock();
-
-	return queue;
-}
-
-static struct deferred_split *
-folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
-{
-	struct deferred_split *queue;
-
-	rcu_read_lock();
-	queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
-	rcu_read_unlock();
-
-	return queue;
-}
-
-static inline void split_queue_unlock(struct deferred_split *queue)
-{
-	spin_unlock(&queue->split_queue_lock);
-}
-
-static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
-						 unsigned long flags)
-{
-	spin_unlock_irqrestore(&queue->split_queue_lock, flags);
-}
-
 static inline bool is_transparent_hugepage(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
@@ -1346,6 +1245,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
 		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 		return NULL;
 	}
+
+	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+		folio_put(folio);
+		count_vm_event(THP_FAULT_FALLBACK);
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+		return NULL;
+	}
+
 	folio_throttle_swaprate(folio, gfp);
 
        /*
@@ -3854,34 +3761,34 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 	struct folio *end_folio = folio_next(folio);
 	struct folio *new_folio, *next;
 	int old_order = folio_order(folio);
+	struct list_lru_one *l;
+	bool dequeue_deferred;
 	int ret = 0;
-	struct deferred_split *ds_queue;
 
 	VM_WARN_ON_ONCE(!mapping && end);
 	/* Prevent deferred_split_scan() touching ->_refcount */
-	ds_queue = folio_split_queue_lock(folio);
+	dequeue_deferred = folio_test_anon(folio) && old_order > 1;
+	if (dequeue_deferred) {
+		rcu_read_lock();
+		l = list_lru_lock(&deferred_split_lru,
+				  folio_nid(folio), folio_memcg(folio));
+	}
 	if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
 		struct swap_cluster_info *ci = NULL;
 		struct lruvec *lruvec;
 
-		if (old_order > 1) {
-			if (!list_empty(&folio->_deferred_list)) {
-				ds_queue->split_queue_len--;
-				/*
-				 * Reinitialize page_deferred_list after removing the
-				 * page from the split_queue, otherwise a subsequent
-				 * split will see list corruption when checking the
-				 * page_deferred_list.
-				 */
-				list_del_init(&folio->_deferred_list);
-			}
+		if (dequeue_deferred) {
+			__list_lru_del(&deferred_split_lru, l,
+				       &folio->_deferred_list, folio_nid(folio));
 			if (folio_test_partially_mapped(folio)) {
 				folio_clear_partially_mapped(folio);
 				mod_mthp_stat(old_order,
 					MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
 			}
+			list_lru_unlock(l);
+			rcu_read_unlock();
 		}
-		split_queue_unlock(ds_queue);
+
 		if (mapping) {
 			int nr = folio_nr_pages(folio);
 
@@ -3982,7 +3889,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 		if (ci)
 			swap_cluster_unlock(ci);
 	} else {
-		split_queue_unlock(ds_queue);
+		if (dequeue_deferred) {
+			list_lru_unlock(l);
+			rcu_read_unlock();
+		}
 		return -EAGAIN;
 	}
 
@@ -4349,33 +4259,35 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
  * queueing THP splits, and that list is (racily observed to be) non-empty.
  *
  * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
- * zero: because even when split_queue_lock is held, a non-empty _deferred_list
- * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ * zero: because even when the list_lru lock is held, a non-empty
+ * _deferred_list might be in use on deferred_split_scan()'s unlocked
+ * on-stack list.
  *
- * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
- * therefore important to unqueue deferred split before changing folio memcg.
+ * The list_lru sublist is determined by folio's memcg: it is therefore
+ * important to unqueue deferred split before changing folio memcg.
  */
 bool __folio_unqueue_deferred_split(struct folio *folio)
 {
-	struct deferred_split *ds_queue;
+	struct list_lru_one *l;
+	int nid = folio_nid(folio);
 	unsigned long flags;
 	bool unqueued = false;
 
 	WARN_ON_ONCE(folio_ref_count(folio));
 	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
 
-	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
-	if (!list_empty(&folio->_deferred_list)) {
-		ds_queue->split_queue_len--;
+	rcu_read_lock();
+	l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
+	if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
 		if (folio_test_partially_mapped(folio)) {
 			folio_clear_partially_mapped(folio);
 			mod_mthp_stat(folio_order(folio),
 				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
 		}
-		list_del_init(&folio->_deferred_list);
 		unqueued = true;
 	}
-	split_queue_unlock_irqrestore(ds_queue, flags);
+	list_lru_unlock_irqrestore(l, &flags);
+	rcu_read_unlock();
 
 	return unqueued;	/* useful for debug warnings */
 }
@@ -4383,7 +4295,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
-	struct deferred_split *ds_queue;
+	struct list_lru_one *l;
+	int nid;
+	struct mem_cgroup *memcg;
 	unsigned long flags;
 
 	/*
@@ -4406,7 +4320,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 	if (folio_test_swapcache(folio))
 		return;
 
-	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
+	nid = folio_nid(folio);
+
+	rcu_read_lock();
+	memcg = folio_memcg(folio);
+	l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
 	if (partially_mapped) {
 		if (!folio_test_partially_mapped(folio)) {
 			folio_set_partially_mapped(folio);
@@ -4414,36 +4332,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
 			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
 			mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
-
 		}
 	} else {
 		/* partially mapped folios cannot become non-partially mapped */
 		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
 	}
-	if (list_empty(&folio->_deferred_list)) {
-		struct mem_cgroup *memcg;
-
-		memcg = folio_split_queue_memcg(folio, ds_queue);
-		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
-		ds_queue->split_queue_len++;
-		if (memcg)
-			set_shrinker_bit(memcg, folio_nid(folio),
-					 shrinker_id(deferred_split_shrinker));
-	}
-	split_queue_unlock_irqrestore(ds_queue, flags);
+	__list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
+	list_lru_unlock_irqrestore(l, &flags);
+	rcu_read_unlock();
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
-	struct pglist_data *pgdata = NODE_DATA(sc->nid);
-	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
-
-#ifdef CONFIG_MEMCG
-	if (sc->memcg)
-		ds_queue = &sc->memcg->deferred_split_queue;
-#endif
-	return READ_ONCE(ds_queue->split_queue_len);
+	return list_lru_shrink_count(&deferred_split_lru, sc);
 }
 
 static bool thp_underused(struct folio *folio)
@@ -4473,45 +4375,47 @@ static bool thp_underused(struct folio *folio)
 	return false;
 }
 
+static enum lru_status deferred_split_isolate(struct list_head *item,
+					      struct list_lru_one *lru,
+					      void *cb_arg)
+{
+	struct folio *folio = container_of(item, struct folio, _deferred_list);
+	struct list_head *freeable = cb_arg;
+
+	if (folio_try_get(folio)) {
+		list_lru_isolate_move(lru, item, freeable);
+		return LRU_REMOVED;
+	}
+
+	/* We lost race with folio_put() */
+	list_lru_isolate(lru, item);
+	if (folio_test_partially_mapped(folio)) {
+		folio_clear_partially_mapped(folio);
+		mod_mthp_stat(folio_order(folio),
+			      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+	}
+	return LRU_REMOVED;
+}
+
 static unsigned long deferred_split_scan(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
-	struct deferred_split *ds_queue;
-	unsigned long flags;
+	LIST_HEAD(dispose);
 	struct folio *folio, *next;
-	int split = 0, i;
-	struct folio_batch fbatch;
+	int split = 0;
+	unsigned long isolated;
 
-	folio_batch_init(&fbatch);
+	isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
+					    deferred_split_isolate, &dispose);
 
-retry:
-	ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
-	/* Take pin on all head pages to avoid freeing them under us */
-	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
-							_deferred_list) {
-		if (folio_try_get(folio)) {
-			folio_batch_add(&fbatch, folio);
-		} else if (folio_test_partially_mapped(folio)) {
-			/* We lost race with folio_put() */
-			folio_clear_partially_mapped(folio);
-			mod_mthp_stat(folio_order(folio),
-				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
-		}
-		list_del_init(&folio->_deferred_list);
-		ds_queue->split_queue_len--;
-		if (!--sc->nr_to_scan)
-			break;
-		if (!folio_batch_space(&fbatch))
-			break;
-	}
-	split_queue_unlock_irqrestore(ds_queue, flags);
-
-	for (i = 0; i < folio_batch_count(&fbatch); i++) {
+	list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
 		bool did_split = false;
 		bool underused = false;
-		struct deferred_split *fqueue;
+		struct list_lru_one *l;
+		unsigned long flags;
+
+		list_del_init(&folio->_deferred_list);
 
-		folio = fbatch.folios[i];
 		if (!folio_test_partially_mapped(folio)) {
 			/*
 			 * See try_to_map_unused_to_zeropage(): we cannot
@@ -4534,64 +4438,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 		}
 		folio_unlock(folio);
 next:
-		if (did_split || !folio_test_partially_mapped(folio))
-			continue;
 		/*
 		 * Only add back to the queue if folio is partially mapped.
 		 * If thp_underused returns false, or if split_folio fails
 		 * in the case it was underused, then consider it used and
 		 * don't add it back to split_queue.
 		 */
-		fqueue = folio_split_queue_lock_irqsave(folio, &flags);
-		if (list_empty(&folio->_deferred_list)) {
-			list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
-			fqueue->split_queue_len++;
+		if (!did_split && folio_test_partially_mapped(folio)) {
+			rcu_read_lock();
+			l = list_lru_lock_irqsave(&deferred_split_lru,
+						  folio_nid(folio),
+						  folio_memcg(folio),
+						  &flags);
+			__list_lru_add(&deferred_split_lru, l,
+				       &folio->_deferred_list,
+				       folio_nid(folio), folio_memcg(folio));
+			list_lru_unlock_irqrestore(l, &flags);
+			rcu_read_unlock();
 		}
-		split_queue_unlock_irqrestore(fqueue, flags);
-	}
-	folios_put(&fbatch);
-
-	if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
-		cond_resched();
-		goto retry;
+		folio_put(folio);
 	}
 
-	/*
-	 * Stop shrinker if we didn't split any page, but the queue is empty.
-	 * This can happen if pages were freed under us.
-	 */
-	if (!split && list_empty(&ds_queue->split_queue))
+	if (!split && !isolated)
 		return SHRINK_STOP;
 	return split;
 }
 
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-	struct deferred_split *ds_queue = &memcg->deferred_split_queue;
-	struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
-	int nid;
-
-	spin_lock_irq(&ds_queue->split_queue_lock);
-	spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
-
-	if (!ds_queue->split_queue_len)
-		goto unlock;
-
-	list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
-	parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
-	ds_queue->split_queue_len = 0;
-
-	for_each_node(nid)
-		set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
-
-unlock:
-	spin_unlock(&parent_ds_queue->split_queue_lock);
-	spin_unlock_irq(&ds_queue->split_queue_lock);
-}
-#endif
-
 #ifdef CONFIG_DEBUG_FS
 static void split_huge_pages_all(void)
 {
diff --git a/mm/internal.h b/mm/internal.h
index f98f4746ac41..d8c737338df5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -863,7 +863,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
 	/*
 	 * At this point, there is no one trying to add the folio to
 	 * deferred_list. If folio is not in deferred_list, it's safe
-	 * to check without acquiring the split_queue_lock.
+	 * to check without acquiring the list_lru lock.
 	 */
 	if (data_race(list_empty(&folio->_deferred_list)))
 		return false;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4b0e59c7c0e6..b2ac28ddd480 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1081,6 +1081,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
 	}
 
 	count_vm_event(THP_COLLAPSE_ALLOC);
+
 	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
 		folio_put(folio);
 		*foliop = NULL;
@@ -1089,6 +1090,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
 
 	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
 
+	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+		folio_put(folio);
+		*foliop = NULL;
+		return SCAN_CGROUP_CHARGE_FAIL;
+	}
+
 	*foliop = folio;
 	return SCAN_SUCCEED;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a47fb68dd65f..f381cb6bdff1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
 		memcg->cgwb_frn[i].done =
 			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
-	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
-	memcg->deferred_split_queue.split_queue_len = 0;
 #endif
 	lru_gen_init_memcg(memcg);
 	return memcg;
@@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	zswap_memcg_offline_cleanup(memcg);
 
 	memcg_offline_kmem(memcg);
-	reparent_deferred_split_queue(memcg);
 	/*
-	 * The reparenting of objcg must be after the reparenting of the
-	 * list_lru and deferred_split_queue above, which ensures that they will
-	 * not mistakenly get the parent list_lru and deferred_split_queue.
+	 * The reparenting of objcg must be after the reparenting of
+	 * the list_lru in memcg_offline_kmem(), which ensures that
+	 * they will not mistakenly get the parent list_lru.
 	 */
 	memcg_reparent_objcgs(memcg);
 	reparent_shrinker_deferred(memcg);
diff --git a/mm/memory.c b/mm/memory.c
index 219b9bf6cae0..e68ceb4aa624 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 		folio = vma_alloc_folio(gfp, order, vma, addr);
-		if (folio) {
-			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
-							    gfp, entry))
-				return folio;
+		if (!folio)
+			goto next;
+		if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
 			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
 			folio_put(folio);
+			goto next;
 		}
+		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+			folio_put(folio);
+			goto fallback;
+		}
+		return folio;
+next:
 		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
 		order = next_order(&orders, order);
 	}
@@ -5169,24 +5175,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 		folio = vma_alloc_folio(gfp, order, vma, addr);
-		if (folio) {
-			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
-				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
-				folio_put(folio);
-				goto next;
-			}
-			folio_throttle_swaprate(folio, gfp);
-			/*
-			 * When a folio is not zeroed during allocation
-			 * (__GFP_ZERO not used) or user folios require special
-			 * handling, folio_zero_user() is used to make sure
-			 * that the page corresponding to the faulting address
-			 * will be hot in the cache after zeroing.
-			 */
-			if (user_alloc_needs_zeroing())
-				folio_zero_user(folio, vmf->address);
-			return folio;
+		if (!folio)
+			goto next;
+		if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+			count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+			folio_put(folio);
+			goto next;
 		}
+		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+			folio_put(folio);
+			goto fallback;
+		}
+		folio_throttle_swaprate(folio, gfp);
+		/*
+		 * When a folio is not zeroed during allocation
+		 * (__GFP_ZERO not used) or user folios require special
+		 * handling, folio_zero_user() is used to make sure
+		 * that the page corresponding to the faulting address
+		 * will be hot in the cache after zeroing.
+		 */
+		if (user_alloc_needs_zeroing())
+			folio_zero_user(folio, vmf->address);
+		return folio;
 next:
 		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
 		order = next_order(&orders, order);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index cec7bb758bdd..f293a62e652a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1388,19 +1388,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
 	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
-	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
-	spin_lock_init(&ds_queue->split_queue_lock);
-	INIT_LIST_HEAD(&ds_queue->split_queue);
-	ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
 #ifdef CONFIG_COMPACTION
 static void pgdat_init_kcompactd(struct pglist_data *pgdat)
 {
@@ -1416,8 +1403,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 
 	pgdat_resize_init(pgdat);
 	pgdat_kswapd_lock_init(pgdat);
-
-	pgdat_init_split_queue(pgdat);
 	pgdat_init_kcompactd(pgdat);
 
 	init_waitqueue_head(&pgdat->kswapd_wait);
-- 
2.53.0
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc
Posted by Mikhail Zaslonko 1 week ago

On 18-Mar-26 20:53, Johannes Weiner wrote:
> The deferred split queue handles cgroups in a suboptimal fashion. 
> The queue is per-NUMA node or per-cgroup, not the intersection. That 
> means on a cgrouped system, a node-restricted allocation entering 
> reclaim can end up splitting large pages on other nodes:
> 
> alloc/unmap deferred_split_folio() list_add_tail(memcg-
> >split_queue) set_shrinker_bit(memcg, node, deferred_shrinker_id)
> 
> for_each_zone_zonelist_nodemask(restricted_nodes) mem_cgroup_iter() 
> shrink_slab(node, memcg) shrink_slab_memcg(node, memcg) if 
> test_shrinker_bit(memcg, node, deferred_shrinker_id) 
> deferred_split_scan() walks memcg->split_queue
> 
> The shrinker bit adds an imperfect guard rail. As soon as the
> cgroup has a single large page on the node of interest, all large
> pages owned by that memcg, including those on other nodes, will be
> split.
> 
> list_lru properly sets up per-node, per-cgroup lists. As a bonus,
> it streamlines a lot of the list operations and reclaim walks. It's 
> used widely by other major shrinkers already. Convert the deferred 
> split queue as well.
> 
> The list_lru per-memcg heads are instantiated on demand when the 
> first object of interest is allocated for a cgroup, by calling 
> folio_memcg_list_lru_alloc(). Add calls to where splittable pages 
> are created: anon faults, swapin faults, khugepaged collapse.
> 
> These calls create all possible node heads for the cgroup at once, 
> so the migration code (between nodes) doesn't need any special care.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> --- include/ 
> linux/huge_mm.h    |   6 +- include/linux/memcontrol.h |   4 - 
> include/linux/mmzone.h     |  12 -- mm/huge_memory.c           | 342 
> ++++++++++++------------------------- mm/internal.h              | 2
> +- mm/khugepaged.c            |   7 + mm/memcontrol.c |  12 +- mm/
> memory.c                |  52 +++--- mm/ mm_init.c               |
> 15 -- 9 files changed, 151 insertions(+), 301 deletions(-)
> 

Hi,

with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
dump kernel when running NVMe standalone dump (ngdump).
This only happens in the 'capture kernel', normal boot of the same kernel works fine.

[   14.350676] Unable to handle kernel pointer dereference in virtual kernel address space
[   14.350682] Failing address: 4000000000000000 TEID: 4000000000000803 ESOP-2 FSI
[   14.350686] Fault in home space mode while using kernel ASCE.
[   14.350689] AS:0000000002798007 R3:000000002d2c4007 S:000000002d2c3001 P:000000000000013d 
[   14.350730] Oops: 0038 ilc:3 [#1]SMP 
[   14.350735] Modules linked in: dm_service_time zfcp scsi_transport_fc uvdevice diag288_wdt nvme prng aes_s390 nvme_core des_s390 libdes zcrypt_cex4 dm_mirror dm_region_hash dm_log scsi_dh_rdac scsi_dh_emc scsi_dh_alua paes_s390 crypto_engine pkey_cca pkey_ep11 zcrypt rng_core pkey_pckmo pkey dm_multipath autofs4
[   14.350760] CPU: 0 UID: 0 PID: 32 Comm: khugepaged Not tainted 7.0.0-rc5-next-20260324 
[   14.350762] Hardware name: IBM 3931 A01 704 (LPAR)
[   14.350764] Krnl PSW : 0704d00180000000 000003ffe0443a82 (__memcg_list_lru_alloc+0x52/0x1d0)
[   14.350774]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
[   14.350776] Krnl GPRS: 0000000000000402 00000000000bece0 0000000000000000 000003ffe1c17928
[   14.350778]            00000000001c24ca 0000000000000000 0000000000000000 000003ffe1c17948
[   14.350780]            0000000000000000 00000000000824c0 0000037200098000 4000000000000000
[   14.350782]            0000000000782400 0000000000000001 0000037fe00f39b8 0000037fe00f3918
[   14.350788] Krnl Code: 000003ffe0443a72: a7690000            lghi    %r6,0
[   14.350788]            000003ffe0443a76: e380f0a00004        lg      %r8,160(%r15)
[   14.350788]           *000003ffe0443a7c: e3b080b80004        lg      %r11,184(%r8)
[   14.350788]           >000003ffe0443a82: e330b9400012        lt      %r3,2368(%r11)
[   14.350788]            000003ffe0443a88: a7a40065            brc     10,000003ffe0443b52
[   14.350788]            000003ffe0443a8c: e3b0f0a00004        lg      %r11,160(%r15)
[   14.350788]            000003ffe0443a92: ec68006f007c        cgij    %r6,0,8,000003ffe0443b70
[   14.350788]            000003ffe0443a98: e300b9400014        lgf     %r0,2368(%r11)
[   14.350825] Call Trace:
[   14.350826]  [<000003ffe0443a82>] __memcg_list_lru_alloc+0x52/0x1d0 
[   14.350831]  [<000003ffe044529a>] folio_memcg_list_lru_alloc+0xba/0x150 
[   14.350834]  [<000003ffe04f279a>] alloc_charge_folio+0x18a/0x250 
[   14.350839]  [<000003ffe04f34dc>] collapse_huge_page+0x8c/0x890 
[   14.350841]  [<000003ffe04f4222>] collapse_scan_pmd+0x542/0x690 
[   14.350844]  [<000003ffe04f65b4>] collapse_single_pmd+0x144/0x240 
[   14.350847]  [<000003ffe04f69ce>] collapse_scan_mm_slot.constprop.0+0x31e/0x480 
[   14.350849]  [<000003ffe04f6d3c>] khugepaged+0x20c/0x210 
[   14.350852]  [<000003ffe019b0a8>] kthread+0x148/0x170 
[   14.350856]  [<000003ffe0119fec>] __ret_from_fork+0x3c/0x240 
[   14.350860]  [<000003ffe0ffa4b2>] ret_from_fork+0xa/0x30 
[   14.350865] Last Breaking-Event-Address:
[   14.350865]  [<000003ffe0445294>] folio_memcg_list_lru_alloc+0xb4/0x150
[   14.350870] Kernel panic - not syncing: Fatal exception: panic_on_oops


Environment:
Arch: s390x (IBM LPAR)
Kernel: next-20260324
Config: (can provide if needed)
Reproducible: always
Steps to Reproduce: 
  Install ngdump to an NVMe device partition via 'zipl -d' and initiate a dump (same issue with DASD ldipl-dump).

I have bisected to this specific commit. 
  good: 230bbdc110b3 ("mm: list_lru: introduce folio_memcg_list_lru_alloc()")
  bad:  b0f512f6e36c ("mm: switch deferred split shrinker to list_lru")
Reverting it on top of linux-next <next-20260327> restores normal Standalone Dump operation.
 
Let me know if I can provide any other data.

Thanks,
Mikhail
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc
Posted by Johannes Weiner 6 days, 20 hours ago
Hello Mikhail,

On Mon, Mar 30, 2026 at 06:37:01PM +0200, Mikhail Zaslonko wrote:
> with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> dump kernel when running NVMe standalone dump (ngdump).
> This only happens in the 'capture kernel', normal boot of the same kernel works fine.
> 
> [   14.350676] Unable to handle kernel pointer dereference in virtual kernel address space
> [   14.350682] Failing address: 4000000000000000 TEID: 4000000000000803 ESOP-2 FSI
> [   14.350686] Fault in home space mode while using kernel ASCE.
> [   14.350689] AS:0000000002798007 R3:000000002d2c4007 S:000000002d2c3001 P:000000000000013d 
> [   14.350730] Oops: 0038 ilc:3 [#1]SMP 
> [   14.350735] Modules linked in: dm_service_time zfcp scsi_transport_fc uvdevice diag288_wdt nvme prng aes_s390 nvme_core des_s390 libdes zcrypt_cex4 dm_mirror dm_region_hash dm_log scsi_dh_rdac scsi_dh_emc scsi_dh_alua paes_s390 crypto_engine pkey_cca pkey_ep11 zcrypt rng_core pkey_pckmo pkey dm_multipath autofs4
> [   14.350760] CPU: 0 UID: 0 PID: 32 Comm: khugepaged Not tainted 7.0.0-rc5-next-20260324 
> [   14.350762] Hardware name: IBM 3931 A01 704 (LPAR)
> [   14.350764] Krnl PSW : 0704d00180000000 000003ffe0443a82 (__memcg_list_lru_alloc+0x52/0x1d0)
> [   14.350774]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
> [   14.350776] Krnl GPRS: 0000000000000402 00000000000bece0 0000000000000000 000003ffe1c17928
> [   14.350778]            00000000001c24ca 0000000000000000 0000000000000000 000003ffe1c17948
> [   14.350780]            0000000000000000 00000000000824c0 0000037200098000 4000000000000000
> [   14.350782]            0000000000782400 0000000000000001 0000037fe00f39b8 0000037fe00f3918
> [   14.350788] Krnl Code: 000003ffe0443a72: a7690000            lghi    %r6,0
> [   14.350788]            000003ffe0443a76: e380f0a00004        lg      %r8,160(%r15)
> [   14.350788]           *000003ffe0443a7c: e3b080b80004        lg      %r11,184(%r8)
> [   14.350788]           >000003ffe0443a82: e330b9400012        lt      %r3,2368(%r11)
> [   14.350788]            000003ffe0443a88: a7a40065            brc     10,000003ffe0443b52
> [   14.350788]            000003ffe0443a8c: e3b0f0a00004        lg      %r11,160(%r15)
> [   14.350788]            000003ffe0443a92: ec68006f007c        cgij    %r6,0,8,000003ffe0443b70
> [   14.350788]            000003ffe0443a98: e300b9400014        lgf     %r0,2368(%r11)
> [   14.350825] Call Trace:
> [   14.350826]  [<000003ffe0443a82>] __memcg_list_lru_alloc+0x52/0x1d0 
> [   14.350831]  [<000003ffe044529a>] folio_memcg_list_lru_alloc+0xba/0x150 
> [   14.350834]  [<000003ffe04f279a>] alloc_charge_folio+0x18a/0x250 
> [   14.350839]  [<000003ffe04f34dc>] collapse_huge_page+0x8c/0x890 
> [   14.350841]  [<000003ffe04f4222>] collapse_scan_pmd+0x542/0x690 
> [   14.350844]  [<000003ffe04f65b4>] collapse_single_pmd+0x144/0x240 
> [   14.350847]  [<000003ffe04f69ce>] collapse_scan_mm_slot.constprop.0+0x31e/0x480 
> [   14.350849]  [<000003ffe04f6d3c>] khugepaged+0x20c/0x210 
> [   14.350852]  [<000003ffe019b0a8>] kthread+0x148/0x170 
> [   14.350856]  [<000003ffe0119fec>] __ret_from_fork+0x3c/0x240 
> [   14.350860]  [<000003ffe0ffa4b2>] ret_from_fork+0xa/0x30 
> [   14.350865] Last Breaking-Event-Address:
> [   14.350865]  [<000003ffe0445294>] folio_memcg_list_lru_alloc+0xb4/0x150
> [   14.350870] Kernel panic - not syncing: Fatal exception: panic_on_oops

Thanks for the report. Would you be able to correlate the function
offset in __memcg_list_lru_alloc() to C code? It would be very useful
to understand *which* pointer or data structure it's tripping over.

thp_shrinker_init() is called before start_stop_khugepaged(), so I am
not sure how we could have invalid/uninitialized list_lru structures.

Could you extract the instruction stream sequence surrounding the
crash? I don't see a Code: line, but maybe you can correlate it with
objdump? And then correlate it with C code (make mm/list_lru.lst e.g.)

That would be super helpful. Sorry about the breakage.
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc
Posted by Johannes Weiner 6 days, 19 hours ago
On Mon, Mar 30, 2026 at 04:41:16PM -0400, Johannes Weiner wrote:
> Hello Mikhail,
> 
> On Mon, Mar 30, 2026 at 06:37:01PM +0200, Mikhail Zaslonko wrote:
> > with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> > dump kernel when running NVMe standalone dump (ngdump).
> > This only happens in the 'capture kernel', normal boot of the same kernel works fine.
> > 
> > [   14.350676] Unable to handle kernel pointer dereference in virtual kernel address space
> > [   14.350682] Failing address: 4000000000000000 TEID: 4000000000000803 ESOP-2 FSI
> > [   14.350686] Fault in home space mode while using kernel ASCE.
> > [   14.350689] AS:0000000002798007 R3:000000002d2c4007 S:000000002d2c3001 P:000000000000013d 
> > [   14.350730] Oops: 0038 ilc:3 [#1]SMP 
> > [   14.350735] Modules linked in: dm_service_time zfcp scsi_transport_fc uvdevice diag288_wdt nvme prng aes_s390 nvme_core des_s390 libdes zcrypt_cex4 dm_mirror dm_region_hash dm_log scsi_dh_rdac scsi_dh_emc scsi_dh_alua paes_s390 crypto_engine pkey_cca pkey_ep11 zcrypt rng_core pkey_pckmo pkey dm_multipath autofs4
> > [   14.350760] CPU: 0 UID: 0 PID: 32 Comm: khugepaged Not tainted 7.0.0-rc5-next-20260324 
> > [   14.350762] Hardware name: IBM 3931 A01 704 (LPAR)
> > [   14.350764] Krnl PSW : 0704d00180000000 000003ffe0443a82 (__memcg_list_lru_alloc+0x52/0x1d0)
> > [   14.350774]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
> > [   14.350776] Krnl GPRS: 0000000000000402 00000000000bece0 0000000000000000 000003ffe1c17928
> > [   14.350778]            00000000001c24ca 0000000000000000 0000000000000000 000003ffe1c17948
> > [   14.350780]            0000000000000000 00000000000824c0 0000037200098000 4000000000000000
> > [   14.350782]            0000000000782400 0000000000000001 0000037fe00f39b8 0000037fe00f3918
> > [   14.350788] Krnl Code: 000003ffe0443a72: a7690000            lghi    %r6,0
> > [   14.350788]            000003ffe0443a76: e380f0a00004        lg      %r8,160(%r15)
> > [   14.350788]           *000003ffe0443a7c: e3b080b80004        lg      %r11,184(%r8)
> > [   14.350788]           >000003ffe0443a82: e330b9400012        lt      %r3,2368(%r11)
> > [   14.350788]            000003ffe0443a88: a7a40065            brc     10,000003ffe0443b52
> > [   14.350788]            000003ffe0443a8c: e3b0f0a00004        lg      %r11,160(%r15)
> > [   14.350788]            000003ffe0443a92: ec68006f007c        cgij    %r6,0,8,000003ffe0443b70
> > [   14.350788]            000003ffe0443a98: e300b9400014        lgf     %r0,2368(%r11)
> > [   14.350825] Call Trace:
> > [   14.350826]  [<000003ffe0443a82>] __memcg_list_lru_alloc+0x52/0x1d0 
> > [   14.350831]  [<000003ffe044529a>] folio_memcg_list_lru_alloc+0xba/0x150 
> > [   14.350834]  [<000003ffe04f279a>] alloc_charge_folio+0x18a/0x250 
> > [   14.350839]  [<000003ffe04f34dc>] collapse_huge_page+0x8c/0x890 
> > [   14.350841]  [<000003ffe04f4222>] collapse_scan_pmd+0x542/0x690 
> > [   14.350844]  [<000003ffe04f65b4>] collapse_single_pmd+0x144/0x240 
> > [   14.350847]  [<000003ffe04f69ce>] collapse_scan_mm_slot.constprop.0+0x31e/0x480 
> > [   14.350849]  [<000003ffe04f6d3c>] khugepaged+0x20c/0x210 
> > [   14.350852]  [<000003ffe019b0a8>] kthread+0x148/0x170 
> > [   14.350856]  [<000003ffe0119fec>] __ret_from_fork+0x3c/0x240 
> > [   14.350860]  [<000003ffe0ffa4b2>] ret_from_fork+0xa/0x30 
> > [   14.350865] Last Breaking-Event-Address:
> > [   14.350865]  [<000003ffe0445294>] folio_memcg_list_lru_alloc+0xb4/0x150
> > [   14.350870] Kernel panic - not syncing: Fatal exception: panic_on_oops

Can you verify whether the kdump kernel boots with
cgroup_disable=memory?

I think there is an issue with how we call __list_lru_init(). The
existing callsites had their own memcg_kmem_online() guards. But the
THP one does not, so we're creating a memcg-aware list_lru, but the
do-while hierarchy walk in __memcg_list_lru_alloc() runs into a NULL
memcg.

Can you try the below on top of that -next checkout?

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 1ccdd45b1d14..7c7024e33653 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -637,7 +637,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr
 	else
 		lru->shrinker_id = -1;
 
-	if (mem_cgroup_kmem_disabled())
+	if (mem_cgroup_disabled() || mem_cgroup_kmem_disabled())
 		memcg_aware = false;
 #endif
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc
Posted by Mikhail Zaslonko 6 days, 8 hours ago

On 30-Mar-26 22:56, Johannes Weiner wrote:
> On Mon, Mar 30, 2026 at 04:41:16PM -0400, Johannes Weiner wrote:
>> Hello Mikhail,
>>

> 
> Can you verify whether the kdump kernel boots with
> cgroup_disable=memory?

It was not a kdump, but s390 specific dump tool. Yes, here is the cmdline (forgot to include):
kernel parmline...: 'reset_devices cgroup_disable=memory nokaslr numa=off irqpoll nr_cpus=1'
> 
> I think there is an issue with how we call __list_lru_init(). The
> existing callsites had their own memcg_kmem_online() guards. But the
> THP one does not, so we're creating a memcg-aware list_lru, but the
> do-while hierarchy walk in __memcg_list_lru_alloc() runs into a NULL
> memcg.
> 
> Can you try the below on top of that -next checkout?
> 
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 1ccdd45b1d14..7c7024e33653 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -637,7 +637,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr
>  	else
>  		lru->shrinker_id = -1;
>  
> -	if (mem_cgroup_kmem_disabled())
> +	if (mem_cgroup_disabled() || mem_cgroup_kmem_disabled())
>  		memcg_aware = false;
>  #endif
> 

I confirm, this resolves the issue.

Thanks.
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc
Posted by Vasily Gorbik 6 days, 18 hours ago
On Mon, Mar 30, 2026 at 04:56:56PM -0400, Johannes Weiner wrote:
> On Mon, Mar 30, 2026 at 04:41:16PM -0400, Johannes Weiner wrote:
> > Hello Mikhail,
> > 
> > On Mon, Mar 30, 2026 at 06:37:01PM +0200, Mikhail Zaslonko wrote:
> > > with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> > > dump kernel when running NVMe standalone dump (ngdump).
> > > This only happens in the 'capture kernel', normal boot of the same kernel works fine.
> 
> Can you verify whether the kdump kernel boots with
> cgroup_disable=memory?

Yes, that is the only special thing about that dump kernel.
Sorry for not including the command line. x86 crashes more or less the
same way with cgroup_disable=memory. But while I was doing the repro on
x86, you already found the root cause.

> Can you try the below on top of that -next checkout?
> 
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 1ccdd45b1d14..7c7024e33653 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -637,7 +637,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr
>  	else
>  		lru->shrinker_id = -1;
>  
> -	if (mem_cgroup_kmem_disabled())
> +	if (mem_cgroup_disabled() || mem_cgroup_kmem_disabled())
>  		memcg_aware = false;
>  #endif

Yes, that fixes it for me on s390 and x86. Thank you!
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc
Posted by Andrew Morton 6 days, 21 hours ago
On Mon, 30 Mar 2026 18:37:01 +0200 Mikhail Zaslonko <zaslonko@linux.ibm.com> wrote:

> with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> dump kernel when running NVMe standalone dump (ngdump).
> This only happens in the 'capture kernel', normal boot of the same kernel works fine.

Thanks.  It seems that hannes is working on a significant respin, so
I'll remove this version of the series from mm.git/linux-next.
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by Kairui Song 1 week, 3 days ago
On Thu, Mar 19, 2026 at 4:05 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> The deferred split queue handles cgroups in a suboptimal fashion. The
> queue is per-NUMA node or per-cgroup, not the intersection. That means
> on a cgrouped system, a node-restricted allocation entering reclaim
> can end up splitting large pages on other nodes:
>
>         alloc/unmap
>           deferred_split_folio()
>             list_add_tail(memcg->split_queue)
>             set_shrinker_bit(memcg, node, deferred_shrinker_id)
>
>         for_each_zone_zonelist_nodemask(restricted_nodes)
>           mem_cgroup_iter()
>             shrink_slab(node, memcg)
>               shrink_slab_memcg(node, memcg)
>                 if test_shrinker_bit(memcg, node, deferred_shrinker_id)
>                   deferred_split_scan()
>                     walks memcg->split_queue
>
> The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> has a single large page on the node of interest, all large pages owned
> by that memcg, including those on other nodes, will be split.
>
> list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> streamlines a lot of the list operations and reclaim walks. It's used
> widely by other major shrinkers already. Convert the deferred split
> queue as well.
>
> The list_lru per-memcg heads are instantiated on demand when the first
> object of interest is allocated for a cgroup, by calling
> folio_memcg_list_lru_alloc(). Add calls to where splittable pages are
> created: anon faults, swapin faults, khugepaged collapse.
>
> These calls create all possible node heads for the cgroup at once, so
> the migration code (between nodes) doesn't need any special care.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  include/linux/huge_mm.h    |   6 +-
>  include/linux/memcontrol.h |   4 -
>  include/linux/mmzone.h     |  12 --
>  mm/huge_memory.c           | 342 ++++++++++++-------------------------
>  mm/internal.h              |   2 +-
>  mm/khugepaged.c            |   7 +
>  mm/memcontrol.c            |  12 +-
>  mm/memory.c                |  52 +++---
>  mm/mm_init.c               |  15 --
>  9 files changed, 151 insertions(+), 301 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index bd7f0e1d8094..8d801ed378db 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
>  {
>         return split_huge_page_to_list_to_order(page, NULL, 0);
>  }
> +
> +extern struct list_lru deferred_split_lru;
>  void deferred_split_folio(struct folio *folio, bool partially_mapped);
> -#ifdef CONFIG_MEMCG
> -void reparent_deferred_split_queue(struct mem_cgroup *memcg);
> -#endif
>
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>                 unsigned long address, bool freeze);
> @@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
>  }
>
>  static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
>  #define split_huge_pmd(__vma, __pmd, __address)        \
>         do { } while (0)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 086158969529..0782c72a1997 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -277,10 +277,6 @@ struct mem_cgroup {
>         struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
>  #endif
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -       struct deferred_split deferred_split_queue;
> -#endif
> -
>  #ifdef CONFIG_LRU_GEN_WALKS_MMU
>         /* per-memcg mm_struct list */
>         struct lru_gen_mm_list mm_list;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 7bd0134c241c..232b7a71fd69 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1429,14 +1429,6 @@ struct zonelist {
>   */
>  extern struct page *mem_map;
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -struct deferred_split {
> -       spinlock_t split_queue_lock;
> -       struct list_head split_queue;
> -       unsigned long split_queue_len;
> -};
> -#endif
> -
>  #ifdef CONFIG_MEMORY_FAILURE
>  /*
>   * Per NUMA node memory failure handling statistics.
> @@ -1562,10 +1554,6 @@ typedef struct pglist_data {
>         unsigned long first_deferred_pfn;
>  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -       struct deferred_split deferred_split_queue;
> -#endif
> -
>  #ifdef CONFIG_NUMA_BALANCING
>         /* start time in ms of current promote rate limit period */
>         unsigned int nbp_rl_start;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 3fc02913b63e..e90d08db219d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -14,6 +14,7 @@
>  #include <linux/mmu_notifier.h>
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
> +#include <linux/list_lru.h>
>  #include <linux/shrinker.h>
>  #include <linux/mm_inline.h>
>  #include <linux/swapops.h>
> @@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
>         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
>         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
>
> +static struct lock_class_key deferred_split_key;
> +struct list_lru deferred_split_lru;
>  static struct shrinker *deferred_split_shrinker;
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>                                           struct shrink_control *sc);
> @@ -919,6 +922,13 @@ static int __init thp_shrinker_init(void)
>         if (!deferred_split_shrinker)
>                 return -ENOMEM;
>
> +       if (list_lru_init_memcg_key(&deferred_split_lru,
> +                                   deferred_split_shrinker,
> +                                   &deferred_split_key)) {
> +               shrinker_free(deferred_split_shrinker);
> +               return -ENOMEM;
> +       }
> +
>         deferred_split_shrinker->count_objects = deferred_split_count;
>         deferred_split_shrinker->scan_objects = deferred_split_scan;
>         shrinker_register(deferred_split_shrinker);
> @@ -939,6 +949,7 @@ static int __init thp_shrinker_init(void)
>
>         huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
>         if (!huge_zero_folio_shrinker) {
> +               list_lru_destroy(&deferred_split_lru);
>                 shrinker_free(deferred_split_shrinker);
>                 return -ENOMEM;
>         }
> @@ -953,6 +964,7 @@ static int __init thp_shrinker_init(void)
>  static void __init thp_shrinker_exit(void)
>  {
>         shrinker_free(huge_zero_folio_shrinker);
> +       list_lru_destroy(&deferred_split_lru);
>         shrinker_free(deferred_split_shrinker);
>  }
>
> @@ -1133,119 +1145,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>         return pmd;
>  }
>
> -static struct deferred_split *split_queue_node(int nid)
> -{
> -       struct pglist_data *pgdata = NODE_DATA(nid);
> -
> -       return &pgdata->deferred_split_queue;
> -}
> -
> -#ifdef CONFIG_MEMCG
> -static inline
> -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> -                                          struct deferred_split *queue)
> -{
> -       if (mem_cgroup_disabled())
> -               return NULL;
> -       if (split_queue_node(folio_nid(folio)) == queue)
> -               return NULL;
> -       return container_of(queue, struct mem_cgroup, deferred_split_queue);
> -}
> -
> -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
> -{
> -       return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
> -}
> -#else
> -static inline
> -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> -                                          struct deferred_split *queue)
> -{
> -       return NULL;
> -}
> -
> -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
> -{
> -       return split_queue_node(nid);
> -}
> -#endif
> -
> -static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
> -{
> -       struct deferred_split *queue;
> -
> -retry:
> -       queue = memcg_split_queue(nid, memcg);
> -       spin_lock(&queue->split_queue_lock);
> -       /*
> -        * There is a period between setting memcg to dying and reparenting
> -        * deferred split queue, and during this period the THPs in the deferred
> -        * split queue will be hidden from the shrinker side.
> -        */
> -       if (unlikely(memcg_is_dying(memcg))) {
> -               spin_unlock(&queue->split_queue_lock);
> -               memcg = parent_mem_cgroup(memcg);
> -               goto retry;
> -       }
> -
> -       return queue;
> -}
> -
> -static struct deferred_split *
> -split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
> -{
> -       struct deferred_split *queue;
> -
> -retry:
> -       queue = memcg_split_queue(nid, memcg);
> -       spin_lock_irqsave(&queue->split_queue_lock, *flags);
> -       if (unlikely(memcg_is_dying(memcg))) {
> -               spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
> -               memcg = parent_mem_cgroup(memcg);
> -               goto retry;
> -       }
> -
> -       return queue;
> -}
> -
> -static struct deferred_split *folio_split_queue_lock(struct folio *folio)
> -{
> -       struct deferred_split *queue;
> -
> -       rcu_read_lock();
> -       queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
> -       /*
> -        * The memcg destruction path is acquiring the split queue lock for
> -        * reparenting. Once you have it locked, it's safe to drop the rcu lock.
> -        */
> -       rcu_read_unlock();
> -
> -       return queue;
> -}
> -
> -static struct deferred_split *
> -folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
> -{
> -       struct deferred_split *queue;
> -
> -       rcu_read_lock();
> -       queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
> -       rcu_read_unlock();
> -
> -       return queue;
> -}
> -
> -static inline void split_queue_unlock(struct deferred_split *queue)
> -{
> -       spin_unlock(&queue->split_queue_lock);
> -}
> -
> -static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
> -                                                unsigned long flags)
> -{
> -       spin_unlock_irqrestore(&queue->split_queue_lock, flags);
> -}
> -
>  static inline bool is_transparent_hugepage(const struct folio *folio)
>  {
>         if (!folio_test_large(folio))
> @@ -1346,6 +1245,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
>                 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>                 return NULL;
>         }
> +
> +       if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +               folio_put(folio);
> +               count_vm_event(THP_FAULT_FALLBACK);
> +               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +               return NULL;
> +       }
> +
>         folio_throttle_swaprate(folio, gfp);
>
>         /*
> @@ -3854,34 +3761,34 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>         struct folio *end_folio = folio_next(folio);
>         struct folio *new_folio, *next;
>         int old_order = folio_order(folio);
> +       struct list_lru_one *l;
> +       bool dequeue_deferred;
>         int ret = 0;
> -       struct deferred_split *ds_queue;
>
>         VM_WARN_ON_ONCE(!mapping && end);
>         /* Prevent deferred_split_scan() touching ->_refcount */
> -       ds_queue = folio_split_queue_lock(folio);
> +       dequeue_deferred = folio_test_anon(folio) && old_order > 1;
> +       if (dequeue_deferred) {
> +               rcu_read_lock();
> +               l = list_lru_lock(&deferred_split_lru,
> +                                 folio_nid(folio), folio_memcg(folio));
> +       }
>         if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
>                 struct swap_cluster_info *ci = NULL;
>                 struct lruvec *lruvec;
>
> -               if (old_order > 1) {
> -                       if (!list_empty(&folio->_deferred_list)) {
> -                               ds_queue->split_queue_len--;
> -                               /*
> -                                * Reinitialize page_deferred_list after removing the
> -                                * page from the split_queue, otherwise a subsequent
> -                                * split will see list corruption when checking the
> -                                * page_deferred_list.
> -                                */
> -                               list_del_init(&folio->_deferred_list);
> -                       }
> +               if (dequeue_deferred) {
> +                       __list_lru_del(&deferred_split_lru, l,
> +                                      &folio->_deferred_list, folio_nid(folio));
>                         if (folio_test_partially_mapped(folio)) {
>                                 folio_clear_partially_mapped(folio);
>                                 mod_mthp_stat(old_order,
>                                         MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>                         }
> +                       list_lru_unlock(l);
> +                       rcu_read_unlock();
>                 }
> -               split_queue_unlock(ds_queue);
> +
>                 if (mapping) {
>                         int nr = folio_nr_pages(folio);
>
> @@ -3982,7 +3889,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>                 if (ci)
>                         swap_cluster_unlock(ci);
>         } else {
> -               split_queue_unlock(ds_queue);
> +               if (dequeue_deferred) {
> +                       list_lru_unlock(l);
> +                       rcu_read_unlock();
> +               }
>                 return -EAGAIN;
>         }
>
> @@ -4349,33 +4259,35 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
>   * queueing THP splits, and that list is (racily observed to be) non-empty.
>   *
>   * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
> - * zero: because even when split_queue_lock is held, a non-empty _deferred_list
> - * might be in use on deferred_split_scan()'s unlocked on-stack list.
> + * zero: because even when the list_lru lock is held, a non-empty
> + * _deferred_list might be in use on deferred_split_scan()'s unlocked
> + * on-stack list.
>   *
> - * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
> - * therefore important to unqueue deferred split before changing folio memcg.
> + * The list_lru sublist is determined by folio's memcg: it is therefore
> + * important to unqueue deferred split before changing folio memcg.
>   */
>  bool __folio_unqueue_deferred_split(struct folio *folio)
>  {
> -       struct deferred_split *ds_queue;
> +       struct list_lru_one *l;
> +       int nid = folio_nid(folio);
>         unsigned long flags;
>         bool unqueued = false;
>
>         WARN_ON_ONCE(folio_ref_count(folio));
>         WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
>
> -       ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> -       if (!list_empty(&folio->_deferred_list)) {
> -               ds_queue->split_queue_len--;
> +       rcu_read_lock();
> +       l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
> +       if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
>                 if (folio_test_partially_mapped(folio)) {
>                         folio_clear_partially_mapped(folio);
>                         mod_mthp_stat(folio_order(folio),
>                                       MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>                 }
> -               list_del_init(&folio->_deferred_list);
>                 unqueued = true;
>         }
> -       split_queue_unlock_irqrestore(ds_queue, flags);
> +       list_lru_unlock_irqrestore(l, &flags);
> +       rcu_read_unlock();
>
>         return unqueued;        /* useful for debug warnings */
>  }
> @@ -4383,7 +4295,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
>  /* partially_mapped=false won't clear PG_partially_mapped folio flag */
>  void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
> -       struct deferred_split *ds_queue;
> +       struct list_lru_one *l;
> +       int nid;
> +       struct mem_cgroup *memcg;
>         unsigned long flags;
>
>         /*
> @@ -4406,7 +4320,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> +       nid = folio_nid(folio);
> +
> +       rcu_read_lock();
> +       memcg = folio_memcg(folio);
> +       l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
>         if (partially_mapped) {
>                 if (!folio_test_partially_mapped(folio)) {
>                         folio_set_partially_mapped(folio);
> @@ -4414,36 +4332,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>                                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>                         count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>                         mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
> -
>                 }
>         } else {
>                 /* partially mapped folios cannot become non-partially mapped */
>                 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
>         }
> -       if (list_empty(&folio->_deferred_list)) {
> -               struct mem_cgroup *memcg;
> -
> -               memcg = folio_split_queue_memcg(folio, ds_queue);
> -               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> -               ds_queue->split_queue_len++;
> -               if (memcg)
> -                       set_shrinker_bit(memcg, folio_nid(folio),
> -                                        shrinker_id(deferred_split_shrinker));
> -       }
> -       split_queue_unlock_irqrestore(ds_queue, flags);
> +       __list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
> +       list_lru_unlock_irqrestore(l, &flags);
> +       rcu_read_unlock();
>  }
>
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>                 struct shrink_control *sc)
>  {
> -       struct pglist_data *pgdata = NODE_DATA(sc->nid);
> -       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
> -
> -#ifdef CONFIG_MEMCG
> -       if (sc->memcg)
> -               ds_queue = &sc->memcg->deferred_split_queue;
> -#endif
> -       return READ_ONCE(ds_queue->split_queue_len);
> +       return list_lru_shrink_count(&deferred_split_lru, sc);
>  }
>
>  static bool thp_underused(struct folio *folio)
> @@ -4473,45 +4375,47 @@ static bool thp_underused(struct folio *folio)
>         return false;
>  }
>
> +static enum lru_status deferred_split_isolate(struct list_head *item,
> +                                             struct list_lru_one *lru,
> +                                             void *cb_arg)
> +{
> +       struct folio *folio = container_of(item, struct folio, _deferred_list);
> +       struct list_head *freeable = cb_arg;
> +
> +       if (folio_try_get(folio)) {
> +               list_lru_isolate_move(lru, item, freeable);
> +               return LRU_REMOVED;
> +       }
> +
> +       /* We lost race with folio_put() */
> +       list_lru_isolate(lru, item);
> +       if (folio_test_partially_mapped(folio)) {
> +               folio_clear_partially_mapped(folio);
> +               mod_mthp_stat(folio_order(folio),
> +                             MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> +       }
> +       return LRU_REMOVED;
> +}
> +
>  static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 struct shrink_control *sc)
>  {
> -       struct deferred_split *ds_queue;
> -       unsigned long flags;
> +       LIST_HEAD(dispose);
>         struct folio *folio, *next;
> -       int split = 0, i;
> -       struct folio_batch fbatch;
> +       int split = 0;
> +       unsigned long isolated;
>
> -       folio_batch_init(&fbatch);
> +       isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
> +                                           deferred_split_isolate, &dispose);
>
> -retry:
> -       ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
> -       /* Take pin on all head pages to avoid freeing them under us */
> -       list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
> -                                                       _deferred_list) {
> -               if (folio_try_get(folio)) {
> -                       folio_batch_add(&fbatch, folio);
> -               } else if (folio_test_partially_mapped(folio)) {
> -                       /* We lost race with folio_put() */
> -                       folio_clear_partially_mapped(folio);
> -                       mod_mthp_stat(folio_order(folio),
> -                                     MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> -               }
> -               list_del_init(&folio->_deferred_list);
> -               ds_queue->split_queue_len--;
> -               if (!--sc->nr_to_scan)
> -                       break;
> -               if (!folio_batch_space(&fbatch))
> -                       break;
> -       }
> -       split_queue_unlock_irqrestore(ds_queue, flags);
> -
> -       for (i = 0; i < folio_batch_count(&fbatch); i++) {
> +       list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
>                 bool did_split = false;
>                 bool underused = false;
> -               struct deferred_split *fqueue;
> +               struct list_lru_one *l;
> +               unsigned long flags;
> +
> +               list_del_init(&folio->_deferred_list);
>
> -               folio = fbatch.folios[i];
>                 if (!folio_test_partially_mapped(folio)) {
>                         /*
>                          * See try_to_map_unused_to_zeropage(): we cannot
> @@ -4534,64 +4438,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 }
>                 folio_unlock(folio);
>  next:
> -               if (did_split || !folio_test_partially_mapped(folio))
> -                       continue;
>                 /*
>                  * Only add back to the queue if folio is partially mapped.
>                  * If thp_underused returns false, or if split_folio fails
>                  * in the case it was underused, then consider it used and
>                  * don't add it back to split_queue.
>                  */
> -               fqueue = folio_split_queue_lock_irqsave(folio, &flags);
> -               if (list_empty(&folio->_deferred_list)) {
> -                       list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
> -                       fqueue->split_queue_len++;
> +               if (!did_split && folio_test_partially_mapped(folio)) {
> +                       rcu_read_lock();
> +                       l = list_lru_lock_irqsave(&deferred_split_lru,
> +                                                 folio_nid(folio),
> +                                                 folio_memcg(folio),
> +                                                 &flags);
> +                       __list_lru_add(&deferred_split_lru, l,
> +                                      &folio->_deferred_list,
> +                                      folio_nid(folio), folio_memcg(folio));
> +                       list_lru_unlock_irqrestore(l, &flags);
> +                       rcu_read_unlock();
>                 }
> -               split_queue_unlock_irqrestore(fqueue, flags);
> -       }
> -       folios_put(&fbatch);
> -
> -       if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
> -               cond_resched();
> -               goto retry;
> +               folio_put(folio);
>         }
>
> -       /*
> -        * Stop shrinker if we didn't split any page, but the queue is empty.
> -        * This can happen if pages were freed under us.
> -        */
> -       if (!split && list_empty(&ds_queue->split_queue))
> +       if (!split && !isolated)
>                 return SHRINK_STOP;
>         return split;
>  }
>
> -#ifdef CONFIG_MEMCG
> -void reparent_deferred_split_queue(struct mem_cgroup *memcg)
> -{
> -       struct mem_cgroup *parent = parent_mem_cgroup(memcg);
> -       struct deferred_split *ds_queue = &memcg->deferred_split_queue;
> -       struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
> -       int nid;
> -
> -       spin_lock_irq(&ds_queue->split_queue_lock);
> -       spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
> -
> -       if (!ds_queue->split_queue_len)
> -               goto unlock;
> -
> -       list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
> -       parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
> -       ds_queue->split_queue_len = 0;
> -
> -       for_each_node(nid)
> -               set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
> -
> -unlock:
> -       spin_unlock(&parent_ds_queue->split_queue_lock);
> -       spin_unlock_irq(&ds_queue->split_queue_lock);
> -}
> -#endif
> -
>  #ifdef CONFIG_DEBUG_FS
>  static void split_huge_pages_all(void)
>  {
> diff --git a/mm/internal.h b/mm/internal.h
> index f98f4746ac41..d8c737338df5 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -863,7 +863,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
>         /*
>          * At this point, there is no one trying to add the folio to
>          * deferred_list. If folio is not in deferred_list, it's safe
> -        * to check without acquiring the split_queue_lock.
> +        * to check without acquiring the list_lru lock.
>          */
>         if (data_race(list_empty(&folio->_deferred_list)))
>                 return false;
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 4b0e59c7c0e6..b2ac28ddd480 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1081,6 +1081,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>         }
>
>         count_vm_event(THP_COLLAPSE_ALLOC);
> +
>         if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
>                 folio_put(folio);
>                 *foliop = NULL;
> @@ -1089,6 +1090,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>
>         count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
>
> +       if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +               folio_put(folio);
> +               *foliop = NULL;
> +               return SCAN_CGROUP_CHARGE_FAIL;
> +       }
> +
>         *foliop = folio;
>         return SCAN_SUCCEED;
>  }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a47fb68dd65f..f381cb6bdff1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
>         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
>                 memcg->cgwb_frn[i].done =
>                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
> -#endif
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -       spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
> -       INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
> -       memcg->deferred_split_queue.split_queue_len = 0;
>  #endif
>         lru_gen_init_memcg(memcg);
>         return memcg;
> @@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>         zswap_memcg_offline_cleanup(memcg);
>
>         memcg_offline_kmem(memcg);
> -       reparent_deferred_split_queue(memcg);
>         /*
> -        * The reparenting of objcg must be after the reparenting of the
> -        * list_lru and deferred_split_queue above, which ensures that they will
> -        * not mistakenly get the parent list_lru and deferred_split_queue.
> +        * The reparenting of objcg must be after the reparenting of
> +        * the list_lru in memcg_offline_kmem(), which ensures that
> +        * they will not mistakenly get the parent list_lru.
>          */
>         memcg_reparent_objcgs(memcg);
>         reparent_shrinker_deferred(memcg);
> diff --git a/mm/memory.c b/mm/memory.c
> index 219b9bf6cae0..e68ceb4aa624 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>         while (orders) {
>                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>                 folio = vma_alloc_folio(gfp, order, vma, addr);
> -               if (folio) {
> -                       if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> -                                                           gfp, entry))
> -                               return folio;
> +               if (!folio)
> +                       goto next;
> +               if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
>                         count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
>                         folio_put(folio);
> +                       goto next;
>                 }
> +               if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +                       folio_put(folio);
> +                       goto fallback;
> +               }

Hi Johannes,

Haven't checked every detail yet, but one question here, might be
trivial, will it be better if we fallback to the next order instead of
fallback to 0 order directly? Suppose this is a 2M allocation and 1M
fallback is allowed, releasing that folio and fallback to 1M will free
1M memory which would be enough for the list lru metadata to be
allocated.
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by Johannes Weiner 1 week ago
On Fri, Mar 27, 2026 at 03:51:07PM +0800, Kairui Song wrote:
> On Thu, Mar 19, 2026 at 4:05 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > @@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >         while (orders) {
> >                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >                 folio = vma_alloc_folio(gfp, order, vma, addr);
> > -               if (folio) {
> > -                       if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> > -                                                           gfp, entry))
> > -                               return folio;
> > +               if (!folio)
> > +                       goto next;
> > +               if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
> >                         count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
> >                         folio_put(folio);
> > +                       goto next;
> >                 }
> > +               if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> > +                       folio_put(folio);
> > +                       goto fallback;
> > +               }
> 
> Hi Johannes,
> 
> Haven't checked every detail yet, but one question here, might be
> trivial, will it be better if we fallback to the next order instead of
> fallback to 0 order directly? Suppose this is a 2M allocation and 1M
> fallback is allowed, releasing that folio and fallback to 1M will free
> 1M memory which would be enough for the list lru metadata to be
> allocated.

I would be surprised if that mattered. If we can get a 2M folio but
fail a couple of small slab requests, there is probably such extreme
levels of concurrency and pressure on the freelists that the fault has
a good chance of failing altogether and OOMing.

And if it doesn't matter, then let's consider it from a code clarity
point of view. For folio allocation and charging, we reduce the size
to try again. But the list_lru allocation is always the same size - it
would look weird to just try again on failure. If we do so based on
the logic you lay out above, now it needs a comment too...
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by Lorenzo Stoakes (Oracle) 1 week, 6 days ago
On Wed, Mar 18, 2026 at 03:53:25PM -0400, Johannes Weiner wrote:
> The deferred split queue handles cgroups in a suboptimal fashion. The
> queue is per-NUMA node or per-cgroup, not the intersection. That means
> on a cgrouped system, a node-restricted allocation entering reclaim
> can end up splitting large pages on other nodes:
>
> 	alloc/unmap
> 	  deferred_split_folio()
> 	    list_add_tail(memcg->split_queue)
> 	    set_shrinker_bit(memcg, node, deferred_shrinker_id)

So here it's:

__do_huge_pmd_anonymous_page() / do_huge_zero_wp_pmd()
-> map_anon_folio_pmd_pf()
-> map_anon_folio_pmd_nopf()
-> deferred_split_folio()
-> set_shrinker_bit()

Yeah it makes sense to make the first bit succinct anyway :)

>
> 	for_each_zone_zonelist_nodemask(restricted_nodes)
> 	  mem_cgroup_iter()
> 	    shrink_slab(node, memcg)
> 	      shrink_slab_memcg(node, memcg)
> 	        if test_shrinker_bit(memcg, node, deferred_shrinker_id)

Hmm there's no such function is this kind of pseudocode, essentially?

Wouldn't it be clearer to reference:

shrink_slab_memcg() -> do_shrink_slab() -> shrinker->scan_objects ->
deferred_split_scan()?

Though I get that it adds verbosity :)

Sorry not (just) being pedantic here, also so I can understand myself :)

> 	          deferred_split_scan()
> 	            walks memcg->split_queue

Ok so overall there's a per-memcg memcg->split_queue, but we set a bit in
memcg->nodeinfo[nid]->shrinker_info->unit[blah]->map for it, and when we
enter shrink_slab_memcg(), we figure out the shrink_control from the
for_each_set_bit() across memcg->...->unit->map?

> The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> has a single large page on the node of interest, all large pages owned
> by that memcg, including those on other nodes, will be split.

So at this point, regardless of node, we are invoking deferred_split_scan()
and it's the same memcg->split_queue we are walking, regardless of node?

Do let me know if my understanding is correct here!

Hmm that does sound sub-optimal.

>
> list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> streamlines a lot of the list operations and reclaim walks. It's used
> widely by other major shrinkers already. Convert the deferred split
> queue as well.

It's odd that it wasn't used before?

>
> The list_lru per-memcg heads are instantiated on demand when the first
> object of interest is allocated for a cgroup, by calling
> folio_memcg_list_lru_alloc(). Add calls to where splittable pages are
> created: anon faults, swapin faults, khugepaged collapse.

OK that makes sense.

>
> These calls create all possible node heads for the cgroup at once, so
> the migration code (between nodes) doesn't need any special care.

Nice!

>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Hmm you saved the big one for last :)

> ---
>  include/linux/huge_mm.h    |   6 +-
>  include/linux/memcontrol.h |   4 -
>  include/linux/mmzone.h     |  12 --
>  mm/huge_memory.c           | 342 ++++++++++++-------------------------
>  mm/internal.h              |   2 +-
>  mm/khugepaged.c            |   7 +
>  mm/memcontrol.c            |  12 +-
>  mm/memory.c                |  52 +++---
>  mm/mm_init.c               |  15 --
>  9 files changed, 151 insertions(+), 301 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index bd7f0e1d8094..8d801ed378db 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
>  {
>  	return split_huge_page_to_list_to_order(page, NULL, 0);
>  }
> +
> +extern struct list_lru deferred_split_lru;

It might be nice for the sake of avoiding a global to instead expose this
as a getter?

Or actually better, since every caller outside of huge_memory.c that
references this uses folio_memcg_list_lru_alloc(), do something like:

int folio_memcg_alloc_deferred(struct folio *folio, gfp_t gfp);

in mm/huge_memory.c:

/**
 * blah blah blah put on error blah
 */
int folio_memcg_alloc_deferred(struct folio *folio, gfp_t gfp)
{
	int err;

	err = folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfP);
	if (err) {
		folio_put(folio);
		return err;
	}

	return 0;
}

And then the callers can just invoke this, and you can make
deferred_split_lru static in mm/huge_memory.c?


>  void deferred_split_folio(struct folio *folio, bool partially_mapped);
> -#ifdef CONFIG_MEMCG
> -void reparent_deferred_split_queue(struct mem_cgroup *memcg);
> -#endif
>
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  		unsigned long address, bool freeze);
> @@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
>  }
>
>  static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
>  #define split_huge_pmd(__vma, __pmd, __address)	\
>  	do { } while (0)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 086158969529..0782c72a1997 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -277,10 +277,6 @@ struct mem_cgroup {
>  	struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
>  #endif
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	struct deferred_split deferred_split_queue;
> -#endif
> -
>  #ifdef CONFIG_LRU_GEN_WALKS_MMU
>  	/* per-memcg mm_struct list */
>  	struct lru_gen_mm_list mm_list;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 7bd0134c241c..232b7a71fd69 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1429,14 +1429,6 @@ struct zonelist {
>   */
>  extern struct page *mem_map;
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -struct deferred_split {
> -	spinlock_t split_queue_lock;
> -	struct list_head split_queue;
> -	unsigned long split_queue_len;
> -};
> -#endif
> -
>  #ifdef CONFIG_MEMORY_FAILURE
>  /*
>   * Per NUMA node memory failure handling statistics.
> @@ -1562,10 +1554,6 @@ typedef struct pglist_data {
>  	unsigned long first_deferred_pfn;
>  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	struct deferred_split deferred_split_queue;
> -#endif
> -
>  #ifdef CONFIG_NUMA_BALANCING
>  	/* start time in ms of current promote rate limit period */
>  	unsigned int nbp_rl_start;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 3fc02913b63e..e90d08db219d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -14,6 +14,7 @@
>  #include <linux/mmu_notifier.h>
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
> +#include <linux/list_lru.h>
>  #include <linux/shrinker.h>
>  #include <linux/mm_inline.h>
>  #include <linux/swapops.h>
> @@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
>  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
>  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
>
> +static struct lock_class_key deferred_split_key;
> +struct list_lru deferred_split_lru;
>  static struct shrinker *deferred_split_shrinker;
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>  					  struct shrink_control *sc);
> @@ -919,6 +922,13 @@ static int __init thp_shrinker_init(void)
>  	if (!deferred_split_shrinker)
>  		return -ENOMEM;
>
> +	if (list_lru_init_memcg_key(&deferred_split_lru,
> +				    deferred_split_shrinker,
> +				    &deferred_split_key)) {
> +		shrinker_free(deferred_split_shrinker);
> +		return -ENOMEM;
> +	}
> +

It's kind of out of scope for the series, but I hate that the huge zero
folio stuff has an early exit for persistent but if not, falls through to
trying to set that up - it'd be nice to split out the huge zero folio stuff
into another function.

But probably one for a follow up!

>  	deferred_split_shrinker->count_objects = deferred_split_count;
>  	deferred_split_shrinker->scan_objects = deferred_split_scan;
>  	shrinker_register(deferred_split_shrinker);
> @@ -939,6 +949,7 @@ static int __init thp_shrinker_init(void)
>
>  	huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
>  	if (!huge_zero_folio_shrinker) {
> +		list_lru_destroy(&deferred_split_lru);
>  		shrinker_free(deferred_split_shrinker);

Presumably no probably-impossible-in-reality race on somebody entering the
shrinker and referencing the deferred_split_lru before the shrinker is freed?

>  		return -ENOMEM;
>  	}
> @@ -953,6 +964,7 @@ static int __init thp_shrinker_init(void)
>  static void __init thp_shrinker_exit(void)
>  {
>  	shrinker_free(huge_zero_folio_shrinker);
> +	list_lru_destroy(&deferred_split_lru);

Same question above as to race/ordering.

>  	shrinker_free(deferred_split_shrinker);
>  }



>
> @@ -1133,119 +1145,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>  	return pmd;
>  }
>
> -static struct deferred_split *split_queue_node(int nid)
> -{
> -	struct pglist_data *pgdata = NODE_DATA(nid);
> -
> -	return &pgdata->deferred_split_queue;
> -}
> -
> -#ifdef CONFIG_MEMCG
> -static inline
> -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> -					   struct deferred_split *queue)
> -{
> -	if (mem_cgroup_disabled())
> -		return NULL;
> -	if (split_queue_node(folio_nid(folio)) == queue)
> -		return NULL;
> -	return container_of(queue, struct mem_cgroup, deferred_split_queue);
> -}
> -
> -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
> -{
> -	return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
> -}
> -#else
> -static inline
> -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> -					   struct deferred_split *queue)
> -{
> -	return NULL;
> -}
> -
> -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
> -{
> -	return split_queue_node(nid);
> -}
> -#endif
> -
> -static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
> -{
> -	struct deferred_split *queue;
> -
> -retry:
> -	queue = memcg_split_queue(nid, memcg);
> -	spin_lock(&queue->split_queue_lock);
> -	/*
> -	 * There is a period between setting memcg to dying and reparenting
> -	 * deferred split queue, and during this period the THPs in the deferred
> -	 * split queue will be hidden from the shrinker side.
> -	 */
> -	if (unlikely(memcg_is_dying(memcg))) {
> -		spin_unlock(&queue->split_queue_lock);
> -		memcg = parent_mem_cgroup(memcg);
> -		goto retry;
> -	}
> -
> -	return queue;
> -}
> -
> -static struct deferred_split *
> -split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
> -{
> -	struct deferred_split *queue;
> -
> -retry:
> -	queue = memcg_split_queue(nid, memcg);
> -	spin_lock_irqsave(&queue->split_queue_lock, *flags);
> -	if (unlikely(memcg_is_dying(memcg))) {
> -		spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
> -		memcg = parent_mem_cgroup(memcg);
> -		goto retry;
> -	}
> -
> -	return queue;
> -}
> -
> -static struct deferred_split *folio_split_queue_lock(struct folio *folio)
> -{
> -	struct deferred_split *queue;
> -
> -	rcu_read_lock();
> -	queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
> -	/*
> -	 * The memcg destruction path is acquiring the split queue lock for
> -	 * reparenting. Once you have it locked, it's safe to drop the rcu lock.
> -	 */
> -	rcu_read_unlock();
> -
> -	return queue;
> -}
> -
> -static struct deferred_split *
> -folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
> -{
> -	struct deferred_split *queue;
> -
> -	rcu_read_lock();
> -	queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
> -	rcu_read_unlock();
> -
> -	return queue;
> -}
> -
> -static inline void split_queue_unlock(struct deferred_split *queue)
> -{
> -	spin_unlock(&queue->split_queue_lock);
> -}
> -
> -static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
> -						 unsigned long flags)
> -{
> -	spin_unlock_irqrestore(&queue->split_queue_lock, flags);
> -}
> -

Lots of red, love to see it :)

>  static inline bool is_transparent_hugepage(const struct folio *folio)
>  {
>  	if (!folio_test_large(folio))
> @@ -1346,6 +1245,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
>  		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>  		return NULL;
>  	}
> +
> +	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +		folio_put(folio);
> +		count_vm_event(THP_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		return NULL;
> +	}
> +

As mentioned above, would be good to separate out into helper in mm/huge_memory.c.

>  	folio_throttle_swaprate(folio, gfp);
>
>         /*
> @@ -3854,34 +3761,34 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>  	struct folio *end_folio = folio_next(folio);
>  	struct folio *new_folio, *next;
>  	int old_order = folio_order(folio);
> +	struct list_lru_one *l;

Nit, and maybe this is a convention, but hate single letter variable names,
'lru' or something might be nicer?

> +	bool dequeue_deferred;
>  	int ret = 0;
> -	struct deferred_split *ds_queue;
>
>  	VM_WARN_ON_ONCE(!mapping && end);
>  	/* Prevent deferred_split_scan() touching ->_refcount */
> -	ds_queue = folio_split_queue_lock(folio);
> +	dequeue_deferred = folio_test_anon(folio) && old_order > 1;

Why anon? (This review is partly me learning about the shrinker, an area
I'm weak on :)

> +	if (dequeue_deferred) {
> +		rcu_read_lock();
> +		l = list_lru_lock(&deferred_split_lru,
> +				  folio_nid(folio), folio_memcg(folio));

Hm don't adore this sort of almost 'hidden' RCU lock here, but this
function is pretty disgusting and needs serious work in general.

And any function that took the RCU lock and list_lru lock/did the unlock
equivalent would be equally horrible so yeah, I guess needs deferring to a
refactor.

OTOH, this could be a good excuse for us to pay down some technical debt
and split out for instance the folio_ref_freeze() bits?

Could we do something like:

	bool frozen;

	...

	dequeue_deferred = folio_test_anon(folio) && old_order > 1;
	frozen = folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1);

	if (dequeue_deferred && frozen) {
		struct list_lru_one *lru;

		rcu_read_lock();
		lru = list_lru_lock(&deferred_split_lru,
				    folio_nid(folio), folio_memcg(folio));
		__list_lru_del(&deferred_split_lru, lru,
			       &folio->_deferred_list, folio_nid(folio));
		if (folio_test_partially_mapped(folio)) {
			folio_clear_partially_mapped(folio);
			mod_mthp_stat(old_order,
				MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
		}
		list_lru_unlock(lru);
		rcu_read_unlock();
	}

	if (!frozen)
		return -EAGAIN;

	... rest of logic now one indent level lower ...

Or maybe factor that out into a helper function or something?

static void execute_deferred_dequeue(...) { ... }

With this implemented either way you'd be able to get rid of the else block
too.

obviously only valid if you are able to do the freezing earlier?


> +	}
>  	if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
>  		struct swap_cluster_info *ci = NULL;
>  		struct lruvec *lruvec;
>
> -		if (old_order > 1) {

Before was this also applicable to non-anon folios?

> -			if (!list_empty(&folio->_deferred_list)) {
> -				ds_queue->split_queue_len--;
> -				/*
> -				 * Reinitialize page_deferred_list after removing the
> -				 * page from the split_queue, otherwise a subsequent
> -				 * split will see list corruption when checking the
> -				 * page_deferred_list.
> -				 */
> -				list_del_init(&folio->_deferred_list);
> -			}
> +		if (dequeue_deferred) {
> +			__list_lru_del(&deferred_split_lru, l,
> +				       &folio->_deferred_list, folio_nid(folio));
>  			if (folio_test_partially_mapped(folio)) {
>  				folio_clear_partially_mapped(folio);
>  				mod_mthp_stat(old_order,
>  					MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>  			}
> +			list_lru_unlock(l);
> +			rcu_read_unlock();
>  		}
> -		split_queue_unlock(ds_queue);
> +
>  		if (mapping) {
>  			int nr = folio_nr_pages(folio);
>
> @@ -3982,7 +3889,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>  		if (ci)
>  			swap_cluster_unlock(ci);
>  	} else {
> -		split_queue_unlock(ds_queue);
> +		if (dequeue_deferred) {
> +			list_lru_unlock(l);
> +			rcu_read_unlock();
> +		}
>  		return -EAGAIN;
>  	}
>
> @@ -4349,33 +4259,35 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
>   * queueing THP splits, and that list is (racily observed to be) non-empty.
>   *
>   * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
> - * zero: because even when split_queue_lock is held, a non-empty _deferred_list
> - * might be in use on deferred_split_scan()'s unlocked on-stack list.
> + * zero: because even when the list_lru lock is held, a non-empty
> + * _deferred_list might be in use on deferred_split_scan()'s unlocked
> + * on-stack list.
>   *
> - * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
> - * therefore important to unqueue deferred split before changing folio memcg.
> + * The list_lru sublist is determined by folio's memcg: it is therefore
> + * important to unqueue deferred split before changing folio memcg.
>   */
>  bool __folio_unqueue_deferred_split(struct folio *folio)
>  {
> -	struct deferred_split *ds_queue;
> +	struct list_lru_one *l;

Again nitty thing about single letter var name, OTOH if that's somehow the
convention maybe fine. 'l' is especially problematic as it looks like a 1
in many fonts :)

> +	int nid = folio_nid(folio);
>  	unsigned long flags;
>  	bool unqueued = false;
>
>  	WARN_ON_ONCE(folio_ref_count(folio));
>  	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
>
> -	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> -	if (!list_empty(&folio->_deferred_list)) {
> -		ds_queue->split_queue_len--;
> +	rcu_read_lock();
> +	l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
> +	if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {

Maybe worth factoring __list_lru_del() into something that explicitly
references &folio->_deferred_list rather than open codingin both places?

>  		if (folio_test_partially_mapped(folio)) {
>  			folio_clear_partially_mapped(folio);
>  			mod_mthp_stat(folio_order(folio),
>  				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>  		}
> -		list_del_init(&folio->_deferred_list);
>  		unqueued = true;
>  	}
> -	split_queue_unlock_irqrestore(ds_queue, flags);
> +	list_lru_unlock_irqrestore(l, &flags);
> +	rcu_read_unlock();
>
>  	return unqueued;	/* useful for debug warnings */
>  }
> @@ -4383,7 +4295,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
>  /* partially_mapped=false won't clear PG_partially_mapped folio flag */
>  void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
> -	struct deferred_split *ds_queue;
> +	struct list_lru_one *l;
> +	int nid;
> +	struct mem_cgroup *memcg;
>  	unsigned long flags;
>
>  	/*
> @@ -4406,7 +4320,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  	if (folio_test_swapcache(folio))
>  		return;
>
> -	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> +	nid = folio_nid(folio);
> +
> +	rcu_read_lock();
> +	memcg = folio_memcg(folio);
> +	l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);

Do we really need to hold the lock over all the below, but hmm we can't do
an irqsave/restore list_lru_add(), and maybe not worth adding one either,
OK.

Just seems odd to have <lock> <__unlocked_add_variant> <unlock>,
instinctively feels like it should be just <locked_add_variant>.

>  	if (partially_mapped) {
>  		if (!folio_test_partially_mapped(folio)) {
>  			folio_set_partially_mapped(folio);
> @@ -4414,36 +4332,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>  			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>  			mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
> -

Always nice to remove these annoying random newlines :)

>  		}
>  	} else {
>  		/* partially mapped folios cannot become non-partially mapped */
>  		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
>  	}
> -	if (list_empty(&folio->_deferred_list)) {
> -		struct mem_cgroup *memcg;
> -
> -		memcg = folio_split_queue_memcg(folio, ds_queue);
> -		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> -		ds_queue->split_queue_len++;
> -		if (memcg)
> -			set_shrinker_bit(memcg, folio_nid(folio),
> -					 shrinker_id(deferred_split_shrinker));
> -	}
> -	split_queue_unlock_irqrestore(ds_queue, flags);
> +	__list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
> +	list_lru_unlock_irqrestore(l, &flags);
> +	rcu_read_unlock();
>  }
>
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>  		struct shrink_control *sc)
>  {
> -	struct pglist_data *pgdata = NODE_DATA(sc->nid);
> -	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
> -
> -#ifdef CONFIG_MEMCG
> -	if (sc->memcg)
> -		ds_queue = &sc->memcg->deferred_split_queue;
> -#endif
> -	return READ_ONCE(ds_queue->split_queue_len);
> +	return list_lru_shrink_count(&deferred_split_lru, sc);

That's nice :)

>  }
>
>  static bool thp_underused(struct folio *folio)
> @@ -4473,45 +4375,47 @@ static bool thp_underused(struct folio *folio)
>  	return false;
>  }
>
> +static enum lru_status deferred_split_isolate(struct list_head *item,
> +					      struct list_lru_one *lru,
> +					      void *cb_arg)
> +{
> +	struct folio *folio = container_of(item, struct folio, _deferred_list);
> +	struct list_head *freeable = cb_arg;
> +
> +	if (folio_try_get(folio)) {
> +		list_lru_isolate_move(lru, item, freeable);
> +		return LRU_REMOVED;
> +	}
> +
> +	/* We lost race with folio_put() */

Hmm, in the original code, this comment is associated with the partially
mapped logic, BUT this seems actually correct, because folio_try_get()
because it does folio_ref_add_unless_zero() only fails if the folio lost
the race.

So I think you're more correct right?

> +	list_lru_isolate(lru, item);
> +	if (folio_test_partially_mapped(folio)) {
> +		folio_clear_partially_mapped(folio);
> +		mod_mthp_stat(folio_order(folio),
> +			      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> +	}
> +	return LRU_REMOVED;
> +}
> +
>  static unsigned long deferred_split_scan(struct shrinker *shrink,
>  		struct shrink_control *sc)
>  {
> -	struct deferred_split *ds_queue;
> -	unsigned long flags;
> +	LIST_HEAD(dispose);
>  	struct folio *folio, *next;
> -	int split = 0, i;
> -	struct folio_batch fbatch;
> +	int split = 0;
> +	unsigned long isolated;
>
> -	folio_batch_init(&fbatch);
> +	isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
> +					    deferred_split_isolate, &dispose);
>
> -retry:
> -	ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
> -	/* Take pin on all head pages to avoid freeing them under us */
> -	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
> -							_deferred_list) {
> -		if (folio_try_get(folio)) {
> -			folio_batch_add(&fbatch, folio);
> -		} else if (folio_test_partially_mapped(folio)) {
> -			/* We lost race with folio_put() */
> -			folio_clear_partially_mapped(folio);
> -			mod_mthp_stat(folio_order(folio),
> -				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> -		}
> -		list_del_init(&folio->_deferred_list);
> -		ds_queue->split_queue_len--;
> -		if (!--sc->nr_to_scan)
> -			break;
> -		if (!folio_batch_space(&fbatch))
> -			break;
> -	}
> -	split_queue_unlock_irqrestore(ds_queue, flags);
> -
> -	for (i = 0; i < folio_batch_count(&fbatch); i++) {
> +	list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
>  		bool did_split = false;
>  		bool underused = false;
> -		struct deferred_split *fqueue;
> +		struct list_lru_one *l;
> +		unsigned long flags;
> +
> +		list_del_init(&folio->_deferred_list);
>
> -		folio = fbatch.folios[i];
>  		if (!folio_test_partially_mapped(folio)) {
>  			/*
>  			 * See try_to_map_unused_to_zeropage(): we cannot
> @@ -4534,64 +4438,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  		}
>  		folio_unlock(folio);
>  next:
> -		if (did_split || !folio_test_partially_mapped(folio))
> -			continue;
>  		/*
>  		 * Only add back to the queue if folio is partially mapped.
>  		 * If thp_underused returns false, or if split_folio fails
>  		 * in the case it was underused, then consider it used and
>  		 * don't add it back to split_queue.
>  		 */
> -		fqueue = folio_split_queue_lock_irqsave(folio, &flags);
> -		if (list_empty(&folio->_deferred_list)) {
> -			list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
> -			fqueue->split_queue_len++;
> +		if (!did_split && folio_test_partially_mapped(folio)) {
> +			rcu_read_lock();
> +			l = list_lru_lock_irqsave(&deferred_split_lru,
> +						  folio_nid(folio),
> +						  folio_memcg(folio),
> +						  &flags);
> +			__list_lru_add(&deferred_split_lru, l,
> +				       &folio->_deferred_list,
> +				       folio_nid(folio), folio_memcg(folio));
> +			list_lru_unlock_irqrestore(l, &flags);

Hmm this does make me think it'd be nice to have a list_lru_add() variant
for irqsave/restore then, since it's a repeating pattern!

> +			rcu_read_unlock();
>  		}
> -		split_queue_unlock_irqrestore(fqueue, flags);
> -	}
> -	folios_put(&fbatch);
> -
> -	if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
> -		cond_resched();
> -		goto retry;
> +		folio_put(folio);
>  	}
>
> -	/*
> -	 * Stop shrinker if we didn't split any page, but the queue is empty.
> -	 * This can happen if pages were freed under us.
> -	 */
> -	if (!split && list_empty(&ds_queue->split_queue))
> +	if (!split && !isolated)
>  		return SHRINK_STOP;
>  	return split;
>  }
>
> -#ifdef CONFIG_MEMCG
> -void reparent_deferred_split_queue(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
> -	struct deferred_split *ds_queue = &memcg->deferred_split_queue;
> -	struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
> -	int nid;
> -
> -	spin_lock_irq(&ds_queue->split_queue_lock);
> -	spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
> -
> -	if (!ds_queue->split_queue_len)
> -		goto unlock;
> -
> -	list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
> -	parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
> -	ds_queue->split_queue_len = 0;
> -
> -	for_each_node(nid)
> -		set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
> -
> -unlock:
> -	spin_unlock(&parent_ds_queue->split_queue_lock);
> -	spin_unlock_irq(&ds_queue->split_queue_lock);
> -}
> -#endif
> -
>  #ifdef CONFIG_DEBUG_FS
>  static void split_huge_pages_all(void)
>  {
> diff --git a/mm/internal.h b/mm/internal.h
> index f98f4746ac41..d8c737338df5 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -863,7 +863,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
>  	/*
>  	 * At this point, there is no one trying to add the folio to
>  	 * deferred_list. If folio is not in deferred_list, it's safe
> -	 * to check without acquiring the split_queue_lock.
> +	 * to check without acquiring the list_lru lock.
>  	 */
>  	if (data_race(list_empty(&folio->_deferred_list)))
>  		return false;
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 4b0e59c7c0e6..b2ac28ddd480 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1081,6 +1081,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>  	}
>
>  	count_vm_event(THP_COLLAPSE_ALLOC);
> +
>  	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
>  		folio_put(folio);
>  		*foliop = NULL;
> @@ -1089,6 +1090,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>
>  	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

Do we want to put this stat counter underneath the below so it's not
incremented on fail?

>
> +	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +		folio_put(folio);
> +		*foliop = NULL;
> +		return SCAN_CGROUP_CHARGE_FAIL;

Do we not need to uncharge here?

> +	}
> +
>  	*foliop = folio;
>  	return SCAN_SUCCEED;
>  }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a47fb68dd65f..f381cb6bdff1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
>  	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
>  		memcg->cgwb_frn[i].done =
>  			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
> -#endif
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
> -	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
> -	memcg->deferred_split_queue.split_queue_len = 0;
>  #endif
>  	lru_gen_init_memcg(memcg);
>  	return memcg;
> @@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  	zswap_memcg_offline_cleanup(memcg);
>
>  	memcg_offline_kmem(memcg);
> -	reparent_deferred_split_queue(memcg);
>  	/*
> -	 * The reparenting of objcg must be after the reparenting of the
> -	 * list_lru and deferred_split_queue above, which ensures that they will
> -	 * not mistakenly get the parent list_lru and deferred_split_queue.
> +	 * The reparenting of objcg must be after the reparenting of
> +	 * the list_lru in memcg_offline_kmem(), which ensures that
> +	 * they will not mistakenly get the parent list_lru.
>  	 */
>  	memcg_reparent_objcgs(memcg);
>  	reparent_shrinker_deferred(memcg);
> diff --git a/mm/memory.c b/mm/memory.c
> index 219b9bf6cae0..e68ceb4aa624 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>  	while (orders) {
>  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>  		folio = vma_alloc_folio(gfp, order, vma, addr);
> -		if (folio) {
> -			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> -							    gfp, entry))
> -				return folio;
> +		if (!folio)
> +			goto next;
> +		if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
>  			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
>  			folio_put(folio);
> +			goto next;
>  		}
> +		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {

Do we need to uncharge here?

> +			folio_put(folio);
> +			goto fallback;
> +		}
> +		return folio;
> +next:
>  		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
>  		order = next_order(&orders, order);
>  	}
> @@ -5169,24 +5175,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>  	while (orders) {
>  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>  		folio = vma_alloc_folio(gfp, order, vma, addr);
> -		if (folio) {
> -			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> -				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> -				folio_put(folio);
> -				goto next;
> -			}
> -			folio_throttle_swaprate(folio, gfp);
> -			/*
> -			 * When a folio is not zeroed during allocation
> -			 * (__GFP_ZERO not used) or user folios require special
> -			 * handling, folio_zero_user() is used to make sure
> -			 * that the page corresponding to the faulting address
> -			 * will be hot in the cache after zeroing.
> -			 */
> -			if (user_alloc_needs_zeroing())
> -				folio_zero_user(folio, vmf->address);
> -			return folio;
> +		if (!folio)
> +			goto next;

This applies to the above equivalent refactorings, but maybe worth
separating out this:

if (folio) { ... big branch ... } ->
if (!folio) goto next; ... what was big branch ...

Refactoring into a separate patch? Makes it easier to see pertinent logic
changes + helps review/bisectability/fixes/etc. etc.


> +		if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> +			count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +			folio_put(folio);
> +			goto next;
>  		}
> +		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {

Again, do we need to uncharge here?

> +			folio_put(folio);
> +			goto fallback;
> +		}
> +		folio_throttle_swaprate(folio, gfp);
> +		/*
> +		 * When a folio is not zeroed during allocation
> +		 * (__GFP_ZERO not used) or user folios require special
> +		 * handling, folio_zero_user() is used to make sure
> +		 * that the page corresponding to the faulting address
> +		 * will be hot in the cache after zeroing.
> +		 */
> +		if (user_alloc_needs_zeroing())
> +			folio_zero_user(folio, vmf->address);
> +		return folio;
>  next:
>  		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>  		order = next_order(&orders, order);
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index cec7bb758bdd..f293a62e652a 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1388,19 +1388,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
>  	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
>  }
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -static void pgdat_init_split_queue(struct pglist_data *pgdat)
> -{
> -	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
> -
> -	spin_lock_init(&ds_queue->split_queue_lock);
> -	INIT_LIST_HEAD(&ds_queue->split_queue);
> -	ds_queue->split_queue_len = 0;
> -}
> -#else
> -static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
> -#endif
> -
>  #ifdef CONFIG_COMPACTION
>  static void pgdat_init_kcompactd(struct pglist_data *pgdat)
>  {
> @@ -1416,8 +1403,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>
>  	pgdat_resize_init(pgdat);
>  	pgdat_kswapd_lock_init(pgdat);
> -
> -	pgdat_init_split_queue(pgdat);
>  	pgdat_init_kcompactd(pgdat);
>
>  	init_waitqueue_head(&pgdat->kswapd_wait);
> --
> 2.53.0
>

Overall a lovely amount of code deletion here :)

Thanks for doing this, Cheers Lorenzo
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by Johannes Weiner 1 week ago
Hey Lorenzo,

Thanks for taking an in-depth look at this!

Apologies for the late reply, I was traveling last week.

On Tue, Mar 24, 2026 at 01:48:06PM +0000, Lorenzo Stoakes (Oracle) wrote:
> On Wed, Mar 18, 2026 at 03:53:25PM -0400, Johannes Weiner wrote:
> > The deferred split queue handles cgroups in a suboptimal fashion. The
> > queue is per-NUMA node or per-cgroup, not the intersection. That means
> > on a cgrouped system, a node-restricted allocation entering reclaim
> > can end up splitting large pages on other nodes:
> >
> > 	alloc/unmap
> > 	  deferred_split_folio()
> > 	    list_add_tail(memcg->split_queue)
> > 	    set_shrinker_bit(memcg, node, deferred_shrinker_id)
> 
> So here it's:
> 
> __do_huge_pmd_anonymous_page() / do_huge_zero_wp_pmd()
> -> map_anon_folio_pmd_pf()
> -> map_anon_folio_pmd_nopf()
> -> deferred_split_folio()
> -> set_shrinker_bit()
> 
> Yeah it makes sense to make the first bit succinct anyway :)

Yep I was trying to keep the focus on those parts that interact with
the other stacks listed below. Not biblically accurate
deferred_split_folio() callsites ;)

> > 	for_each_zone_zonelist_nodemask(restricted_nodes)
> > 	  mem_cgroup_iter()
> > 	    shrink_slab(node, memcg)
> > 	      shrink_slab_memcg(node, memcg)
> > 	        if test_shrinker_bit(memcg, node, deferred_shrinker_id)
> 
> Hmm there's no such function is this kind of pseudocode, essentially?
> 
> Wouldn't it be clearer to reference:
> 
> shrink_slab_memcg() -> do_shrink_slab() -> shrinker->scan_objects ->
> deferred_split_scan()?
> 
> Though I get that it adds verbosity :)
> 
> Sorry not (just) being pedantic here, also so I can understand myself :)

Yes, it's pseudocode. Direct reclaim (shrink_zones) and kswapd
(wake_all_kswapds) do the "for all nodes" loops. Nested within them
are the loops that iterate the cgroup tree (shrink_node_memcgs,
shrink_many, depending on LRU vs MGLRU). And then nested within
*those* we have the shrink_slab() calls.

Again, I tried to keep the boiler plate low and to the point. The most
important thing is: 1) the shrinker bit gets set for each node, memcg
pair that has objects, 2) shrinker visits all nodes x memcgs with it
set 3) but the shrinker scans happen on per-memcg queue, not node-aware.

Let me know if that works as is, or if you would like me to rephrase
that. I would like to keep it as simple as possible, but no simpler
than that ;)

> > 	          deferred_split_scan()
> > 	            walks memcg->split_queue
> 
> Ok so overall there's a per-memcg memcg->split_queue, but we set a bit in
> memcg->nodeinfo[nid]->shrinker_info->unit[blah]->map for it, and when we
> enter shrink_slab_memcg(), we figure out the shrink_control from the
> for_each_set_bit() across memcg->...->unit->map?
> 
> > The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> > has a single large page on the node of interest, all large pages owned
> > by that memcg, including those on other nodes, will be split.
> 
> So at this point, regardless of node, we are invoking deferred_split_scan()
> and it's the same memcg->split_queue we are walking, regardless of node?
> 
> Do let me know if my understanding is correct here!
> 
> Hmm that does sound sub-optimal.

That is exactly what's happening. We have all this dance to be precise
about the shrinker bit, but the queue itself is not node-aware.

That makes for some pretty erratic behavior.

> > list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> > streamlines a lot of the list operations and reclaim walks. It's used
> > widely by other major shrinkers already. Convert the deferred split
> > queue as well.
> 
> It's odd that it wasn't used before?

The list_lru API was initially explicitly for *slab objects*, doing
virt_to_page() and mem_cgroup_from_slab_obj() on the passed in list
heads to derive the proper queue. It was only extended more recently
in 0a97c01cd20b ("list_lru: allow explicit memcg and NUMA node
selection") to allow passing in the queue routing information and
embedding the list heads in other things (such as folios).

The only missing piece now was the support for callside locking of the
list_lru structures.

> >  include/linux/huge_mm.h    |   6 +-
> >  include/linux/memcontrol.h |   4 -
> >  include/linux/mmzone.h     |  12 --
> >  mm/huge_memory.c           | 342 ++++++++++++-------------------------
> >  mm/internal.h              |   2 +-
> >  mm/khugepaged.c            |   7 +
> >  mm/memcontrol.c            |  12 +-
> >  mm/memory.c                |  52 +++---
> >  mm/mm_init.c               |  15 --
> >  9 files changed, 151 insertions(+), 301 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index bd7f0e1d8094..8d801ed378db 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
> >  {
> >  	return split_huge_page_to_list_to_order(page, NULL, 0);
> >  }
> > +
> > +extern struct list_lru deferred_split_lru;
> 
> It might be nice for the sake of avoiding a global to instead expose this
> as a getter?
> 
> Or actually better, since every caller outside of huge_memory.c that
> references this uses folio_memcg_list_lru_alloc(), do something like:
> 
> int folio_memcg_alloc_deferred(struct folio *folio, gfp_t gfp);
> 
> in mm/huge_memory.c:
> 
> /**
>  * blah blah blah put on error blah
>  */
> int folio_memcg_alloc_deferred(struct folio *folio, gfp_t gfp)
> {
> 	int err;
> 
> 	err = folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfP);
> 	if (err) {
> 		folio_put(folio);
> 		return err;
> 	}
> 
> 	return 0;
> }
> 
> And then the callers can just invoke this, and you can make
> deferred_split_lru static in mm/huge_memory.c?

That sounds reasonable. Let me make this change.

> >  void deferred_split_folio(struct folio *folio, bool partially_mapped);
> > -#ifdef CONFIG_MEMCG
> > -void reparent_deferred_split_queue(struct mem_cgroup *memcg);
> > -#endif
> >
> >  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >  		unsigned long address, bool freeze);
> > @@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
> >  }
> >
> >  static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> > -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
> >  #define split_huge_pmd(__vma, __pmd, __address)	\
> >  	do { } while (0)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 086158969529..0782c72a1997 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -277,10 +277,6 @@ struct mem_cgroup {
> >  	struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
> >  #endif
> >
> > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > -	struct deferred_split deferred_split_queue;
> > -#endif
> > -
> >  #ifdef CONFIG_LRU_GEN_WALKS_MMU
> >  	/* per-memcg mm_struct list */
> >  	struct lru_gen_mm_list mm_list;
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 7bd0134c241c..232b7a71fd69 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -1429,14 +1429,6 @@ struct zonelist {
> >   */
> >  extern struct page *mem_map;
> >
> > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > -struct deferred_split {
> > -	spinlock_t split_queue_lock;
> > -	struct list_head split_queue;
> > -	unsigned long split_queue_len;
> > -};
> > -#endif
> > -
> >  #ifdef CONFIG_MEMORY_FAILURE
> >  /*
> >   * Per NUMA node memory failure handling statistics.
> > @@ -1562,10 +1554,6 @@ typedef struct pglist_data {
> >  	unsigned long first_deferred_pfn;
> >  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
> >
> > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > -	struct deferred_split deferred_split_queue;
> > -#endif
> > -
> >  #ifdef CONFIG_NUMA_BALANCING
> >  	/* start time in ms of current promote rate limit period */
> >  	unsigned int nbp_rl_start;
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 3fc02913b63e..e90d08db219d 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -14,6 +14,7 @@
> >  #include <linux/mmu_notifier.h>
> >  #include <linux/rmap.h>
> >  #include <linux/swap.h>
> > +#include <linux/list_lru.h>
> >  #include <linux/shrinker.h>
> >  #include <linux/mm_inline.h>
> >  #include <linux/swapops.h>
> > @@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
> >  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
> >  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
> >
> > +static struct lock_class_key deferred_split_key;
> > +struct list_lru deferred_split_lru;
> >  static struct shrinker *deferred_split_shrinker;
> >  static unsigned long deferred_split_count(struct shrinker *shrink,
> >  					  struct shrink_control *sc);
> > @@ -919,6 +922,13 @@ static int __init thp_shrinker_init(void)
> >  	if (!deferred_split_shrinker)
> >  		return -ENOMEM;
> >
> > +	if (list_lru_init_memcg_key(&deferred_split_lru,
> > +				    deferred_split_shrinker,
> > +				    &deferred_split_key)) {
> > +		shrinker_free(deferred_split_shrinker);
> > +		return -ENOMEM;
> > +	}
> > +
> 
> It's kind of out of scope for the series, but I hate that the huge zero
> folio stuff has an early exit for persistent but if not, falls through to
> trying to set that up - it'd be nice to split out the huge zero folio stuff
> into another function.
> 
> But probably one for a follow up!

It doesn't bother me personally, but I wouldn't object to it being
moved to its own function and have the caller do necessary cleanups.

But yeah sounds like an unrelated refactor :)

> >  	deferred_split_shrinker->count_objects = deferred_split_count;
> >  	deferred_split_shrinker->scan_objects = deferred_split_scan;
> >  	shrinker_register(deferred_split_shrinker);
> > @@ -939,6 +949,7 @@ static int __init thp_shrinker_init(void)
> >
> >  	huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
> >  	if (!huge_zero_folio_shrinker) {
> > +		list_lru_destroy(&deferred_split_lru);
> >  		shrinker_free(deferred_split_shrinker);
> 
> Presumably no probably-impossible-in-reality race on somebody entering the
> shrinker and referencing the deferred_split_lru before the shrinker is freed?

Ah right, I think for clarity it would indeed be better to destroy the
shrinker, then the queue. Let me re-order this one.

But yes, in practice, none of the above fails. If we have trouble
doing a couple of small kmallocs during a subsys_initcall(), that
machine is unlikely to finish booting, let alone allocate enough
memory to enter the THP shrinker.

> > @@ -953,6 +964,7 @@ static int __init thp_shrinker_init(void)
> >  static void __init thp_shrinker_exit(void)
> >  {
> >  	shrinker_free(huge_zero_folio_shrinker);
> > +	list_lru_destroy(&deferred_split_lru);
> 
> Same question above as to race/ordering.

... and this one as well.

> >  	shrinker_free(deferred_split_shrinker);
> >  }

> > @@ -3854,34 +3761,34 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
> >  	struct folio *end_folio = folio_next(folio);
> >  	struct folio *new_folio, *next;
> >  	int old_order = folio_order(folio);
> > +	struct list_lru_one *l;
> 
> Nit, and maybe this is a convention, but hate single letter variable names,
> 'lru' or something might be nicer?

Yeah I stuck with the list_lru internal naming, which uses `lru` for
the struct list_lru, and `l` for struct list_lru_one. I suppose that
was fine for the very domain-specific code and short functions in
there, but it's grating in large, general MM functions like these.

Since `lru` is taken, any preferences? llo?

> > +	bool dequeue_deferred;
> >  	int ret = 0;
> > -	struct deferred_split *ds_queue;
> >
> >  	VM_WARN_ON_ONCE(!mapping && end);
> >  	/* Prevent deferred_split_scan() touching ->_refcount */
> > -	ds_queue = folio_split_queue_lock(folio);
> > +	dequeue_deferred = folio_test_anon(folio) && old_order > 1;
> 
> Why anon? (This review is partly me learning about the shrinker, an area
> I'm weak on :)

This is the type of folios that can be queued through
deferred_split_folio(). The order check is inside that function
itself; and the function is only called for anon pages.

File pages are split differently, they don't use the deferred split
shrinker and we don't allocate list_lru heads for them. However, they
still come through this split path here, and so I need to gate whether
it's safe to do list_lru_lock(), __list_lru_del() etc.

> > +	if (dequeue_deferred) {
> > +		rcu_read_lock();
> > +		l = list_lru_lock(&deferred_split_lru,
> > +				  folio_nid(folio), folio_memcg(folio));
> 
> Hm don't adore this sort of almost 'hidden' RCU lock here, but this
> function is pretty disgusting and needs serious work in general.
> 
> And any function that took the RCU lock and list_lru lock/did the unlock
> equivalent would be equally horrible so yeah, I guess needs deferring to a
> refactor.
> 
> OTOH, this could be a good excuse for us to pay down some technical debt
> and split out for instance the folio_ref_freeze() bits?
> 
> Could we do something like:
> 
> 	bool frozen;
> 
> 	...
> 
> 	dequeue_deferred = folio_test_anon(folio) && old_order > 1;
> 	frozen = folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1);
> 
> 	if (dequeue_deferred && frozen) {
> 		struct list_lru_one *lru;
> 
> 		rcu_read_lock();
> 		lru = list_lru_lock(&deferred_split_lru,
> 				    folio_nid(folio), folio_memcg(folio));
> 		__list_lru_del(&deferred_split_lru, lru,
> 			       &folio->_deferred_list, folio_nid(folio));
> 		if (folio_test_partially_mapped(folio)) {
> 			folio_clear_partially_mapped(folio);
> 			mod_mthp_stat(old_order,
> 				MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> 		}
> 		list_lru_unlock(lru);
> 		rcu_read_unlock();
> 	}
> 
> 	if (!frozen)
> 		return -EAGAIN;
> 
> 	... rest of logic now one indent level lower ...
> 
> Or maybe factor that out into a helper function or something?
> 
> static void execute_deferred_dequeue(...) { ... }
> 
> With this implemented either way you'd be able to get rid of the else block
> too.
> 
> obviously only valid if you are able to do the freezing earlier?

The reason it locks the list_lru first is because there can be a race
between the shrinker and a synchronous split attempt.

This is what that comment above it is about:

	/* Prevent deferred_split_scan() touching ->_refcount */

If the shrinker fails to "get" the folio while holding the shrinker
lock, it thinks it beat the free path to the list_lru lock and will
feel responsible for cleaning up the deferred split state:

/* caller holds the list_lru lock already */
static enum lru_status deferred_split_isolate(struct list_head *item,
					      struct list_lru_one *lru,
					      void *cb_arg)
{
	struct folio *folio = container_of(item, struct folio, _deferred_list);
	struct list_head *freeable = cb_arg;

	if (folio_try_get(folio)) {
		list_lru_isolate_move(lru, item, freeable);
		return LRU_REMOVED;
	}

	/* We lost race with folio_put() */
	list_lru_isolate(lru, item);
	if (folio_test_partially_mapped(folio)) {
		folio_clear_partially_mapped(folio);
		mod_mthp_stat(folio_order(folio),
			      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
	}
	return LRU_REMOVED;
}

So if the split path freezes before acquiring the list_lru lock, we
get them both stepping on the deferred split state.

This may or may not be safe given the current manifestation of
list_lru_del() and the folio_test_partially_mapped() tests. But it
feels pretty hairy to let them race and rely on individual tests into
what should be a coherent aggregate state.

> > +	}
> >  	if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
> >  		struct swap_cluster_info *ci = NULL;
> >  		struct lruvec *lruvec;
> >
> > -		if (old_order > 1) {
> 
> Before was this also applicable to non-anon folios?

This branch, yes. But non-anon pages would then fail this:

> > -			if (!list_empty(&folio->_deferred_list)) {
> > -				ds_queue->split_queue_len--;
> > -				/*
> > -				 * Reinitialize page_deferred_list after removing the
> > -				 * page from the split_queue, otherwise a subsequent
> > -				 * split will see list corruption when checking the
> > -				 * page_deferred_list.
> > -				 */
> > -				list_del_init(&folio->_deferred_list);
> > -			}

..since they aren't ever added. This was okay because accessing that
list head is always safe. Now I need to be explicit to determine if
it's safe to call __list_lru_del() which does all these list_lru
references, which aren't allocated for file.

> > +		if (dequeue_deferred) {
> > +			__list_lru_del(&deferred_split_lru, l,
> > +				       &folio->_deferred_list, folio_nid(folio));
> >  			if (folio_test_partially_mapped(folio)) {
> >  				folio_clear_partially_mapped(folio);
> >  				mod_mthp_stat(old_order,
> >  					MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> >  			}

And honestly, I think it's nicer to be explicit here about the
expectations of which pages get which treatment.

Filtering file pages on !list_empty() was not very obvious.

> > +	int nid = folio_nid(folio);
> >  	unsigned long flags;
> >  	bool unqueued = false;
> >
> >  	WARN_ON_ONCE(folio_ref_count(folio));
> >  	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
> >
> > -	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> > -	if (!list_empty(&folio->_deferred_list)) {
> > -		ds_queue->split_queue_len--;
> > +	rcu_read_lock();
> > +	l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
> > +	if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
> 
> Maybe worth factoring __list_lru_del() into something that explicitly
> references &folio->_deferred_list rather than open codingin both places?

Hm, I wouldn't want to encode this into list_lru API, but we could do
a huge_memory.c-local helper?

folio_deferred_split_del(folio, l, nid)

> > @@ -4406,7 +4320,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >  	if (folio_test_swapcache(folio))
> >  		return;
> >
> > -	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> > +	nid = folio_nid(folio);
> > +
> > +	rcu_read_lock();
> > +	memcg = folio_memcg(folio);
> > +	l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
> 
> Do we really need to hold the lock over all the below, but hmm we can't do
> an irqsave/restore list_lru_add(), and maybe not worth adding one either,
> OK.
> 
> Just seems odd to have <lock> <__unlocked_add_variant> <unlock>,
> instinctively feels like it should be just <locked_add_variant>.

It protects that auxiliary folio_test_partially_mapped() state that
the shrinker also touches. LRU + that page flag + the counts is what
that lock protects.

> > @@ -4473,45 +4375,47 @@ static bool thp_underused(struct folio *folio)
> >  	return false;
> >  }
> >
> > +static enum lru_status deferred_split_isolate(struct list_head *item,
> > +					      struct list_lru_one *lru,
> > +					      void *cb_arg)
> > +{
> > +	struct folio *folio = container_of(item, struct folio, _deferred_list);
> > +	struct list_head *freeable = cb_arg;
> > +
> > +	if (folio_try_get(folio)) {
> > +		list_lru_isolate_move(lru, item, freeable);
> > +		return LRU_REMOVED;
> > +	}
> > +
> > +	/* We lost race with folio_put() */
> 
> Hmm, in the original code, this comment is associated with the partially
> mapped logic, BUT this seems actually correct, because folio_try_get()
> because it does folio_ref_add_unless_zero() only fails if the folio lost
> the race.
> 
> So I think you're more correct right?

I think the original placement was because that else if made it
awkward to place where it should be. But yes, the above is more
correct: the comment refers to what happens when the "get" fails.

> > -	ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
> > -	/* Take pin on all head pages to avoid freeing them under us */
> > -	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
> > -							_deferred_list) {
> > -		if (folio_try_get(folio)) {
> > -			folio_batch_add(&fbatch, folio);
> > -		} else if (folio_test_partially_mapped(folio)) {
> > -			/* We lost race with folio_put() */
> > -			folio_clear_partially_mapped(folio);
> > -			mod_mthp_stat(folio_order(folio),
> > -				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> > -		}

> > @@ -4534,64 +4438,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >  		}
> >  		folio_unlock(folio);
> >  next:
> > -		if (did_split || !folio_test_partially_mapped(folio))
> > -			continue;
> >  		/*
> >  		 * Only add back to the queue if folio is partially mapped.
> >  		 * If thp_underused returns false, or if split_folio fails
> >  		 * in the case it was underused, then consider it used and
> >  		 * don't add it back to split_queue.
> >  		 */
> > -		fqueue = folio_split_queue_lock_irqsave(folio, &flags);
> > -		if (list_empty(&folio->_deferred_list)) {
> > -			list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
> > -			fqueue->split_queue_len++;
> > +		if (!did_split && folio_test_partially_mapped(folio)) {
> > +			rcu_read_lock();
> > +			l = list_lru_lock_irqsave(&deferred_split_lru,
> > +						  folio_nid(folio),
> > +						  folio_memcg(folio),
> > +						  &flags);
> > +			__list_lru_add(&deferred_split_lru, l,
> > +				       &folio->_deferred_list,
> > +				       folio_nid(folio), folio_memcg(folio));
> > +			list_lru_unlock_irqrestore(l, &flags);
> 
> Hmm this does make me think it'd be nice to have a list_lru_add() variant
> for irqsave/restore then, since it's a repeating pattern!

Yeah, this site calls for it the most :( I tried to balance callsite
prettiness with the need to extend the list_lru api; it's just one
caller. And the possible mutations and variants with these locks is
seemingly endless once you open that can of worms...

Case in point: this is process context and we could use
spin_lock_irq() here. I'm just using list_lru_lock_irqsave() because
that's the common variant used by the add and del paths already.

If I went with a helper, I could do list_lru_add_irq().

I think it would actually nicely mirror the list_lru_shrink_walk_irq()
a few lines up.

> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1081,6 +1081,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
> >  	}
> >
> >  	count_vm_event(THP_COLLAPSE_ALLOC);
> > +
> >  	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
> >  		folio_put(folio);
> >  		*foliop = NULL;
> > @@ -1089,6 +1090,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
> >
> >  	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
> 
> Do we want to put this stat counter underneath the below so it's not
> incremented on fail?

We currently increment it also when the cgroup charge fails, which is
common. The list_lru allocation (a smaller kmalloc) in turn should
basically never fail - we just managed to get a PMD-sized folio.

Executive summar being: the way we bump them currently is weird. I
tried to stay out of it to keep this series on track ;)

> > +	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> > +		folio_put(folio);
> > +		*foliop = NULL;
> > +		return SCAN_CGROUP_CHARGE_FAIL;
> 
> Do we not need to uncharge here?

folio_put() does that. We could do it for symmetry, but there is no
need. And the hard and fast rule is that folio_memcg() is immutable
until the put hits 0, so it would look weird/unexpected.

> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index a47fb68dd65f..f381cb6bdff1 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
> >  	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
> >  		memcg->cgwb_frn[i].done =
> >  			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
> > -#endif
> > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > -	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
> > -	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
> > -	memcg->deferred_split_queue.split_queue_len = 0;
> >  #endif
> >  	lru_gen_init_memcg(memcg);
> >  	return memcg;
> > @@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> >  	zswap_memcg_offline_cleanup(memcg);
> >
> >  	memcg_offline_kmem(memcg);
> > -	reparent_deferred_split_queue(memcg);
> >  	/*
> > -	 * The reparenting of objcg must be after the reparenting of the
> > -	 * list_lru and deferred_split_queue above, which ensures that they will
> > -	 * not mistakenly get the parent list_lru and deferred_split_queue.
> > +	 * The reparenting of objcg must be after the reparenting of
> > +	 * the list_lru in memcg_offline_kmem(), which ensures that
> > +	 * they will not mistakenly get the parent list_lru.
> >  	 */
> >  	memcg_reparent_objcgs(memcg);
> >  	reparent_shrinker_deferred(memcg);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 219b9bf6cae0..e68ceb4aa624 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >  	while (orders) {
> >  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >  		folio = vma_alloc_folio(gfp, order, vma, addr);
> > -		if (folio) {
> > -			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> > -							    gfp, entry))
> > -				return folio;
> > +		if (!folio)
> > +			goto next;
> > +		if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
> >  			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
> >  			folio_put(folio);
> > +			goto next;
> >  		}
> > +		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> 
> Do we need to uncharge here?

Same here. It's folio state that the put cleans up.

> > +			folio_put(folio);
> > +			goto fallback;
> > +		}
> > +		return folio;
> > +next:
> >  		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
> >  		order = next_order(&orders, order);
> >  	}
> > @@ -5169,24 +5175,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> >  	while (orders) {
> >  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >  		folio = vma_alloc_folio(gfp, order, vma, addr);
> > -		if (folio) {
> > -			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> > -				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > -				folio_put(folio);
> > -				goto next;
> > -			}
> > -			folio_throttle_swaprate(folio, gfp);
> > -			/*
> > -			 * When a folio is not zeroed during allocation
> > -			 * (__GFP_ZERO not used) or user folios require special
> > -			 * handling, folio_zero_user() is used to make sure
> > -			 * that the page corresponding to the faulting address
> > -			 * will be hot in the cache after zeroing.
> > -			 */
> > -			if (user_alloc_needs_zeroing())
> > -				folio_zero_user(folio, vmf->address);
> > -			return folio;
> > +		if (!folio)
> > +			goto next;
> 
> This applies to the above equivalent refactorings, but maybe worth
> separating out this:
> 
> if (folio) { ... big branch ... } ->
> if (!folio) goto next; ... what was big branch ...
> 
> Refactoring into a separate patch? Makes it easier to see pertinent logic
> changes + helps review/bisectability/fixes/etc. etc.

Will do.

> > +		if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> > +			count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > +			folio_put(folio);
> > +			goto next;
> >  		}
> > +		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> 
> Again, do we need to uncharge here?

folio_put() does it :)

> > +			folio_put(folio);
> > +			goto fallback;
> > +		}
> > +		folio_throttle_swaprate(folio, gfp);
> > +		/*
> > +		 * When a folio is not zeroed during allocation
> > +		 * (__GFP_ZERO not used) or user folios require special
> > +		 * handling, folio_zero_user() is used to make sure
> > +		 * that the page corresponding to the faulting address
> > +		 * will be hot in the cache after zeroing.
> > +		 */
> > +		if (user_alloc_needs_zeroing())
> > +			folio_zero_user(folio, vmf->address);
> > +		return folio;
> >  next:
> >  		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> >  		order = next_order(&orders, order);
> > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > index cec7bb758bdd..f293a62e652a 100644
> > --- a/mm/mm_init.c
> > +++ b/mm/mm_init.c
> > @@ -1388,19 +1388,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
> >  	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
> >  }
> >
> > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > -static void pgdat_init_split_queue(struct pglist_data *pgdat)
> > -{
> > -	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
> > -
> > -	spin_lock_init(&ds_queue->split_queue_lock);
> > -	INIT_LIST_HEAD(&ds_queue->split_queue);
> > -	ds_queue->split_queue_len = 0;
> > -}
> > -#else
> > -static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
> > -#endif
> > -
> >  #ifdef CONFIG_COMPACTION
> >  static void pgdat_init_kcompactd(struct pglist_data *pgdat)
> >  {
> > @@ -1416,8 +1403,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
> >
> >  	pgdat_resize_init(pgdat);
> >  	pgdat_kswapd_lock_init(pgdat);
> > -
> > -	pgdat_init_split_queue(pgdat);
> >  	pgdat_init_kcompactd(pgdat);
> >
> >  	init_waitqueue_head(&pgdat->kswapd_wait);
> > --
> > 2.53.0
> >
> 
> Overall a lovely amount of code deletion here :)
> 
> Thanks for doing this, Cheers Lorenzo

Thanks for your feedback!
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by Lorenzo Stoakes (Oracle) 4 days, 23 hours ago
On Mon, Mar 30, 2026 at 12:40:22PM -0400, Johannes Weiner wrote:
> Hey Lorenzo,
>
> Thanks for taking an in-depth look at this!
>
> Apologies for the late reply, I was traveling last week.
>
> On Tue, Mar 24, 2026 at 01:48:06PM +0000, Lorenzo Stoakes (Oracle) wrote:
> > On Wed, Mar 18, 2026 at 03:53:25PM -0400, Johannes Weiner wrote:
> > > The deferred split queue handles cgroups in a suboptimal fashion. The
> > > queue is per-NUMA node or per-cgroup, not the intersection. That means
> > > on a cgrouped system, a node-restricted allocation entering reclaim
> > > can end up splitting large pages on other nodes:
> > >
> > > 	alloc/unmap
> > > 	  deferred_split_folio()
> > > 	    list_add_tail(memcg->split_queue)
> > > 	    set_shrinker_bit(memcg, node, deferred_shrinker_id)
> >
> > So here it's:
> >
> > __do_huge_pmd_anonymous_page() / do_huge_zero_wp_pmd()
> > -> map_anon_folio_pmd_pf()
> > -> map_anon_folio_pmd_nopf()
> > -> deferred_split_folio()
> > -> set_shrinker_bit()
> >
> > Yeah it makes sense to make the first bit succinct anyway :)
>
> Yep I was trying to keep the focus on those parts that interact with
> the other stacks listed below. Not biblically accurate
> deferred_split_folio() callsites ;)
>
> > > 	for_each_zone_zonelist_nodemask(restricted_nodes)
> > > 	  mem_cgroup_iter()
> > > 	    shrink_slab(node, memcg)
> > > 	      shrink_slab_memcg(node, memcg)
> > > 	        if test_shrinker_bit(memcg, node, deferred_shrinker_id)
> >
> > Hmm there's no such function is this kind of pseudocode, essentially?
> >
> > Wouldn't it be clearer to reference:
> >
> > shrink_slab_memcg() -> do_shrink_slab() -> shrinker->scan_objects ->
> > deferred_split_scan()?
> >
> > Though I get that it adds verbosity :)
> >
> > Sorry not (just) being pedantic here, also so I can understand myself :)
>
> Yes, it's pseudocode. Direct reclaim (shrink_zones) and kswapd
> (wake_all_kswapds) do the "for all nodes" loops. Nested within them
> are the loops that iterate the cgroup tree (shrink_node_memcgs,
> shrink_many, depending on LRU vs MGLRU). And then nested within
> *those* we have the shrink_slab() calls.
>
> Again, I tried to keep the boiler plate low and to the point. The most
> important thing is: 1) the shrinker bit gets set for each node, memcg
> pair that has objects, 2) shrinker visits all nodes x memcgs with it
> set 3) but the shrinker scans happen on per-memcg queue, not node-aware.
>
> Let me know if that works as is, or if you would like me to rephrase
> that. I would like to keep it as simple as possible, but no simpler
> than that ;)

Yeah that's fine, maybe just add a little note to say 'simplified for clarity'
or something so very-literal nutters like me don't get confused ;)

>
> > > 	          deferred_split_scan()
> > > 	            walks memcg->split_queue
> >
> > Ok so overall there's a per-memcg memcg->split_queue, but we set a bit in
> > memcg->nodeinfo[nid]->shrinker_info->unit[blah]->map for it, and when we
> > enter shrink_slab_memcg(), we figure out the shrink_control from the
> > for_each_set_bit() across memcg->...->unit->map?
> >
> > > The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> > > has a single large page on the node of interest, all large pages owned
> > > by that memcg, including those on other nodes, will be split.
> >
> > So at this point, regardless of node, we are invoking deferred_split_scan()
> > and it's the same memcg->split_queue we are walking, regardless of node?
> >
> > Do let me know if my understanding is correct here!
> >
> > Hmm that does sound sub-optimal.
>
> That is exactly what's happening. We have all this dance to be precise
> about the shrinker bit, but the queue itself is not node-aware.
>
> That makes for some pretty erratic behavior.

Yeah that sucks!

>
> > > list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> > > streamlines a lot of the list operations and reclaim walks. It's used
> > > widely by other major shrinkers already. Convert the deferred split
> > > queue as well.
> >
> > It's odd that it wasn't used before?
>
> The list_lru API was initially explicitly for *slab objects*, doing
> virt_to_page() and mem_cgroup_from_slab_obj() on the passed in list
> heads to derive the proper queue. It was only extended more recently
> in 0a97c01cd20b ("list_lru: allow explicit memcg and NUMA node
> selection") to allow passing in the queue routing information and
> embedding the list heads in other things (such as folios).
>
> The only missing piece now was the support for callside locking of the
> list_lru structures.

Ack

>
> > >  include/linux/huge_mm.h    |   6 +-
> > >  include/linux/memcontrol.h |   4 -
> > >  include/linux/mmzone.h     |  12 --
> > >  mm/huge_memory.c           | 342 ++++++++++++-------------------------
> > >  mm/internal.h              |   2 +-
> > >  mm/khugepaged.c            |   7 +
> > >  mm/memcontrol.c            |  12 +-
> > >  mm/memory.c                |  52 +++---
> > >  mm/mm_init.c               |  15 --
> > >  9 files changed, 151 insertions(+), 301 deletions(-)
> > >
> > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > > index bd7f0e1d8094..8d801ed378db 100644
> > > --- a/include/linux/huge_mm.h
> > > +++ b/include/linux/huge_mm.h
> > > @@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
> > >  {
> > >  	return split_huge_page_to_list_to_order(page, NULL, 0);
> > >  }
> > > +
> > > +extern struct list_lru deferred_split_lru;
> >
> > It might be nice for the sake of avoiding a global to instead expose this
> > as a getter?
> >
> > Or actually better, since every caller outside of huge_memory.c that
> > references this uses folio_memcg_list_lru_alloc(), do something like:
> >
> > int folio_memcg_alloc_deferred(struct folio *folio, gfp_t gfp);
> >
> > in mm/huge_memory.c:
> >
> > /**
> >  * blah blah blah put on error blah
> >  */
> > int folio_memcg_alloc_deferred(struct folio *folio, gfp_t gfp)
> > {
> > 	int err;
> >
> > 	err = folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfP);
> > 	if (err) {
> > 		folio_put(folio);
> > 		return err;
> > 	}
> >
> > 	return 0;
> > }
> >
> > And then the callers can just invoke this, and you can make
> > deferred_split_lru static in mm/huge_memory.c?
>
> That sounds reasonable. Let me make this change.

Thanks!

>
> > >  void deferred_split_folio(struct folio *folio, bool partially_mapped);
> > > -#ifdef CONFIG_MEMCG
> > > -void reparent_deferred_split_queue(struct mem_cgroup *memcg);
> > > -#endif
> > >
> > >  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > >  		unsigned long address, bool freeze);
> > > @@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
> > >  }
> > >
> > >  static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> > > -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
> > >  #define split_huge_pmd(__vma, __pmd, __address)	\
> > >  	do { } while (0)
> > >
> > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > > index 086158969529..0782c72a1997 100644
> > > --- a/include/linux/memcontrol.h
> > > +++ b/include/linux/memcontrol.h
> > > @@ -277,10 +277,6 @@ struct mem_cgroup {
> > >  	struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
> > >  #endif
> > >
> > > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > -	struct deferred_split deferred_split_queue;
> > > -#endif
> > > -
> > >  #ifdef CONFIG_LRU_GEN_WALKS_MMU
> > >  	/* per-memcg mm_struct list */
> > >  	struct lru_gen_mm_list mm_list;
> > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > index 7bd0134c241c..232b7a71fd69 100644
> > > --- a/include/linux/mmzone.h
> > > +++ b/include/linux/mmzone.h
> > > @@ -1429,14 +1429,6 @@ struct zonelist {
> > >   */
> > >  extern struct page *mem_map;
> > >
> > > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > -struct deferred_split {
> > > -	spinlock_t split_queue_lock;
> > > -	struct list_head split_queue;
> > > -	unsigned long split_queue_len;
> > > -};
> > > -#endif
> > > -
> > >  #ifdef CONFIG_MEMORY_FAILURE
> > >  /*
> > >   * Per NUMA node memory failure handling statistics.
> > > @@ -1562,10 +1554,6 @@ typedef struct pglist_data {
> > >  	unsigned long first_deferred_pfn;
> > >  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
> > >
> > > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > -	struct deferred_split deferred_split_queue;
> > > -#endif
> > > -
> > >  #ifdef CONFIG_NUMA_BALANCING
> > >  	/* start time in ms of current promote rate limit period */
> > >  	unsigned int nbp_rl_start;
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 3fc02913b63e..e90d08db219d 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -14,6 +14,7 @@
> > >  #include <linux/mmu_notifier.h>
> > >  #include <linux/rmap.h>
> > >  #include <linux/swap.h>
> > > +#include <linux/list_lru.h>
> > >  #include <linux/shrinker.h>
> > >  #include <linux/mm_inline.h>
> > >  #include <linux/swapops.h>
> > > @@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
> > >  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
> > >  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
> > >
> > > +static struct lock_class_key deferred_split_key;
> > > +struct list_lru deferred_split_lru;
> > >  static struct shrinker *deferred_split_shrinker;
> > >  static unsigned long deferred_split_count(struct shrinker *shrink,
> > >  					  struct shrink_control *sc);
> > > @@ -919,6 +922,13 @@ static int __init thp_shrinker_init(void)
> > >  	if (!deferred_split_shrinker)
> > >  		return -ENOMEM;
> > >
> > > +	if (list_lru_init_memcg_key(&deferred_split_lru,
> > > +				    deferred_split_shrinker,
> > > +				    &deferred_split_key)) {
> > > +		shrinker_free(deferred_split_shrinker);
> > > +		return -ENOMEM;
> > > +	}
> > > +
> >
> > It's kind of out of scope for the series, but I hate that the huge zero
> > folio stuff has an early exit for persistent but if not, falls through to
> > trying to set that up - it'd be nice to split out the huge zero folio stuff
> > into another function.
> >
> > But probably one for a follow up!
>
> It doesn't bother me personally, but I wouldn't object to it being
> moved to its own function and have the caller do necessary cleanups.
>
> But yeah sounds like an unrelated refactor :)

Yeah it is entirely unrelated for sure, can be an optional follow up!

>
> > >  	deferred_split_shrinker->count_objects = deferred_split_count;
> > >  	deferred_split_shrinker->scan_objects = deferred_split_scan;
> > >  	shrinker_register(deferred_split_shrinker);
> > > @@ -939,6 +949,7 @@ static int __init thp_shrinker_init(void)
> > >
> > >  	huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
> > >  	if (!huge_zero_folio_shrinker) {
> > > +		list_lru_destroy(&deferred_split_lru);
> > >  		shrinker_free(deferred_split_shrinker);
> >
> > Presumably no probably-impossible-in-reality race on somebody entering the
> > shrinker and referencing the deferred_split_lru before the shrinker is freed?
>
> Ah right, I think for clarity it would indeed be better to destroy the
> shrinker, then the queue. Let me re-order this one.
>
> But yes, in practice, none of the above fails. If we have trouble
> doing a couple of small kmallocs during a subsys_initcall(), that
> machine is unlikely to finish booting, let alone allocate enough
> memory to enter the THP shrinker.

Yeah I thought that might be the case, but seems more logical killing shrinker
first, thanks!

>
> > > @@ -953,6 +964,7 @@ static int __init thp_shrinker_init(void)
> > >  static void __init thp_shrinker_exit(void)
> > >  {
> > >  	shrinker_free(huge_zero_folio_shrinker);
> > > +	list_lru_destroy(&deferred_split_lru);
> >
> > Same question above as to race/ordering.
>
> ... and this one as well.
>
> > >  	shrinker_free(deferred_split_shrinker);
> > >  }
>
> > > @@ -3854,34 +3761,34 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
> > >  	struct folio *end_folio = folio_next(folio);
> > >  	struct folio *new_folio, *next;
> > >  	int old_order = folio_order(folio);
> > > +	struct list_lru_one *l;
> >
> > Nit, and maybe this is a convention, but hate single letter variable names,
> > 'lru' or something might be nicer?
>
> Yeah I stuck with the list_lru internal naming, which uses `lru` for
> the struct list_lru, and `l` for struct list_lru_one. I suppose that
> was fine for the very domain-specific code and short functions in
> there, but it's grating in large, general MM functions like these.
>
> Since `lru` is taken, any preferences? llo?

ljs? ;)

Could be list?

>
> > > +	bool dequeue_deferred;
> > >  	int ret = 0;
> > > -	struct deferred_split *ds_queue;
> > >
> > >  	VM_WARN_ON_ONCE(!mapping && end);
> > >  	/* Prevent deferred_split_scan() touching ->_refcount */
> > > -	ds_queue = folio_split_queue_lock(folio);
> > > +	dequeue_deferred = folio_test_anon(folio) && old_order > 1;
> >
> > Why anon? (This review is partly me learning about the shrinker, an area
> > I'm weak on :)
>
> This is the type of folios that can be queued through
> deferred_split_folio(). The order check is inside that function
> itself; and the function is only called for anon pages.
>
> File pages are split differently, they don't use the deferred split
> shrinker and we don't allocate list_lru heads for them. However, they
> still come through this split path here, and so I need to gate whether
> it's safe to do list_lru_lock(), __list_lru_del() etc.
>
> > > +	if (dequeue_deferred) {
> > > +		rcu_read_lock();
> > > +		l = list_lru_lock(&deferred_split_lru,
> > > +				  folio_nid(folio), folio_memcg(folio));
> >
> > Hm don't adore this sort of almost 'hidden' RCU lock here, but this
> > function is pretty disgusting and needs serious work in general.
> >
> > And any function that took the RCU lock and list_lru lock/did the unlock
> > equivalent would be equally horrible so yeah, I guess needs deferring to a
> > refactor.
> >
> > OTOH, this could be a good excuse for us to pay down some technical debt
> > and split out for instance the folio_ref_freeze() bits?
> >
> > Could we do something like:
> >
> > 	bool frozen;
> >
> > 	...
> >
> > 	dequeue_deferred = folio_test_anon(folio) && old_order > 1;
> > 	frozen = folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1);
> >
> > 	if (dequeue_deferred && frozen) {
> > 		struct list_lru_one *lru;
> >
> > 		rcu_read_lock();
> > 		lru = list_lru_lock(&deferred_split_lru,
> > 				    folio_nid(folio), folio_memcg(folio));
> > 		__list_lru_del(&deferred_split_lru, lru,
> > 			       &folio->_deferred_list, folio_nid(folio));
> > 		if (folio_test_partially_mapped(folio)) {
> > 			folio_clear_partially_mapped(folio);
> > 			mod_mthp_stat(old_order,
> > 				MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> > 		}
> > 		list_lru_unlock(lru);
> > 		rcu_read_unlock();
> > 	}
> >
> > 	if (!frozen)
> > 		return -EAGAIN;
> >
> > 	... rest of logic now one indent level lower ...
> >
> > Or maybe factor that out into a helper function or something?
> >
> > static void execute_deferred_dequeue(...) { ... }
> >
> > With this implemented either way you'd be able to get rid of the else block
> > too.
> >
> > obviously only valid if you are able to do the freezing earlier?
>
> The reason it locks the list_lru first is because there can be a race
> between the shrinker and a synchronous split attempt.

Ahh!

>
> This is what that comment above it is about:
>
> 	/* Prevent deferred_split_scan() touching ->_refcount */

I _may_ have glossed over this :)

>
> If the shrinker fails to "get" the folio while holding the shrinker
> lock, it thinks it beat the free path to the list_lru lock and will
> feel responsible for cleaning up the deferred split state:
>
> /* caller holds the list_lru lock already */
> static enum lru_status deferred_split_isolate(struct list_head *item,
> 					      struct list_lru_one *lru,
> 					      void *cb_arg)
> {
> 	struct folio *folio = container_of(item, struct folio, _deferred_list);
> 	struct list_head *freeable = cb_arg;
>
> 	if (folio_try_get(folio)) {
> 		list_lru_isolate_move(lru, item, freeable);
> 		return LRU_REMOVED;
> 	}
>
> 	/* We lost race with folio_put() */
> 	list_lru_isolate(lru, item);
> 	if (folio_test_partially_mapped(folio)) {
> 		folio_clear_partially_mapped(folio);
> 		mod_mthp_stat(folio_order(folio),
> 			      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> 	}
> 	return LRU_REMOVED;
> }
>
> So if the split path freezes before acquiring the list_lru lock, we
> get them both stepping on the deferred split state.

So we're sort of intentionally doing a lock inversion on the other path (give me
some rope here, I mean broadly speaking :) to avoid this?

>
> This may or may not be safe given the current manifestation of
> list_lru_del() and the folio_test_partially_mapped() tests. But it
> feels pretty hairy to let them race and rely on individual tests into
> what should be a coherent aggregate state.

Yeah sure best not.

But, and I _know_ it's nitty sorry, but maybe worth expanding that comment to
explain that e.g. 'we must take the folio look prior to the list_lru lock to
avoid racing with deferred_split_scan() in accessing the folio reference count'
or similar?

>
> > > +	}
> > >  	if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
> > >  		struct swap_cluster_info *ci = NULL;
> > >  		struct lruvec *lruvec;
> > >
> > > -		if (old_order > 1) {
> >
> > Before was this also applicable to non-anon folios?
>
> This branch, yes. But non-anon pages would then fail this:
>
> > > -			if (!list_empty(&folio->_deferred_list)) {
> > > -				ds_queue->split_queue_len--;
> > > -				/*
> > > -				 * Reinitialize page_deferred_list after removing the
> > > -				 * page from the split_queue, otherwise a subsequent
> > > -				 * split will see list corruption when checking the
> > > -				 * page_deferred_list.
> > > -				 */
> > > -				list_del_init(&folio->_deferred_list);
> > > -			}
>
> ..since they aren't ever added. This was okay because accessing that
> list head is always safe. Now I need to be explicit to determine if
> it's safe to call __list_lru_del() which does all these list_lru
> references, which aren't allocated for file.
>
> > > +		if (dequeue_deferred) {
> > > +			__list_lru_del(&deferred_split_lru, l,
> > > +				       &folio->_deferred_list, folio_nid(folio));
> > >  			if (folio_test_partially_mapped(folio)) {
> > >  				folio_clear_partially_mapped(folio);
> > >  				mod_mthp_stat(old_order,
> > >  					MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> > >  			}
>
> And honestly, I think it's nicer to be explicit here about the
> expectations of which pages get which treatment.

Yeah for sure.

>
> Filtering file pages on !list_empty() was not very obvious.

Yes :)

>
> > > +	int nid = folio_nid(folio);
> > >  	unsigned long flags;
> > >  	bool unqueued = false;
> > >
> > >  	WARN_ON_ONCE(folio_ref_count(folio));
> > >  	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
> > >
> > > -	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> > > -	if (!list_empty(&folio->_deferred_list)) {
> > > -		ds_queue->split_queue_len--;
> > > +	rcu_read_lock();
> > > +	l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
> > > +	if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
> >
> > Maybe worth factoring __list_lru_del() into something that explicitly
> > references &folio->_deferred_list rather than open codingin both places?
>
> Hm, I wouldn't want to encode this into list_lru API, but we could do
> a huge_memory.c-local helper?
>
> folio_deferred_split_del(folio, l, nid)

Well, I kind of hate how we're using the global deferred_split_lru all over the
place, so a helper woudl be preferable but one that also could be used for
khugepaged.c and memory.c also?

>
> > > @@ -4406,7 +4320,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
> > >  	if (folio_test_swapcache(folio))
> > >  		return;
> > >
> > > -	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> > > +	nid = folio_nid(folio);
> > > +
> > > +	rcu_read_lock();
> > > +	memcg = folio_memcg(folio);
> > > +	l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
> >
> > Do we really need to hold the lock over all the below, but hmm we can't do
> > an irqsave/restore list_lru_add(), and maybe not worth adding one either,
> > OK.
> >
> > Just seems odd to have <lock> <__unlocked_add_variant> <unlock>,
> > instinctively feels like it should be just <locked_add_variant>.
>
> It protects that auxiliary folio_test_partially_mapped() state that
> the shrinker also touches. LRU + that page flag + the counts is what
> that lock protects.

Ack thanks

>
> > > @@ -4473,45 +4375,47 @@ static bool thp_underused(struct folio *folio)
> > >  	return false;
> > >  }
> > >
> > > +static enum lru_status deferred_split_isolate(struct list_head *item,
> > > +					      struct list_lru_one *lru,
> > > +					      void *cb_arg)
> > > +{
> > > +	struct folio *folio = container_of(item, struct folio, _deferred_list);
> > > +	struct list_head *freeable = cb_arg;
> > > +
> > > +	if (folio_try_get(folio)) {
> > > +		list_lru_isolate_move(lru, item, freeable);
> > > +		return LRU_REMOVED;
> > > +	}
> > > +
> > > +	/* We lost race with folio_put() */
> >
> > Hmm, in the original code, this comment is associated with the partially
> > mapped logic, BUT this seems actually correct, because folio_try_get()
> > because it does folio_ref_add_unless_zero() only fails if the folio lost
> > the race.
> >
> > So I think you're more correct right?
>
> I think the original placement was because that else if made it
> awkward to place where it should be. But yes, the above is more
> correct: the comment refers to what happens when the "get" fails.

Right yeah

>
> > > -	ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
> > > -	/* Take pin on all head pages to avoid freeing them under us */
> > > -	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
> > > -							_deferred_list) {
> > > -		if (folio_try_get(folio)) {
> > > -			folio_batch_add(&fbatch, folio);
> > > -		} else if (folio_test_partially_mapped(folio)) {
> > > -			/* We lost race with folio_put() */
> > > -			folio_clear_partially_mapped(folio);
> > > -			mod_mthp_stat(folio_order(folio),
> > > -				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> > > -		}
>
> > > @@ -4534,64 +4438,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> > >  		}
> > >  		folio_unlock(folio);
> > >  next:
> > > -		if (did_split || !folio_test_partially_mapped(folio))
> > > -			continue;
> > >  		/*
> > >  		 * Only add back to the queue if folio is partially mapped.
> > >  		 * If thp_underused returns false, or if split_folio fails
> > >  		 * in the case it was underused, then consider it used and
> > >  		 * don't add it back to split_queue.
> > >  		 */
> > > -		fqueue = folio_split_queue_lock_irqsave(folio, &flags);
> > > -		if (list_empty(&folio->_deferred_list)) {
> > > -			list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
> > > -			fqueue->split_queue_len++;
> > > +		if (!did_split && folio_test_partially_mapped(folio)) {
> > > +			rcu_read_lock();
> > > +			l = list_lru_lock_irqsave(&deferred_split_lru,
> > > +						  folio_nid(folio),
> > > +						  folio_memcg(folio),
> > > +						  &flags);
> > > +			__list_lru_add(&deferred_split_lru, l,
> > > +				       &folio->_deferred_list,
> > > +				       folio_nid(folio), folio_memcg(folio));
> > > +			list_lru_unlock_irqrestore(l, &flags);
> >
> > Hmm this does make me think it'd be nice to have a list_lru_add() variant
> > for irqsave/restore then, since it's a repeating pattern!
>
> Yeah, this site calls for it the most :( I tried to balance callsite
> prettiness with the need to extend the list_lru api; it's just one
> caller. And the possible mutations and variants with these locks is
> seemingly endless once you open that can of worms...

True...

>
> Case in point: this is process context and we could use
> spin_lock_irq() here. I'm just using list_lru_lock_irqsave() because
> that's the common variant used by the add and del paths already.
>
> If I went with a helper, I could do list_lru_add_irq().
>
> I think it would actually nicely mirror the list_lru_shrink_walk_irq()
> a few lines up.

Yeah, I mean I'm pretty sure this repeats quite a few times so is worthy of a
helper.

>
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -1081,6 +1081,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
> > >  	}
> > >
> > >  	count_vm_event(THP_COLLAPSE_ALLOC);
> > > +
> > >  	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
> > >  		folio_put(folio);
> > >  		*foliop = NULL;
> > > @@ -1089,6 +1090,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
> > >
> > >  	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
> >
> > Do we want to put this stat counter underneath the below so it's not
> > incremented on fail?
>
> We currently increment it also when the cgroup charge fails, which is
> common. The list_lru allocation (a smaller kmalloc) in turn should
> basically never fail - we just managed to get a PMD-sized folio.
>
> Executive summar being: the way we bump them currently is weird. I
> tried to stay out of it to keep this series on track ;)

Ack ok fair enough :)

>
> > > +	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> > > +		folio_put(folio);
> > > +		*foliop = NULL;
> > > +		return SCAN_CGROUP_CHARGE_FAIL;
> >
> > Do we not need to uncharge here?
>
> folio_put() does that. We could do it for symmetry, but there is no
> need. And the hard and fast rule is that folio_memcg() is immutable
> until the put hits 0, so it would look weird/unexpected.

So it does!

void __folio_put(struct folio *folio)
{
	...
	mem_cgroup_uncharge(folio);
	...
}

I didn't _think_ it would but I guess I didn't realise how much work
__folio_put() was doing :)

>
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index a47fb68dd65f..f381cb6bdff1 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
> > >  	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
> > >  		memcg->cgwb_frn[i].done =
> > >  			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
> > > -#endif
> > > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > -	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
> > > -	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
> > > -	memcg->deferred_split_queue.split_queue_len = 0;
> > >  #endif
> > >  	lru_gen_init_memcg(memcg);
> > >  	return memcg;
> > > @@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> > >  	zswap_memcg_offline_cleanup(memcg);
> > >
> > >  	memcg_offline_kmem(memcg);
> > > -	reparent_deferred_split_queue(memcg);
> > >  	/*
> > > -	 * The reparenting of objcg must be after the reparenting of the
> > > -	 * list_lru and deferred_split_queue above, which ensures that they will
> > > -	 * not mistakenly get the parent list_lru and deferred_split_queue.
> > > +	 * The reparenting of objcg must be after the reparenting of
> > > +	 * the list_lru in memcg_offline_kmem(), which ensures that
> > > +	 * they will not mistakenly get the parent list_lru.
> > >  	 */
> > >  	memcg_reparent_objcgs(memcg);
> > >  	reparent_shrinker_deferred(memcg);
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 219b9bf6cae0..e68ceb4aa624 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> > >  	while (orders) {
> > >  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> > >  		folio = vma_alloc_folio(gfp, order, vma, addr);
> > > -		if (folio) {
> > > -			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> > > -							    gfp, entry))
> > > -				return folio;
> > > +		if (!folio)
> > > +			goto next;
> > > +		if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
> > >  			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
> > >  			folio_put(folio);
> > > +			goto next;
> > >  		}
> > > +		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> >
> > Do we need to uncharge here?
>
> Same here. It's folio state that the put cleans up.

Ack, indeed!

>
> > > +			folio_put(folio);
> > > +			goto fallback;
> > > +		}
> > > +		return folio;
> > > +next:
> > >  		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
> > >  		order = next_order(&orders, order);
> > >  	}
> > > @@ -5169,24 +5175,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> > >  	while (orders) {
> > >  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> > >  		folio = vma_alloc_folio(gfp, order, vma, addr);
> > > -		if (folio) {
> > > -			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> > > -				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > > -				folio_put(folio);
> > > -				goto next;
> > > -			}
> > > -			folio_throttle_swaprate(folio, gfp);
> > > -			/*
> > > -			 * When a folio is not zeroed during allocation
> > > -			 * (__GFP_ZERO not used) or user folios require special
> > > -			 * handling, folio_zero_user() is used to make sure
> > > -			 * that the page corresponding to the faulting address
> > > -			 * will be hot in the cache after zeroing.
> > > -			 */
> > > -			if (user_alloc_needs_zeroing())
> > > -				folio_zero_user(folio, vmf->address);
> > > -			return folio;
> > > +		if (!folio)
> > > +			goto next;
> >
> > This applies to the above equivalent refactorings, but maybe worth
> > separating out this:
> >
> > if (folio) { ... big branch ... } ->
> > if (!folio) goto next; ... what was big branch ...
> >
> > Refactoring into a separate patch? Makes it easier to see pertinent logic
> > changes + helps review/bisectability/fixes/etc. etc.
>
> Will do.

Thanks!

>
> > > +		if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> > > +			count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > > +			folio_put(folio);
> > > +			goto next;
> > >  		}
> > > +		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> >
> > Again, do we need to uncharge here?
>
> folio_put() does it :)

Yeah, I should have just checked __folio_put() :>)

<Insert quote about good programmers, lazy, hubris etc. here :P>

>
> > > +			folio_put(folio);
> > > +			goto fallback;
> > > +		}
> > > +		folio_throttle_swaprate(folio, gfp);
> > > +		/*
> > > +		 * When a folio is not zeroed during allocation
> > > +		 * (__GFP_ZERO not used) or user folios require special
> > > +		 * handling, folio_zero_user() is used to make sure
> > > +		 * that the page corresponding to the faulting address
> > > +		 * will be hot in the cache after zeroing.
> > > +		 */
> > > +		if (user_alloc_needs_zeroing())
> > > +			folio_zero_user(folio, vmf->address);
> > > +		return folio;
> > >  next:
> > >  		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> > >  		order = next_order(&orders, order);
> > > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > > index cec7bb758bdd..f293a62e652a 100644
> > > --- a/mm/mm_init.c
> > > +++ b/mm/mm_init.c
> > > @@ -1388,19 +1388,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
> > >  	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
> > >  }
> > >
> > > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > -static void pgdat_init_split_queue(struct pglist_data *pgdat)
> > > -{
> > > -	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
> > > -
> > > -	spin_lock_init(&ds_queue->split_queue_lock);
> > > -	INIT_LIST_HEAD(&ds_queue->split_queue);
> > > -	ds_queue->split_queue_len = 0;
> > > -}
> > > -#else
> > > -static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
> > > -#endif
> > > -
> > >  #ifdef CONFIG_COMPACTION
> > >  static void pgdat_init_kcompactd(struct pglist_data *pgdat)
> > >  {
> > > @@ -1416,8 +1403,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
> > >
> > >  	pgdat_resize_init(pgdat);
> > >  	pgdat_kswapd_lock_init(pgdat);
> > > -
> > > -	pgdat_init_split_queue(pgdat);
> > >  	pgdat_init_kcompactd(pgdat);
> > >
> > >  	init_waitqueue_head(&pgdat->kswapd_wait);
> > > --
> > > 2.53.0
> > >
> >
> > Overall a lovely amount of code deletion here :)
> >
> > Thanks for doing this, Cheers Lorenzo
>
> Thanks for your feedback!

No worries, thanks for the series :)

Cheers, Lorenzo
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by Shakeel Butt 2 weeks, 4 days ago
On Wed, Mar 18, 2026 at 03:53:25PM -0400, Johannes Weiner wrote:
> The deferred split queue handles cgroups in a suboptimal fashion. The
> queue is per-NUMA node or per-cgroup, not the intersection. That means
> on a cgrouped system, a node-restricted allocation entering reclaim
> can end up splitting large pages on other nodes:
> 
> 	alloc/unmap
> 	  deferred_split_folio()
> 	    list_add_tail(memcg->split_queue)
> 	    set_shrinker_bit(memcg, node, deferred_shrinker_id)
> 
> 	for_each_zone_zonelist_nodemask(restricted_nodes)
> 	  mem_cgroup_iter()
> 	    shrink_slab(node, memcg)
> 	      shrink_slab_memcg(node, memcg)
> 	        if test_shrinker_bit(memcg, node, deferred_shrinker_id)
> 	          deferred_split_scan()
> 	            walks memcg->split_queue
> 
> The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> has a single large page on the node of interest, all large pages owned
> by that memcg, including those on other nodes, will be split.
> 
> list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> streamlines a lot of the list operations and reclaim walks. It's used
> widely by other major shrinkers already. Convert the deferred split
> queue as well.
> 
> The list_lru per-memcg heads are instantiated on demand when the first
> object of interest is allocated for a cgroup, by calling
> folio_memcg_list_lru_alloc(). Add calls to where splittable pages are
> created: anon faults, swapin faults, khugepaged collapse.
> 
> These calls create all possible node heads for the cgroup at once, so
> the migration code (between nodes) doesn't need any special care.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Posted by David Hildenbrand (Arm) 2 weeks, 4 days ago
On 3/18/26 20:53, Johannes Weiner wrote:
> The deferred split queue handles cgroups in a suboptimal fashion. The
> queue is per-NUMA node or per-cgroup, not the intersection. That means
> on a cgrouped system, a node-restricted allocation entering reclaim
> can end up splitting large pages on other nodes:
> 
> 	alloc/unmap
> 	  deferred_split_folio()
> 	    list_add_tail(memcg->split_queue)
> 	    set_shrinker_bit(memcg, node, deferred_shrinker_id)
> 
> 	for_each_zone_zonelist_nodemask(restricted_nodes)
> 	  mem_cgroup_iter()
> 	    shrink_slab(node, memcg)
> 	      shrink_slab_memcg(node, memcg)
> 	        if test_shrinker_bit(memcg, node, deferred_shrinker_id)
> 	          deferred_split_scan()
> 	            walks memcg->split_queue
> 
> The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> has a single large page on the node of interest, all large pages owned
> by that memcg, including those on other nodes, will be split.
> 
> list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> streamlines a lot of the list operations and reclaim walks. It's used
> widely by other major shrinkers already. Convert the deferred split
> queue as well.
> 
> The list_lru per-memcg heads are instantiated on demand when the first
> object of interest is allocated for a cgroup, by calling
> folio_memcg_list_lru_alloc(). Add calls to where splittable pages are
> created: anon faults, swapin faults, khugepaged collapse.
> 
> These calls create all possible node heads for the cgroup at once, so
> the migration code (between nodes) doesn't need any special care.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---

Was just hitting sent on a reply on v2 when I spotted this in my inbox.

-- 
Cheers,

David