[v3] mm: switch THP shrinker to list_lru

[PATCH v3 7/7] mm: switch deferred split shrinker to list_lru

Posted by Johannes Weiner 2 weeks, 4 days ago

The deferred split queue handles cgroups in a suboptimal fashion. The
queue is per-NUMA node or per-cgroup, not the intersection. That means
on a cgrouped system, a node-restricted allocation entering reclaim
can end up splitting large pages on other nodes:

	alloc/unmap
	  deferred_split_folio()
	    list_add_tail(memcg->split_queue)
	    set_shrinker_bit(memcg, node, deferred_shrinker_id)

	for_each_zone_zonelist_nodemask(restricted_nodes)
	  mem_cgroup_iter()
	    shrink_slab(node, memcg)
	      shrink_slab_memcg(node, memcg)
	        if test_shrinker_bit(memcg, node, deferred_shrinker_id)
	          deferred_split_scan()
	            walks memcg->split_queue

The shrinker bit adds an imperfect guard rail. As soon as the cgroup
has a single large page on the node of interest, all large pages owned
by that memcg, including those on other nodes, will be split.

list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
streamlines a lot of the list operations and reclaim walks. It's used
widely by other major shrinkers already. Convert the deferred split
queue as well.

The list_lru per-memcg heads are instantiated on demand when the first
object of interest is allocated for a cgroup, by calling
folio_memcg_list_lru_alloc(). Add calls to where splittable pages are
created: anon faults, swapin faults, khugepaged collapse.

These calls create all possible node heads for the cgroup at once, so
the migration code (between nodes) doesn't need any special care.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/huge_mm.h    |   6 +-
 include/linux/memcontrol.h |   4 -
 include/linux/mmzone.h     |  12 --
 mm/huge_memory.c           | 342 ++++++++++++-------------------------
 mm/internal.h              |   2 +-
 mm/khugepaged.c            |   7 +
 mm/memcontrol.c            |  12 +-
 mm/memory.c                |  52 +++---
 mm/mm_init.c               |  15 --
 9 files changed, 151 insertions(+), 301 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index bd7f0e1d8094..8d801ed378db 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
 {
 	return split_huge_page_to_list_to_order(page, NULL, 0);
 }
+
+extern struct list_lru deferred_split_lru;
 void deferred_split_folio(struct folio *folio, bool partially_mapped);
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg);
-#endif
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze);
@@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
 }
 
 static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
-static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 086158969529..0782c72a1997 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -277,10 +277,6 @@ struct mem_cgroup {
 	struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	struct deferred_split deferred_split_queue;
-#endif
-
 #ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* per-memcg mm_struct list */
 	struct lru_gen_mm_list mm_list;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7bd0134c241c..232b7a71fd69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1429,14 +1429,6 @@ struct zonelist {
  */
 extern struct page *mem_map;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct deferred_split {
-	spinlock_t split_queue_lock;
-	struct list_head split_queue;
-	unsigned long split_queue_len;
-};
-#endif
-
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Per NUMA node memory failure handling statistics.
@@ -1562,10 +1554,6 @@ typedef struct pglist_data {
 	unsigned long first_deferred_pfn;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	struct deferred_split deferred_split_queue;
-#endif
-
 #ifdef CONFIG_NUMA_BALANCING
 	/* start time in ms of current promote rate limit period */
 	unsigned int nbp_rl_start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3fc02913b63e..e90d08db219d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -14,6 +14,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/list_lru.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
 #include <linux/swapops.h>
@@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
+static struct lock_class_key deferred_split_key;
+struct list_lru deferred_split_lru;
 static struct shrinker *deferred_split_shrinker;
 static unsigned long deferred_split_count(struct shrinker *shrink,
 					  struct shrink_control *sc);
@@ -919,6 +922,13 @@ static int __init thp_shrinker_init(void)
 	if (!deferred_split_shrinker)
 		return -ENOMEM;
 
+	if (list_lru_init_memcg_key(&deferred_split_lru,
+				    deferred_split_shrinker,
+				    &deferred_split_key)) {
+		shrinker_free(deferred_split_shrinker);
+		return -ENOMEM;
+	}
+
 	deferred_split_shrinker->count_objects = deferred_split_count;
 	deferred_split_shrinker->scan_objects = deferred_split_scan;
 	shrinker_register(deferred_split_shrinker);
@@ -939,6 +949,7 @@ static int __init thp_shrinker_init(void)
 
 	huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
 	if (!huge_zero_folio_shrinker) {
+		list_lru_destroy(&deferred_split_lru);
 		shrinker_free(deferred_split_shrinker);
 		return -ENOMEM;
 	}
@@ -953,6 +964,7 @@ static int __init thp_shrinker_init(void)
 static void __init thp_shrinker_exit(void)
 {
 	shrinker_free(huge_zero_folio_shrinker);
+	list_lru_destroy(&deferred_split_lru);
 	shrinker_free(deferred_split_shrinker);
 }
 
@@ -1133,119 +1145,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 	return pmd;
 }
 
-static struct deferred_split *split_queue_node(int nid)
-{
-	struct pglist_data *pgdata = NODE_DATA(nid);
-
-	return &pgdata->deferred_split_queue;
-}
-
-#ifdef CONFIG_MEMCG
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
-					   struct deferred_split *queue)
-{
-	if (mem_cgroup_disabled())
-		return NULL;
-	if (split_queue_node(folio_nid(folio)) == queue)
-		return NULL;
-	return container_of(queue, struct mem_cgroup, deferred_split_queue);
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
-	return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
-}
-#else
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
-					   struct deferred_split *queue)
-{
-	return NULL;
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
-	return split_queue_node(nid);
-}
-#endif
-
-static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
-{
-	struct deferred_split *queue;
-
-retry:
-	queue = memcg_split_queue(nid, memcg);
-	spin_lock(&queue->split_queue_lock);
-	/*
-	 * There is a period between setting memcg to dying and reparenting
-	 * deferred split queue, and during this period the THPs in the deferred
-	 * split queue will be hidden from the shrinker side.
-	 */
-	if (unlikely(memcg_is_dying(memcg))) {
-		spin_unlock(&queue->split_queue_lock);
-		memcg = parent_mem_cgroup(memcg);
-		goto retry;
-	}
-
-	return queue;
-}
-
-static struct deferred_split *
-split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
-{
-	struct deferred_split *queue;
-
-retry:
-	queue = memcg_split_queue(nid, memcg);
-	spin_lock_irqsave(&queue->split_queue_lock, *flags);
-	if (unlikely(memcg_is_dying(memcg))) {
-		spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
-		memcg = parent_mem_cgroup(memcg);
-		goto retry;
-	}
-
-	return queue;
-}
-
-static struct deferred_split *folio_split_queue_lock(struct folio *folio)
-{
-	struct deferred_split *queue;
-
-	rcu_read_lock();
-	queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
-	/*
-	 * The memcg destruction path is acquiring the split queue lock for
-	 * reparenting. Once you have it locked, it's safe to drop the rcu lock.
-	 */
-	rcu_read_unlock();
-
-	return queue;
-}
-
-static struct deferred_split *
-folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
-{
-	struct deferred_split *queue;
-
-	rcu_read_lock();
-	queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
-	rcu_read_unlock();
-
-	return queue;
-}
-
-static inline void split_queue_unlock(struct deferred_split *queue)
-{
-	spin_unlock(&queue->split_queue_lock);
-}
-
-static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
-						 unsigned long flags)
-{
-	spin_unlock_irqrestore(&queue->split_queue_lock, flags);
-}
-
 static inline bool is_transparent_hugepage(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
@@ -1346,6 +1245,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
 		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 		return NULL;
 	}
+
+	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+		folio_put(folio);
+		count_vm_event(THP_FAULT_FALLBACK);
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+		return NULL;
+	}
+
 	folio_throttle_swaprate(folio, gfp);
 
        /*
@@ -3854,34 +3761,34 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 	struct folio *end_folio = folio_next(folio);
 	struct folio *new_folio, *next;
 	int old_order = folio_order(folio);
+	struct list_lru_one *l;
+	bool dequeue_deferred;
 	int ret = 0;
-	struct deferred_split *ds_queue;
 
 	VM_WARN_ON_ONCE(!mapping && end);
 	/* Prevent deferred_split_scan() touching ->_refcount */
-	ds_queue = folio_split_queue_lock(folio);
+	dequeue_deferred = folio_test_anon(folio) && old_order > 1;
+	if (dequeue_deferred) {
+		rcu_read_lock();
+		l = list_lru_lock(&deferred_split_lru,
+				  folio_nid(folio), folio_memcg(folio));
+	}
 	if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
 		struct swap_cluster_info *ci = NULL;
 		struct lruvec *lruvec;
 
-		if (old_order > 1) {
-			if (!list_empty(&folio->_deferred_list)) {
-				ds_queue->split_queue_len--;
-				/*
-				 * Reinitialize page_deferred_list after removing the
-				 * page from the split_queue, otherwise a subsequent
-				 * split will see list corruption when checking the
-				 * page_deferred_list.
-				 */
-				list_del_init(&folio->_deferred_list);
-			}
+		if (dequeue_deferred) {
+			__list_lru_del(&deferred_split_lru, l,
+				       &folio->_deferred_list, folio_nid(folio));
 			if (folio_test_partially_mapped(folio)) {
 				folio_clear_partially_mapped(folio);
 				mod_mthp_stat(old_order,
 					MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
 			}
+			list_lru_unlock(l);
+			rcu_read_unlock();
 		}
-		split_queue_unlock(ds_queue);
+
 		if (mapping) {
 			int nr = folio_nr_pages(folio);
 
@@ -3982,7 +3889,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 		if (ci)
 			swap_cluster_unlock(ci);
 	} else {
-		split_queue_unlock(ds_queue);
+		if (dequeue_deferred) {
+			list_lru_unlock(l);
+			rcu_read_unlock();
+		}
 		return -EAGAIN;
 	}
 
@@ -4349,33 +4259,35 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
  * queueing THP splits, and that list is (racily observed to be) non-empty.
  *
  * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
- * zero: because even when split_queue_lock is held, a non-empty _deferred_list
- * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ * zero: because even when the list_lru lock is held, a non-empty
+ * _deferred_list might be in use on deferred_split_scan()'s unlocked
+ * on-stack list.
  *
- * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
- * therefore important to unqueue deferred split before changing folio memcg.
+ * The list_lru sublist is determined by folio's memcg: it is therefore
+ * important to unqueue deferred split before changing folio memcg.
  */
 bool __folio_unqueue_deferred_split(struct folio *folio)
 {
-	struct deferred_split *ds_queue;
+	struct list_lru_one *l;
+	int nid = folio_nid(folio);
 	unsigned long flags;
 	bool unqueued = false;
 
 	WARN_ON_ONCE(folio_ref_count(folio));
 	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
 
-	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
-	if (!list_empty(&folio->_deferred_list)) {
-		ds_queue->split_queue_len--;
+	rcu_read_lock();
+	l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
+	if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
 		if (folio_test_partially_mapped(folio)) {
 			folio_clear_partially_mapped(folio);
 			mod_mthp_stat(folio_order(folio),
 				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
 		}
-		list_del_init(&folio->_deferred_list);
 		unqueued = true;
 	}
-	split_queue_unlock_irqrestore(ds_queue, flags);
+	list_lru_unlock_irqrestore(l, &flags);
+	rcu_read_unlock();
 
 	return unqueued;	/* useful for debug warnings */
 }
@@ -4383,7 +4295,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
-	struct deferred_split *ds_queue;
+	struct list_lru_one *l;
+	int nid;
+	struct mem_cgroup *memcg;
 	unsigned long flags;
 
 	/*
@@ -4406,7 +4320,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 	if (folio_test_swapcache(folio))
 		return;
 
-	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
+	nid = folio_nid(folio);
+
+	rcu_read_lock();
+	memcg = folio_memcg(folio);
+	l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
 	if (partially_mapped) {
 		if (!folio_test_partially_mapped(folio)) {
 			folio_set_partially_mapped(folio);
@@ -4414,36 +4332,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
 			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
 			mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
-
 		}
 	} else {
 		/* partially mapped folios cannot become non-partially mapped */
 		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
 	}
-	if (list_empty(&folio->_deferred_list)) {
-		struct mem_cgroup *memcg;
-
-		memcg = folio_split_queue_memcg(folio, ds_queue);
-		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
-		ds_queue->split_queue_len++;
-		if (memcg)
-			set_shrinker_bit(memcg, folio_nid(folio),
-					 shrinker_id(deferred_split_shrinker));
-	}
-	split_queue_unlock_irqrestore(ds_queue, flags);
+	__list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
+	list_lru_unlock_irqrestore(l, &flags);
+	rcu_read_unlock();
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
-	struct pglist_data *pgdata = NODE_DATA(sc->nid);
-	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
-
-#ifdef CONFIG_MEMCG
-	if (sc->memcg)
-		ds_queue = &sc->memcg->deferred_split_queue;
-#endif
-	return READ_ONCE(ds_queue->split_queue_len);
+	return list_lru_shrink_count(&deferred_split_lru, sc);
 }
 
 static bool thp_underused(struct folio *folio)
@@ -4473,45 +4375,47 @@ static bool thp_underused(struct folio *folio)
 	return false;
 }
 
+static enum lru_status deferred_split_isolate(struct list_head *item,
+					      struct list_lru_one *lru,
+					      void *cb_arg)
+{
+	struct folio *folio = container_of(item, struct folio, _deferred_list);
+	struct list_head *freeable = cb_arg;
+
+	if (folio_try_get(folio)) {
+		list_lru_isolate_move(lru, item, freeable);
+		return LRU_REMOVED;
+	}
+
+	/* We lost race with folio_put() */
+	list_lru_isolate(lru, item);
+	if (folio_test_partially_mapped(folio)) {
+		folio_clear_partially_mapped(folio);
+		mod_mthp_stat(folio_order(folio),
+			      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+	}
+	return LRU_REMOVED;
+}
+
 static unsigned long deferred_split_scan(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
-	struct deferred_split *ds_queue;
-	unsigned long flags;
+	LIST_HEAD(dispose);
 	struct folio *folio, *next;
-	int split = 0, i;
-	struct folio_batch fbatch;
+	int split = 0;
+	unsigned long isolated;
 
-	folio_batch_init(&fbatch);
+	isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
+					    deferred_split_isolate, &dispose);
 
-retry:
-	ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
-	/* Take pin on all head pages to avoid freeing them under us */
-	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
-							_deferred_list) {
-		if (folio_try_get(folio)) {
-			folio_batch_add(&fbatch, folio);
-		} else if (folio_test_partially_mapped(folio)) {
-			/* We lost race with folio_put() */
-			folio_clear_partially_mapped(folio);
-			mod_mthp_stat(folio_order(folio),
-				      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
-		}
-		list_del_init(&folio->_deferred_list);
-		ds_queue->split_queue_len--;
-		if (!--sc->nr_to_scan)
-			break;
-		if (!folio_batch_space(&fbatch))
-			break;
-	}
-	split_queue_unlock_irqrestore(ds_queue, flags);
-
-	for (i = 0; i < folio_batch_count(&fbatch); i++) {
+	list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
 		bool did_split = false;
 		bool underused = false;
-		struct deferred_split *fqueue;
+		struct list_lru_one *l;
+		unsigned long flags;
+
+		list_del_init(&folio->_deferred_list);
 
-		folio = fbatch.folios[i];
 		if (!folio_test_partially_mapped(folio)) {
 			/*
 			 * See try_to_map_unused_to_zeropage(): we cannot
@@ -4534,64 +4438,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 		}
 		folio_unlock(folio);
 next:
-		if (did_split || !folio_test_partially_mapped(folio))
-			continue;
 		/*
 		 * Only add back to the queue if folio is partially mapped.
 		 * If thp_underused returns false, or if split_folio fails
 		 * in the case it was underused, then consider it used and
 		 * don't add it back to split_queue.
 		 */
-		fqueue = folio_split_queue_lock_irqsave(folio, &flags);
-		if (list_empty(&folio->_deferred_list)) {
-			list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
-			fqueue->split_queue_len++;
+		if (!did_split && folio_test_partially_mapped(folio)) {
+			rcu_read_lock();
+			l = list_lru_lock_irqsave(&deferred_split_lru,
+						  folio_nid(folio),
+						  folio_memcg(folio),
+						  &flags);
+			__list_lru_add(&deferred_split_lru, l,
+				       &folio->_deferred_list,
+				       folio_nid(folio), folio_memcg(folio));
+			list_lru_unlock_irqrestore(l, &flags);
+			rcu_read_unlock();
 		}
-		split_queue_unlock_irqrestore(fqueue, flags);
-	}
-	folios_put(&fbatch);
-
-	if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
-		cond_resched();
-		goto retry;
+		folio_put(folio);
 	}
 
-	/*
-	 * Stop shrinker if we didn't split any page, but the queue is empty.
-	 * This can happen if pages were freed under us.
-	 */
-	if (!split && list_empty(&ds_queue->split_queue))
+	if (!split && !isolated)
 		return SHRINK_STOP;
 	return split;
 }
 
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-	struct deferred_split *ds_queue = &memcg->deferred_split_queue;
-	struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
-	int nid;
-
-	spin_lock_irq(&ds_queue->split_queue_lock);
-	spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
-
-	if (!ds_queue->split_queue_len)
-		goto unlock;
-
-	list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
-	parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
-	ds_queue->split_queue_len = 0;
-
-	for_each_node(nid)
-		set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
-
-unlock:
-	spin_unlock(&parent_ds_queue->split_queue_lock);
-	spin_unlock_irq(&ds_queue->split_queue_lock);
-}
-#endif
-
 #ifdef CONFIG_DEBUG_FS
 static void split_huge_pages_all(void)
 {
diff --git a/mm/internal.h b/mm/internal.h
index f98f4746ac41..d8c737338df5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -863,7 +863,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
 	/*
 	 * At this point, there is no one trying to add the folio to
 	 * deferred_list. If folio is not in deferred_list, it's safe
-	 * to check without acquiring the split_queue_lock.
+	 * to check without acquiring the list_lru lock.
 	 */
 	if (data_race(list_empty(&folio->_deferred_list)))
 		return false;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4b0e59c7c0e6..b2ac28ddd480 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1081,6 +1081,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
 	}
 
 	count_vm_event(THP_COLLAPSE_ALLOC);
+
 	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
 		folio_put(folio);
 		*foliop = NULL;
@@ -1089,6 +1090,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
 
 	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
 
+	if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+		folio_put(folio);
+		*foliop = NULL;
+		return SCAN_CGROUP_CHARGE_FAIL;
+	}
+
 	*foliop = folio;
 	return SCAN_SUCCEED;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a47fb68dd65f..f381cb6bdff1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
 		memcg->cgwb_frn[i].done =
 			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
-	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
-	memcg->deferred_split_queue.split_queue_len = 0;
 #endif
 	lru_gen_init_memcg(memcg);
 	return memcg;
@@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	zswap_memcg_offline_cleanup(memcg);
 
 	memcg_offline_kmem(memcg);
-	reparent_deferred_split_queue(memcg);
 	/*
-	 * The reparenting of objcg must be after the reparenting of the
-	 * list_lru and deferred_split_queue above, which ensures that they will
-	 * not mistakenly get the parent list_lru and deferred_split_queue.
+	 * The reparenting of objcg must be after the reparenting of
+	 * the list_lru in memcg_offline_kmem(), which ensures that
+	 * they will not mistakenly get the parent list_lru.
 	 */
 	memcg_reparent_objcgs(memcg);
 	reparent_shrinker_deferred(memcg);
diff --git a/mm/memory.c b/mm/memory.c
index 219b9bf6cae0..e68ceb4aa624 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 		folio = vma_alloc_folio(gfp, order, vma, addr);
-		if (folio) {
-			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
-							    gfp, entry))
-				return folio;
+		if (!folio)
+			goto next;
+		if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
 			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
 			folio_put(folio);
+			goto next;
 		}
+		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+			folio_put(folio);
+			goto fallback;
+		}
+		return folio;
+next:
 		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
 		order = next_order(&orders, order);
 	}
@@ -5169,24 +5175,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 		folio = vma_alloc_folio(gfp, order, vma, addr);
-		if (folio) {
-			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
-				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
-				folio_put(folio);
-				goto next;
-			}
-			folio_throttle_swaprate(folio, gfp);
-			/*
-			 * When a folio is not zeroed during allocation
-			 * (__GFP_ZERO not used) or user folios require special
-			 * handling, folio_zero_user() is used to make sure
-			 * that the page corresponding to the faulting address
-			 * will be hot in the cache after zeroing.
-			 */
-			if (user_alloc_needs_zeroing())
-				folio_zero_user(folio, vmf->address);
-			return folio;
+		if (!folio)
+			goto next;
+		if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+			count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+			folio_put(folio);
+			goto next;
 		}
+		if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
+			folio_put(folio);
+			goto fallback;
+		}
+		folio_throttle_swaprate(folio, gfp);
+		/*
+		 * When a folio is not zeroed during allocation
+		 * (__GFP_ZERO not used) or user folios require special
+		 * handling, folio_zero_user() is used to make sure
+		 * that the page corresponding to the faulting address
+		 * will be hot in the cache after zeroing.
+		 */
+		if (user_alloc_needs_zeroing())
+			folio_zero_user(folio, vmf->address);
+		return folio;
 next:
 		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
 		order = next_order(&orders, order);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index cec7bb758bdd..f293a62e652a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1388,19 +1388,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
 	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
-	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
-	spin_lock_init(&ds_queue->split_queue_lock);
-	INIT_LIST_HEAD(&ds_queue->split_queue);
-	ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
 #ifdef CONFIG_COMPACTION
 static void pgdat_init_kcompactd(struct pglist_data *pgdat)
 {
@@ -1416,8 +1403,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 
 	pgdat_resize_init(pgdat);
 	pgdat_kswapd_lock_init(pgdat);
-
-	pgdat_init_split_queue(pgdat);
 	pgdat_init_kcompactd(pgdat);
 
 	init_waitqueue_head(&pgdat->kswapd_wait);
-- 
2.53.0

Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc

Posted by Mikhail Zaslonko 1 week ago


On 18-Mar-26 20:53, Johannes Weiner wrote:
> The deferred split queue handles cgroups in a suboptimal fashion. 
> The queue is per-NUMA node or per-cgroup, not the intersection. That 
> means on a cgrouped system, a node-restricted allocation entering 
> reclaim can end up splitting large pages on other nodes:
> 
> alloc/unmap deferred_split_folio() list_add_tail(memcg-
> >split_queue) set_shrinker_bit(memcg, node, deferred_shrinker_id)
> 
> for_each_zone_zonelist_nodemask(restricted_nodes) mem_cgroup_iter() 
> shrink_slab(node, memcg) shrink_slab_memcg(node, memcg) if 
> test_shrinker_bit(memcg, node, deferred_shrinker_id) 
> deferred_split_scan() walks memcg->split_queue
> 
> The shrinker bit adds an imperfect guard rail. As soon as the
> cgroup has a single large page on the node of interest, all large
> pages owned by that memcg, including those on other nodes, will be
> split.
> 
> list_lru properly sets up per-node, per-cgroup lists. As a bonus,
> it streamlines a lot of the list operations and reclaim walks. It's 
> used widely by other major shrinkers already. Convert the deferred 
> split queue as well.
> 
> The list_lru per-memcg heads are instantiated on demand when the 
> first object of interest is allocated for a cgroup, by calling 
> folio_memcg_list_lru_alloc(). Add calls to where splittable pages 
> are created: anon faults, swapin faults, khugepaged collapse.
> 
> These calls create all possible node heads for the cgroup at once, 
> so the migration code (between nodes) doesn't need any special care.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> --- include/ 
> linux/huge_mm.h    |   6 +- include/linux/memcontrol.h |   4 - 
> include/linux/mmzone.h     |  12 -- mm/huge_memory.c           | 342 
> ++++++++++++------------------------- mm/internal.h              | 2
> +- mm/khugepaged.c            |   7 + mm/memcontrol.c |  12 +- mm/
> memory.c                |  52 +++--- mm/ mm_init.c               |
> 15 -- 9 files changed, 151 insertions(+), 301 deletions(-)
> 

Hi,

with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
dump kernel when running NVMe standalone dump (ngdump).
This only happens in the 'capture kernel', normal boot of the same kernel works fine.

[   14.350676] Unable to handle kernel pointer dereference in virtual kernel address space
[   14.350682] Failing address: 4000000000000000 TEID: 4000000000000803 ESOP-2 FSI
[   14.350686] Fault in home space mode while using kernel ASCE.
[   14.350689] AS:0000000002798007 R3:000000002d2c4007 S:000000002d2c3001 P:000000000000013d 
[   14.350730] Oops: 0038 ilc:3 [#1]SMP 
[   14.350735] Modules linked in: dm_service_time zfcp scsi_transport_fc uvdevice diag288_wdt nvme prng aes_s390 nvme_core des_s390 libdes zcrypt_cex4 dm_mirror dm_region_hash dm_log scsi_dh_rdac scsi_dh_emc scsi_dh_alua paes_s390 crypto_engine pkey_cca pkey_ep11 zcrypt rng_core pkey_pckmo pkey dm_multipath autofs4
[   14.350760] CPU: 0 UID: 0 PID: 32 Comm: khugepaged Not tainted 7.0.0-rc5-next-20260324 
[   14.350762] Hardware name: IBM 3931 A01 704 (LPAR)
[   14.350764] Krnl PSW : 0704d00180000000 000003ffe0443a82 (__memcg_list_lru_alloc+0x52/0x1d0)
[   14.350774]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
[   14.350776] Krnl GPRS: 0000000000000402 00000000000bece0 0000000000000000 000003ffe1c17928
[   14.350778]            00000000001c24ca 0000000000000000 0000000000000000 000003ffe1c17948
[   14.350780]            0000000000000000 00000000000824c0 0000037200098000 4000000000000000
[   14.350782]            0000000000782400 0000000000000001 0000037fe00f39b8 0000037fe00f3918
[   14.350788] Krnl Code: 000003ffe0443a72: a7690000            lghi    %r6,0
[   14.350788]            000003ffe0443a76: e380f0a00004        lg      %r8,160(%r15)
[   14.350788]           *000003ffe0443a7c: e3b080b80004        lg      %r11,184(%r8)
[   14.350788]           >000003ffe0443a82: e330b9400012        lt      %r3,2368(%r11)
[   14.350788]            000003ffe0443a88: a7a40065            brc     10,000003ffe0443b52
[   14.350788]            000003ffe0443a8c: e3b0f0a00004        lg      %r11,160(%r15)
[   14.350788]            000003ffe0443a92: ec68006f007c        cgij    %r6,0,8,000003ffe0443b70
[   14.350788]            000003ffe0443a98: e300b9400014        lgf     %r0,2368(%r11)
[   14.350825] Call Trace:
[   14.350826]  [<000003ffe0443a82>] __memcg_list_lru_alloc+0x52/0x1d0 
[   14.350831]  [<000003ffe044529a>] folio_memcg_list_lru_alloc+0xba/0x150 
[   14.350834]  [<000003ffe04f279a>] alloc_charge_folio+0x18a/0x250 
[   14.350839]  [<000003ffe04f34dc>] collapse_huge_page+0x8c/0x890 
[   14.350841]  [<000003ffe04f4222>] collapse_scan_pmd+0x542/0x690 
[   14.350844]  [<000003ffe04f65b4>] collapse_single_pmd+0x144/0x240 
[   14.350847]  [<000003ffe04f69ce>] collapse_scan_mm_slot.constprop.0+0x31e/0x480 
[   14.350849]  [<000003ffe04f6d3c>] khugepaged+0x20c/0x210 
[   14.350852]  [<000003ffe019b0a8>] kthread+0x148/0x170 
[   14.350856]  [<000003ffe0119fec>] __ret_from_fork+0x3c/0x240 
[   14.350860]  [<000003ffe0ffa4b2>] ret_from_fork+0xa/0x30 
[   14.350865] Last Breaking-Event-Address:
[   14.350865]  [<000003ffe0445294>] folio_memcg_list_lru_alloc+0xb4/0x150
[   14.350870] Kernel panic - not syncing: Fatal exception: panic_on_oops


Environment:
Arch: s390x (IBM LPAR)
Kernel: next-20260324
Config: (can provide if needed)
Reproducible: always
Steps to Reproduce: 
  Install ngdump to an NVMe device partition via 'zipl -d' and initiate a dump (same issue with DASD ldipl-dump).

I have bisected to this specific commit. 
  good: 230bbdc110b3 ("mm: list_lru: introduce folio_memcg_list_lru_alloc()")
  bad:  b0f512f6e36c ("mm: switch deferred split shrinker to list_lru")
Reverting it on top of linux-next <next-20260327> restores normal Standalone Dump operation.
 
Let me know if I can provide any other data.

Thanks,
Mikhail

Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc

Posted by Johannes Weiner 6 days, 20 hours ago

Hello Mikhail,

On Mon, Mar 30, 2026 at 06:37:01PM +0200, Mikhail Zaslonko wrote:
> with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> dump kernel when running NVMe standalone dump (ngdump).
> This only happens in the 'capture kernel', normal boot of the same kernel works fine.
> 
> [   14.350676] Unable to handle kernel pointer dereference in virtual kernel address space
> [   14.350682] Failing address: 4000000000000000 TEID: 4000000000000803 ESOP-2 FSI
> [   14.350686] Fault in home space mode while using kernel ASCE.
> [   14.350689] AS:0000000002798007 R3:000000002d2c4007 S:000000002d2c3001 P:000000000000013d 
> [   14.350730] Oops: 0038 ilc:3 [#1]SMP 
> [   14.350735] Modules linked in: dm_service_time zfcp scsi_transport_fc uvdevice diag288_wdt nvme prng aes_s390 nvme_core des_s390 libdes zcrypt_cex4 dm_mirror dm_region_hash dm_log scsi_dh_rdac scsi_dh_emc scsi_dh_alua paes_s390 crypto_engine pkey_cca pkey_ep11 zcrypt rng_core pkey_pckmo pkey dm_multipath autofs4
> [   14.350760] CPU: 0 UID: 0 PID: 32 Comm: khugepaged Not tainted 7.0.0-rc5-next-20260324 
> [   14.350762] Hardware name: IBM 3931 A01 704 (LPAR)
> [   14.350764] Krnl PSW : 0704d00180000000 000003ffe0443a82 (__memcg_list_lru_alloc+0x52/0x1d0)
> [   14.350774]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
> [   14.350776] Krnl GPRS: 0000000000000402 00000000000bece0 0000000000000000 000003ffe1c17928
> [   14.350778]            00000000001c24ca 0000000000000000 0000000000000000 000003ffe1c17948
> [   14.350780]            0000000000000000 00000000000824c0 0000037200098000 4000000000000000
> [   14.350782]            0000000000782400 0000000000000001 0000037fe00f39b8 0000037fe00f3918
> [   14.350788] Krnl Code: 000003ffe0443a72: a7690000            lghi    %r6,0
> [   14.350788]            000003ffe0443a76: e380f0a00004        lg      %r8,160(%r15)
> [   14.350788]           *000003ffe0443a7c: e3b080b80004        lg      %r11,184(%r8)
> [   14.350788]           >000003ffe0443a82: e330b9400012        lt      %r3,2368(%r11)
> [   14.350788]            000003ffe0443a88: a7a40065            brc     10,000003ffe0443b52
> [   14.350788]            000003ffe0443a8c: e3b0f0a00004        lg      %r11,160(%r15)
> [   14.350788]            000003ffe0443a92: ec68006f007c        cgij    %r6,0,8,000003ffe0443b70
> [   14.350788]            000003ffe0443a98: e300b9400014        lgf     %r0,2368(%r11)
> [   14.350825] Call Trace:
> [   14.350826]  [<000003ffe0443a82>] __memcg_list_lru_alloc+0x52/0x1d0 
> [   14.350831]  [<000003ffe044529a>] folio_memcg_list_lru_alloc+0xba/0x150 
> [   14.350834]  [<000003ffe04f279a>] alloc_charge_folio+0x18a/0x250 
> [   14.350839]  [<000003ffe04f34dc>] collapse_huge_page+0x8c/0x890 
> [   14.350841]  [<000003ffe04f4222>] collapse_scan_pmd+0x542/0x690 
> [   14.350844]  [<000003ffe04f65b4>] collapse_single_pmd+0x144/0x240 
> [   14.350847]  [<000003ffe04f69ce>] collapse_scan_mm_slot.constprop.0+0x31e/0x480 
> [   14.350849]  [<000003ffe04f6d3c>] khugepaged+0x20c/0x210 
> [   14.350852]  [<000003ffe019b0a8>] kthread+0x148/0x170 
> [   14.350856]  [<000003ffe0119fec>] __ret_from_fork+0x3c/0x240 
> [   14.350860]  [<000003ffe0ffa4b2>] ret_from_fork+0xa/0x30 
> [   14.350865] Last Breaking-Event-Address:
> [   14.350865]  [<000003ffe0445294>] folio_memcg_list_lru_alloc+0xb4/0x150
> [   14.350870] Kernel panic - not syncing: Fatal exception: panic_on_oops

Thanks for the report. Would you be able to correlate the function
offset in __memcg_list_lru_alloc() to C code? It would be very useful
to understand *which* pointer or data structure it's tripping over.

thp_shrinker_init() is called before start_stop_khugepaged(), so I am
not sure how we could have invalid/uninitialized list_lru structures.

Could you extract the instruction stream sequence surrounding the
crash? I don't see a Code: line, but maybe you can correlate it with
objdump? And then correlate it with C code (make mm/list_lru.lst e.g.)

That would be super helpful. Sorry about the breakage.

Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc

Posted by Johannes Weiner 6 days, 19 hours ago

On Mon, Mar 30, 2026 at 04:41:16PM -0400, Johannes Weiner wrote:
> Hello Mikhail,
> 
> On Mon, Mar 30, 2026 at 06:37:01PM +0200, Mikhail Zaslonko wrote:
> > with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> > dump kernel when running NVMe standalone dump (ngdump).
> > This only happens in the 'capture kernel', normal boot of the same kernel works fine.
> > 
> > [   14.350676] Unable to handle kernel pointer dereference in virtual kernel address space
> > [   14.350682] Failing address: 4000000000000000 TEID: 4000000000000803 ESOP-2 FSI
> > [   14.350686] Fault in home space mode while using kernel ASCE.
> > [   14.350689] AS:0000000002798007 R3:000000002d2c4007 S:000000002d2c3001 P:000000000000013d 
> > [   14.350730] Oops: 0038 ilc:3 [#1]SMP 
> > [   14.350735] Modules linked in: dm_service_time zfcp scsi_transport_fc uvdevice diag288_wdt nvme prng aes_s390 nvme_core des_s390 libdes zcrypt_cex4 dm_mirror dm_region_hash dm_log scsi_dh_rdac scsi_dh_emc scsi_dh_alua paes_s390 crypto_engine pkey_cca pkey_ep11 zcrypt rng_core pkey_pckmo pkey dm_multipath autofs4
> > [   14.350760] CPU: 0 UID: 0 PID: 32 Comm: khugepaged Not tainted 7.0.0-rc5-next-20260324 
> > [   14.350762] Hardware name: IBM 3931 A01 704 (LPAR)
> > [   14.350764] Krnl PSW : 0704d00180000000 000003ffe0443a82 (__memcg_list_lru_alloc+0x52/0x1d0)
> > [   14.350774]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
> > [   14.350776] Krnl GPRS: 0000000000000402 00000000000bece0 0000000000000000 000003ffe1c17928
> > [   14.350778]            00000000001c24ca 0000000000000000 0000000000000000 000003ffe1c17948
> > [   14.350780]            0000000000000000 00000000000824c0 0000037200098000 4000000000000000
> > [   14.350782]            0000000000782400 0000000000000001 0000037fe00f39b8 0000037fe00f3918
> > [   14.350788] Krnl Code: 000003ffe0443a72: a7690000            lghi    %r6,0
> > [   14.350788]            000003ffe0443a76: e380f0a00004        lg      %r8,160(%r15)
> > [   14.350788]           *000003ffe0443a7c: e3b080b80004        lg      %r11,184(%r8)
> > [   14.350788]           >000003ffe0443a82: e330b9400012        lt      %r3,2368(%r11)
> > [   14.350788]            000003ffe0443a88: a7a40065            brc     10,000003ffe0443b52
> > [   14.350788]            000003ffe0443a8c: e3b0f0a00004        lg      %r11,160(%r15)
> > [   14.350788]            000003ffe0443a92: ec68006f007c        cgij    %r6,0,8,000003ffe0443b70
> > [   14.350788]            000003ffe0443a98: e300b9400014        lgf     %r0,2368(%r11)
> > [   14.350825] Call Trace:
> > [   14.350826]  [<000003ffe0443a82>] __memcg_list_lru_alloc+0x52/0x1d0 
> > [   14.350831]  [<000003ffe044529a>] folio_memcg_list_lru_alloc+0xba/0x150 
> > [   14.350834]  [<000003ffe04f279a>] alloc_charge_folio+0x18a/0x250 
> > [   14.350839]  [<000003ffe04f34dc>] collapse_huge_page+0x8c/0x890 
> > [   14.350841]  [<000003ffe04f4222>] collapse_scan_pmd+0x542/0x690 
> > [   14.350844]  [<000003ffe04f65b4>] collapse_single_pmd+0x144/0x240 
> > [   14.350847]  [<000003ffe04f69ce>] collapse_scan_mm_slot.constprop.0+0x31e/0x480 
> > [   14.350849]  [<000003ffe04f6d3c>] khugepaged+0x20c/0x210 
> > [   14.350852]  [<000003ffe019b0a8>] kthread+0x148/0x170 
> > [   14.350856]  [<000003ffe0119fec>] __ret_from_fork+0x3c/0x240 
> > [   14.350860]  [<000003ffe0ffa4b2>] ret_from_fork+0xa/0x30 
> > [   14.350865] Last Breaking-Event-Address:
> > [   14.350865]  [<000003ffe0445294>] folio_memcg_list_lru_alloc+0xb4/0x150
> > [   14.350870] Kernel panic - not syncing: Fatal exception: panic_on_oops

Can you verify whether the kdump kernel boots with
cgroup_disable=memory?

I think there is an issue with how we call __list_lru_init(). The
existing callsites had their own memcg_kmem_online() guards. But the
THP one does not, so we're creating a memcg-aware list_lru, but the
do-while hierarchy walk in __memcg_list_lru_alloc() runs into a NULL
memcg.

Can you try the below on top of that -next checkout?

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 1ccdd45b1d14..7c7024e33653 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -637,7 +637,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr
 	else
 		lru->shrinker_id = -1;
 
-	if (mem_cgroup_kmem_disabled())
+	if (mem_cgroup_disabled() || mem_cgroup_kmem_disabled())
 		memcg_aware = false;
 #endif

Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc

Posted by Mikhail Zaslonko 6 days, 8 hours ago


On 30-Mar-26 22:56, Johannes Weiner wrote:
> On Mon, Mar 30, 2026 at 04:41:16PM -0400, Johannes Weiner wrote:
>> Hello Mikhail,
>>

> 
> Can you verify whether the kdump kernel boots with
> cgroup_disable=memory?

It was not a kdump, but s390 specific dump tool. Yes, here is the cmdline (forgot to include):
kernel parmline...: 'reset_devices cgroup_disable=memory nokaslr numa=off irqpoll nr_cpus=1'
> 
> I think there is an issue with how we call __list_lru_init(). The
> existing callsites had their own memcg_kmem_online() guards. But the
> THP one does not, so we're creating a memcg-aware list_lru, but the
> do-while hierarchy walk in __memcg_list_lru_alloc() runs into a NULL
> memcg.
> 
> Can you try the below on top of that -next checkout?
> 
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 1ccdd45b1d14..7c7024e33653 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -637,7 +637,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr
>  	else
>  		lru->shrinker_id = -1;
>  
> -	if (mem_cgroup_kmem_disabled())
> +	if (mem_cgroup_disabled() || mem_cgroup_kmem_disabled())
>  		memcg_aware = false;
>  #endif
> 

I confirm, this resolves the issue.

Thanks.

Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc

Posted by Vasily Gorbik 6 days, 18 hours ago

On Mon, Mar 30, 2026 at 04:56:56PM -0400, Johannes Weiner wrote:
> On Mon, Mar 30, 2026 at 04:41:16PM -0400, Johannes Weiner wrote:
> > Hello Mikhail,
> > 
> > On Mon, Mar 30, 2026 at 06:37:01PM +0200, Mikhail Zaslonko wrote:
> > > with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> > > dump kernel when running NVMe standalone dump (ngdump).
> > > This only happens in the 'capture kernel', normal boot of the same kernel works fine.
> 
> Can you verify whether the kdump kernel boots with
> cgroup_disable=memory?

Yes, that is the only special thing about that dump kernel.
Sorry for not including the command line. x86 crashes more or less the
same way with cgroup_disable=memory. But while I was doing the repro on
x86, you already found the root cause.

> Can you try the below on top of that -next checkout?
> 
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 1ccdd45b1d14..7c7024e33653 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -637,7 +637,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr
>  	else
>  		lru->shrinker_id = -1;
>  
> -	if (mem_cgroup_kmem_disabled())
> +	if (mem_cgroup_disabled() || mem_cgroup_kmem_disabled())
>  		memcg_aware = false;
>  #endif

Yes, that fixes it for me on s390 and x86. Thank you!

Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc

Posted by Andrew Morton 6 days, 21 hours ago

On Mon, 30 Mar 2026 18:37:01 +0200 Mikhail Zaslonko <zaslonko@linux.ibm.com> wrote:

> with this series in linux-next (since next-20260324) I see a reproducible panic on s390 in the
> dump kernel when running NVMe standalone dump (ngdump).
> This only happens in the 'capture kernel', normal boot of the same kernel works fine.

Thanks.  It seems that hannes is working on a significant respin, so
I'll remove this version of the series from mm.git/linux-next.