From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id AE4733EFD15
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289256; cv=none;
 b=goZWv+oe31vdHu1UlA+6HCAXfI24ozROO0USXA+0YgzQmGID71n8Q4fCC3OEYWVCswYQj/3zWalfAmTbJ+T+kjuAl1hvR+EiuC4pazfDFQFf90XPazDfflizBROX5niz9HDI1MjWjk0ZI+ycuQkOfGxw3fi7RZQ9yJZ3Fp+6sp4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289256; c=relaxed/simple;
	bh=BfQ26ULt2PTjqCU+ztURTv+VepPG+2f5LE/MdoYaXEA=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=I0dIPUApbAeY17X1nSUtQhplwUuRErq5zQ7RzMjBtMaZpVbMYx4NnV3mSPW4AS9+kVR0DZ+KkLcU2bE7oL+MSiWj+i6gsSIkhUnotIUH/MxR+6CoL7u9Rgjv1P0UhgvoubC/a6jM/S7D25GVtbdjXQTZf4asp2HjFgeB/ioCHM0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=jTECLLiP; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="jTECLLiP"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=0VRttkw1lb8yL2Eo96Ndyc3llIaCXL78G0HM2bDGbJI=; b=jTECLLiPMmNc8gdxA7G0IoHJna
	TOOys13yPQIA8pYEkJg7CokWyQsLyjNLCiSIecDg7uWw81/e2YB+frRWlnxBNh8R+9QkAba5Vgl/j
	dWKf7Suduc1mR4rVOVhq80Zmq0+nGGR/dZKDARObghWBMUi+htFVK4Clgf9KFvkgjaIOHX7caedjJ
	Qx9sl5Fu+jWGJYI9+1RxSsrLI2/Pub5njPgA67xKUhbkrVrT1M/Vmquz7WZp97zLFJ3U0zkHjugM3
	JYTTERJHd0NrfQdS8XsPqFGnyqCP6zPU9TB/ux9/rCNXQkzX9SmWY+hBtMS/wPQSHLH4p8B05e17M
	pt4eGeOg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-073F;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Johannes Weiner <hnaz@cmpxchg.org>,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 01/40] mm: page_alloc: replace pageblock_flags bitmap with
 struct pageblock_data
Date: Wed, 20 May 2026 10:59:07 -0400
Message-ID: <20260520150018.2491267-2-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Johannes Weiner <hannes@cmpxchg.org>

Replace the packed pageblock_flags bitmap with a per-pageblock struct
containing its own flags word. This changes the storage from
NR_PAGEBLOCK_BITS bits per pageblock packed into shared unsigned longs,
to a dedicated unsigned long per pageblock.

The free path looks up migratetype (from pageblock flags) immediately
followed by looking up pageblock ownership. Colocating them in a struct
means this hot path touches one cache line instead of two.

The per-pageblock struct also eliminates all the bit-packing indexing
(pfn_to_bitidx, word selection, intra-word shifts), simplifying the
accessor code.

Memory overhead: 8 bytes per pageblock (one unsigned long). With 2MB
pageblocks on x86_64, that's 4KB per GB -- up from ~0.5-1 bytes per
pageblock with the packed bitmap, but still negligible in absolute terms.

No functional change.

Signed-off-by: Johannes Weiner <hnaz@cmpxchg.org>
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h | 15 ++++----
 mm/internal.h          | 17 +++++++++
 mm/mm_init.c           | 25 +++++--------
 mm/page_alloc.c        | 84 +++++++-----------------------------------
 mm/sparse.c            |  3 +-
 5 files changed, 50 insertions(+), 94 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21da5..935ddc78f636 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1004,7 +1004,7 @@ struct zone {
 	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
 	 * In SPARSEMEM, this map is stored in struct mem_section
 	 */
-	unsigned long		*pageblock_flags;
+	struct pageblock_data	*pageblock_data;
 #endif /* CONFIG_SPARSEMEM */
=20
 	/* zone_start_pfn =3D=3D zone_start_paddr >> PAGE_SHIFT */
@@ -1957,9 +1957,6 @@ static inline bool movable_only_nodes(nodemask_t *nod=
es)
 #define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
 #define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
=20
-#define SECTION_BLOCKFLAGS_BITS \
-	((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
-
 #if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
 #error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
 #endif
@@ -1992,13 +1989,17 @@ static inline unsigned long section_nr_to_pfn(unsig=
ned long sec)
 #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
 #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
=20
+struct pageblock_data {
+	unsigned long flags;
+};
+
 struct mem_section_usage {
 	struct rcu_head rcu;
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
 #endif
 	/* See declaration of similar field in struct zone */
-	unsigned long pageblock_flags[0];
+	struct pageblock_data pageblock_data[];
 };
=20
 struct page;
@@ -2049,9 +2050,9 @@ extern struct mem_section **mem_section;
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
=20
-static inline unsigned long *section_to_usemap(struct mem_section *ms)
+static inline struct pageblock_data *section_to_usemap(struct mem_section =
*ms)
 {
-	return ms->usage->pageblock_flags;
+	return ms->usage->pageblock_data;
 }
=20
 static inline struct mem_section *__nr_to_section(unsigned long nr)
diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e0b..c8404cb00b08 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -808,6 +808,23 @@ static inline struct page *find_buddy_page_pfn(struct =
page *page,
 	return NULL;
 }
=20
+static inline struct pageblock_data *pfn_to_pageblock(const struct page *p=
age,
+						      unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	struct mem_section *ms =3D __pfn_to_section(pfn);
+	unsigned long idx =3D (pfn & (PAGES_PER_SECTION - 1)) >> pageblock_order;
+
+	return &section_to_usemap(ms)[idx];
+#else
+	struct zone *zone =3D page_zone(page);
+	unsigned long idx;
+
+	idx =3D (pfn - pageblock_start_pfn(zone->zone_start_pfn)) >> pageblock_or=
der;
+	return &zone->pageblock_data[idx];
+#endif
+}
+
 extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
 				unsigned long end_pfn, struct zone *zone);
=20
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f9f8e1af921c..1bc909da9c13 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1453,36 +1453,31 @@ void __meminit init_currently_empty_zone(struct zon=
e *zone,
=20
 #ifndef CONFIG_SPARSEMEM
 /*
- * Calculate the size of the zone->pageblock_flags rounded to an unsigned =
long
- * Start by making sure zonesize is a multiple of pageblock_order by round=
ing
- * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- * round what is now in bits to nearest long in bits, then return it in
- * bytes.
+ * Calculate the size of the zone->pageblock_data array.
+ * Round up the zone size to a pageblock boundary to get the
+ * number of pageblocks, then multiply by the struct size.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsi=
gned long zonesize)
 {
-	unsigned long usemapsize;
+	unsigned long nr_pageblocks;
=20
 	zonesize +=3D zone_start_pfn & (pageblock_nr_pages-1);
-	usemapsize =3D round_up(zonesize, pageblock_nr_pages);
-	usemapsize =3D usemapsize >> pageblock_order;
-	usemapsize *=3D NR_PAGEBLOCK_BITS;
-	usemapsize =3D round_up(usemapsize, BITS_PER_LONG);
+	nr_pageblocks =3D round_up(zonesize, pageblock_nr_pages) >> pageblock_ord=
er;
=20
-	return usemapsize / BITS_PER_BYTE;
+	return nr_pageblocks * sizeof(struct pageblock_data);
 }
=20
 static void __ref setup_usemap(struct zone *zone)
 {
 	unsigned long usemapsize =3D usemap_size(zone->zone_start_pfn,
 					       zone->spanned_pages);
-	zone->pageblock_flags =3D NULL;
+	zone->pageblock_data =3D NULL;
 	if (usemapsize) {
-		zone->pageblock_flags =3D
+		zone->pageblock_data =3D
 			memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
 					    zone_to_nid(zone));
-		if (!zone->pageblock_flags)
-			panic("Failed to allocate %ld bytes for zone %s pageblock flags on node=
 %d\n",
+		if (!zone->pageblock_data)
+			panic("Failed to allocate %ld bytes for zone %s pageblock data on node =
%d\n",
 			      usemapsize, zone->name, zone_to_nid(zone));
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 227d58dc3de6..fcff0083d5d4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -315,52 +315,18 @@ static inline bool _deferred_grow_zone(struct zone *z=
one, unsigned int order)
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
=20
-/* Return a pointer to the bitmap storing bits affecting a block of pages =
*/
-static inline unsigned long *get_pageblock_bitmap(const struct page *page,
-							unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	return section_to_usemap(__pfn_to_section(pfn));
-#else
-	return page_zone(page)->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	pfn &=3D (PAGES_PER_SECTION-1);
-#else
-	pfn =3D pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
-#endif /* CONFIG_SPARSEMEM */
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-}
-
 static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bi=
t)
 {
 	return pb_bit >=3D PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
 }
=20
-static __always_inline void
-get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
-			   unsigned long **bitmap_word, unsigned long *bitidx)
+static __always_inline unsigned long *
+get_pfnblock_flags_word(const struct page *page, unsigned long pfn)
 {
-	unsigned long *bitmap;
-	unsigned long word_bitidx;
-
-#ifdef CONFIG_MEMORY_ISOLATION
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS !=3D 8);
-#else
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS !=3D 4);
-#endif
 	BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
 	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
=20
-	bitmap =3D get_pageblock_bitmap(page, pfn);
-	*bitidx =3D pfn_to_bitidx(page, pfn);
-	word_bitidx =3D *bitidx / BITS_PER_LONG;
-	*bitidx &=3D (BITS_PER_LONG - 1);
-	*bitmap_word =3D &bitmap[word_bitidx];
+	return &pfn_to_pageblock(page, pfn)->flags;
 }
=20
=20
@@ -377,18 +343,14 @@ static unsigned long __get_pfnblock_flags_mask(const =
struct page *page,
 					       unsigned long pfn,
 					       unsigned long mask)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-	unsigned long word;
+	unsigned long *flags_word =3D get_pfnblock_flags_word(page, pfn);
=20
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
 	/*
 	 * This races, without locks, with set_pfnblock_migratetype(). Ensure
 	 * a consistent read of the memory array, so that results, even though
 	 * racy, are not corrupted.
 	 */
-	word =3D READ_ONCE(*bitmap_word);
-	return (word >> bitidx) & mask;
+	return READ_ONCE(*flags_word) & mask;
 }
=20
 /**
@@ -402,15 +364,10 @@ static unsigned long __get_pfnblock_flags_mask(const =
struct page *page,
 bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
 		      enum pageblock_bits pb_bit)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-
 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
 		return false;
=20
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
-	return test_bit(bitidx + pb_bit, bitmap_word);
+	return test_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
=20
 /**
@@ -449,18 +406,13 @@ get_pfnblock_migratetype(const struct page *page, uns=
igned long pfn)
 static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
 				      unsigned long flags, unsigned long mask)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-	unsigned long word;
-
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+	unsigned long *flags_word =3D get_pfnblock_flags_word(page, pfn);
+	unsigned long word, new_word;
=20
-	mask <<=3D bitidx;
-	flags <<=3D bitidx;
-
-	word =3D READ_ONCE(*bitmap_word);
+	word =3D READ_ONCE(*flags_word);
 	do {
-	} while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
+		new_word =3D (word & ~mask) | flags;
+	} while (!try_cmpxchg(flags_word, &word, new_word));
 }
=20
 /**
@@ -472,15 +424,10 @@ static void __set_pfnblock_flags_mask(struct page *pa=
ge, unsigned long pfn,
 void set_pfnblock_bit(const struct page *page, unsigned long pfn,
 		      enum pageblock_bits pb_bit)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-
 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
 		return;
=20
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
-	set_bit(bitidx + pb_bit, bitmap_word);
+	set_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
=20
 /**
@@ -492,15 +439,10 @@ void set_pfnblock_bit(const struct page *page, unsign=
ed long pfn,
 void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
 			enum pageblock_bits pb_bit)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-
 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
 		return;
=20
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
-	clear_bit(bitidx + pb_bit, bitmap_word);
+	clear_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
=20
 /**
diff --git a/mm/sparse.c b/mm/sparse.c
index effdac6b0ab1..f77d6d9fa62f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -216,7 +216,8 @@ static void __init memblocks_present(void)
=20
 static unsigned long usemap_size(void)
 {
-	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
+	return (1UL << (PFN_SECTION_SHIFT - pageblock_order)) *
+		sizeof(struct pageblock_data);
 }
=20
 size_t mem_section_usage_size(void)
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD3F13F1677
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289262; cv=none;
 b=qrrxK+EEf33JGcIUVresIufX2/qLKNPxfTnHXRsxwUPOjggU8qhOMtekey0XMutO2hjcdNQSiihiMwSAB7pt66deETsBR2/OQvM3D3a0DYZU/KF9bPEX2rnli7ESzuTu3GMTt79bT/BRiCcD+PnAkbIWPFBKl8SAtCN8tfNVEl8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289262; c=relaxed/simple;
	bh=phux0aDfwXk16lJ8jUgMbUQUJX0vnmn1LlAMhc5bVhg=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=iE5Z1xHAEjlnLdg/978V1G/KNpLfjlUsznskd3feN18rGlsZ1ICfV4dN7fUvkSdMQ8XHrrRHYPq5GwvtE7JCPmBQMzzCKg0SLcDMgD7bnS26CiesgvZ4isWTUZyCKmGqjXouTlk3clzlRddLFf8Fkvx6fKrv7/brSs5WIYi0Dmc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=VqGgf3CY; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="VqGgf3CY"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=AJCxp07sL7BPWTjtROCBl4DjRFWKw8FYiTrUwOD4HTE=; b=VqGgf3CYnn1QkSOi48utoihLVp
	2Nt49IpyBwvK7MZ9NrmXi5+v98uTWMgl+Fx1mWYEToCnWViHIOLWgpDIp0vD57uWY+IqpVWGIYBRp
	9gOq1dun1P6u0nygaz2bQdnPavZxCHzklGabauxFIqm+N5YkZt9Cv+GMMalLk6oK16HDSMAcGNsX9
	yy5pjq5mBjWHe2gXDX7eTgqVBO0/mo8egdHPQHaUwbrn2IaUCGsoaEoh8Gj3KEuZJ+B8IphUZk+2t
	btaCTXumll7bBJ9VKR01cogQI5QapqgGPL5F07qDN68ncHTEBcudQv0THwfZmGRBne+rGb85W6vJV
	0hDt/yPw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-0Dyy;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Johannes Weiner <hnaz@cmpxchg.org>,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 02/40] mm: page_alloc: per-cpu pageblock buddy allocator
Date: Wed, 20 May 2026 10:59:08 -0400
Message-ID: <20260520150018.2491267-3-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Johannes Weiner <hannes@cmpxchg.org>

On large machines, zone->lock is a scaling bottleneck for page
allocation. Two common patterns drive contention:

1. Affinity violations: pages are allocated on one CPU but freed on
another (jemalloc, exit, reclaim). The freeing CPU's PCP drains to
zone buddy, and the allocating CPU refills from zone buddy -- both
under zone->lock, defeating PCP batching entirely.

2. Concurrent exits: processes tearing down large address spaces
simultaneously overwhelm per-CPU PCP capacity, serializing on
zone->lock for overflow.

Solution

Extend the PCP to operate on whole pageblocks with ownership tracking.

Each CPU claims pageblocks from the zone buddy and splits them
locally. Pages are tagged with their owning CPU, so frees route back
to the owner's PCP regardless of which CPU frees. This eliminates
affinity violations: the owner CPU's PCP absorbs both allocations and
frees for its blocks without touching zone->lock.

It also shortens zone->lock hold time during drain and refill
cycles. Whole blocks are acquired under zone->lock and then split
outside of it. Affinity routing to the owning PCP on free enables
buddy merging outside the zone->lock as well; a bottom-up merge pass
runs under pcp->lock on drain, freeing larger chunks under zone->lock.

PCP refill uses a four-phase approach:

Phase 0: recover owned fragments previously drained to zone buddy.
Phase 1: claim whole pageblocks from zone buddy.
Phase 2: grab sub-pageblock chunks without migratetype stealing.
Phase 3: traditional __rmqueue() with migratetype fallback.

Phase 0/1 pages are owned and marked PagePCPBuddy, making them
eligible for PCP-level merging. Phase 2/3 pages are cached on PCP for
batching only -- no ownership, no merging. However, Phase 2 still
benefits from chunky zone transactions: it pulls higher-order entries
from zone free lists under zone->lock and splits them on the PCP
outside of it, rather than acquiring zone->lock per page.

When PCP batch sizes are small (small machines with few CPUs) or the
zone is fragmented and no whole pageblocks are available, refill falls
through to Phase 2/3 naturally. The allocator degrades gracefully to
the original page-at-a-time behavior.

When owned blocks accumulate long-lived allocations (e.g. a mix of
anonymous and file cache pages), partial block drains send the free
fragments to zone buddy and remember the block, so Phase 0 can recover
them on the next refill. This allows the allocator to pack new
allocations next to existing ones in already-committed blocks rather
than consuming fresh pageblocks, keeping fragmentation contained.

Data structures:

- per_cpu_pages: +owned_blocks list head, +PCPF_CPU_DEAD flag to gate
  enqueuing on offline CPUs.
- pageblock_data: +cpu (owner), +block_pfn, +cpu_node (recovery list
  linkage). 32 bytes per pageblock, ~16KB per GB with 2MB pageblocks.
- PagePCPBuddy page type marks pages eligible for PCP-level merging.

[riel@surriel.com: fix ownership clearing on direct block frees]

Note (checkpatch-rationale): the two VM_BUG_ON(order >
PAGE_BLOCK_MAX_ORDER) assertions in __pcp_buddy_idx() and
__pcp_buddy_order() are intentional and should remain BUG()-class
rather than WARN_ON_ONCE(). Their result is used as an array index
into the per-CPU pageblock free lists; a violation means the caller
passed an invalid order and the index would corrupt adjacent state.
Both are VM_BUG_ON (compiled out unless CONFIG_DEBUG_VM=3Dy) and
follow the established mm/page_alloc.c convention for invariants on
the buddy index domain (see neighbouring VM_BUG_ON(order >
PAGE_ALLOC_COSTLY_ORDER) call sites this patch replaces).

Signed-off-by: Johannes Weiner <hnaz@cmpxchg.org>
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h     |  23 +-
 include/linux/page-flags.h |   9 +
 mm/debug.c                 |   1 +
 mm/page_alloc.c            | 668 ++++++++++++++++++++++++++++++-------
 4 files changed, 576 insertions(+), 125 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 935ddc78f636..f0eb16390906 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -802,17 +802,10 @@ enum zone_watermarks {
 };
=20
 /*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional li=
sts
- * are added for THP. One PCP list is used by GPF_MOVABLE, and the other P=
CP list
- * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
+ * One per migratetype for page orders up to and including PAGE_BLOCK_MAX_=
ORDER.
  */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define NR_PCP_THP 2
-#else
-#define NR_PCP_THP 0
-#endif
-#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER=
 + 1))
-#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
+#define NR_PCP_ORDERS (PAGE_BLOCK_MAX_ORDER + 1)
+#define NR_PCP_LISTS (MIGRATE_PCPTYPES * NR_PCP_ORDERS)
=20
 /*
  * Flags used in pcp->flags field.
@@ -825,9 +818,13 @@ enum zone_watermarks {
  * draining PCP for consecutive high-order pages freeing without
  * allocation if data cache slice of CPU is large enough.  To reduce
  * zone lock contention and keep cache-hot pages reusing.
+ *
+ * PCPF_CPU_DEAD: CPU is offline.  Don't enqueue freed pages; fall
+ * back to zone buddy instead.
  */
 #define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0)
 #define	PCPF_FREE_HIGH_BATCH		BIT(1)
+#define	PCPF_CPU_DEAD			BIT(2)
=20
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
@@ -843,6 +840,9 @@ struct per_cpu_pages {
 #endif
 	short free_count;	/* consecutive free count */
=20
+	/* Pageblocks owned by this CPU, for fragment recovery */
+	struct list_head owned_blocks;
+
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
 } ____cacheline_aligned_in_smp;
@@ -1991,6 +1991,9 @@ static inline unsigned long section_nr_to_pfn(unsigne=
d long sec)
=20
 struct pageblock_data {
 	unsigned long flags;
+	int cpu;			/* PCP ownership: owning cpu + 1, or 0 for zone-owned */
+	unsigned long block_pfn;	/* first PFN of pageblock */
+	struct list_head cpu_node;	/* per-CPU owned-blocks list */
 };
=20
 struct mem_section_usage {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0e03d816e8b9..96b4735f50a0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -924,6 +924,7 @@ enum pagetype {
 	PGTY_unaccepted		=3D 0xf7,
 	PGTY_large_kmalloc	=3D 0xf8,
 	PGTY_netpp		=3D 0xf9,
+	PGTY_pcp_buddy		=3D 0xfa,
=20
 	PGTY_mapcount_underflow =3D 0xff
 };
@@ -992,6 +993,14 @@ static __always_inline void __ClearPage##uname(struct =
page *page)	\
  */
 PAGE_TYPE_OPS(Buddy, buddy, buddy)
=20
+/*
+ * PagePCPBuddy() indicates that the page is free and in a per-cpu
+ * buddy allocator (see mm/page_alloc.c). Unlike PageBuddy() pages,
+ * these are not on zone free lists and must not be isolated by
+ * compaction or other zone-level code.
+ */
+PAGE_TYPE_OPS(PCPBuddy, pcp_buddy, pcp_buddy)
+
 /*
  * PageOffline() indicates that the page is logically offline although the
  * containing section is online. (e.g. inflated in a balloon driver or
diff --git a/mm/debug.c b/mm/debug.c
index 77fa8fe1d641..d4542d5d202b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -56,6 +56,7 @@ static const char *page_type_names[] =3D {
 	DEF_PAGETYPE_NAME(table),
 	DEF_PAGETYPE_NAME(buddy),
 	DEF_PAGETYPE_NAME(unaccepted),
+	DEF_PAGETYPE_NAME(pcp_buddy),
 };
=20
 static const char *page_type_name(unsigned int page_type)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fcff0083d5d4..a3448a97bab2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -370,6 +370,22 @@ bool get_pfnblock_bit(const struct page *page, unsigne=
d long pfn,
 	return test_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
=20
+/*
+ * Extract migratetype from a pageblock_data pointer. Callers that
+ * already have the pbd can avoid a redundant pfn_to_pageblock().
+ */
+static __always_inline enum migratetype
+pbd_migratetype(const struct pageblock_data *pbd)
+{
+	unsigned long flags =3D READ_ONCE(pbd->flags) & MIGRATETYPE_AND_ISO_MASK;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+	if (flags & BIT(PB_migrate_isolate))
+		return MIGRATE_ISOLATE;
+#endif
+	return flags & MIGRATETYPE_MASK;
+}
+
 /**
  * get_pfnblock_migratetype - Return the migratetype of a pageblock
  * @page: The page within the block of interest
@@ -383,16 +399,7 @@ bool get_pfnblock_bit(const struct page *page, unsigne=
d long pfn,
 __always_inline enum migratetype
 get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
 {
-	unsigned long mask =3D MIGRATETYPE_AND_ISO_MASK;
-	unsigned long flags;
-
-	flags =3D __get_pfnblock_flags_mask(page, pfn, mask);
-
-#ifdef CONFIG_MEMORY_ISOLATION
-	if (flags & BIT(PB_migrate_isolate))
-		return MIGRATE_ISOLATE;
-#endif
-	return flags & MIGRATETYPE_MASK;
+	return pbd_migratetype(pfn_to_pageblock(page, pfn));
 }
=20
 /**
@@ -476,6 +483,8 @@ void __meminit init_pageblock_migratetype(struct page *=
page,
 					  enum migratetype migratetype,
 					  bool isolate)
 {
+	unsigned long pfn =3D page_to_pfn(page);
+	struct pageblock_data *pbd;
 	unsigned long flags;
=20
 	if (unlikely(page_group_by_mobility_disabled &&
@@ -494,8 +503,11 @@ void __meminit init_pageblock_migratetype(struct page =
*page,
 	if (isolate)
 		flags |=3D BIT(PB_migrate_isolate);
 #endif
-	__set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
-				  MIGRATETYPE_AND_ISO_MASK);
+	__set_pfnblock_flags_mask(page, pfn, flags, MIGRATETYPE_AND_ISO_MASK);
+
+	pbd =3D pfn_to_pageblock(page, pfn);
+	pbd->block_pfn =3D pfn;
+	INIT_LIST_HEAD(&pbd->cpu_node);
 }
=20
 #ifdef CONFIG_DEBUG_VM
@@ -581,19 +593,7 @@ static void bad_page(struct page *page, const char *re=
ason)
=20
 static inline unsigned int order_to_pindex(int migratetype, int order)
 {
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	bool movable;
-	if (order > PAGE_ALLOC_COSTLY_ORDER) {
-		VM_BUG_ON(!is_pmd_order(order));
-
-		movable =3D migratetype =3D=3D MIGRATE_MOVABLE;
-
-		return NR_LOWORDER_PCP_LISTS + movable;
-	}
-#else
-	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
-#endif
+	VM_BUG_ON(order > PAGE_BLOCK_MAX_ORDER);
=20
 	return (MIGRATE_PCPTYPES * order) + migratetype;
 }
@@ -602,25 +602,14 @@ static inline int pindex_to_order(unsigned int pindex)
 {
 	int order =3D pindex / MIGRATE_PCPTYPES;
=20
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pindex >=3D NR_LOWORDER_PCP_LISTS)
-		order =3D HPAGE_PMD_ORDER;
-#else
-	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
-#endif
+	VM_BUG_ON(order > PAGE_BLOCK_MAX_ORDER);
=20
 	return order;
 }
=20
 static inline bool pcp_allowed_order(unsigned int order)
 {
-	if (order <=3D PAGE_ALLOC_COSTLY_ORDER)
-		return true;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (is_pmd_order(order))
-		return true;
-#endif
-	return false;
+	return order <=3D pageblock_order;
 }
=20
 /*
@@ -653,6 +642,91 @@ static inline void set_buddy_order(struct page *page, =
unsigned int order)
 	__SetPageBuddy(page);
 }
=20
+/*
+ * PCP pageblock ownership tracking.
+ *
+ * Ownership rules:
+ * - Whole pageblocks acquired by rmqueue_bulk() Phase 1 are owned, meaning
+ *   all frees will be routed to that PCP.
+ * - Draining a whole pageblock back to the zone clears PCP ownership.
+ * - Draining a partial block (due to PCP thresholds or memory pressure) p=
uts
+ *   the block on the pcp->owned_blocks list. A later refill will attempt =
to
+ *   recover it in Phase 0.
+ * - Whole pageblocks can assemble on the zone buddy due to PCP bypasses,
+ *   e.g. during lock contention. __free_one_page() clears stale ownership.
+ * - Phases 2/3 refill with fragments for pure caching - if there are not
+ *   enough blocks or pcp->high restrictions. They do not participate
+ *   in ownership, affinity enforcement, or on-PCP merging.
+ *
+ * PagePCPBuddy means "mergeable buddy on home PCP":
+ * - Set when Phase 0/1 restore or acquire whole pageblocks.
+ * - Propagated to split remainders in pcp_rmqueue_smallest().
+ * - Set on freed pages from owned blocks routed to the owner PCP.
+ * - NOT set for Phase 2/3 fragments or zone-owned frees.
+ * - The merge pass in free_pcppages_bulk() only processes
+ *   PagePCPBuddy pages, ensuring it never touches pages on
+ *   another CPU's PCP list.
+ *
+ * We store the owning CPU + 1, so the default value of 0 in those
+ * arrays means no owner / zone owner (and not CPU 0).
+ */
+
+static inline void clear_pcpblock_owner(struct page *page)
+{
+	unsigned long pfn =3D page_to_pfn(page);
+	struct pageblock_data *pbd =3D pfn_to_pageblock(page, pfn);
+
+	pbd->cpu =3D 0;
+	list_del_init(&pbd->cpu_node);
+}
+
+static inline void set_pcpblock_owner(struct page *page, int cpu)
+{
+	pfn_to_pageblock(page, page_to_pfn(page))->cpu =3D cpu + 1;
+}
+
+static inline int get_pcpblock_owner(struct page *page)
+{
+	return pfn_to_pageblock(page, page_to_pfn(page))->cpu - 1;
+}
+
+static inline void set_pcp_order(struct page *page, unsigned int order)
+{
+	set_page_private(page, order);
+}
+
+static inline unsigned int pcp_buddy_order(struct page *page)
+{
+	return page_private(page);
+}
+
+static void pcp_enqueue(struct per_cpu_pages *pcp, struct page *page,
+			int migratetype, unsigned int order)
+{
+	set_pcp_order(page, order);
+	list_add(&page->pcp_list,
+		 &pcp->lists[order_to_pindex(migratetype, order)]);
+	pcp->count +=3D 1 << order;
+}
+
+static void pcp_enqueue_tail(struct per_cpu_pages *pcp, struct page *page,
+			     int migratetype, unsigned int order)
+{
+	set_pcp_order(page, order);
+	list_add_tail(&page->pcp_list,
+		      &pcp->lists[order_to_pindex(migratetype, order)]);
+	pcp->count +=3D 1 << order;
+}
+
+static void pcp_dequeue(struct per_cpu_pages *pcp, struct page *page,
+			unsigned int order)
+{
+	list_del(&page->pcp_list);
+	__ClearPagePCPBuddy(page);
+	set_page_private(page, 0);
+	pcp->count -=3D 1 << order;
+}
+
 #ifdef CONFIG_COMPACTION
 static inline struct capture_control *task_capc(struct zone *zone)
 {
@@ -893,6 +967,21 @@ static inline void __free_one_page(struct page *page,
=20
 	account_freepages(zone, 1 << order, migratetype);
=20
+	/*
+	 * For whole blocks, ownership returns to the zone. There are
+	 * no more outstanding frees to route through that CPU's PCP,
+	 * and we don't want to confuse any future users of the pages
+	 * in this block. E.g. rmqueue_buddy().
+	 *
+	 * Check here if a whole block came in directly: pre-merged in
+	 * the PCP, or PCP contended and bypassed.
+	 *
+	 * There is another check in the loop below if a block merges
+	 * up with pages already on the zone buddy.
+	 */
+	if (order =3D=3D pageblock_order)
+		clear_pcpblock_owner(page);
+
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt =3D migratetype;
=20
@@ -942,6 +1031,10 @@ static inline void __free_one_page(struct page *page,
 		page =3D page + (combined_pfn - pfn);
 		pfn =3D combined_pfn;
 		order++;
+
+		/* Clear owner also when we merge up. See above */
+		if (order =3D=3D pageblock_order)
+			clear_pcpblock_owner(page);
 	}
=20
 done_merging:
@@ -1390,17 +1483,24 @@ bool free_pages_prepare(struct page *page, unsigned=
 int order)
 }
=20
 /*
- * Frees a number of pages from the PCP lists
- * Assumes all pages on list are in same zone.
- * count is the number of pages to free.
+ * Free PCP pages to zone buddy. First does a bottom-up merge pass
+ * over PagePCPBuddy entries under pcp->lock only (already held by
+ * caller). Only pages marked PagePCPBuddy (owned-block pages on
+ * their home PCP) participate in merging; non-owned pages (Phase
+ * 2/3 fragments) are skipped and drain individually.
+ *
+ * Then drains pages to zone under zone->lock, starting with
+ * fully-merged pageblocks via round-robin. When those are exhausted,
+ * falls through to smaller orders. Draining a pageblock-order page
+ * disowns the block.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
-					struct per_cpu_pages *pcp,
-					int pindex)
+				struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	unsigned int order;
 	struct page *page;
+	int mt, pindex;
=20
 	/*
 	 * Ensure proper count is passed which otherwise would stuck in the
@@ -1408,8 +1508,45 @@ static void free_pcppages_bulk(struct zone *zone, in=
t count,
 	 */
 	count =3D min(pcp->count, count);
=20
-	/* Ensure requested pindex is drained first. */
-	pindex =3D pindex - 1;
+	/* PCP merge pass */
+	for (order =3D 0; order < pageblock_order; order++) {
+		for (mt =3D 0; mt < MIGRATE_PCPTYPES; mt++) {
+			struct list_head *list;
+			struct page *page, *tmp;
+
+			list =3D &pcp->lists[order_to_pindex(mt, order)];
+			list_for_each_entry_safe(page, tmp, list, pcp_list) {
+				unsigned long pfn =3D page_to_pfn(page);
+				unsigned long buddy_pfn =3D __find_buddy_pfn(pfn, order);
+				struct page *buddy =3D page + (buddy_pfn - pfn);
+				unsigned long combined_pfn;
+				struct page *combined;
+
+				if (!PagePCPBuddy(page))
+					continue;
+				if (!PagePCPBuddy(buddy))
+					continue;
+				if (pcp_buddy_order(buddy) !=3D order)
+					continue;
+
+				/* Don't corrupt the safe iterator! */
+				if (buddy =3D=3D tmp)
+					tmp =3D list_next_entry(tmp, pcp_list);
+
+				pcp_dequeue(pcp, page, order);
+				pcp_dequeue(pcp, buddy, order);
+
+				combined_pfn =3D buddy_pfn & pfn;
+				combined =3D page + (combined_pfn - pfn);
+
+				__SetPagePCPBuddy(combined);
+				pcp_enqueue_tail(pcp, combined, mt, order + 1);
+			}
+		}
+	}
+
+	/* Ensure pageblock orders are drained first. */
+	pindex =3D order_to_pindex(0, pageblock_order) - 1;
=20
 	spin_lock_irqsave(&zone->lock, flags);
=20
@@ -1427,19 +1564,31 @@ static void free_pcppages_bulk(struct zone *zone, i=
nt count,
 		order =3D pindex_to_order(pindex);
 		nr_pages =3D 1 << order;
 		do {
+			fpi_t fpi =3D FPI_NONE;
 			unsigned long pfn;
-			int mt;
=20
 			page =3D list_last_entry(list, struct page, pcp_list);
 			pfn =3D page_to_pfn(page);
 			mt =3D get_pfnblock_migratetype(page, pfn);
=20
-			/* must delete to avoid corrupting pcp list */
-			list_del(&page->pcp_list);
+			/*
+			 * Owned fragment going to zone buddy: queue
+			 * block for recovery during the next refill,
+			 * and keep it away from other CPUs (tail).
+			 */
+			if (PagePCPBuddy(page) && order < pageblock_order) {
+				struct pageblock_data *pbd;
+
+				pbd =3D pfn_to_pageblock(page, pfn);
+				if (list_empty(&pbd->cpu_node))
+					list_add(&pbd->cpu_node, &pcp->owned_blocks);
+				fpi =3D FPI_TO_TAIL;
+			}
+
+			pcp_dequeue(pcp, page, order);
 			count -=3D nr_pages;
-			pcp->count -=3D nr_pages;
=20
-			__free_one_page(page, pfn, zone, order, mt, FPI_NONE);
+			__free_one_page(page, pfn, zone, order, mt, fpi);
 			trace_mm_page_pcpu_drain(page, order, mt);
 		} while (count > 0 && !list_empty(list));
 	}
@@ -1447,6 +1596,45 @@ static void free_pcppages_bulk(struct zone *zone, in=
t count,
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
=20
+/*
+ * Search PCP free lists for a page of at least the requested order.
+ * If found at a higher order, split and place remainders on PCP lists.
+ * Returns NULL if nothing available on the PCP.
+ */
+static struct page *pcp_rmqueue_smallest(struct per_cpu_pages *pcp,
+					 int migratetype, unsigned int order)
+{
+	unsigned int high;
+
+	for (high =3D order; high <=3D pageblock_order; high++) {
+		struct list_head *list;
+		unsigned long size;
+		struct page *page;
+		bool owned;
+
+		list =3D &pcp->lists[order_to_pindex(migratetype, high)];
+		if (list_empty(list))
+			continue;
+
+		page =3D list_first_entry(list, struct page, pcp_list);
+		/* Save before pcp_dequeue() clears it */
+		owned =3D PagePCPBuddy(page);
+		pcp_dequeue(pcp, page, high);
+
+		size =3D 1 << high;
+		while (high > order) {
+			high--;
+			size >>=3D 1;
+			if (owned)
+				__SetPagePCPBuddy(&page[size]);
+			pcp_enqueue(pcp, &page[size], migratetype, high);
+		}
+
+		return page;
+	}
+	return NULL;
+}
+
 /* Split a multi-block free page into its individual pageblocks. */
 static void split_large_buddy(struct zone *zone, struct page *page,
 			      unsigned long pfn, int order, fpi_t fpi)
@@ -1456,6 +1644,7 @@ static void split_large_buddy(struct zone *zone, stru=
ct page *page,
 	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
 	/* Caller removed page from freelist, buddy info cleared! */
 	VM_WARN_ON_ONCE(PageBuddy(page));
+	VM_WARN_ON_ONCE(PagePCPBuddy(page));
=20
 	if (order > pageblock_order)
 		order =3D pageblock_order;
@@ -2451,28 +2640,162 @@ __rmqueue(struct zone *zone, unsigned int order, i=
nt migratetype,
 }
=20
 /*
- * Obtain a specified number of elements from the buddy allocator, all und=
er
- * a single hold of the lock, for efficiency.  Add them to the supplied li=
st.
- * Returns the number of new pages which were placed at *list.
+ * Obtain a specified number of elements from the buddy allocator, all
+ * under a single hold of the lock, for efficiency.  Add them to the
+ * freelist of @pcp.
+ *
+ * When @pcp is non-NULL and @count > 1 (normal pageset), uses a four-phase
+ * approach:
+ *   Phase 0: Recover previously owned, partially drained blocks.
+ *   Phase 1: Acquire whole pageblocks, claim ownership, set PagePCPBuddy.
+ *            These pages are eligible for PCP-level buddy merging.
+ *   Phase 2: Grab sub-pageblock fragments of the same migratetype.
+ *   Phase 3: Fall back to __rmqueue() with migratetype fallback.
+ *   Phase 2/3 pages are cached for batching only -- no ownership claim,
+ *   no PagePCPBuddy, no PCP-level merging.
+ *
+ * When @pcp is NULL or @count <=3D 1 (boot pageset), acquires individual
+ * pages of the requested order directly.
+ *
+ * Returns %true if at least some pages were acquired.
  */
-static int rmqueue_bulk(struct zone *zone, unsigned int order,
-			unsigned long count, struct list_head *list,
-			int migratetype, unsigned int alloc_flags)
+static bool rmqueue_bulk(struct zone *zone, unsigned int order,
+			 unsigned long count,
+			 int migratetype, unsigned int alloc_flags,
+			 struct per_cpu_pages *pcp)
 {
+	unsigned long pages_needed =3D count << order;
 	enum rmqueue_mode rmqm =3D RMQUEUE_NORMAL;
+	struct pageblock_data *pbd, *tmp;
+	int cpu =3D smp_processor_id();
+	unsigned long refilled =3D 0;
 	unsigned long flags;
-	int i;
+	int o;
=20
 	if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
 		if (!spin_trylock_irqsave(&zone->lock, flags))
-			return 0;
+			return false;
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 	}
-	for (i =3D 0; i < count; ++i) {
+
+	if (!pcp || count <=3D 1)
+		goto phase3;
+
+	/*
+	 * Phase 0: Recover fragments from owned blocks.
+	 *
+	 * The owned_blocks list tracks blocks that have fragments
+	 * sitting in zone buddy (put there by drains). Pull matching
+	 * fragments back to PCP with PagePCPBuddy so they participate
+	 * in merging, instead of claiming fresh blocks and spreading
+	 * fragmentation further.
+	 *
+	 * Only recover blocks matching the requested migratetype.
+	 * After recovery, remove the block from the list -- the drain
+	 * path re-adds it if new fragments arrive.
+	 */
+	list_for_each_entry_safe(pbd, tmp, &pcp->owned_blocks, cpu_node) {
+		unsigned long base_pfn, pfn;
+		int block_mt;
+
+		base_pfn =3D pbd->block_pfn;
+		block_mt =3D pbd_migratetype(pbd);
+		if (block_mt !=3D migratetype)
+			continue;
+
+		for (pfn =3D base_pfn; pfn < base_pfn + pageblock_nr_pages;) {
+			struct page *page =3D pfn_to_page(pfn);
+
+			if (!PageBuddy(page)) {
+				pfn++;
+				continue;
+			}
+
+			o =3D buddy_order(page);
+			del_page_from_free_list(page, zone, o, block_mt);
+			__SetPagePCPBuddy(page);
+			pcp_enqueue_tail(pcp, page, block_mt, o);
+			refilled +=3D 1 << o;
+			pfn +=3D 1 << o;
+		}
+
+		list_del_init(&pbd->cpu_node);
+
+		if (refilled >=3D pages_needed)
+			goto out;
+	}
+
+	/*
+	 * Phase 1: Try whole pageblocks. Fast path for unfragmented
+	 * zones. Claim ownership and set PagePCPBuddy so these pages
+	 * are eligible for PCP-level merging.
+	 *
+	 * Only grab blocks that fit within the refill budget. On
+	 * small zones, pages_needed can be less than a whole
+	 * pageblock; skip to smaller blocks or individual pages to
+	 * avoid overshooting the PCP high watermark.
+	 */
+	while (refilled + pageblock_nr_pages <=3D pages_needed) {
+		struct page *page;
+
+		page =3D __rmqueue(zone, pageblock_order,
+				 migratetype, alloc_flags, &rmqm);
+		if (!page)
+			break;
+
+		set_pcpblock_owner(page, cpu);
+		__SetPagePCPBuddy(page);
+		pcp_enqueue_tail(pcp, page, migratetype, pageblock_order);
+		refilled +=3D 1 << pageblock_order;
+	}
+	if (refilled >=3D pages_needed)
+		goto out;
+
+	/*
+	 * Phase 2: Zone too fragmented for whole pageblocks.
+	 * Sweep zone free lists top-down for same-migratetype
+	 * chunks. Avoids cross-type stealing and keeps PCP
+	 * functional under fragmentation.
+	 *
+	 * No ownership claim or PagePCPBuddy - these are
+	 * sub-pageblock fragments cached for batching only.
+	 *
+	 * Stop above the requested order -- at that point,
+	 * phase 3's __rmqueue() does the same lookup but with
+	 * migratetype fallback.
+	 */
+	for (o =3D pageblock_order - 1;
+	     o > (int)order && refilled < pages_needed; o--) {
+		struct free_area *area =3D &zone->free_area[o];
+		struct page *page;
+
+		while (refilled + (1 << o) <=3D pages_needed) {
+			page =3D get_page_from_free_area(area, migratetype);
+			if (!page)
+				break;
+
+			del_page_from_free_list(page, zone, o, migratetype);
+			pcp_enqueue_tail(pcp, page, migratetype, o);
+			refilled +=3D 1 << o;
+		}
+	}
+
+	/*
+	 * Phase 3: Last resort. Use __rmqueue() which does
+	 * migratetype fallback. Cache the pages on PCP to still
+	 * amortize future zone lock acquisitions.
+	 *
+	 * No ownership claim or PagePCPBuddy - these fragments
+	 * drain individually to zone buddy.
+	 *
+	 * Boot pagesets (count <=3D 1) jump here directly.
+	 */
+phase3:
+	while (refilled < pages_needed) {
 		struct page *page =3D __rmqueue(zone, order, migratetype,
 					      alloc_flags, &rmqm);
-		if (unlikely(page =3D=3D NULL))
+		if (!page)
 			break;
=20
 		/*
@@ -2485,11 +2808,13 @@ static int rmqueue_bulk(struct zone *zone, unsigned=
 int order,
 		 * for IO devices that can merge IO requests if the physical
 		 * pages are ordered properly.
 		 */
-		list_add_tail(&page->pcp_list, list);
+		pcp_enqueue_tail(pcp, page, migratetype, order);
+		refilled +=3D 1 << order;
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
=20
-	return i;
+out:
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return refilled;
 }
=20
 /*
@@ -2519,7 +2844,7 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu=
_pages *pcp)
 	while (to_drain > 0) {
 		to_drain_batched =3D min(to_drain, batch);
 		pcp_spin_lock_nopin(pcp);
-		free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
+		free_pcppages_bulk(zone, to_drain_batched, pcp);
 		pcp_spin_unlock_nopin(pcp);
 		todo =3D true;
=20
@@ -2543,7 +2868,7 @@ void drain_zone_pages(struct zone *zone, struct per_c=
pu_pages *pcp)
 	to_drain =3D min(pcp->count, batch);
 	if (to_drain > 0) {
 		pcp_spin_lock_nopin(pcp);
-		free_pcppages_bulk(zone, to_drain, pcp, 0);
+		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp_spin_unlock_nopin(pcp);
 	}
 }
@@ -2564,7 +2889,7 @@ static void drain_pages_zone(unsigned int cpu, struct=
 zone *zone)
 			int to_drain =3D min(count,
 				pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
=20
-			free_pcppages_bulk(zone, to_drain, pcp, 0);
+			free_pcppages_bulk(zone, to_drain, pcp);
 			count -=3D to_drain;
 		}
 		pcp_spin_unlock_nopin(pcp);
@@ -2758,19 +3083,16 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, s=
truct zone *zone,
 }
=20
 /*
- * Tune pcp alloc factor and adjust count & free_count. Free pages to brin=
g the
- * pcp's watermarks below high.
- *
- * May return a freed pcp, if during page freeing the pcp spinlock cannot =
be
- * reacquired. Return true if pcp is locked, false otherwise.
+ * Free a page to the PCP and flush excess pages if necessary.
+ * Works for both local and remote PCP - caller handles locking.
+ * @owned: page is from a PCP-owned block (eligible for merging).
  */
 static bool free_frozen_page_commit(struct zone *zone,
 		struct per_cpu_pages *pcp, struct page *page, int migratetype,
-		unsigned int order, fpi_t fpi_flags)
+		unsigned int order, fpi_t fpi_flags, bool owned)
 {
 	int high, batch;
 	int to_free, to_free_batched;
-	int pindex;
 	int cpu =3D smp_processor_id();
 	int ret =3D true;
 	bool free_high =3D false;
@@ -2782,9 +3104,15 @@ static bool free_frozen_page_commit(struct zone *zon=
e,
 	 */
 	pcp->alloc_factor >>=3D 1;
 	__count_vm_events(PGFREE, 1 << order);
-	pindex =3D order_to_pindex(migratetype, order);
-	list_add(&page->pcp_list, &pcp->lists[pindex]);
-	pcp->count +=3D 1 << order;
+	/*
+	 * Only set PagePCPBuddy for pages from owned blocks -- those
+	 * are on their home PCP and eligible for buddy merging.
+	 * Zone-owned pages are cached on the local PCP for batching
+	 * only; the merge pass skips them harmlessly.
+	 */
+	if (owned)
+		__SetPagePCPBuddy(page);
+	pcp_enqueue(pcp, page, migratetype, order);
=20
 	batch =3D READ_ONCE(pcp->batch);
 	/*
@@ -2820,7 +3148,13 @@ static bool free_frozen_page_commit(struct zone *zon=
e,
 	to_free =3D nr_pcp_free(pcp, batch, high, free_high);
 	while (to_free > 0 && pcp->count > 0) {
 		to_free_batched =3D min(to_free, batch);
-		free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
+		/*
+		 * pindex=3D0: SPB-aware free_pcppages_bulk scans all PCP
+		 * lists; we don't track a single pindex hint here because
+		 * pcp_enqueue distributed the page across the PCP buddy
+		 * lists per migratetype.
+		 */
+		free_pcppages_bulk(zone, to_free_batched, pcp);
 		to_free -=3D to_free_batched;
=20
 		if (to_free =3D=3D 0 || pcp->count =3D=3D 0)
@@ -2863,6 +3197,7 @@ static bool free_frozen_page_commit(struct zone *zone,
 		    next_memory_node(pgdat->node_id) < MAX_NUMNODES)
 			kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP);
 	}
+
 	return ret;
 }
=20
@@ -2873,9 +3208,11 @@ static void __free_frozen_pages(struct page *page, u=
nsigned int order,
 				fpi_t fpi_flags)
 {
 	struct per_cpu_pages *pcp;
+	struct pageblock_data *pbd;
 	struct zone *zone;
 	unsigned long pfn =3D page_to_pfn(page);
 	int migratetype;
+	int owner_cpu, cache_cpu;
=20
 	if (!pcp_allowed_order(order)) {
 		__free_pages_ok(page, order, fpi_flags);
@@ -2893,7 +3230,8 @@ static void __free_frozen_pages(struct page *page, un=
signed int order,
 	 * excessively into the page allocator
 	 */
 	zone =3D page_zone(page);
-	migratetype =3D get_pfnblock_migratetype(page, pfn);
+	pbd =3D pfn_to_pageblock(page, pfn);
+	migratetype =3D pbd_migratetype(pbd);
 	if (unlikely(migratetype >=3D MIGRATE_PCPTYPES)) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
 			free_one_page(zone, page, pfn, order, fpi_flags);
@@ -2907,15 +3245,45 @@ static void __free_frozen_pages(struct page *page, =
unsigned int order,
 		add_page_to_zone_llist(zone, page, order);
 		return;
 	}
-	pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
-	if (pcp) {
-		if (!free_frozen_page_commit(zone, pcp, page, migratetype,
-						order, fpi_flags))
+
+	/*
+	 * Route page to the owning CPU's PCP for merging, or to
+	 * the local PCP for batching (zone-owned pages). Zone-owned
+	 * pages are cached without PagePCPBuddy -- the merge pass
+	 * skips them, so they're inert on any PCP list and drain
+	 * individually to zone buddy.
+	 *
+	 * Ownership is stable here: it can only change when the
+	 * pageblock is complete -- either fully free in zone buddy
+	 * (Phase 1 claims) or fully merged on PCP (drain disowns).
+	 * Since we hold this page, neither can happen.
+	 */
+	owner_cpu =3D pbd->cpu - 1;
+	cache_cpu =3D owner_cpu;
+	if (cache_cpu < 0)
+		cache_cpu =3D raw_smp_processor_id();
+
+	pcp =3D per_cpu_ptr(zone->per_cpu_pageset, cache_cpu);
+	if (unlikely(fpi_flags & FPI_TRYLOCK) || !in_task()) {
+		if (!spin_trylock(&pcp->lock)) {
+			free_one_page(zone, page, pfn, order, fpi_flags);
 			return;
-		pcp_spin_unlock(pcp);
+		}
 	} else {
+		spin_lock(&pcp->lock);
+	}
+
+	if (unlikely(pcp->flags & PCPF_CPU_DEAD)) {
+		spin_unlock(&pcp->lock);
 		free_one_page(zone, page, pfn, order, fpi_flags);
+		return;
 	}
+
+	if (free_frozen_page_commit(zone, pcp, page, migratetype, order,
+				    fpi_flags, cache_cpu =3D=3D owner_cpu))
+		spin_unlock(&pcp->lock);
+	/* If commit returned false, pcp was already unlocked (migration or
+	 * trylock failure inside the batched-free loop). */
 }
=20
 void free_frozen_pages(struct page *page, unsigned int order)
@@ -2935,6 +3303,7 @@ void free_unref_folios(struct folio_batch *folios)
 {
 	struct per_cpu_pages *pcp =3D NULL;
 	struct zone *locked_zone =3D NULL;
+	int locked_cpu =3D -1;
 	int i, j;
=20
 	/* Prepare folios for freeing */
@@ -2966,17 +3335,29 @@ void free_unref_folios(struct folio_batch *folios)
 		struct zone *zone =3D folio_zone(folio);
 		unsigned long pfn =3D folio_pfn(folio);
 		unsigned int order =3D (unsigned long)folio->private;
+		struct pageblock_data *pbd;
 		int migratetype;
+		int owner_cpu, cache_cpu;
=20
 		folio->private =3D NULL;
-		migratetype =3D get_pfnblock_migratetype(&folio->page, pfn);
+		pbd =3D pfn_to_pageblock(&folio->page, pfn);
+		migratetype =3D pbd_migratetype(pbd);
+		owner_cpu =3D pbd->cpu - 1;
+		cache_cpu =3D owner_cpu;
+		if (cache_cpu < 0)
+			cache_cpu =3D raw_smp_processor_id();
=20
-		/* Different zone requires a different pcp lock */
+		/*
+		 * Re-lock needed if zone changed, page is isolate,
+		 * or target CPU changed.
+		 */
 		if (zone !=3D locked_zone ||
-		    is_migrate_isolate(migratetype)) {
+		    is_migrate_isolate(migratetype) ||
+		    cache_cpu !=3D locked_cpu) {
 			if (pcp) {
-				pcp_spin_unlock(pcp);
+				spin_unlock(&pcp->lock);
 				locked_zone =3D NULL;
+				locked_cpu =3D -1;
 				pcp =3D NULL;
 			}
=20
@@ -2990,17 +3371,34 @@ void free_unref_folios(struct folio_batch *folios)
 				continue;
 			}
=20
+			pcp =3D per_cpu_ptr(zone->per_cpu_pageset,
+					  cache_cpu);
 			/*
-			 * trylock is necessary as folios may be getting freed
-			 * from IRQ or SoftIRQ context after an IO completion.
+			 * Use trylock when not in task context (IRQ,
+			 * softirq) to avoid spinning with IRQs
+			 * disabled. In task context, spin -- brief
+			 * contention on a per-CPU lock beats the
+			 * unbatched zone->lock fallback.
 			 */
-			pcp =3D pcp_spin_trylock(zone->per_cpu_pageset);
-			if (unlikely(!pcp)) {
+			if (!in_task()) {
+				if (unlikely(!spin_trylock(&pcp->lock))) {
+					pcp =3D NULL;
+					free_one_page(zone, &folio->page, pfn,
+						      order, FPI_NONE);
+					continue;
+				}
+			} else {
+				spin_lock(&pcp->lock);
+			}
+			if (unlikely(pcp->flags & PCPF_CPU_DEAD)) {
+				spin_unlock(&pcp->lock);
+				pcp =3D NULL;
 				free_one_page(zone, &folio->page, pfn,
 					      order, FPI_NONE);
 				continue;
 			}
 			locked_zone =3D zone;
+			locked_cpu =3D cache_cpu;
 		}
=20
 		/*
@@ -3012,14 +3410,16 @@ void free_unref_folios(struct folio_batch *folios)
=20
 		trace_mm_page_free_batched(&folio->page);
 		if (!free_frozen_page_commit(zone, pcp, &folio->page,
-				migratetype, order, FPI_NONE)) {
+				migratetype, order, FPI_NONE,
+				cache_cpu =3D=3D owner_cpu)) {
 			pcp =3D NULL;
 			locked_zone =3D NULL;
+			locked_cpu =3D -1;
 		}
 	}
=20
 	if (pcp)
-		pcp_spin_unlock(pcp);
+		spin_unlock(&pcp->lock);
 	folio_batch_reinit(folios);
 }
=20
@@ -3249,15 +3649,15 @@ static inline
 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 			int migratetype,
 			unsigned int alloc_flags,
-			struct per_cpu_pages *pcp,
-			struct list_head *list)
+			struct per_cpu_pages *pcp)
 {
 	struct page *page;
=20
 	do {
-		if (list_empty(list)) {
+		/* Try to find/split from existing PCP stock */
+		page =3D pcp_rmqueue_smallest(pcp, migratetype, order);
+		if (!page) {
 			int batch =3D nr_pcp_alloc(pcp, zone, order);
-			int alloced;
=20
 			/*
 			 * Don't refill the list for a higher order atomic
@@ -3273,18 +3673,14 @@ struct page *__rmqueue_pcplist(struct zone *zone, u=
nsigned int order,
 			if (alloc_flags & ALLOC_HIGHATOMIC)
 				return NULL;
=20
-			alloced =3D rmqueue_bulk(zone, order,
-					batch, list,
-					migratetype, alloc_flags);
+			if (!rmqueue_bulk(zone, order, batch, migratetype,
+					  alloc_flags, pcp))
+				return NULL;
=20
-			pcp->count +=3D alloced << order;
-			if (unlikely(list_empty(list)))
+			page =3D pcp_rmqueue_smallest(pcp, migratetype, order);
+			if (unlikely(!page))
 				return NULL;
 		}
-
-		page =3D list_first_entry(list, struct page, pcp_list);
-		list_del(&page->pcp_list);
-		pcp->count -=3D 1 << order;
 	} while (check_new_pages(page, order));
=20
 	return page;
@@ -3296,7 +3692,6 @@ static struct page *rmqueue_pcplist(struct zone *pref=
erred_zone,
 			int migratetype, unsigned int alloc_flags)
 {
 	struct per_cpu_pages *pcp;
-	struct list_head *list;
 	struct page *page;
=20
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
@@ -3310,8 +3705,7 @@ static struct page *rmqueue_pcplist(struct zone *pref=
erred_zone,
 	 * frees.
 	 */
 	pcp->free_count >>=3D 1;
-	list =3D &pcp->lists[order_to_pindex(migratetype, order)];
-	page =3D __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, li=
st);
+	page =3D __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp);
 	pcp_spin_unlock(pcp);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
@@ -4989,7 +5383,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
 	struct zone *zone;
 	struct zoneref *z;
 	struct per_cpu_pages *pcp;
-	struct list_head *pcp_list;
 	struct alloc_context ac;
 	gfp_t alloc_gfp;
 	unsigned int alloc_flags =3D ALLOC_WMARK_LOW;
@@ -5084,7 +5477,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
 		goto failed;
=20
 	/* Attempt the batch allocation */
-	pcp_list =3D &pcp->lists[order_to_pindex(ac.migratetype, 0)];
 	while (nr_populated < nr_pages) {
=20
 		/* Skip existing pages */
@@ -5093,8 +5485,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
 			continue;
 		}
=20
-		page =3D __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
-								pcp, pcp_list);
+		page =3D __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp);
 		if (unlikely(!page)) {
 			/* Try and allocate at least one page */
 			if (!nr_account) {
@@ -5969,6 +6360,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *=
pcp, struct per_cpu_zonesta
 	spin_lock_init(&pcp->lock);
 	for (pindex =3D 0; pindex < NR_PCP_LISTS; pindex++)
 		INIT_LIST_HEAD(&pcp->lists[pindex]);
+	INIT_LIST_HEAD(&pcp->owned_blocks);
=20
 	/*
 	 * Set batch and high values safe for a boot pageset. A true percpu
@@ -6167,7 +6559,45 @@ static int page_alloc_cpu_dead(unsigned int cpu)
=20
 	lru_add_drain_cpu(cpu);
 	mlock_drain_remote(cpu);
-	drain_pages(cpu);
+
+	/*
+	 * Mark the dead CPU's PCPs so concurrent frees don't
+	 * enqueue pages on them after the drain. Set the flag
+	 * under pcp->lock to serialize with trylock in the free
+	 * path. Stale ownership entries in pageblock_data are
+	 * harmless: frees check PCPF_CPU_DEAD and fall back to zone,
+	 * and rmqueue_bulk will reclaim the blocks for live CPUs.
+	 */
+	for_each_populated_zone(zone) {
+		unsigned long zflags;
+		struct per_cpu_pages *pcp;
+
+		pcp =3D per_cpu_ptr(zone->per_cpu_pageset, cpu);
+
+		pcp_spin_lock_nopin(pcp);
+		pcp->flags |=3D PCPF_CPU_DEAD;
+		pcp_spin_unlock_nopin(pcp);
+
+		drain_pages_zone(cpu, zone);
+
+		/*
+		 * Drain released all pages. Reinitialize the
+		 * owned-blocks list -- any remaining entries are
+		 * stale (fragments that merged in zone buddy and
+		 * cleared ownership, but weren't removed from
+		 * the list because __free_one_page doesn't hold
+		 * pcp->lock).
+		 *
+		 * Hold zone lock to prevent racing with other
+		 * CPUs doing list_del_init on stale entries
+		 * from this list during their Phase 1.
+		 */
+		pcp_spin_lock_nopin(pcp);
+		spin_lock_irqsave(&zone->lock, zflags);
+		INIT_LIST_HEAD(&pcp->owned_blocks);
+		spin_unlock_irqrestore(&zone->lock, zflags);
+		pcp_spin_unlock_nopin(pcp);
+	}
=20
 	/*
 	 * Spill the event counters of the dead processor
@@ -6196,8 +6626,16 @@ static int page_alloc_cpu_online(unsigned int cpu)
 {
 	struct zone *zone;
=20
-	for_each_populated_zone(zone)
+	for_each_populated_zone(zone) {
+		struct per_cpu_pages *pcp;
+
+		pcp =3D per_cpu_ptr(zone->per_cpu_pageset, cpu);
+		pcp_spin_lock_nopin(pcp);
+		pcp->flags &=3D ~PCPF_CPU_DEAD;
+		pcp_spin_unlock_nopin(pcp);
+
 		zone_pcp_update(zone, 1);
+	}
 	return 0;
 }
=20
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CA92D3EFFA0
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289243; cv=none;
 b=P8YEbTBRs04MspHE38KrZMYeS/zQArnBXh0EMXBOiY6RsucAGq+JJpfbRQ9rDlXy608PtZ8bzYxYdQDbq544dx0DrC86Z33wnxh6ZI6xX+52YiCjdjNDzT+Ufyt1yzy0zZqW8eEEFQW/qF8j5sGyFV2sWF6e+gZejlgHbwbbWLE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289243; c=relaxed/simple;
	bh=m5MKjlDPojW5mMuVNS6/udSvB32X7mZ5yMNE2+YRDyo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=tQu9XfwxEOn1RaLEoLiOb2ZhDvzm64RDumI05n0GQMnXkd9nAOvSBdI5ixftc0Z+t7YTZlJy+UOlMGVp1Ze3vI3YAuRb4jUMXOUv0bjXPjUikHXb7S9Ceen+1JPYJ0PNpxOOSYVdXlSfjEIER3zS0wZqJM2ZUpuXjbNjFZJh56s=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=JljjMHsj; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="JljjMHsj"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=Z/qj+KWI7h9JqJhNq4BxI6ZWMV4VhY1/AXzq1+33pR8=; b=JljjMHsjzfjFl41mysnmQ4YPui
	WwF5xRbeUYxYn0fHLSJFHja4dAlFQORwSTO+1qHgIyhDk437Q3g6Hi2uJHpt7EYp+NJ35oBzoqOhj
	tERvokdbIWMRRY+/E0sTto3XzjcLSD2BVoqB+15Ov3T1dyvBeWfbw779ICcOquCw3HsURR3NUaij4
	E7HJy9jGspfLZILp8cBANa55xskwfSvMmBsXpFKc02shXc7tRdFQB19tYd/GCddrl4eWosWgH1w7X
	DdxkoaTD3e2KdWkVehdurLQegmAegjepGWrnRFSuxg+x7Vhsl1joZW6dczIr6qG/Hk0RIHX4AlrJi
	8WB5OKfw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-0Moh;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 03/40] mm: page_alloc: split-path PCP free with
 local-trylock + remote-llist
Date: Wed, 20 May 2026 10:59:09 -0400
Message-ID: <20260520150018.2491267-4-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The page allocator's PCP free path needs lock-inversion protection
against zone->lock.  The natural form -- always take pcp->lock with
spin_lock -- can deadlock because callers may hold locks (e.g.
xa_lock via slab/stack_depot) that are also taken in hardirq context,
and pcp->lock is acquired with IRQs enabled on the allocation side.

A coarse fix is to use spin_trylock and fall back to free_one_page()
(direct zone-buddy free) on contention.  That removes the inversion
risk but defeats the per-CPU pageset benefits on a busy multi-CPU
system: many frees take the slow zone->lock path, and the per-CPU
pcp->count visible to allocators understates real free-page
availability for the remote CPU's pageset.

Replace the trylock-fallback with a per-CPU remote free list (llist)
consumed by the owning CPU.  Local frees still use the trylock path;
remote frees push onto the target's lockless llist; the owning CPU
absorbs the queued pages back onto its PCP buddy lists at the next
opportunity.  Result: zero lock-inversion risk, no zone->lock
fallback storm, and remote frees become near-free at the freer's
side.

Mechanics:

  - per_cpu_pages gains struct llist_head free_llist.
  - absorb_remote_frees(pcp) drains the llist into the local PCP buddy
    lists. Called from pcp_rmqueue_smallest(), free_pcppages_bulk(),
    and drain_pages_zone().
  - __free_frozen_pages and free_unref_folios are split into a local
    path (spin_trylock on pcp->lock; on success enqueue locally) and
    a remote path (llist_add to the target CPU's free_llist).
  - The local-side spin_trylock no longer takes irqsave: lockdep
    analysis showed no IRQ-context caller of the local PCP free path
    that is also a holder of pcp->lock; the remote-from-IRQ case
    routes through llist_add (NMI-safe).
  - Memory hot-add lazy init: page_alloc_cpu_dead drains the dead PCP
    via existing drain_pages_zone (which now also drains the llist
    via absorb_remote_frees). For the narrow race where a remote freer
    raced PCPF_CPU_DEAD and pushed onto the dead PCP's llist after the
    drain, page_alloc_cpu_online absorbs any stranded pages.
  - page_alloc_cpu_dead detaches every entry from owned_blocks via
    list_del_init before reinitializing the list head.  A simpler
    INIT_LIST_HEAD-only form leaves owned PB entries with stale
    ->prev/->next pointing at the dead head -- they get list_del()'d
    later by clear_pcpblock_owner() under zone->lock, corrupting
    whatever now happens to be at the dead head address.  A
    stress-test reproducer surfaced this as a list_del prev->next =3D=3D
    prev WARN.

QEMU stress (234K worker iters + 5 hotplug cycles + 30 hugepages):
zero WARN/BUG.  Bare-metal test machine ran for ~14 hours under
production-style load with no list_del corruption, no WARN, no panic.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |   9 ++
 mm/page_alloc.c        | 249 ++++++++++++++++++++++++++++++-----------
 2 files changed, 193 insertions(+), 65 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f0eb16390906..732e4dd181b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -843,6 +843,15 @@ struct per_cpu_pages {
 	/* Pageblocks owned by this CPU, for fragment recovery */
 	struct list_head owned_blocks;
=20
+	/*
+	 * Pages remotely freed by other CPUs into pageblocks owned by
+	 * this CPU. Lock-free push by remote freers via llist_add(); the
+	 * owning CPU drains and merges them into its PCP buddy lists at
+	 * convenient moments (start of pcp_rmqueue_smallest, drain
+	 * paths) under pcp->lock.
+	 */
+	struct llist_head free_llist;
+
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
 } ____cacheline_aligned_in_smp;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3448a97bab2..47d314e77151 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1482,6 +1482,8 @@ bool free_pages_prepare(struct page *page, unsigned i=
nt order)
 	return __free_pages_prepare(page, order, FPI_NONE);
 }
=20
+static void absorb_remote_frees(struct per_cpu_pages *pcp);
+
 /*
  * Free PCP pages to zone buddy. First does a bottom-up merge pass
  * over PagePCPBuddy entries under pcp->lock only (already held by
@@ -1502,6 +1504,13 @@ static void free_pcppages_bulk(struct zone *zone, in=
t count,
 	struct page *page;
 	int mt, pindex;
=20
+	/*
+	 * Pull in any pages remotely freed to our pageblocks before the
+	 * merge pass -- they participate in merging just like locally
+	 * freed pages.
+	 */
+	absorb_remote_frees(pcp);
+
 	/*
 	 * Ensure proper count is passed which otherwise would stuck in the
 	 * below while (list_empty(list)) loop.
@@ -1596,6 +1605,45 @@ static void free_pcppages_bulk(struct zone *zone, in=
t count,
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
=20
+/*
+ * Absorb pages remotely freed into this CPU's pageblocks. Remote freers
+ * push pages onto pcp->free_llist lock-free (no remote PCP lock taken);
+ * the owning CPU pulls them onto its PCP buddy lists here, where they
+ * become eligible for normal merging on the next free_pcppages_bulk()
+ * pass.
+ *
+ * Called with pcp->lock held. Must be cheap on the empty path; the
+ * llist_empty() check is the fast-path bail-out.
+ */
+static void absorb_remote_frees(struct per_cpu_pages *pcp)
+{
+	struct llist_node *node;
+	struct page *p, *tmp;
+	int absorbed =3D 0;
+
+	if (likely(llist_empty(&pcp->free_llist)))
+		return;
+
+	node =3D llist_del_all(&pcp->free_llist);
+	llist_for_each_entry_safe(p, tmp, node, pcp_llist) {
+		unsigned long pfn =3D page_to_pfn(p);
+		unsigned int order =3D pcp_buddy_order(p);
+		int mt =3D pbd_migratetype(pfn_to_pageblock(p, pfn));
+
+		if (unlikely(mt >=3D MIGRATE_PCPTYPES))
+			mt =3D MIGRATE_MOVABLE;
+
+		/*
+		 * Pages on the llist came from pageblocks owned by this CPU
+		 * (that's how the freer picked our llist), so they are
+		 * eligible for PCP-buddy merging.
+		 */
+		__SetPagePCPBuddy(p);
+		pcp_enqueue(pcp, p, mt, order);
+		absorbed +=3D 1 << order;
+	}
+}
+
 /*
  * Search PCP free lists for a page of at least the requested order.
  * If found at a higher order, split and place remainders on PCP lists.
@@ -1606,6 +1654,8 @@ static struct page *pcp_rmqueue_smallest(struct per_c=
pu_pages *pcp,
 {
 	unsigned int high;
=20
+	absorb_remote_frees(pcp);
+
 	for (high =3D order; high <=3D pageblock_order; high++) {
 		struct list_head *list;
 		unsigned long size;
@@ -2884,6 +2934,7 @@ static void drain_pages_zone(unsigned int cpu, struct=
 zone *zone)
=20
 	do {
 		pcp_spin_lock_nopin(pcp);
+		absorb_remote_frees(pcp);
 		count =3D pcp->count;
 		if (count) {
 			int to_drain =3D min(count,
@@ -3247,11 +3298,22 @@ static void __free_frozen_pages(struct page *page, =
unsigned int order,
 	}
=20
 	/*
-	 * Route page to the owning CPU's PCP for merging, or to
-	 * the local PCP for batching (zone-owned pages). Zone-owned
-	 * pages are cached without PagePCPBuddy -- the merge pass
-	 * skips them, so they're inert on any PCP list and drain
-	 * individually to zone buddy.
+	 * Route the page based on pageblock ownership:
+	 *
+	 *  - owner_cpu =3D=3D this CPU (or no owner): take the local PCP
+	 *    lock with spin_trylock and enqueue normally. The trylock
+	 *    fails only on rare local self re-entry (IRQ/NMI fires
+	 *    while the interrupted task already holds the lock) or
+	 *    while a remote drain is active; either way, fall back to
+	 *    free_one_page (or the zone-llist for FPI_TRYLOCK). No
+	 *    irqsave: the trylock cannot block on self, and remote
+	 *    CPUs never take this pcp->lock (they go via free_llist),
+	 *    so an interruption cannot deadlock against another freer.
+	 *
+	 *  - owner_cpu !=3D this CPU: lock-free push onto the owner's
+	 *    free_llist. The owner absorbs the page into its PCP buddy
+	 *    lists at its next alloc/drain. No remote PCP lock taken,
+	 *    so no cross-CPU contention.
 	 *
 	 * Ownership is stable here: it can only change when the
 	 * pageblock is complete -- either fully free in zone buddy
@@ -3259,31 +3321,46 @@ static void __free_frozen_pages(struct page *page, =
unsigned int order,
 	 * Since we hold this page, neither can happen.
 	 */
 	owner_cpu =3D pbd->cpu - 1;
-	cache_cpu =3D owner_cpu;
-	if (cache_cpu < 0)
-		cache_cpu =3D raw_smp_processor_id();
+	cache_cpu =3D raw_smp_processor_id();
+
+	if (owner_cpu < 0 || owner_cpu =3D=3D cache_cpu) {
+		pcp =3D per_cpu_ptr(zone->per_cpu_pageset, cache_cpu);
=20
-	pcp =3D per_cpu_ptr(zone->per_cpu_pageset, cache_cpu);
-	if (unlikely(fpi_flags & FPI_TRYLOCK) || !in_task()) {
 		if (!spin_trylock(&pcp->lock)) {
+			if (fpi_flags & FPI_TRYLOCK)
+				add_page_to_zone_llist(zone, page, order);
+			else
+				free_one_page(zone, page, pfn, order, fpi_flags);
+			return;
+		}
+
+		if (unlikely(pcp->flags & PCPF_CPU_DEAD)) {
+			spin_unlock(&pcp->lock);
 			free_one_page(zone, page, pfn, order, fpi_flags);
 			return;
 		}
-	} else {
-		spin_lock(&pcp->lock);
+
+		if (free_frozen_page_commit(zone, pcp, page, migratetype,
+					    order, fpi_flags,
+					    owner_cpu =3D=3D cache_cpu))
+			spin_unlock(&pcp->lock);
+		/* If commit returned false, pcp was already unlocked
+		 * (migration or trylock failure inside the batched-free
+		 * loop). */
+		return;
 	}
=20
-	if (unlikely(pcp->flags & PCPF_CPU_DEAD)) {
-		spin_unlock(&pcp->lock);
+	/* Remote owner: lock-free llist hand-off. */
+	pcp =3D per_cpu_ptr(zone->per_cpu_pageset, owner_cpu);
+
+	if (unlikely(READ_ONCE(pcp->flags) & PCPF_CPU_DEAD)) {
 		free_one_page(zone, page, pfn, order, fpi_flags);
 		return;
 	}
=20
-	if (free_frozen_page_commit(zone, pcp, page, migratetype, order,
-				    fpi_flags, cache_cpu =3D=3D owner_cpu))
-		spin_unlock(&pcp->lock);
-	/* If commit returned false, pcp was already unlocked (migration or
-	 * trylock failure inside the batched-free loop). */
+	set_pcp_order(page, order);
+	llist_add(&page->pcp_llist, &pcp->free_llist);
+	__count_vm_events(PGFREE, 1 << order);
 }
=20
 void free_frozen_pages(struct page *page, unsigned int order)
@@ -3335,60 +3412,78 @@ void free_unref_folios(struct folio_batch *folios)
 		struct zone *zone =3D folio_zone(folio);
 		unsigned long pfn =3D folio_pfn(folio);
 		unsigned int order =3D (unsigned long)folio->private;
+		struct per_cpu_pages *remote_pcp;
 		struct pageblock_data *pbd;
 		int migratetype;
-		int owner_cpu, cache_cpu;
+		int owner_cpu;
=20
 		folio->private =3D NULL;
 		pbd =3D pfn_to_pageblock(&folio->page, pfn);
 		migratetype =3D pbd_migratetype(pbd);
 		owner_cpu =3D pbd->cpu - 1;
-		cache_cpu =3D owner_cpu;
-		if (cache_cpu < 0)
-			cache_cpu =3D raw_smp_processor_id();
=20
-		/*
-		 * Re-lock needed if zone changed, page is isolate,
-		 * or target CPU changed.
-		 */
-		if (zone !=3D locked_zone ||
-		    is_migrate_isolate(migratetype) ||
-		    cache_cpu !=3D locked_cpu) {
+		/* Isolated pages always go directly to the zone buddy. */
+		if (unlikely(is_migrate_isolate(migratetype))) {
 			if (pcp) {
 				spin_unlock(&pcp->lock);
+				pcp =3D NULL;
 				locked_zone =3D NULL;
 				locked_cpu =3D -1;
-				pcp =3D NULL;
 			}
+			free_one_page(zone, &folio->page, pfn,
+				      order, FPI_NONE);
+			continue;
+		}
=20
-			/*
-			 * Free isolated pages directly to the
-			 * allocator, see comment in free_frozen_pages.
-			 */
-			if (is_migrate_isolate(migratetype)) {
+		if (locked_cpu < 0)
+			locked_cpu =3D raw_smp_processor_id();
+
+		/*
+		 * Remote owner: lock-free push onto the owner's free_llist.
+		 * Drop any local PCP lock first; the remote llist needs no
+		 * lock and the next folio may belong to a different owner.
+		 */
+		if (owner_cpu >=3D 0 && owner_cpu !=3D locked_cpu) {
+			if (pcp) {
+				spin_unlock(&pcp->lock);
+				pcp =3D NULL;
+				locked_zone =3D NULL;
+			}
+			remote_pcp =3D per_cpu_ptr(zone->per_cpu_pageset,
+						 owner_cpu);
+			if (unlikely(READ_ONCE(remote_pcp->flags) &
+				     PCPF_CPU_DEAD)) {
 				free_one_page(zone, &folio->page, pfn,
 					      order, FPI_NONE);
 				continue;
 			}
+			set_pcp_order(&folio->page, order);
+			llist_add(&folio->page.pcp_llist,
+				  &remote_pcp->free_llist);
+			__count_vm_events(PGFREE, 1 << order);
+			trace_mm_page_free_batched(&folio->page);
+			continue;
+		}
=20
-			pcp =3D per_cpu_ptr(zone->per_cpu_pageset,
-					  cache_cpu);
-			/*
-			 * Use trylock when not in task context (IRQ,
-			 * softirq) to avoid spinning with IRQs
-			 * disabled. In task context, spin -- brief
-			 * contention on a per-CPU lock beats the
-			 * unbatched zone->lock fallback.
-			 */
-			if (!in_task()) {
-				if (unlikely(!spin_trylock(&pcp->lock))) {
-					pcp =3D NULL;
-					free_one_page(zone, &folio->page, pfn,
-						      order, FPI_NONE);
-					continue;
-				}
-			} else {
-				spin_lock(&pcp->lock);
+		/*
+		 * Local owner (or unowned): take the local PCP lock with
+		 * spin_trylock. On failure (rare local re-entry or a remote
+		 * drain in progress) fall back to the zone buddy. No
+		 * irqsave -- trylock cannot block on self, and remote
+		 * CPUs never take this pcp->lock (they go via free_llist).
+		 */
+		if (zone !=3D locked_zone) {
+			if (pcp) {
+				spin_unlock(&pcp->lock);
+				pcp =3D NULL;
+				locked_zone =3D NULL;
+			}
+			pcp =3D per_cpu_ptr(zone->per_cpu_pageset, locked_cpu);
+			if (!spin_trylock(&pcp->lock)) {
+				pcp =3D NULL;
+				free_one_page(zone, &folio->page, pfn,
+					      order, FPI_NONE);
+				continue;
 			}
 			if (unlikely(pcp->flags & PCPF_CPU_DEAD)) {
 				spin_unlock(&pcp->lock);
@@ -3398,7 +3493,6 @@ void free_unref_folios(struct folio_batch *folios)
 				continue;
 			}
 			locked_zone =3D zone;
-			locked_cpu =3D cache_cpu;
 		}
=20
 		/*
@@ -3411,7 +3505,7 @@ void free_unref_folios(struct folio_batch *folios)
 		trace_mm_page_free_batched(&folio->page);
 		if (!free_frozen_page_commit(zone, pcp, &folio->page,
 				migratetype, order, FPI_NONE,
-				cache_cpu =3D=3D owner_cpu)) {
+				owner_cpu =3D=3D locked_cpu)) {
 			pcp =3D NULL;
 			locked_zone =3D NULL;
 			locked_cpu =3D -1;
@@ -6361,6 +6455,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *=
pcp, struct per_cpu_zonesta
 	for (pindex =3D 0; pindex < NR_PCP_LISTS; pindex++)
 		INIT_LIST_HEAD(&pcp->lists[pindex]);
 	INIT_LIST_HEAD(&pcp->owned_blocks);
+	init_llist_head(&pcp->free_llist);
=20
 	/*
 	 * Set batch and high values safe for a boot pageset. A true percpu
@@ -6581,19 +6676,38 @@ static int page_alloc_cpu_dead(unsigned int cpu)
 		drain_pages_zone(cpu, zone);
=20
 		/*
-		 * Drain released all pages. Reinitialize the
-		 * owned-blocks list -- any remaining entries are
-		 * stale (fragments that merged in zone buddy and
-		 * cleared ownership, but weren't removed from
-		 * the list because __free_one_page doesn't hold
-		 * pcp->lock).
+		 * drain_pages_zone iterates absorb_remote_frees +
+		 * free_pcppages_bulk until both pcp->count and the
+		 * remote-free llist are empty. A remote freer that
+		 * read PCPF_CPU_DEAD as clear *before* the flag was set
+		 * above and does llist_add *after* the drain exits will
+		 * leave a few pages on the dead PCP's free_llist; they
+		 * are harmless and absorbed when the CPU comes back
+		 * online (any first alloc/free runs absorb_remote_frees).
 		 *
-		 * Hold zone lock to prevent racing with other
-		 * CPUs doing list_del_init on stale entries
-		 * from this list during their Phase 1.
+		 * Drain released all pages. Tear down the owned-blocks
+		 * list cleanly: walk each entry and list_del_init() it
+		 * before INIT_LIST_HEAD on the head. INIT_LIST_HEAD
+		 * alone would leave stale entries with prev/next
+		 * pointing at the (now self-pointing) head, so a future
+		 * clear_pcpblock_owner -> list_del_init on a stale
+		 * pbd->cpu_node would corrupt the list head it walks
+		 * back through. Detaching each entry first makes the
+		 * subsequent list_del_init a safe self-loop no-op.
+		 *
+		 * Hold zone lock to serialize with concurrent Phase 0
+		 * iteration on this same list from other CPUs (which
+		 * also hold zone->lock).
 		 */
 		pcp_spin_lock_nopin(pcp);
 		spin_lock_irqsave(&zone->lock, zflags);
+		while (!list_empty(&pcp->owned_blocks)) {
+			struct pageblock_data *pbd =3D
+				list_first_entry(&pcp->owned_blocks,
+						 struct pageblock_data,
+						 cpu_node);
+			list_del_init(&pbd->cpu_node);
+		}
 		INIT_LIST_HEAD(&pcp->owned_blocks);
 		spin_unlock_irqrestore(&zone->lock, zflags);
 		pcp_spin_unlock_nopin(pcp);
@@ -6632,6 +6746,11 @@ static int page_alloc_cpu_online(unsigned int cpu)
 		pcp =3D per_cpu_ptr(zone->per_cpu_pageset, cpu);
 		pcp_spin_lock_nopin(pcp);
 		pcp->flags &=3D ~PCPF_CPU_DEAD;
+		/*
+		 * Pull in any pages that landed on the free_llist while
+		 * the CPU was down (rare race in page_alloc_cpu_dead).
+		 */
+		absorb_remote_frees(pcp);
 		pcp_spin_unlock_nopin(pcp);
=20
 		zone_pcp_update(zone, 1);
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4FF83EEAFF
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289241; cv=none;
 b=h9XUFQV01zmOpe2Kp1sCM8LchPLd47xps+/CFkEs5/Qp115uiRDINi90SR6wnlvx27hpKhqR/6n33pdnvZ6kUMGeXMm2S/eeLPGViD2U6Z43mKVz7bEVt9/ZG7Fowp09tosEjkkezvtSsVlGIGd0CszYwdRHyUHtdq2tDaVgwSk=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289241; c=relaxed/simple;
	bh=UwwuIzTzhgzjMlX2GxOnbYg2TVk0j9izsbPvHwhUwbs=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=FY1xn2rZwHW3To/JuhoCnB3NnEDvcb1m56/jgP9JVzBoxvvryC/uOaLpV6CxemcXkUFjaHcnBV28QyQPOVD5Dek0H7HYd3lhmOLQS/9Wzs6y8me0jPbN6Hlt9c/q3gXEbeyRzy0WHNgXItlRbekDvTnNy+p0lLCqtwBb9orTdWE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=hvoqZOeC; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="hvoqZOeC"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=1c9ByeJjCfG0gYJvGtiXQ9M/uTq5YaBvLV18O1MJ0UU=; b=hvoqZOeCLBvtxCldkLZmtJCI3C
	vkbEdJDYrqh8kpeznCytOLOHIZOliqZlJyNZbgbxYe91iqQkd2biXISW8EiEczS2hLPuiRbopm408
	cEYPlcyvs/1U0hI+iKCAPqZhgvTI2uNjuy0kE3004UXGzZAYuVE0VmGf6aRFD1/lpfRuMIKng/FKK
	hWwPc1X3+5iC5BNoE4Xg9gPCUxhdvrMe+LiuifKYFS4pf+ePmN+Q85nGCEnQpzm3gZRtlRQ/FYzrF
	Nha6i8e55ePs8+CWc2bnIoi7AcPruD0KOBeJL4Xzi0zb2Ps7NOkw3x9qsIMrLvw5awqMD1QlOrJf5
	I0DCVcYg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-0T4O;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 04/40] mm: mm_init: fix zone assignment for pages in
 unavailable ranges
Date: Wed, 20 May 2026 10:59:10 -0400
Message-ID: <20260520150018.2491267-5-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

init_unavailable_range() initializes struct pages for memory holes between
memblock regions.  It receives a zone ID from its caller, but that zone ID
is simply the last zone processed in memmap_init()'s iteration -- it does
not necessarily match the zone that actually spans each PFN in the hole.

When an unavailable range straddles a zone boundary (e.g. a hole between
DMA32 and Normal), all pages in the hole get tagged with the wrong zone in
page->flags.  Any later page_zone() call on such a page returns the wrong
zone, which can cause accounting confusion or crashes when code assumes the
returned zone is valid for that page.

Fix by looking up the correct zone for each PFN in the hole.  This is init-
only code running once at boot, so the per-page zone lookup has no
performance impact.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/mm_init.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1bc909da9c13..47a222e49fc9 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -834,9 +834,27 @@ static void __init init_unavailable_range(unsigned lon=
g spfn,
 {
 	unsigned long pfn;
 	u64 pgcnt =3D 0;
+	pg_data_t *pgdat =3D NODE_DATA(node);
+	int zid =3D zone;
=20
 	for_each_valid_pfn(pfn, spfn, epfn) {
-		__init_single_page(pfn_to_page(pfn), pfn, zone, node);
+		/*
+		 * The caller's zone may not match the PFN when unavailable
+		 * ranges straddle zone boundaries.  Look up the correct zone
+		 * so page->flags encodes the right zone for page_zone().
+		 */
+		if (!zone_spans_pfn(&pgdat->node_zones[zid], pfn)) {
+			int z;
+
+			for (z =3D 0; z < MAX_NR_ZONES; z++) {
+				if (zone_spans_pfn(&pgdat->node_zones[z], pfn)) {
+					zid =3D z;
+					break;
+				}
+			}
+		}
+
+		__init_single_page(pfn_to_page(pfn), pfn, zid, node);
 		__SetPageReserved(pfn_to_page(pfn));
 		pgcnt++;
 	}
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C50DA3EF66D
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289247; cv=none;
 b=MsMGVKAw3EI7ZVhdgzm+W8Ch1ZvBSDKlZTE/uKi0Fw15iepvH3hXrUVnx1UeVo9fT6g3hQwEFABox9lWw09mDLbz4IlbHfRIAamFxXK4Oh1Otlv7dy/NA9gtmTpq+q7QUj/pDRWjIv1f6xVQazMPW8dMQoGO4oC5zysLiPkdyxA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289247; c=relaxed/simple;
	bh=dT91fCD46jHk/D4aoLw96tr/I0JsH4B77kBlTMdAzqY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=WxYQtN9ZdrFX+Ac9z/Ig+CuabgqupvcVIINIulpgyvVoPeZDJlQ9R8WK1EuvPoyc3eepa3CKkYv8j6jNSQwxvMZVGTk0tND5UJb+0MZlTA0W1LBAdsjaXeOMgc6EfCL1h7PKn5+x+xNGEYNh1xe0GpTY3djJQthtOKikHv5Prk8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=VUMd8V68; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="VUMd8V68"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=KnbjIHq5zc+FyT7SFjp3JWUrqZg3k8IbbEuFyeESKjg=; b=VUMd8V68HOoJFocIxrt6TVgYSL
	iMa9qJyEBDFu47Cf7UrNAam9P1eAFW0A/O5Ii4h++vjqKS/huRwYxPgSGxdvsgaYo5SgpvCqEwO1M
	peHVAy2BKixoXayXieZMo31UQNX+9l5U9U6lJ+DCQTqbXaZmjnh+DfzLNjayvRWrF72JeYpHetOwI
	a5BdvuLuiCUZ/JW2eObRBWKYEWdL8+2lwnnDI+VHeAI8qc6ElV8txlEXF9r+QGOcxT3yRSjhZt5sR
	yEv2mewCpFkuwuq+EmHni5qdfZg8Pv1Low/P2wQz4MvyDLG+KbayvuOUj989cRht84WnhoNC9yoOy
	LO/6CEoA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-0YZP;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 05/40] mm: page_alloc: remove watermark boost mechanism
Date: Wed, 20 May 2026 10:59:11 -0400
Message-ID: <20260520150018.2491267-6-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

watermark_boost was introduced to react to fragmentation events at the
pageblock granularity: a sub-pageblock cross-type fallback would raise
the zone watermark and wake kswapd, on the theory that reclaiming some
order-0 pages would reduce future fallbacks.

With superpageblocks, anti-fragmentation is enforced at 1 GiB SPB
granularity, and the meaningful signals (CLEAN->TAINT events, empty SPB
count) live there.  Sub-pageblock fallbacks inside an already-tainted
SPB do not change the fragmentation picture, and order-0 reclaim does
not unmix a pageblock or surface a fresh clean SPB.

Worse, the boost is applied in try_to_claim_block() before the success
path is decided.  When option 1 (no UNMOVABLE/RECLAIMABLE pageblock
mixing) rejects a cross-type relabel, the boost has already been
applied and the next rmqueue() will wake kswapd to drain memory back
to high+boost - even when free pages are tens of times the high
watermark.  Real workloads showed bursts of >150 wakeup_kswapd/min,
all order-0, with stack traces consistently arriving from rmqueue()
through the boost-cleanup path.  Free memory at the time was 38x the
high watermark.

Drop the mechanism entirely:

  - boost_watermark() and its callsite in try_to_claim_block()
  - the ZONE_BOOSTED_WATERMARK flag and its set/clear in rmqueue()
  - zone->watermark_boost and the boost addend in wmark_pages()
  - the __GFP_HIGH boost-bypass path in zone_watermark_fast()
  - the watermark_boost_factor sysctl
  - boost-aware logic in balance_pgdat() (nr_boost_reclaim,
    zone_boosts[], pgdat_watermark_boosted, the boost-restart goto,
    no-writeback for boost reclaim, the boost-only kcompactd wakeup)

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 Documentation/admin-guide/sysctl/vm.rst |  21 -----
 Documentation/mm/physical_memory.rst    |  13 +--
 include/linux/mmzone.h                  |   6 +-
 mm/page_alloc.c                         |  82 +----------------
 mm/show_mem.c                           |   2 -
 mm/vmscan.c                             | 115 ++----------------------
 mm/vmstat.c                             |   2 -
 7 files changed, 14 insertions(+), 227 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-=
guide/sysctl/vm.rst
index 97e12359775c..3ddc6115c89a 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -76,7 +76,6 @@ files can be found in mm/swap.c.
 - user_reserve_kbytes
 - vfs_cache_pressure
 - vfs_cache_pressure_denom
-- watermark_boost_factor
 - watermark_scale_factor
 - zone_reclaim_mode
=20
@@ -1073,26 +1072,6 @@ vfs_cache_pressure_denom
 Defaults to 100 (minimum allowed value). Requires corresponding
 vfs_cache_pressure setting to take effect.
=20
-watermark_boost_factor
-=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
-
-This factor controls the level of reclaim when memory is being fragmented.
-It defines the percentage of the high watermark of a zone that will be
-reclaimed if pages of different mobility are being mixed within pageblocks.
-The intent is that compaction has less work to do in the future and to
-increase the success rate of future high-order allocations such as SLUB
-allocations, THP and hugetlbfs pages.
-
-To make it sensible with respect to the watermark_scale_factor
-parameter, the unit is in fractions of 10,000. The default value of
-15,000 means that up to 150% of the high watermark will be reclaimed in the
-event of a pageblock being mixed due to fragmentation. The level of reclaim
-is determined by the number of fragmentation events that occurred in the
-recent past. If this value is smaller than a pageblock then a pageblocks
-worth of pages will be reclaimed (e.g.  2MB on 64-bit x86). A boost factor
-of 0 will disable the feature.
-
-
 watermark_scale_factor
 =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
=20
diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physic=
al_memory.rst
index b76183545e5b..c4968db6e77c 100644
--- a/Documentation/mm/physical_memory.rst
+++ b/Documentation/mm/physical_memory.rst
@@ -394,11 +394,6 @@ General
   to the distance between two watermarks. The distance itself is calculated
   taking ``vm.watermark_scale_factor`` sysctl into account.
=20
-``watermark_boost``
-  The number of pages which are used to boost watermarks to increase recla=
im
-  pressure to reduce the likelihood of future fallbacks and wake kswapd now
-  as the node may be balanced overall and kswapd will not wake naturally.
-
 ``nr_reserved_highatomic``
   The number of pages which are reserved for high-order atomic allocations.
=20
@@ -527,11 +522,9 @@ General
   Defined only when ``CONFIG_UNACCEPTED_MEMORY`` is enabled.
=20
 ``flags``
-  The zone flags. The least three bits are used and defined by
-  ``enum zone_flags``. ``ZONE_BOOSTED_WATERMARK`` (bit 0): zone recently b=
oosted
-  watermarks. Cleared when kswapd is woken. ``ZONE_RECLAIM_ACTIVE`` (bit 1=
):
-  kswapd may be scanning the zone. ``ZONE_BELOW_HIGH`` (bit 2): zone is be=
low
-  high watermark.
+  The zone flags. The bits are defined by ``enum zone_flags``.
+  ``ZONE_RECLAIM_ACTIVE`` (bit 0): kswapd may be scanning the zone.
+  ``ZONE_BELOW_HIGH`` (bit 1): zone is below high watermark.
=20
 ``lock``
   The main lock that protects the internal data structures of the page all=
ocator
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 732e4dd181b9..13e29b2ebb86 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -978,7 +978,6 @@ struct zone {
=20
 	/* zone watermarks, access with *_wmark_pages(zone) macros */
 	unsigned long _watermark[NR_WMARK];
-	unsigned long watermark_boost;
=20
 	unsigned long nr_reserved_highatomic;
 	unsigned long nr_free_highatomic;
@@ -1167,9 +1166,6 @@ enum pgdat_flags {
 };
=20
 enum zone_flags {
-	ZONE_BOOSTED_WATERMARK,		/* zone recently boosted watermarks.
-					 * Cleared when kswapd is woken.
-					 */
 	ZONE_RECLAIM_ACTIVE,		/* kswapd may be scanning the zone. */
 	ZONE_BELOW_HIGH,		/* zone is below high watermark. */
 };
@@ -1177,7 +1173,7 @@ enum zone_flags {
 static inline unsigned long wmark_pages(const struct zone *z,
 					enum zone_watermarks w)
 {
-	return z->_watermark[w] + z->watermark_boost;
+	return z->_watermark[w];
 }
=20
 static inline unsigned long min_wmark_pages(const struct zone *z)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47d314e77151..6e01e58aca54 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -267,7 +267,6 @@ const char * const migratetype_names[MIGRATE_TYPES] =3D=
 {
=20
 int min_free_kbytes =3D 1024;
 int user_min_free_kbytes =3D -1;
-static int watermark_boost_factor __read_mostly =3D 15000;
 static int watermark_scale_factor =3D 10;
 int defrag_mode;
=20
@@ -2340,43 +2339,6 @@ bool pageblock_unisolate_and_move_free_pages(struct =
zone *zone, struct page *pag
=20
 #endif /* CONFIG_MEMORY_ISOLATION */
=20
-static inline bool boost_watermark(struct zone *zone)
-{
-	unsigned long max_boost;
-
-	if (!watermark_boost_factor)
-		return false;
-	/*
-	 * Don't bother in zones that are unlikely to produce results.
-	 * On small machines, including kdump capture kernels running
-	 * in a small area, boosting the watermark can cause an out of
-	 * memory situation immediately.
-	 */
-	if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
-		return false;
-
-	max_boost =3D mult_frac(zone->_watermark[WMARK_HIGH],
-			watermark_boost_factor, 10000);
-
-	/*
-	 * high watermark may be uninitialised if fragmentation occurs
-	 * very early in boot so do not boost. We do not fall
-	 * through and boost by pageblock_nr_pages as failing
-	 * allocations that early means that reclaim is not going
-	 * to help and it may even be impossible to reclaim the
-	 * boosted watermark resulting in a hang.
-	 */
-	if (!max_boost)
-		return false;
-
-	max_boost =3D max(pageblock_nr_pages, max_boost);
-
-	zone->watermark_boost =3D min(zone->watermark_boost + pageblock_nr_pages,
-		max_boost);
-
-	return true;
-}
-
 /*
  * When we are falling back to another migratetype during allocation, shou=
ld we
  * try to claim an entire block to satisfy further allocations, instead of
@@ -2477,14 +2439,6 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 		return page;
 	}
=20
-	/*
-	 * Boost watermarks to increase reclaim pressure to reduce the
-	 * likelihood of future fallbacks. Wake kswapd now as the node
-	 * may be balanced overall and kswapd will not wake naturally.
-	 */
-	if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
-		set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
-
 	/* moving whole block can fail due to zone boundary conditions */
 	if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
 				       &movable_pages))
@@ -3839,13 +3793,6 @@ struct page *rmqueue(struct zone *preferred_zone,
 							migratetype);
=20
 out:
-	/* Separate test+clear to avoid unnecessary atomics */
-	if ((alloc_flags & ALLOC_KSWAPD) &&
-	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
-		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
-		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
-	}
-
 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
 	return page;
 }
@@ -4123,24 +4070,8 @@ static inline bool zone_watermark_fast(struct zone *=
z, unsigned int order,
 			return true;
 	}
=20
-	if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
-					free_pages))
-		return true;
-
-	/*
-	 * Ignore watermark boosting for __GFP_HIGH order-0 allocations
-	 * when checking the min watermark. The min watermark is the
-	 * point where boosting is ignored so that kswapd is woken up
-	 * when below the low watermark.
-	 */
-	if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_=
boost
-		&& ((alloc_flags & ALLOC_WMARK_MASK) =3D=3D WMARK_MIN))) {
-		mark =3D z->_watermark[WMARK_MIN];
-		return __zone_watermark_ok(z, order, mark, highest_zoneidx,
-					alloc_flags, free_pages);
-	}
-
-	return false;
+	return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
+					free_pages);
 }
=20
 #ifdef CONFIG_NUMA
@@ -6919,7 +6850,6 @@ static void __setup_per_zone_wmarks(void)
 			    mult_frac(zone_managed_pages(zone),
 				      watermark_scale_factor, 10000));
=20
-		zone->watermark_boost =3D 0;
 		zone->_watermark[WMARK_LOW]  =3D min_wmark_pages(zone) + tmp;
 		zone->_watermark[WMARK_HIGH] =3D low_wmark_pages(zone) + tmp;
 		zone->_watermark[WMARK_PROMO] =3D high_wmark_pages(zone) + tmp;
@@ -7187,14 +7117,6 @@ static const struct ctl_table page_alloc_sysctl_tabl=
e[] =3D {
 		.proc_handler	=3D min_free_kbytes_sysctl_handler,
 		.extra1		=3D SYSCTL_ZERO,
 	},
-	{
-		.procname	=3D "watermark_boost_factor",
-		.data		=3D &watermark_boost_factor,
-		.maxlen		=3D sizeof(watermark_boost_factor),
-		.mode		=3D 0644,
-		.proc_handler	=3D proc_dointvec_minmax,
-		.extra1		=3D SYSCTL_ZERO,
-	},
 	{
 		.procname	=3D "watermark_scale_factor",
 		.data		=3D &watermark_scale_factor,
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 43aca5a2ac99..d08f1263480a 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -302,7 +302,6 @@ static void show_free_areas(unsigned int filter, nodema=
sk_t *nodemask, int max_z
 		printk(KERN_CONT
 			"%s"
 			" free:%lukB"
-			" boost:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
@@ -325,7 +324,6 @@ static void show_free_areas(unsigned int filter, nodema=
sk_t *nodemask, int max_z
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
-			K(zone->watermark_boost),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd1b1aa12581..461e70f9c9f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6883,30 +6883,6 @@ static void kswapd_age_node(struct pglist_data *pgda=
t, struct scan_control *sc)
 	} while (memcg);
 }
=20
-static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
-{
-	int i;
-	struct zone *zone;
-
-	/*
-	 * Check for watermark boosts top-down as the higher zones
-	 * are more likely to be boosted. Both watermarks and boosts
-	 * should not be checked at the same time as reclaim would
-	 * start prematurely when there is no boosting and a lower
-	 * zone is balanced.
-	 */
-	for (i =3D highest_zoneidx; i >=3D 0; i--) {
-		zone =3D pgdat->node_zones + i;
-		if (!managed_zone(zone))
-			continue;
-
-		if (zone->watermark_boost)
-			return true;
-	}
-
-	return false;
-}
-
 /*
  * Returns true if there is an eligible zone balanced for the request order
  * and highest_zoneidx
@@ -7111,14 +7087,13 @@ static int balance_pgdat(pg_data_t *pgdat, int orde=
r, int highest_zoneidx)
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
 	unsigned long pflags;
-	unsigned long nr_boost_reclaim;
-	unsigned long zone_boosts[MAX_NR_ZONES] =3D { 0, };
-	bool boosted;
 	struct zone *zone;
 	struct scan_control sc =3D {
 		.gfp_mask =3D GFP_KERNEL,
 		.order =3D order,
 		.may_unmap =3D 1,
+		.may_writepage =3D 1,
+		.may_swap =3D 1,
 	};
=20
 	set_task_reclaim_state(current, &sc.reclaim_state);
@@ -7127,18 +7102,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order=
, int highest_zoneidx)
=20
 	count_vm_event(PAGEOUTRUN);
=20
-	/*
-	 * Account for the reclaim boost. Note that the zone boost is left in
-	 * place so that parallel allocations that are near the watermark will
-	 * stall or direct reclaim until kswapd is finished.
-	 */
-	nr_boost_reclaim =3D 0;
-	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
-		nr_boost_reclaim +=3D zone->watermark_boost;
-		zone_boosts[i] =3D zone->watermark_boost;
-	}
-	boosted =3D nr_boost_reclaim;
-
 restart:
 	set_reclaim_active(pgdat, highest_zoneidx);
 	sc.priority =3D DEF_PRIORITY;
@@ -7173,39 +7136,14 @@ static int balance_pgdat(pg_data_t *pgdat, int orde=
r, int highest_zoneidx)
 		}
=20
 		/*
-		 * If the pgdat is imbalanced then ignore boosting and preserve
-		 * the watermarks for a later time and restart. Note that the
-		 * zone watermarks will be still reset at the end of balancing
-		 * on the grounds that the normal reclaim should be enough to
-		 * re-evaluate if boosting is required when kswapd next wakes.
+		 * If there are no eligible zones, no work to do. Note that
+		 * sc.reclaim_idx is not used as buffer_heads_over_limit may
+		 * have adjusted it.
 		 */
 		balanced =3D pgdat_balanced(pgdat, sc.order, highest_zoneidx);
-		if (!balanced && nr_boost_reclaim) {
-			nr_boost_reclaim =3D 0;
-			goto restart;
-		}
-
-		/*
-		 * If boosting is not active then only reclaim if there are no
-		 * eligible zones. Note that sc.reclaim_idx is not used as
-		 * buffer_heads_over_limit may have adjusted it.
-		 */
-		if (!nr_boost_reclaim && balanced)
+		if (balanced)
 			goto out;
=20
-		/* Limit the priority of boosting to avoid reclaim writeback */
-		if (nr_boost_reclaim && sc.priority =3D=3D DEF_PRIORITY - 2)
-			raise_priority =3D false;
-
-		/*
-		 * Do not writeback or swap pages for boosted reclaim. The
-		 * intent is to relieve pressure not issue sub-optimal IO
-		 * from reclaim context. If no pages are reclaimed, the
-		 * reclaim will be aborted.
-		 */
-		sc.may_writepage =3D !nr_boost_reclaim;
-		sc.may_swap =3D !nr_boost_reclaim;
-
 		/*
 		 * Do some background aging, to give pages a chance to be
 		 * referenced before reclaiming. All pages are rotated
@@ -7249,15 +7187,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order=
, int highest_zoneidx)
 		 * progress in reclaiming pages
 		 */
 		nr_reclaimed =3D sc.nr_reclaimed - nr_reclaimed;
-		nr_boost_reclaim -=3D min(nr_boost_reclaim, nr_reclaimed);
-
-		/*
-		 * If reclaim made no progress for a boost, stop reclaim as
-		 * IO cannot be queued and it could be an infinite loop in
-		 * extreme circumstances.
-		 */
-		if (nr_boost_reclaim && !nr_reclaimed)
-			break;
=20
 		if (raise_priority || !nr_reclaimed)
 			sc.priority--;
@@ -7273,12 +7202,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order=
, int highest_zoneidx)
 		goto restart;
 	}
=20
-	/*
-	 * If the reclaim was boosted, we might still be far from the
-	 * watermark_high at this point. We need to avoid increasing the
-	 * failure count to prevent the kswapd thread from stopping.
-	 */
-	if (!sc.nr_reclaimed && !boosted) {
+	if (!sc.nr_reclaimed) {
 		int fail_cnt =3D atomic_inc_return(&pgdat->kswapd_failures);
 		/* kswapd context, low overhead to trace every failure */
 		trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt);
@@ -7287,28 +7211,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order=
, int highest_zoneidx)
 out:
 	clear_reclaim_active(pgdat, highest_zoneidx);
=20
-	/* If reclaim was boosted, account for the reclaim done in this pass */
-	if (boosted) {
-		unsigned long flags;
-
-		for (i =3D 0; i <=3D highest_zoneidx; i++) {
-			if (!zone_boosts[i])
-				continue;
-
-			/* Increments are under the zone lock */
-			zone =3D pgdat->node_zones + i;
-			spin_lock_irqsave(&zone->lock, flags);
-			zone->watermark_boost -=3D min(zone->watermark_boost, zone_boosts[i]);
-			spin_unlock_irqrestore(&zone->lock, flags);
-		}
-
-		/*
-		 * As there is now likely space, wakeup kcompact to defragment
-		 * pageblocks.
-		 */
-		wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
-	}
-
 	snapshot_refaults(NULL, pgdat);
 	__fs_reclaim_release(_THIS_IP_);
 	psi_memstall_leave(&pflags);
@@ -7542,8 +7444,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags=
, int order,
=20
 	/* Hopeless node, leave it to direct reclaim if possible */
 	if (kswapd_test_hopeless(pgdat) ||
-	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
-	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
+	    pgdat_balanced(pgdat, order, highest_zoneidx)) {
 		/*
 		 * There may be plenty of free memory available, but it's too
 		 * fragmented for high-order allocations.  Wake up kcompactd
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f534972f517d..7b48b84287a7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1769,7 +1769,6 @@ static void zoneinfo_show_print(struct seq_file *m, p=
g_data_t *pgdat,
 	}
 	seq_printf(m,
 		   "\n  pages free     %lu"
-		   "\n        boost    %lu"
 		   "\n        min      %lu"
 		   "\n        low      %lu"
 		   "\n        high     %lu"
@@ -1779,7 +1778,6 @@ static void zoneinfo_show_print(struct seq_file *m, p=
g_data_t *pgdat,
 		   "\n        managed  %lu"
 		   "\n        cma      %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
-		   zone->watermark_boost,
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C524D3EFFB0
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289241; cv=none;
 b=rrZ+SYnYe0t8kwI8agqq+AOL5DjbYCx0VSB96V13Y0XyVhNuyg3Q4xMA7mXDcwJ+AOLBfVte0DpbomOCoRKmCbuh/jLvxp9UeXylBAmECs2NMctV75L62djfWykCzqv+Asz3sCYzPmcMbSUpi6MK17nmkoC2FHC3elRobdPk3PY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289241; c=relaxed/simple;
	bh=sBEm+poRz94wfLZkYjzHBdXcOQ8y2UxtTeQy4usdan8=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=ZDxI9uahbOzBqGcn78wRsN1oY0EHisLSOW4ibXhRYgkZfRmUTzMukTplSz7fnJJAsbNZX7/iX9J+hWpOxc7DQ3+lXi7O1EvknwS66kIlISRIx/2Upmwh+NQukq0WrUhvg9Wt0M7XPyHv2yQYqjho7ohAaDhUOhVTZpNcOAPpiJQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=Y76r9bMo; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="Y76r9bMo"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=1wlnd9k10K1fagsrmIWt7MyygubkQyWXShAgLjFuCo0=; b=Y76r9bMom3LGgcPyxMuHW2xdIx
	x55SShjWwJ1GYSQxZFZr3wSAdPsmrlvjg5qFZj7P0kxyzoeL4Cv73Cpdr2O7+r/SKCX0r7EPTG4g4
	cIU5SUqHMGYF7o8eSvC8KhFisYPW8FFyN5lVNhTw2guUPV+KvlB7IDxip51mKezOn2Ps8uJTYkFoV
	zjIenPQJkniML5TeaI31AJvwrSDZ4UQ01FQjyO2lwK3WMpyZUrAZwWuuauO1reA386veoVUYmQMOC
	wt1GITzoFwJHM/KOlXxEQcUfPQrvbd2jMy3GnZMbJXRzL24e4RbSPJCUG3ySPoblhJoronfMo9mRu
	FsDmPlqw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-0jGZ;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 06/40] mm: page_alloc: async evacuation of stolen movable
 pageblocks
Date: Wed, 20 May 2026 10:59:12 -0400
Message-ID: <20260520150018.2491267-7-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

When the page allocator steals a movable pageblock for unmovable or
reclaimable allocations (via try_to_claim_block), the remaining movable
pages in that block can prevent future unmovable/reclaimable allocations
from being concentrated in fewer pageblocks, leading to long-term memory
fragmentation.

Add a lightweight asynchronous evacuation mechanism: when a movable
pageblock is claimed for unmovable/reclaimable use, queue a work item to
migrate the remaining movable pages out. This allows future
unmovable/reclaimable allocations to be satisfied from the now-evacuated
block, keeping those allocation types concentrated and reducing
fragmentation.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |   4 +
 mm/page_alloc.c        | 223 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 227 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 13e29b2ebb86..90498bbbf60b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
 #include <linux/local_lock.h>
+#include <linux/irq_work_types.h>
 #include <linux/zswap.h>
 #include <linux/sizes.h>
 #include <asm/page.h>
@@ -1540,6 +1541,9 @@ typedef struct pglist_data {
 	wait_queue_head_t kcompactd_wait;
 	struct task_struct *kcompactd;
 	bool proactive_compact_trigger;
+	struct workqueue_struct *evacuate_wq;
+	struct llist_head evacuate_pending;
+	struct irq_work evacuate_irq_work;
 #endif
 	/*
 	 * This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e01e58aca54..0f3d734bd296 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
+#include <linux/irq_work.h>
 #include <linux/jiffies.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
@@ -51,6 +52,7 @@
 #include <linux/lockdep.h>
 #include <linux/psi.h>
 #include <linux/khugepaged.h>
+#include <linux/workqueue.h>
 #include <linux/delayacct.h>
 #include <linux/cacheinfo.h>
 #include <linux/pgalloc_tag.h>
@@ -59,6 +61,10 @@
 #include "shuffle.h"
 #include "page_reporting.h"
=20
+#ifdef CONFIG_COMPACTION
+static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn);
+#endif
+
 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(=
). */
 typedef int __bitwise fpi_t;
=20
@@ -2428,6 +2434,13 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
=20
+	/*
+	 * Don't steal from pageblocks that are isolated for
+	 * evacuation -- that would undo the work in progress.
+	 */
+	if (get_pageblock_isolate(page))
+		return NULL;
+
 	/* Take ownership for orders >=3D pageblock_order */
 	if (current_order >=3D pageblock_order) {
 		unsigned int nr_added;
@@ -2473,6 +2486,18 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 			page_group_by_mobility_disabled) {
 		__move_freepages_block(zone, start_pfn, block_type, start_type);
 		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
+#ifdef CONFIG_COMPACTION
+		/*
+		 * A movable pageblock was just claimed for unmovable or
+		 * reclaimable use. Queue async evacuation of the remaining
+		 * movable pages so future unmovable/reclaimable allocations
+		 * can stay concentrated in fewer pageblocks.
+		 */
+		if (block_type =3D=3D MIGRATE_MOVABLE &&
+		    (start_type =3D=3D MIGRATE_UNMOVABLE ||
+		     start_type =3D=3D MIGRATE_RECLAIMABLE))
+			queue_pageblock_evacuate(zone, start_pfn);
+#endif
 		return __rmqueue_smallest(zone, order, start_type);
 	}
=20
@@ -7184,6 +7209,204 @@ void __init page_alloc_sysctl_init(void)
 	register_sysctl_init("vm", page_alloc_sysctl_table);
 }
=20
+#ifdef CONFIG_COMPACTION
+/*
+ * Pageblock evacuation: asynchronously migrate movable pages out of
+ * pageblocks that were stolen for unmovable/reclaimable allocations.
+ * This keeps unmovable/reclaimable allocations concentrated in fewer
+ * pageblocks, reducing long-term fragmentation.
+ *
+ * Uses a global pool of 64 pre-allocated work items (~3.5KB total)
+ * and a per-pgdat workqueue to keep migration node-local.
+ */
+
+struct evacuate_item {
+	struct work_struct	work;
+	struct zone		*zone;
+	unsigned long		start_pfn;
+	struct llist_node	free_node;
+};
+
+#define NR_EVACUATE_ITEMS	64
+static struct evacuate_item evacuate_pool[NR_EVACUATE_ITEMS];
+static struct llist_head evacuate_freelist;
+
+static struct evacuate_item *evacuate_item_alloc(void)
+{
+	struct llist_node *node;
+
+	node =3D llist_del_first(&evacuate_freelist);
+	if (!node)
+		return NULL;
+	return container_of(node, struct evacuate_item, free_node);
+}
+
+static void evacuate_item_free(struct evacuate_item *item)
+{
+	llist_add(&item->free_node, &evacuate_freelist);
+}
+
+static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn)
+{
+	unsigned long end_pfn =3D start_pfn + pageblock_nr_pages;
+	unsigned long pfn =3D start_pfn;
+	int nr_reclaimed;
+	int ret =3D 0;
+	struct compact_control cc =3D {
+		.nr_migratepages =3D 0,
+		.order =3D -1,
+		.zone =3D zone,
+		.mode =3D MIGRATE_ASYNC,
+		.gfp_mask =3D GFP_HIGHUSER_MOVABLE,
+	};
+	struct migration_target_control mtc =3D {
+		.nid =3D zone_to_nid(zone),
+		.gfp_mask =3D GFP_HIGHUSER_MOVABLE,
+	};
+
+	/* Verify this pageblock is still worth evacuating */
+	if (get_pageblock_migratetype(pfn_to_page(start_pfn)) =3D=3D MIGRATE_MOVA=
BLE)
+		return;
+
+	INIT_LIST_HEAD(&cc.migratepages);
+
+	/*
+	 * Loop through the entire pageblock, isolating and migrating
+	 * in batches. isolate_migratepages_range stops at
+	 * COMPACT_CLUSTER_MAX, so we must loop to cover the full block.
+	 */
+	while (pfn < end_pfn || !list_empty(&cc.migratepages)) {
+		if (list_empty(&cc.migratepages)) {
+			cc.nr_migratepages =3D 0;
+			cc.migrate_pfn =3D pfn;
+			ret =3D isolate_migratepages_range(&cc, pfn, end_pfn);
+			if (ret && ret !=3D -EAGAIN)
+				break;
+			pfn =3D cc.migrate_pfn;
+			if (list_empty(&cc.migratepages))
+				break;
+		}
+
+		nr_reclaimed =3D reclaim_clean_pages_from_list(zone,
+							&cc.migratepages);
+		cc.nr_migratepages -=3D nr_reclaimed;
+
+		if (!list_empty(&cc.migratepages)) {
+			ret =3D migrate_pages(&cc.migratepages,
+					    alloc_migration_target, NULL,
+					    (unsigned long)&mtc, cc.mode,
+					    MR_COMPACTION, NULL);
+			if (ret) {
+				putback_movable_pages(&cc.migratepages);
+				break;
+			}
+		}
+
+		cond_resched();
+	}
+
+	if (!list_empty(&cc.migratepages))
+		putback_movable_pages(&cc.migratepages);
+}
+
+static void evacuate_work_fn(struct work_struct *work)
+{
+	struct evacuate_item *item =3D container_of(work, struct evacuate_item,
+						  work);
+	evacuate_pageblock(item->zone, item->start_pfn);
+	evacuate_item_free(item);
+}
+
+/**
+ * evacuate_irq_work_fn - IRQ work callback to drain pending evacuations
+ * @work: the irq_work embedded in pg_data_t
+ *
+ * queue_work() can deadlock when called from inside the page allocator
+ * because it may try to allocate memory with locks already held.
+ * Use irq_work to defer the queue_work() calls to a safe context.
+ */
+static void evacuate_irq_work_fn(struct irq_work *work)
+{
+	pg_data_t *pgdat =3D container_of(work, pg_data_t,
+					evacuate_irq_work);
+	struct llist_node *pending;
+	struct evacuate_item *item, *next;
+
+	if (!pgdat->evacuate_wq)
+		return;
+
+	/*
+	 * Collect all pending items first, then queue them.  Use _safe
+	 * because evacuate_work_fn() may run immediately on another
+	 * CPU and free the item before we follow the next pointer.
+	 */
+	pending =3D llist_del_all(&pgdat->evacuate_pending);
+	llist_for_each_entry_safe(item, next, pending, free_node) {
+		INIT_WORK(&item->work, evacuate_work_fn);
+		queue_work(pgdat->evacuate_wq, &item->work);
+	}
+}
+
+/**
+ * queue_pageblock_evacuate - schedule async evacuation of movable pages
+ * @zone: the zone containing the pageblock
+ * @pfn: start PFN of the pageblock (must be pageblock-aligned)
+ *
+ * Called from the page allocator when a movable pageblock is claimed
+ * for unmovable or reclaimable allocations. Queues the pageblock for
+ * background migration of its remaining movable pages. Uses irq_work
+ * to defer the actual queue_work() call outside the allocator's lock
+ * context.
+ */
+static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn)
+{
+	struct evacuate_item *item;
+	pg_data_t *pgdat =3D zone->zone_pgdat;
+
+	if (!pgdat->evacuate_irq_work.func)
+		return;
+
+	item =3D evacuate_item_alloc();
+	if (!item)
+		return;
+
+	item->zone =3D zone;
+	item->start_pfn =3D pfn;
+	llist_add(&item->free_node, &pgdat->evacuate_pending);
+	irq_work_queue(&pgdat->evacuate_irq_work);
+}
+
+static int __init pageblock_evacuate_init(void)
+{
+	int nid, i;
+
+	/* Initialize the global freelist of work items */
+	init_llist_head(&evacuate_freelist);
+	for (i =3D 0; i < NR_EVACUATE_ITEMS; i++)
+		llist_add(&evacuate_pool[i].free_node, &evacuate_freelist);
+
+	/* Create a per-pgdat workqueue */
+	for_each_online_node(nid) {
+		pg_data_t *pgdat =3D NODE_DATA(nid);
+		char name[32];
+
+		snprintf(name, sizeof(name), "kevacuate/%d", nid);
+		pgdat->evacuate_wq =3D alloc_workqueue(name, WQ_MEM_RECLAIM, 1);
+		if (!pgdat->evacuate_wq) {
+			pr_warn("Failed to create evacuate workqueue for node %d\n", nid);
+			continue;
+		}
+
+		init_llist_head(&pgdat->evacuate_pending);
+		init_irq_work(&pgdat->evacuate_irq_work,
+			      evacuate_irq_work_fn);
+	}
+
+	return 0;
+}
+late_initcall(pageblock_evacuate_init);
+#endif /* CONFIG_COMPACTION */
+
 #ifdef CONFIG_CONTIG_ALLOC
 /* Usage: See admin-guide/dynamic-debug-howto.rst */
 static void alloc_contig_dump_pages(struct list_head *page_list)
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4FB63EEAC8
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289242; cv=none;
 b=u1BtdVSUiYsDqp+pNmCRxblADNzr/RgiqJdOLXE62M8iqqurzFNqAYCmWZJplyZGdQkwQT0zFOv+VLuijBSV2XgLEMD/E+ZCnk0WBBnH6+f89qde6RQ1+/bGEvmlnaMV/QE9FI/DOMCxTp4QSfEwMv2xeLsv6PGvE8eWEsAbF5Q=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289242; c=relaxed/simple;
	bh=fFke10xE2tSJsDIARZQOC4Ed/anZ3wGcEoWILS6cPA0=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=UitbER0DZQxW/pb5/GfzKHyrUA8FF010SijSzeXJXVR3jhkvD7r8Vn54t1l+tYpC9rwx6krpzJ93TtybAoqciZWxsq0zJevSVjcLOWiYc0BQ9qf5hyiAASm55HNdkELwMQl+QpkphTJLNGqd1cJQS5auXbt293Z3VZOfOFBnWMI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=niBp0Ap3; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="niBp0Ap3"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=6mviOc8XFuigX2Lo+UT1CyWuK4MX908sRC8mjB+IJ94=; b=niBp0Ap3VdEs+oGl1vCF0I66ew
	mT4tYL7RMYFEM4uy/y0rQNS5/XvSDG3ycZAikG+yyvMbAvgrYYH/rOazynbzIy19o1ZbwuW1L0g3g
	T2m7K84QXR+B7nDkNxyNpYiKmKEoNN6NZVWQMM3kDi4u+t2HhschXvC6waU1Xkv7hiwsYTyvqu33l
	Cc4KGbdmUrNpHDwJZWIi02On2IIUTXuBRQjbS8FYmfeekLBfB/EuMVy48LiVTa4pg8Kt3xF6z27Oo
	bBhYa7CeOhXMsFFstQpyOZx6sxqqENiUgQU1KeCHVKDQe+ZPFbK7BidG1BACKxTxZczhl8yvhj8W8
	eHYQGP4g==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-0rHP;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 07/40] mm: page_alloc: track actual page contents in
 pageblock flags
Date: Wed, 20 May 2026 10:59:13 -0400
Message-ID: <20260520150018.2491267-8-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Extend pageblock_data flags with PB_has_unmovable, PB_has_reclaimable, and
PB_has_movable bits to track the actual types of pages allocated within a
pageblock, independent of its intended migratetype.

The flags are set at steal time in try_to_claim_block(), avoiding overhead
on every allocation in __rmqueue_smallest():

1. Allocation / steal time: when try_to_claim_block() claims a pageblock,
set the PB_has_* flag corresponding to the allocation's migratetype. If
unmovable or reclaimable pages are being placed into a pageblock that
already has PB_has_movable set, queue async evacuation of the remaining
movable pages.

2. Full pageblock free: when buddy merging reconstructs a complete
pageblock in __free_one_page(), clear all PB_has_* flags since the block is
now empty.

3. Migration scan: when isolate_migratepages_block() completes a full
pageblock scan and finds no movable pages to isolate, clear PB_has_movable.
This consolidates the clearing for all callers: evacuate_pageblock(),
compaction, and alloc_contig_range().

This provides the foundation for superpageblock-level steering decisions:
knowing which pageblocks actually contain unmovable/reclaimable pages
allows directing future allocations to already-tainted regions, keeping
clean regions available for large contiguous allocations.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/pageblock-flags.h |  9 ++++
 mm/compaction.c                 | 17 ++++++
 mm/page_alloc.c                 | 93 +++++++++++++++++++++++++--------
 3 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flag=
s.h
index e046278a01fa..21bfcdf80b2e 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -20,6 +20,15 @@ enum pageblock_bits {
 	PB_migrate_2,
 	PB_compact_skip,/* If set the block is skipped by compaction */
=20
+	/*
+	 * Track actual page contents independent of the intended migratetype.
+	 * Set at allocation time; cleared on full pageblock free or when
+	 * migration confirms no pages of that type remain.
+	 */
+	PB_has_unmovable,
+	PB_has_reclaimable,
+	PB_has_movable,
+
 #ifdef CONFIG_MEMORY_ISOLATION
 	/*
 	 * Pageblock isolation is represented with a separate bit, so that
diff --git a/mm/compaction.c b/mm/compaction.c
index 3648ce22c807..e8ca651e2b07 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -867,6 +867,7 @@ isolate_migratepages_block(struct compact_control *cc, =
unsigned long low_pfn,
 	bool skip_on_failure =3D false;
 	unsigned long next_skip_pfn =3D 0;
 	bool skip_updated =3D false;
+	bool movable_skipped =3D false;
 	int ret =3D 0;
=20
 	cc->migrate_pfn =3D low_pfn;
@@ -1079,6 +1080,7 @@ isolate_migratepages_block(struct compact_control *cc=
, unsigned long low_pfn,
 					folio =3D page_folio(page);
 					goto isolate_success;
 				}
+				movable_skipped =3D true;
 			}
=20
 			goto isolate_fail;
@@ -1246,6 +1248,7 @@ isolate_migratepages_block(struct compact_control *cc=
, unsigned long low_pfn,
 			lruvec_unlock_irqrestore(locked, flags);
 			locked =3D NULL;
 		}
+		movable_skipped =3D true;
 		folio_put(folio);
=20
 isolate_fail:
@@ -1309,6 +1312,20 @@ isolate_migratepages_block(struct compact_control *c=
c, unsigned long low_pfn,
 		if (!cc->no_set_skip_hint && valid_page && !skip_updated)
 			set_pageblock_skip(valid_page);
 		update_cached_migrate(cc, low_pfn);
+
+		/*
+		 * Full pageblock scanned with no movable pages isolated.
+		 * Only clear PB_has_movable if no movable pages were
+		 * seen at all. If movable pages exist but could not be
+		 * isolated (pinned, writeback, dirty, etc.), leave the
+		 * flag set so a future migration attempt can try again.
+		 */
+		if (!nr_isolated && !movable_skipped && valid_page &&
+		    get_pfnblock_bit(valid_page, pageblock_start_pfn(start_pfn),
+				     PB_has_movable))
+			clear_pfnblock_bit(valid_page,
+					   pageblock_start_pfn(start_pfn),
+					   PB_has_movable);
 	}
=20
 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0f3d734bd296..23108cdcbbec 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -928,6 +928,30 @@ static void change_pageblock_range(struct page *pagebl=
ock_page,
 	}
 }
=20
+/*
+ * mark_pageblock_free - handle a pageblock becoming fully free
+ * @page: page at the start of the pageblock
+ * @pfn: page frame number
+ *
+ * Clear stale PCP ownership and actual-contents tracking flags when
+ * buddy merging reconstructs a full pageblock or a whole pageblock is
+ * freed directly. No PCP can still hold pages from this block (otherwise
+ * the buddy merge couldn't have completed), so the ownership entry would
+ * just cause misrouted frees.
+ */
+static void mark_pageblock_free(struct page *page, unsigned long pfn)
+{
+	clear_pcpblock_owner(page);
+
+	/*
+	 * The entire block is now free -- clear actual-contents tracking
+	 * flags since no allocated pages remain.
+	 */
+	clear_pfnblock_bit(page, pfn, PB_has_unmovable);
+	clear_pfnblock_bit(page, pfn, PB_has_reclaimable);
+	clear_pfnblock_bit(page, pfn, PB_has_movable);
+}
+
 /*
  * Freeing function for a buddy system allocator.
  *
@@ -973,19 +997,14 @@ static inline void __free_one_page(struct page *page,
 	account_freepages(zone, 1 << order, migratetype);
=20
 	/*
-	 * For whole blocks, ownership returns to the zone. There are
-	 * no more outstanding frees to route through that CPU's PCP,
-	 * and we don't want to confuse any future users of the pages
-	 * in this block. E.g. rmqueue_buddy().
-	 *
-	 * Check here if a whole block came in directly: pre-merged in
-	 * the PCP, or PCP contended and bypassed.
-	 *
-	 * There is another check in the loop below if a block merges
-	 * up with pages already on the zone buddy.
+	 * When freeing a whole pageblock, clear stale PCP ownership
+	 * and actual-contents tracking flags up front.  The in-loop
+	 * check only fires when sub-pageblock pages merge *up to*
+	 * pageblock_order, not when entering at pageblock_order
+	 * directly.
 	 */
 	if (order =3D=3D pageblock_order)
-		clear_pcpblock_owner(page);
+		mark_pageblock_free(page, pfn);
=20
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt =3D migratetype;
@@ -1037,9 +1056,13 @@ static inline void __free_one_page(struct page *page,
 		pfn =3D combined_pfn;
 		order++;
=20
-		/* Clear owner also when we merge up. See above */
+		/*
+		 * If merging has reconstructed a full pageblock,
+		 * clear any stale PCP ownership and actual-contents
+		 * tracking flags.
+		 */
 		if (order =3D=3D pageblock_order)
-			clear_pcpblock_owner(page);
+			mark_pageblock_free(page, pfn);
 	}
=20
 done_merging:
@@ -2433,6 +2456,9 @@ try_to_claim_block(struct zone *zone, struct page *pa=
ge,
 {
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
+#ifdef CONFIG_COMPACTION
+	struct page *start_page;
+#endif
=20
 	/*
 	 * Don't steal from pageblocks that are isolated for
@@ -2488,15 +2514,29 @@ try_to_claim_block(struct zone *zone, struct page *=
page,
 		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
 #ifdef CONFIG_COMPACTION
 		/*
-		 * A movable pageblock was just claimed for unmovable or
-		 * reclaimable use. Queue async evacuation of the remaining
-		 * movable pages so future unmovable/reclaimable allocations
-		 * can stay concentrated in fewer pageblocks.
+		 * Track actual page contents in pageblock flags.
+		 * Mark the pageblock with the type being allocated, and
+		 * if unmovable/reclaimable pages are being placed into a
+		 * pageblock that already has movable pages, queue async
+		 * evacuation of the movable pages.
 		 */
-		if (block_type =3D=3D MIGRATE_MOVABLE &&
-		    (start_type =3D=3D MIGRATE_UNMOVABLE ||
-		     start_type =3D=3D MIGRATE_RECLAIMABLE))
-			queue_pageblock_evacuate(zone, start_pfn);
+		start_page =3D pfn_to_page(start_pfn);
+		if (start_type =3D=3D MIGRATE_UNMOVABLE) {
+			set_pfnblock_bit(start_page, start_pfn,
+					 PB_has_unmovable);
+			if (get_pfnblock_bit(start_page, start_pfn,
+					     PB_has_movable))
+				queue_pageblock_evacuate(zone, start_pfn);
+		} else if (start_type =3D=3D MIGRATE_RECLAIMABLE) {
+			set_pfnblock_bit(start_page, start_pfn,
+					 PB_has_reclaimable);
+			if (get_pfnblock_bit(start_page, start_pfn,
+					     PB_has_movable))
+				queue_pageblock_evacuate(zone, start_pfn);
+		} else if (start_type =3D=3D MIGRATE_MOVABLE) {
+			set_pfnblock_bit(start_page, start_pfn,
+					 PB_has_movable);
+		}
 #endif
 		return __rmqueue_smallest(zone, order, start_type);
 	}
@@ -7307,6 +7347,17 @@ static void evacuate_pageblock(struct zone *zone, un=
signed long start_pfn)
=20
 	if (!list_empty(&cc.migratepages))
 		putback_movable_pages(&cc.migratepages);
+
+	/*
+	 * Re-scan to let isolate_migratepages_block clear PB_has_movable
+	 * if no movable pages remain after evacuation.
+	 */
+	cc.migrate_pfn =3D start_pfn;
+	cc.nr_migratepages =3D 0;
+	INIT_LIST_HEAD(&cc.migratepages);
+	isolate_migratepages_range(&cc, start_pfn, end_pfn);
+	if (!list_empty(&cc.migratepages))
+		putback_movable_pages(&cc.migratepages);
 }
=20
 static void evacuate_work_fn(struct work_struct *work)
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D75F63EFFD8
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289249; cv=none;
 b=ut4vEC0yRKjpQ39vruuJlO58ZMAg+SOLkRHNM3+YIEWpD791bPTGL+0xwLPXSsgxGeBC194LPzSYHOwNIsblVeoTS2aUu5ijBuGzKYu9n5QcsyzILXcubqufukm44G6h0ZEBXRIwCa1asHEWEHQQY2PfPPIHFgEjMuMqnJJrFIQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289249; c=relaxed/simple;
	bh=V4L4I1nCDaPWp52CNhfDpWydRxTTtpU+CJva6dzWuYo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=eqVQvion9G3Qp6ZO94DW/nCBm4Rgas+o2lAAX3mlUcbpPzL2624WAc+GhF1xlhUp5wuKrJu2lL3VTIYcBjH1ND3hQUzDVAsZIsvPtYw6zgzRzG6III6UNinWebhqO/KqKzHV4Fasq8S3PHj38pHwB+yTehQKzy3v//mxewlvXCk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=CkS1ZzHB; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="CkS1ZzHB"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:Content-Type:MIME-Version:References:
	In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=4Bzuf5BwIhqsSiCqnGbY9Gcb4BHhoBzW49U2aIagYW8=; b=CkS1ZzHB+SVk/g4C3fgrjki9LM
	3wC4dwpHnuXCizhek6TI0mRnX1ViB0uIDK0LboknXITq5LdMskd5WDgqC3lVb9Dul9OSvNXvL1Imk
	dPo4XGBb3fMFA2npkznCZJ3+wYkSDa6AVMHdQFj0neXry8asN2P+NGMkpREVibLkv51HKg1Xcxdev
	pmlrz4Vom3kVeYeeLd4wt3EI/feN6uFQ14Xkf0MtyrMZNJz0eJg/wtI7khTMKDUaijeFlSf9ccWBA
	lN1OIx91/TyzUfvtEt7NFdfw62cH/+dBA+RJhrK7Psdmeo8KokiDUaFOJoMtwQswOhu8xQ6agtS9v
	crEimvVA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-0xr4;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 08/40] mm: page_alloc: superpageblock metadata for 1GB
 anti-fragmentation
Date: Wed, 20 May 2026 10:59:14 -0400
Message-ID: <20260520150018.2491267-9-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

Introduce a 1GB (PUD-sized) "superpageblock" data structure to track
pageblock composition at a coarser granularity, enabling future steering of
unmovable/reclaimable allocations into already-tainted superpageblocks and
preserving clean superpageblocks for 1GB hugepage allocation.

Each superpageblock groups SUPERBLOCK_NR_PAGEBLOCKS pageblocks (512 on
  x86_64 with 2MB pageblocks) and maintains:
- Counts of pageblocks by migratetype (nr_free, nr_unmovable,
  nr_reclaimable, nr_movable, nr_reserved)
- A list_head for future organization by fullness category
- Identity (start_pfn, zone pointer)

Superblock counters are maintained by hooking into
init_pageblock_migratetype(). Memory holes and firmware-reserved regions
are tracked as reserved pageblocks by initializing all slots as reserved
during setup and decrementing as init_pageblock_migratetype() claims them.

The superpageblock array is allocated per-zone during boot via memblock. At
~48 bytes per superpageblock (~12KB for a 256GB system), the overhead is
negligible.

This is pure bookkeeping with no allocation behavior change.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h | 57 ++++++++++++++++++++++++++
 mm/mm_init.c           | 90 ++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c        | 65 ++++++++++++++++++++++++++++++
 3 files changed, 212 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 90498bbbf60b..e3eac971a76a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -974,6 +974,43 @@ enum zone_type {
=20
 #define ASYNC_AND_SYNC 2
=20
+/*
+ * Superpageblock: 1GB (PUD-sized) region for anti-fragmentation tracking.
+ *
+ * Groups pageblocks to steer unmovable/reclaimable allocations into
+ * already-tainted superpageblocks, preserving clean superpageblocks for 1=
GB
+ * hugepage allocation.
+ *
+ * SUPERPAGEBLOCK_ORDER derived from PUD geometry:
+ *   x86_64: PUD_SHIFT=3D30, PAGE_SHIFT=3D12 =E2=86=92 order 18 =E2=86=92 =
1GB
+ *   Each superpageblock contains SUPERPAGEBLOCK_NR_PAGEBLOCKS pageblocks
+ *   (512 on x86_64 with 2MB pageblocks).
+ */
+#define SUPERPAGEBLOCK_ORDER	(PUD_SHIFT - PAGE_SHIFT)
+#define SUPERPAGEBLOCK_NR_PAGES	(1UL << SUPERPAGEBLOCK_ORDER)
+
+/*
+ * SUPERPAGEBLOCK_NR_PAGEBLOCKS depends on pageblock_order which may be
+ * variable (CONFIG_HUGETLB_PAGE_SIZE_VARIABLE).
+ */
+#define SUPERPAGEBLOCK_NR_PAGEBLOCKS (1UL << (SUPERPAGEBLOCK_ORDER - pageb=
lock_order))
+
+struct superpageblock {
+	/* Pageblock counts by current migratetype */
+	u16			nr_free;
+	u16			nr_unmovable;
+	u16			nr_reclaimable;
+	u16			nr_movable;
+	u16			nr_reserved;	/* holes, firmware, etc. */
+
+	/* For organizing superpageblocks by fullness category */
+	struct list_head	list;
+
+	/* Identity */
+	unsigned long		start_pfn;
+	struct zone		*zone;
+};
+
 struct zone {
 	/* Read-mostly fields */
=20
@@ -1016,6 +1053,11 @@ struct zone {
 	struct pageblock_data	*pageblock_data;
 #endif /* CONFIG_SPARSEMEM */
=20
+	/* Superpageblock array for 1GB anti-fragmentation tracking */
+	struct superpageblock	*superpageblocks;
+	unsigned long		nr_superpageblocks;
+	unsigned long		superpageblock_base_pfn; /* 1GB-aligned base */
+
 	/* zone_start_pfn =3D=3D zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
=20
@@ -1159,6 +1201,21 @@ struct zone {
 #endif
 } ____cacheline_internodealigned_in_smp;
=20
+static inline struct superpageblock *pfn_to_superpageblock(struct zone *zo=
ne,
+						   unsigned long pfn)
+{
+	unsigned long idx;
+
+	if (!zone->superpageblocks)
+		return NULL;
+
+	idx =3D (pfn - zone->superpageblock_base_pfn) >> SUPERPAGEBLOCK_ORDER;
+	if (idx >=3D zone->nr_superpageblocks)
+		return NULL;
+
+	return &zone->superpageblocks[idx];
+}
+
 enum pgdat_flags {
 	PGDAT_WRITEBACK,		/* reclaim scanning has recently found
 					 * many pages under writeback
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 47a222e49fc9..de02a6087c21 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1503,6 +1503,95 @@ static void __ref setup_usemap(struct zone *zone)
 static inline void setup_usemap(struct zone *zone) {}
 #endif /* CONFIG_SPARSEMEM */
=20
+/**
+ * init_one_superpageblock - initialize a single superpageblock
+ * @sb: superpageblock to initialize
+ * @zone: owning zone
+ * @start_pfn: start PFN for this superpageblock
+ * @zone_start: zone start PFN (for clipping)
+ * @zone_end: zone end PFN (for clipping)
+ *
+ * Zero counters, compute the zone-clipped pageblock count.
+ * Used by both boot-time setup and memory hotplug resize.
+ */
+static void __meminit init_one_superpageblock(struct superpageblock *sb,
+					      struct zone *zone,
+					      unsigned long start_pfn,
+					      unsigned long zone_start,
+					      unsigned long zone_end)
+{
+	unsigned long sb_end =3D start_pfn + SUPERPAGEBLOCK_NR_PAGES;
+	unsigned long pb_start =3D max(start_pfn, zone_start);
+	unsigned long pb_end =3D min(sb_end, zone_end);
+	u16 actual_pbs;
+
+	sb->nr_unmovable =3D 0;
+	sb->nr_reclaimable =3D 0;
+	sb->nr_movable =3D 0;
+	sb->nr_free =3D 0;
+	INIT_LIST_HEAD(&sb->list);
+	sb->start_pfn =3D start_pfn;
+	sb->zone =3D zone;
+
+	/*
+	 * Start with all pageblock slots as reserved.
+	 * init_pageblock_migratetype() will decrement nr_reserved and
+	 * increment the appropriate counter for each real pageblock.
+	 * Holes and firmware-reserved regions stay counted as reserved.
+	 *
+	 * Only count pageblocks that fall within the zone's span.
+	 * The first and last superpageblocks may extend beyond the
+	 * zone boundaries.  Use round-up division because a partial
+	 * pageblock at the zone boundary still gets initialized by
+	 * init_pageblock_migratetype().
+	 */
+	actual_pbs =3D (pb_end > pb_start) ?
+		     ((pb_end - pb_start + pageblock_nr_pages - 1) >>
+		      pageblock_order) : 0;
+	sb->nr_reserved =3D actual_pbs;
+}
+
+static void __init setup_superpageblocks(struct zone *zone)
+{
+	unsigned long zone_start =3D zone->zone_start_pfn;
+	unsigned long zone_end =3D zone_start + zone->spanned_pages;
+	unsigned long sb_base, nr_superpageblocks;
+	size_t alloc_size;
+	unsigned long i;
+
+	zone->superpageblocks =3D NULL;
+	zone->nr_superpageblocks =3D 0;
+	zone->superpageblock_base_pfn =3D 0;
+
+	if (!zone->spanned_pages)
+		return;
+
+	/*
+	 * Superpageblocks must be 1GB (PUD) aligned. Align the base down
+	 * and the end up to cover all 1GB regions the zone spans.
+	 */
+	sb_base =3D ALIGN_DOWN(zone_start, SUPERPAGEBLOCK_NR_PAGES);
+	nr_superpageblocks =3D (ALIGN(zone_end, SUPERPAGEBLOCK_NR_PAGES) - sb_bas=
e) >>
+			 SUPERPAGEBLOCK_ORDER;
+
+	alloc_size =3D nr_superpageblocks * sizeof(struct superpageblock);
+	zone->superpageblocks =3D memblock_alloc_node(alloc_size, SMP_CACHE_BYTES,
+						zone_to_nid(zone));
+	if (!zone->superpageblocks) {
+		pr_warn("Failed to allocate %zu bytes for zone %s superpageblocks\n",
+			alloc_size, zone->name);
+		return;
+	}
+
+	zone->nr_superpageblocks =3D nr_superpageblocks;
+	zone->superpageblock_base_pfn =3D sb_base;
+
+	for (i =3D 0; i < nr_superpageblocks; i++)
+		init_one_superpageblock(&zone->superpageblocks[i], zone,
+					sb_base + (i << SUPERPAGEBLOCK_ORDER),
+					zone_start, zone_end);
+}
+
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
=20
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
@@ -1611,6 +1700,7 @@ static void __init free_area_init_core(struct pglist_=
data *pgdat)
 			continue;
=20
 		setup_usemap(zone);
+		setup_superpageblocks(zone);
 		init_currently_empty_zone(zone, zone->zone_start_pfn, size);
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23108cdcbbec..b9b7d54a869c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -457,6 +457,62 @@ void clear_pfnblock_bit(const struct page *page, unsig=
ned long pfn,
 	clear_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
=20
+/*
+ * Map migratetype to PB_has_* bit index. Returns -1 for types that
+ * don't have a tracking bit (e.g. MIGRATE_ISOLATE).
+ */
+static inline int migratetype_to_has_bit(int migratetype)
+{
+	switch (migratetype) {
+	case MIGRATE_UNMOVABLE:
+	case MIGRATE_HIGHATOMIC:
+		return PB_has_unmovable;
+	case MIGRATE_RECLAIMABLE:
+		return PB_has_reclaimable;
+	case MIGRATE_MOVABLE:
+#ifdef CONFIG_CMA
+	case MIGRATE_CMA:
+#endif
+		return PB_has_movable;
+	default:
+		return -1;
+	}
+}
+
+/*
+ * __spb_set_has_type - set PB_has_* and increment type counter
+ *
+ * Idempotent: only increments the counter on the 0=E2=86=921 bit transiti=
on.
+ */
+static void __spb_set_has_type(struct page *page, int migratetype)
+{
+	unsigned long pfn =3D page_to_pfn(page);
+	struct superpageblock *sb =3D pfn_to_superpageblock(page_zone(page), pfn);
+	int bit;
+
+	if (!sb)
+		return;
+
+	bit =3D migratetype_to_has_bit(migratetype);
+	if (bit < 0)
+		return;
+
+	if (!get_pfnblock_bit(page, pfn, bit)) {
+		set_pfnblock_bit(page, pfn, bit);
+		switch (bit) {
+		case PB_has_unmovable:
+			sb->nr_unmovable++;
+			break;
+		case PB_has_reclaimable:
+			sb->nr_reclaimable++;
+			break;
+		case PB_has_movable:
+			sb->nr_movable++;
+			break;
+		}
+	}
+}
+
 /**
  * set_pageblock_migratetype - Set the migratetype of a pageblock
  * @page: The page within the block of interest
@@ -490,6 +546,7 @@ void __meminit init_pageblock_migratetype(struct page *=
page,
 {
 	unsigned long pfn =3D page_to_pfn(page);
 	struct pageblock_data *pbd;
+	struct superpageblock *sb;
 	unsigned long flags;
=20
 	if (unlikely(page_group_by_mobility_disabled &&
@@ -513,6 +570,14 @@ void __meminit init_pageblock_migratetype(struct page =
*page,
 	pbd =3D pfn_to_pageblock(page, pfn);
 	pbd->block_pfn =3D pfn;
 	INIT_LIST_HEAD(&pbd->cpu_node);
+
+	/* Transition from reserved (boot default) to initial migratetype */
+	sb =3D pfn_to_superpageblock(page_zone(page), pfn);
+	if (sb) {
+		if (sb->nr_reserved)
+			sb->nr_reserved--;
+		__spb_set_has_type(page, migratetype);
+	}
 }
=20
 #ifdef CONFIG_DEBUG_VM
--=20
2.54.0

From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C35283EFD0A
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289241; cv=none;
 b=XvVoumGPtpkxe1Aho/YVDimj/TxH1LqA0kCLJyTWO7v4m0AhDbLYaKEL64jUiZkhYOWzm8czTlxkkqQSHwc41Ap0UWxBur3+aguQfM1MXmKBhebbKFCL7UQ4cDeDa6HlUcxH1XCkUc6ajmnoyi1HEA2mMAfDW1y0SWmhurQeODU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289241; c=relaxed/simple;
	bh=KueegnwQ/eM2jDCy8aekKSZqGme4HtkuQtQYLad7fbo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=GCwu2mxpnoXmppeMvttswHwXEq8peFiDu8zmBwsEXpeXEZVsA1Ymkfb5+g7HhmyTby3SRE71nmV02mdsGtFvwM9hn/agC876qOOu05ZPvSBruh3HmKNlGnfD+4PxevkCpmUypsF+If+Rv92IOkv/giKua6j7Q8SUYR9xVzOF6bE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=lLaK9KSw; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="lLaK9KSw"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=Nz5aqOVAk4HZQUBK+B668PusNEBvbV/K+dHsAIcLF6E=; b=lLaK9KSwr9B0+Ln0fPn71YTk0v
	7nWKTiBemGexrkmtUwG361FY6TI4Kt9zoF2A83StwBtG+Nn/4kPLM5oOV7MMawHneyz+RwW/a+7gL
	P4o2wlFqorkDkfbLr7y5pkHBBxU7KCr6lbnLTXfYUvfUVgehWqq1MfkXwfu3mn1yoFHqjNpc8YIuk
	XJ6+y0FNEe3uj84h0D6BkFhWHUDKsJ6y9OVDiWREOrAfmnBEXf3A2aavc/vASO7oYxDINOsVYEfWb
	2nsp+70I+nVGDXwuoapDn86AE9fCflAnsBh6NXa8+f6Z5kVhd8ge6M101OQhVyI65szdI4Fj8iD+b
	gPerttPw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-15DG;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 09/40] mm: page_alloc: support superpageblock resize for
 memory hotplug
Date: Wed, 20 May 2026 10:59:15 -0400
Message-ID: <20260520150018.2491267-10-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

setup_superpageblocks() is __init-only and uses memblock_alloc_node(), so
hotplugged memory that extends a zone's span has no superpageblock
coverage.  Pages in those regions would bypass superpageblock steering
entirely.

Add resize_zone_superpageblocks() which is called from
move_pfn_range_to_zone() after the zone span has been updated. It allocates
a new superpageblock array with kvmalloc_node() covering the full zone
span, copies existing superpageblocks (fixing up list head pointers), and
initializes new superpageblocks for the added range.

Use round-up division for partial pageblock counting to match
init_one_superpageblock().

ZONE_DEVICE is excluded since device pages should not participate in anti-
fragmentation steering.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |   1 +
 mm/internal.h          |   4 ++
 mm/memory_hotplug.c    |   4 ++
 mm/mm_init.c           | 138 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 147 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e3eac971a76a..19190328e0c7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1057,6 +1057,7 @@ struct zone {
 	struct superpageblock	*superpageblocks;
 	unsigned long		nr_superpageblocks;
 	unsigned long		superpageblock_base_pfn; /* 1GB-aligned base */
+	bool			spb_kvmalloced; /* true if from kvmalloc (hotplug) */
=20
 	/* zone_start_pfn =3D=3D zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
diff --git a/mm/internal.h b/mm/internal.h
index c8404cb00b08..6a089bc4aa09 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1101,6 +1101,10 @@ void init_cma_reserved_pageblock(struct page *page);
=20
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
=20
+#ifdef CONFIG_MEMORY_HOTPLUG
+void resize_zone_superpageblocks(struct zone *zone);
+#endif
+
 struct cma;
=20
 #ifdef CONFIG_CMA
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2a943ec57c85..b7c30dfdce8e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -752,6 +752,10 @@ void move_pfn_range_to_zone(struct zone *zone, unsigne=
d long start_pfn,
 	resize_zone_range(zone, start_pfn, nr_pages);
 	resize_pgdat_range(pgdat, start_pfn, nr_pages);
=20
+	/* Grow superpageblock array to cover the new zone span */
+	if (!zone_is_zone_device(zone))
+		resize_zone_superpageblocks(zone);
+
 	/*
 	 * Subsection population requires care in pfn_to_online_page().
 	 * Set the taint to enable the slow path detection of
diff --git a/mm/mm_init.c b/mm/mm_init.c
index de02a6087c21..ad1cbc2b4498 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1592,6 +1592,144 @@ static void __init setup_superpageblocks(struct zon=
e *zone)
 					zone_start, zone_end);
 }
=20
+#ifdef CONFIG_MEMORY_HOTPLUG
+/**
+ * resize_zone_superpageblocks - grow superpageblock array for memory hotp=
lug
+ * @zone: zone whose span has been extended by hotplug
+ *
+ * Called from move_pfn_range_to_zone() after resize_zone_range() has
+ * updated the zone's span.  Allocates a new superpageblock array covering
+ * the full zone span, copies existing superpageblocks (fixing up list hea=
ds),
+ * and initializes new superpageblocks for the added range.
+ *
+ * Must be called under mem_hotplug_lock (write).  No concurrent
+ * allocations can occur since the hotplugged pages are not yet online.
+ */
+void __meminit resize_zone_superpageblocks(struct zone *zone)
+{
+	unsigned long zone_start =3D zone->zone_start_pfn;
+	unsigned long zone_end =3D zone_start + zone->spanned_pages;
+	unsigned long new_sb_base, new_nr_sbs;
+	unsigned long old_offset;
+	struct superpageblock *old_sbs;
+	struct superpageblock *new_sbs;
+	bool old_kvmalloced;
+	size_t alloc_size;
+	unsigned long i;
+	int nid =3D zone_to_nid(zone);
+
+	if (!zone->spanned_pages)
+		return;
+
+	new_sb_base =3D ALIGN_DOWN(zone_start, SUPERPAGEBLOCK_NR_PAGES);
+	new_nr_sbs =3D (ALIGN(zone_end, SUPERPAGEBLOCK_NR_PAGES) - new_sb_base) >>
+		     SUPERPAGEBLOCK_ORDER;
+
+	/* Already covered? */
+	if (zone->superpageblocks &&
+	    new_sb_base =3D=3D zone->superpageblock_base_pfn &&
+	    new_nr_sbs =3D=3D zone->nr_superpageblocks)
+		return;
+
+	alloc_size =3D new_nr_sbs * sizeof(struct superpageblock);
+	new_sbs =3D kvmalloc_node(alloc_size, GFP_KERNEL | __GFP_ZERO, nid);
+	if (!new_sbs) {
+		pr_warn("Failed to allocate %zu bytes for zone %s superpageblocks\n",
+			alloc_size, zone->name);
+		return;
+	}
+
+	/*
+	 * Copy existing superpageblocks to their new position.
+	 * The old array covers [old_base, old_base + old_nr * SB_SIZE).
+	 * The new array covers [new_base, new_base + new_nr * SB_SIZE).
+	 * old_base >=3D new_base always (zone can only grow).
+	 */
+	if (zone->superpageblocks) {
+		old_offset =3D (zone->superpageblock_base_pfn - new_sb_base) >>
+			     SUPERPAGEBLOCK_ORDER;
+		memcpy(&new_sbs[old_offset], zone->superpageblocks,
+		       zone->nr_superpageblocks * sizeof(struct superpageblock));
+
+		/*
+		 * Fix up list_head pointers that were self-referencing
+		 * (empty lists) or pointing into the old array.
+		 */
+		for (i =3D old_offset; i < old_offset + zone->nr_superpageblocks; i++) {
+			struct superpageblock *sb =3D &new_sbs[i];
+
+			if (list_empty(&sb->list))
+				INIT_LIST_HEAD(&sb->list);
+			else
+				list_replace(&zone->superpageblocks[i - old_offset].list,
+					     &sb->list);
+		}
+	}
+
+	/* Initialize new superpageblocks (slots not covered by old array) */
+	for (i =3D 0; i < new_nr_sbs; i++) {
+		struct superpageblock *sb =3D &new_sbs[i];
+		bool is_old =3D false;
+
+		if (zone->superpageblocks) {
+			old_offset =3D (zone->superpageblock_base_pfn - new_sb_base) >>
+				     SUPERPAGEBLOCK_ORDER;
+			if (i >=3D old_offset &&
+			    i < old_offset + zone->nr_superpageblocks)
+				is_old =3D true;
+		}
+
+		if (is_old)
+			continue;
+
+		init_one_superpageblock(sb, zone,
+					new_sb_base + (i << SUPERPAGEBLOCK_ORDER),
+					zone_start, zone_end);
+	}
+
+	/*
+	 * Update existing superpageblocks whose nr_reserved may have
+	 * increased due to the zone span growing into them.
+	 */
+	if (zone->superpageblocks) {
+		old_offset =3D (zone->superpageblock_base_pfn - new_sb_base) >>
+			     SUPERPAGEBLOCK_ORDER;
+		for (i =3D old_offset; i < old_offset + zone->nr_superpageblocks; i++) {
+			struct superpageblock *sb =3D &new_sbs[i];
+			unsigned long sb_start =3D sb->start_pfn;
+			unsigned long sb_end =3D sb_start + SUPERPAGEBLOCK_NR_PAGES;
+			unsigned long pb_start =3D max(sb_start, zone_start);
+			unsigned long pb_end =3D min(sb_end, zone_end);
+			u16 new_pbs =3D (pb_end > pb_start) ?
+				((pb_end - pb_start + pageblock_nr_pages - 1) >>
+				 pageblock_order) : 0;
+			u16 old_pbs =3D sb->nr_free + sb->nr_unmovable +
+				sb->nr_reclaimable + sb->nr_movable +
+				sb->nr_reserved;
+
+			if (new_pbs > old_pbs)
+				sb->nr_reserved +=3D new_pbs - old_pbs;
+		}
+	}
+
+	/* Swap in the new array */
+	old_sbs =3D zone->superpageblocks;
+	old_kvmalloced =3D zone->spb_kvmalloced;
+	zone->superpageblocks =3D new_sbs;
+	zone->nr_superpageblocks =3D new_nr_sbs;
+	zone->superpageblock_base_pfn =3D new_sb_base;
+	zone->spb_kvmalloced =3D true;
+
+	/*
+	 * The boot-time array was allocated with memblock_alloc, which
+	 * is not individually freeable after boot.  Only kvfree arrays
+	 * from previous hotplug resizes.
+	 */
+	if (old_sbs && old_kvmalloced)
+		kvfree(old_sbs);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
=20
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CFFA63EFFC0
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289244; cv=none;
 b=KMh9kFTGpHSrv0Htl+Oq3aeGuqRLYiMpKERvrKPv2z4Nhm0WLo7s3VRVTHgF0oasj/t2rUUf9eaJItLRF1Lax6tq0Bhn0zlBVT3tKQE5h0OlC+YJFkjfurw/8QorkBpXxGyhjsnMtMXvEYZFsV+3twEQm44mydHY/ElKQfXC0UQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289244; c=relaxed/simple;
	bh=lndzxmbqIBNfU3f60HRfNe+gz75tSk+qDH1rzHb8zI8=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=hPdQLRZj6juf8R2hJbGHm0HQcgRxBEOFmZf/opS2EG5rGpIAyEDdlc61brCT2KhG5+09XKJgqIWVi5FwGJZS/GBJQTdI7i3OGsO6sKrZaxWQhgKOl5cqSFo41q8BFnbe/LDkljP8Okeol5Ynh7tCnvIUBVv+kR9phjWOPQcW3JE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=YrDS4Miz; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="YrDS4Miz"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:Content-Type:MIME-Version:References:
	In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=5m2f0p/Td6puNx2BkVZxtqaOk4oJNa3hM0pY4XXb+2M=; b=YrDS4MizLcd1AKObSmKUj0wIZV
	Mt0diwbj8g69QzTInSn+7MDeGnvJrjwlAfhjfF4HWIMJRI//rXLT9pZvL+e/BrUWxvOfhXOKfakfC
	V07TdjA7chkY4auPgDRDi7E4R1eSpfMKuYUg6HMyLvaT4pHH/7TLjqZecABoLVkmqCSuiY26AP7zh
	2WFbHY7MUpUnYKmGtELAi4rJD6eAVu4/6apJ0giuG36/8TViX6nKvwyTCumFAum0Eh218AO0rw3/c
	Fftvu15rP5YcP6chsgnexWn+GFvMuoHsmGUg62TSPs61DIux1CrtsxXqoj/Qnm/wXchpyrCTIKPDC
	ovIqHNxA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-1BZs;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 10/40] mm: page_alloc: add superpageblock fullness lists
 for allocation steering
Date: Wed, 20 May 2026 10:59:16 -0400
Message-ID: <20260520150018.2491267-11-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

Organize superpageblocks into bucketed lists by fullness level and taint
status to enable efficient allocation steering without sorting.

Five fullness buckets (FULL, 75%, 50%, 25%, ALMOST_EMPTY) track what
fraction of a superpageblock's pageblocks are in use. Two categories (CLEAN
vs TAINTED) distinguish superpageblocks that contain only free and movable
pageblocks from those contaminated with unmovable, reclaimable, or reserved
pageblocks. A separate sb_empty list tracks completely free
superpageblocks.

Track fully-free pageblocks with a PB_all_free pageblock flag. When buddy
coalescing reconstructs a full pageblock, increment nr_free. Type counters
are driven by PB_has_* bit transitions, not by migratetype label changes.

For tainted superpageblocks, fullness is based on unmovable + reclaimable
pageblock counts rather than total usage, correctly reflecting how full
they are with the content types we're trying to concentrate.

Add a debugfs interface at /sys/kernel/debug/superpageblocks.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h          |  22 +++
 include/linux/pageblock-flags.h |   1 +
 mm/mm_init.c                    |  26 ++-
 mm/page_alloc.c                 | 295 +++++++++++++++++++++++++++++++-
 4 files changed, 339 insertions(+), 5 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 19190328e0c7..b8ada3d13a34 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -995,6 +995,23 @@ enum zone_type {
  */
 #define SUPERPAGEBLOCK_NR_PAGEBLOCKS (1UL << (SUPERPAGEBLOCK_ORDER - pageb=
lock_order))
=20
+/* Superpageblock fullness buckets (by % of pageblocks in use) */
+enum sb_fullness {
+	SB_FULL,		/* 100% full, 0 free pageblocks */
+	SB_FULL_75,		/* 75-99% full */
+	SB_FULL_50,		/* 50-74% full */
+	SB_FULL_25,		/* 25-49% full */
+	SB_ALMOST_EMPTY,	/* 1-24% full */
+	__NR_SB_FULLNESS,
+};
+
+/* Superpageblock taint categories */
+enum sb_category {
+	SB_CLEAN,		/* only free + movable pageblocks */
+	SB_TAINTED,		/* has unmovable/reclaimable/reserved */
+	__NR_SB_CATEGORIES,
+};
+
 struct superpageblock {
 	/* Pageblock counts by current migratetype */
 	u16			nr_free;
@@ -1002,6 +1019,7 @@ struct superpageblock {
 	u16			nr_reclaimable;
 	u16			nr_movable;
 	u16			nr_reserved;	/* holes, firmware, etc. */
+	u16			total_pageblocks; /* zone-clipped total */
=20
 	/* For organizing superpageblocks by fullness category */
 	struct list_head	list;
@@ -1059,6 +1077,10 @@ struct zone {
 	unsigned long		superpageblock_base_pfn; /* 1GB-aligned base */
 	bool			spb_kvmalloced; /* true if from kvmalloc (hotplug) */
=20
+	/* Superpageblock fullness lists for allocation steering */
+	struct list_head	spb_empty;	/* completely free superpageblocks */
+	struct list_head	spb_lists[__NR_SB_CATEGORIES][__NR_SB_FULLNESS];
+
 	/* zone_start_pfn =3D=3D zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
=20
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flag=
s.h
index 21bfcdf80b2e..4dce39d054a9 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -28,6 +28,7 @@ enum pageblock_bits {
 	PB_has_unmovable,
 	PB_has_reclaimable,
 	PB_has_movable,
+	PB_all_free,	/* All pages in pageblock are free in buddy */
=20
 #ifdef CONFIG_MEMORY_ISOLATION
 	/*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index ad1cbc2b4498..2dc73d8a8d6c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1548,7 +1548,17 @@ static void __meminit init_one_superpageblock(struct=
 superpageblock *sb,
 	actual_pbs =3D (pb_end > pb_start) ?
 		     ((pb_end - pb_start + pageblock_nr_pages - 1) >>
 		      pageblock_order) : 0;
+	sb->total_pageblocks =3D actual_pbs;
 	sb->nr_reserved =3D actual_pbs;
+	if (actual_pbs) {
+		/*
+		 * All superpageblocks start as reserved (tainted+full).
+		 * They move to the correct category when the pages
+		 * inside are freed during boot.
+		 */
+		list_add_tail(&sb->list,
+			      &zone->spb_lists[SB_TAINTED][SB_FULL]);
+	}
 }
=20
 static void __init setup_superpageblocks(struct zone *zone)
@@ -1558,11 +1568,18 @@ static void __init setup_superpageblocks(struct zon=
e *zone)
 	unsigned long sb_base, nr_superpageblocks;
 	size_t alloc_size;
 	unsigned long i;
+	int cat, full;
=20
 	zone->superpageblocks =3D NULL;
 	zone->nr_superpageblocks =3D 0;
 	zone->superpageblock_base_pfn =3D 0;
=20
+	/* Fullness lists steer allocations to preferred superpageblocks */
+	INIT_LIST_HEAD(&zone->spb_empty);
+	for (cat =3D 0; cat < __NR_SB_CATEGORIES; cat++)
+		for (full =3D 0; full < __NR_SB_FULLNESS; full++)
+			INIT_LIST_HEAD(&zone->spb_lists[cat][full]);
+
 	if (!zone->spanned_pages)
 		return;
=20
@@ -1688,8 +1705,9 @@ void __meminit resize_zone_superpageblocks(struct zon=
e *zone)
 	}
=20
 	/*
-	 * Update existing superpageblocks whose nr_reserved may have
-	 * increased due to the zone span growing into them.
+	 * Update existing superpageblocks whose nr_reserved and
+	 * total_pageblocks may have increased due to the zone
+	 * span growing into them.
 	 */
 	if (zone->superpageblocks) {
 		old_offset =3D (zone->superpageblock_base_pfn - new_sb_base) >>
@@ -1707,8 +1725,10 @@ void __meminit resize_zone_superpageblocks(struct zo=
ne *zone)
 				sb->nr_reclaimable + sb->nr_movable +
 				sb->nr_reserved;
=20
-			if (new_pbs > old_pbs)
+			if (new_pbs > old_pbs) {
 				sb->nr_reserved +=3D new_pbs - old_pbs;
+				sb->total_pageblocks =3D new_pbs;
+			}
 		}
 	}
=20
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9b7d54a869c..c0f86a30b5c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,8 @@
 #include <linux/delayacct.h>
 #include <linux/cacheinfo.h>
 #include <linux/pgalloc_tag.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <asm/div64.h>
 #include "internal.h"
 #include "shuffle.h"
@@ -514,7 +516,157 @@ static void __spb_set_has_type(struct page *page, int=
 migratetype)
 }
=20
 /**
- * set_pageblock_migratetype - Set the migratetype of a pageblock
+ * spb_get_category - Determine if a superpageblock is clean or tainted
+ * @sb: superpageblock to classify
+ *
+ * A superpageblock is clean if it contains only free and movable pagebloc=
ks.
+ * Any unmovable, reclaimable, or reserved pageblocks make it tainted.
+ * Reserved pageblocks (memory holes) taint the superpageblock because it
+ * can never be used for 1GB hugepages, making it a better home for
+ * unmovable/reclaimable allocations.
+ */
+static inline enum sb_category spb_get_category(struct superpageblock *sb)
+{
+	if (sb->nr_unmovable || sb->nr_reclaimable || sb->nr_reserved)
+		return SB_TAINTED;
+	return SB_CLEAN;
+}
+
+/**
+ * sb_get_fullness - Determine the fullness bucket for a superpageblock
+ * @sb: superpageblock to classify
+ * @cat: the category (CLEAN or TAINTED) of this superpageblock
+ *
+ * For clean SPBs, fullness is based on total usage (total - nr_free).
+ * For tainted SPBs, fullness is based only on unmovable + reclaimable
+ * pageblocks, since those are what we're trying to concentrate.
+ * Uses SUPERPAGEBLOCK_NR_PAGEBLOCKS as divisor so that partial
+ * superpageblocks at zone boundaries are preferred over whole ones.
+ */
+static inline enum sb_fullness sb_get_fullness(struct superpageblock *sb,
+					       enum sb_category cat)
+{
+	unsigned int used, total =3D sb->total_pageblocks;
+	unsigned int quarter =3D SUPERPAGEBLOCK_NR_PAGEBLOCKS / 4;
+
+	if (!total)
+		return SB_FULL;
+
+	if (cat =3D=3D SB_TAINTED)
+		used =3D sb->nr_unmovable + sb->nr_reclaimable;
+	else
+		used =3D total - sb->nr_free;
+
+	if (used >=3D total)
+		return SB_FULL;
+
+	if (used >=3D 3 * quarter)
+		return SB_FULL_75;
+	if (used >=3D 2 * quarter)
+		return SB_FULL_50;
+	if (used >=3D quarter)
+		return SB_FULL_25;
+	return SB_ALMOST_EMPTY;
+}
+
+/**
+ * spb_update_list - Move a superpageblock to the correct fullness list
+ * @sb: superpageblock to reclassify
+ *
+ * Called after counters change. Removes from current list (if any)
+ * and adds to the appropriate list based on current fullness and
+ * taint status.
+ */
+static void spb_update_list(struct superpageblock *sb)
+{
+	struct zone *zone =3D sb->zone;
+	enum sb_category cat;
+	enum sb_fullness full;
+
+	list_del_init(&sb->list);
+
+	if (sb->nr_free =3D=3D SUPERPAGEBLOCK_NR_PAGEBLOCKS) {
+		list_add_tail(&sb->list, &zone->spb_empty);
+		return;
+	}
+
+	cat =3D spb_get_category(sb);
+	full =3D sb_get_fullness(sb, cat);
+	list_add_tail(&sb->list, &zone->spb_lists[cat][full]);
+}
+
+/**
+ * superpageblock_pb_now_free - A pageblock just became fully free in buddy
+ * @page: page in the pageblock
+ *
+ * When buddy coalescing reconstructs a complete pageblock-order page,
+ * increment nr_free. Type counters are handled separately in
+ * mark_pageblock_free().
+ */
+static void superpageblock_pb_now_free(struct page *page)
+{
+	unsigned long pfn =3D page_to_pfn(page);
+	struct superpageblock *sb =3D pfn_to_superpageblock(page_zone(page), pfn);
+
+	if (!sb)
+		return;
+
+	sb->nr_free++;
+
+	spb_update_list(sb);
+}
+
+/**
+ * superpageblock_pb_now_used - A fully-free pageblock just got its first =
allocation
+ * @page: page in the pageblock
+ *
+ * When allocating from an order >=3D pageblock_order free page, decrement
+ * nr_free. Type counters are handled separately by __spb_set_has_type()
+ * at allocation time.
+ */
+static void superpageblock_pb_now_used(struct page *page)
+{
+	unsigned long pfn =3D page_to_pfn(page);
+	struct superpageblock *sb =3D pfn_to_superpageblock(page_zone(page), pfn);
+
+	if (!sb)
+		return;
+
+	if (sb->nr_free)
+		sb->nr_free--;
+
+	spb_update_list(sb);
+}
+
+/**
+ * superpageblock_range_now_used - Mark a multi-pageblock free range as no=
 longer free
+ * @page: first page of the range (must be pageblock-aligned)
+ * @order: order of the range (must be >=3D pageblock_order)
+ *
+ * When a free page of order >=3D pageblock_order is removed from buddy ou=
tside
+ * the normal allocation path (e.g. __isolate_free_page, memory hotplug,
+ * HW poison takeoff), every constituent pageblock leaves its PB_all_free
+ * state. Walk the range, clear PB_all_free, and decrement nr_free for each
+ * affected pageblock. PB_has_* bits are not touched: the pages are not be=
ing
+ * allocated to a specific migratetype here. They will be re-established by
+ * mark_pageblock_free() if the pages later return to buddy and coalesce.
+ */
+static void superpageblock_range_now_used(struct page *page, unsigned int =
order)
+{
+	unsigned long pfn =3D page_to_pfn(page);
+	unsigned long end_pfn =3D pfn + (1UL << order);
+
+	for (; pfn < end_pfn; pfn +=3D pageblock_nr_pages) {
+		struct page *pb_page =3D pfn_to_page(pfn);
+
+		if (get_pfnblock_bit(pb_page, pfn, PB_all_free)) {
+			clear_pfnblock_bit(pb_page, pfn, PB_all_free);
+			superpageblock_pb_now_used(pb_page);
+		}
+	}
+}
+
+/** set_pageblock_migratetype - Set the migratetype of a pageblock
  * @page: The page within the block of interest
  * @migratetype: migratetype to set
  */
@@ -577,6 +729,7 @@ void __meminit init_pageblock_migratetype(struct page *=
page,
 		if (sb->nr_reserved)
 			sb->nr_reserved--;
 		__spb_set_has_type(page, migratetype);
+		spb_update_list(sb);
 	}
 }
=20
@@ -1015,6 +1168,11 @@ static void mark_pageblock_free(struct page *page, u=
nsigned long pfn)
 	clear_pfnblock_bit(page, pfn, PB_has_unmovable);
 	clear_pfnblock_bit(page, pfn, PB_has_reclaimable);
 	clear_pfnblock_bit(page, pfn, PB_has_movable);
+
+	if (!get_pfnblock_bit(page, pfn, PB_all_free)) {
+		set_pfnblock_bit(page, pfn, PB_all_free);
+		superpageblock_pb_now_free(page);
+	}
 }
=20
 /*
@@ -1063,7 +1221,8 @@ static inline void __free_one_page(struct page *page,
=20
 	/*
 	 * When freeing a whole pageblock, clear stale PCP ownership
-	 * and actual-contents tracking flags up front.  The in-loop
+	 * and actual-contents tracking flags up front, and mark it
+	 * as fully free for superpageblock accounting.  The in-loop
 	 * check only fires when sub-pageblock pages merge *up to*
 	 * pageblock_order, not when entering at pageblock_order
 	 * directly.
@@ -2006,6 +2165,20 @@ static __always_inline void page_del_and_expand(stru=
ct zone *zone,
 {
 	int nr_pages =3D 1 << high;
=20
+	/*
+	 * If we're splitting a page that spans at least a full pageblock,
+	 * the allocated pageblock transitions from fully-free to in-use.
+	 * Clear PB_all_free and update superpageblock accounting.
+	 */
+	if (high >=3D pageblock_order) {
+		unsigned long pfn =3D page_to_pfn(page);
+
+		if (get_pfnblock_bit(page, pfn, PB_all_free)) {
+			clear_pfnblock_bit(page, pfn, PB_all_free);
+			superpageblock_pb_now_used(page);
+		}
+	}
+
 	__del_page_from_free_list(page, zone, high, migratetype);
 	nr_pages -=3D expand(zone, page, low, high, migratetype);
 	account_freepages(zone, -nr_pages, migratetype);
@@ -2535,6 +2708,25 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 	/* Take ownership for orders >=3D pageblock_order */
 	if (current_order >=3D pageblock_order) {
 		unsigned int nr_added;
+		unsigned long pb_pfn;
+
+		/*
+		 * Clear PB_all_free for pageblocks being claimed.
+		 * This path bypasses page_del_and_expand(), so we
+		 * must handle the free=E2=86=92used transition here.
+		 * Use block_type (the original migratetype) because
+		 * that's what was decremented when PB_all_free was set.
+		 */
+		for (pb_pfn =3D page_to_pfn(page);
+		     pb_pfn < page_to_pfn(page) + (1 << current_order);
+		     pb_pfn +=3D pageblock_nr_pages) {
+			struct page *pb_page =3D pfn_to_page(pb_pfn);
+
+			if (get_pfnblock_bit(pb_page, pb_pfn, PB_all_free)) {
+				clear_pfnblock_bit(pb_page, pb_pfn, PB_all_free);
+				superpageblock_pb_now_used(pb_page);
+			}
+		}
=20
 		del_page_from_free_list(page, zone, current_order, block_type);
 		change_pageblock_range(page, current_order, start_type);
@@ -3651,6 +3843,14 @@ int __isolate_free_page(struct page *page, unsigned =
int order)
=20
 	del_page_from_free_list(page, zone, order, mt);
=20
+	/*
+	 * The free page is leaving buddy. For order >=3D pageblock_order, every
+	 * constituent pageblock had PB_all_free set; clear those bits and
+	 * decrement nr_free so the SPB pageblock-level counter stays in sync.
+	 */
+	if (order >=3D pageblock_order)
+		superpageblock_range_now_used(page, order);
+
 	/*
 	 * Set the pageblock if the isolated page is at least half of a
 	 * pageblock
@@ -8163,6 +8363,8 @@ unsigned long __offline_isolated_pages(unsigned long =
start_pfn,
 		BUG_ON(!PageBuddy(page));
 		VM_WARN_ON(get_pageblock_migratetype(page) !=3D MIGRATE_ISOLATE);
 		order =3D buddy_order(page);
+		if (order >=3D pageblock_order)
+			superpageblock_range_now_used(page, order);
 		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
 		pfn +=3D (1 << order);
 	}
@@ -8254,6 +8456,25 @@ bool take_page_off_buddy(struct page *page)
=20
 			del_page_from_free_list(page_head, zone, page_order,
 						migratetype);
+			/*
+			 * break_down_buddy_pages() re-adds every non-target
+			 * pageblock to buddy at order >=3D pageblock_order, so
+			 * those keep their PB_all_free state. Only the target's
+			 * pageblock loses its fully-free status -- clear that
+			 * one bit and decrement the SPB nr_free counter.
+			 */
+			if (page_order >=3D pageblock_order) {
+				unsigned long pfn_pb =3D ALIGN_DOWN(pfn,
+							pageblock_nr_pages);
+				struct page *pb_page =3D pfn_to_page(pfn_pb);
+
+				if (get_pfnblock_bit(pb_page, pfn_pb,
+						     PB_all_free)) {
+					clear_pfnblock_bit(pb_page, pfn_pb,
+							   PB_all_free);
+					superpageblock_pb_now_used(pb_page);
+				}
+			}
 			break_down_buddy_pages(zone, page_head, page, 0,
 						page_order, migratetype);
 			SetPageHWPoisonTakenOff(page);
@@ -8558,3 +8779,73 @@ struct page *alloc_pages_nolock_noprof(gfp_t gfp_fla=
gs, int nid, unsigned int or
 	return page;
 }
 EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);
+
+#ifdef CONFIG_DEBUG_FS
+static const char * const sb_fullness_names[] =3D {
+	"full", "75pct", "50pct", "25pct", "almost_empty"
+};
+
+static const char * const sb_category_names[] =3D {
+	"clean", "tainted"
+};
+
+static int superpageblock_debugfs_show(struct seq_file *m, void *v)
+{
+	struct zone *zone;
+	int cat, full;
+
+	for_each_populated_zone(zone) {
+		unsigned long i;
+		int empty_count =3D 0;
+		struct superpageblock *sb;
+
+		if (!zone->superpageblocks)
+			continue;
+
+		seq_printf(m, "Node %d, zone %8s: %lu superpageblocks, base_pfn=3D0x%lx\=
n",
+			   zone->zone_pgdat->node_id, zone->name,
+			   zone->nr_superpageblocks, zone->superpageblock_base_pfn);
+
+		list_for_each_entry(sb, &zone->spb_empty, list)
+			empty_count++;
+		if (empty_count)
+			seq_printf(m, "  empty: %d\n", empty_count);
+
+		for (cat =3D 0; cat < __NR_SB_CATEGORIES; cat++) {
+			for (full =3D 0; full < __NR_SB_FULLNESS; full++) {
+				int count =3D 0;
+
+				list_for_each_entry(sb,
+					&zone->spb_lists[cat][full], list)
+					count++;
+				if (count)
+					seq_printf(m, "  %s/%s: %d\n",
+						   sb_category_names[cat],
+						   sb_fullness_names[full],
+						   count);
+			}
+		}
+
+		/* Per-superpageblock detail */
+		for (i =3D 0; i < zone->nr_superpageblocks; i++) {
+			sb =3D &zone->superpageblocks[i];
+			seq_printf(m, "  sb[%lu] pfn=3D0x%lx: unmov=3D%u recl=3D%u mov=3D%u rsv=
=3D%u free=3D%u total=3D%u\n",
+				   i, sb->start_pfn,
+				   sb->nr_unmovable, sb->nr_reclaimable,
+				   sb->nr_movable, sb->nr_reserved,
+				   sb->nr_free, sb->total_pageblocks);
+		}
+	}
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(superpageblock_debugfs);
+
+static int __init superpageblock_debugfs_init(void)
+{
+	debugfs_create_file("superpageblocks", 0444, NULL, NULL,
+			    &superpageblock_debugfs_fops);
+	return 0;
+}
+late_initcall(superpageblock_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
--=20
2.54.0

From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C3A2D3EFD11
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289241; cv=none;
 b=VrQxbgd0MFOOSHMsHS8ro3+ZXKNW2h4BDmjJgoa7yKe5C1TM+Z+99f/l3MotJS/h9t64eIgdJ8JCQoz5j9dmvqyuOZ4ij55HkruwFGQ2JKjWIwb5B0y3er/EFE1Sm7i53mUmzdlJhlq1NjfS6rqOdstzZ3Ajm1C8DuSFE/utoHY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289241; c=relaxed/simple;
	bh=dXmkUTLt0Nzt9L4v6RayGNziWoTQ9NZbWHMX6x7o2I8=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=kx8COVR2tYE6EJZyJCP+/FqFB16l9n9re/kYpdMdhYsMdotBIHjgzXygfxvL4HSKYOf5klhzPn1g7m7uxyZroh99LpQIWJu0xEkouLRnMdZdT6eFxSPeJZWc0L4xaN1j9K1hm8YCohUKhEFBI1wkGD8HVTlp2u45kxFYrlcjzWw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=LWI2bTfW; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="LWI2bTfW"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=X0aGX80HuR6xHObr34naZL4Dvo5p+JAOapg4ifN1i7w=; b=LWI2bTfW2IQSaDXqwrxhjngvUI
	vylm668SB0qOQj4/JK5yA+qFKLWgd69FPYkBloCaocxUew0TM2U2XfBc2R6KuFXxL2DRG2H4ffH3k
	jC36QHrhY0O3d87ncBpmUmuYFymOr3LZ/bUFvWWRcVF3FY0Wbmx5xRx8aOsy7OjVdAmtmIIWjd4jy
	oipwlRliXxeiUfz6xxD7qHV3Igfjqkg5JuWAzggmdw7+1njxpVo+gchiiiSCRqDXEgKa4ihuDnOK2
	V+2+chv5HwB2aJu5uB61Mv7RLrX0UB1/fYD7uEVwuAiEAZG0bPzr8te7cF6fJBwmv8JVRgyqo1f+D
	ctIEdSgg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-1Lx0;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 11/40] mm: page_alloc: steer pageblock stealing to tainted
 superpageblocks
Date: Wed, 20 May 2026 10:59:17 -0400
Message-ID: <20260520150018.2491267-12-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

When the allocator needs to steal a movable pageblock for unmovable or
reclaimable allocations, prefer pages from already-tainted superpageblocks.
This concentrates contamination in superpageblocks that are already impure,
preserving clean superpageblocks for future 1GB hugepage allocations.

In __rmqueue_claim, after finding a candidate page on the free list, check
if it belongs to a clean superpageblock. If so, do a bounded scan
(SPB_SCAN_LIMIT=3D8) of the same free list looking for a page from a
tainted superpageblock instead. This is a best-effort optimization:
if no tainted alternative is found, the original page is used.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 100 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 79 insertions(+), 21 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f86a30b5c7..a17c4cd9a788 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2327,6 +2327,9 @@ static void prep_new_page(struct page *page, unsigned=
 int order, gfp_t gfp_flags
 		clear_page_pfmemalloc(page);
 }
=20
+/* Bounded scan limit when searching free lists for tainted superpageblock=
 pages */
+#define SPB_SCAN_LIMIT 8
+
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
@@ -2694,6 +2697,7 @@ try_to_claim_block(struct zone *zone, struct page *pa=
ge,
 {
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
+	struct superpageblock *sb;
 #ifdef CONFIG_COMPACTION
 	struct page *start_page;
 #endif
@@ -2726,7 +2730,12 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 				clear_pfnblock_bit(pb_page, pb_pfn, PB_all_free);
 				superpageblock_pb_now_used(pb_page);
 			}
+			__spb_set_has_type(pb_page, start_type);
 		}
+		/* Single list update after all pageblocks processed */
+		sb =3D pfn_to_superpageblock(zone, page_to_pfn(page));
+		if (sb)
+			spb_update_list(sb);
=20
 		del_page_from_free_list(page, zone, current_order, block_type);
 		change_pageblock_range(page, current_order, start_type);
@@ -2771,29 +2780,24 @@ try_to_claim_block(struct zone *zone, struct page *=
page,
 		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
 #ifdef CONFIG_COMPACTION
 		/*
-		 * Track actual page contents in pageblock flags.
-		 * Mark the pageblock with the type being allocated, and
-		 * if unmovable/reclaimable pages are being placed into a
-		 * pageblock that already has movable pages, queue async
-		 * evacuation of the movable pages.
+		 * Track actual page contents in pageblock flags and
+		 * update superpageblock counters so the SPB moves to
+		 * the correct fullness list for steering.
 		 */
 		start_page =3D pfn_to_page(start_pfn);
-		if (start_type =3D=3D MIGRATE_UNMOVABLE) {
-			set_pfnblock_bit(start_page, start_pfn,
-					 PB_has_unmovable);
-			if (get_pfnblock_bit(start_page, start_pfn,
-					     PB_has_movable))
-				queue_pageblock_evacuate(zone, start_pfn);
-		} else if (start_type =3D=3D MIGRATE_RECLAIMABLE) {
-			set_pfnblock_bit(start_page, start_pfn,
-					 PB_has_reclaimable);
-			if (get_pfnblock_bit(start_page, start_pfn,
-					     PB_has_movable))
-				queue_pageblock_evacuate(zone, start_pfn);
-		} else if (start_type =3D=3D MIGRATE_MOVABLE) {
-			set_pfnblock_bit(start_page, start_pfn,
-					 PB_has_movable);
-		}
+		__spb_set_has_type(start_page, start_type);
+		if (block_type !=3D start_type)
+			__spb_set_has_type(start_page, block_type);
+
+		sb =3D pfn_to_superpageblock(zone, start_pfn);
+		if (sb)
+			spb_update_list(sb);
+
+		if ((start_type =3D=3D MIGRATE_UNMOVABLE ||
+		     start_type =3D=3D MIGRATE_RECLAIMABLE) &&
+		    get_pfnblock_bit(start_page, start_pfn,
+				     PB_has_movable))
+			queue_pageblock_evacuate(zone, start_pfn);
 #endif
 		return __rmqueue_smallest(zone, order, start_type);
 	}
@@ -2847,6 +2851,38 @@ __rmqueue_claim(struct zone *zone, int order, int st=
art_migratetype,
 			break;
=20
 		page =3D get_page_from_free_area(area, fallback_mt);
+
+		/*
+		 * For unmovable/reclaimable stealing, prefer pages from
+		 * tainted superpageblocks (already contaminated) to keep clean
+		 * superpageblocks clean for future 1GB allocations.
+		 */
+		if (start_migratetype !=3D MIGRATE_MOVABLE &&
+		    zone->superpageblocks && page) {
+			struct superpageblock *sb;
+			struct page *alt;
+			int scanned =3D 0;
+
+			sb =3D pfn_to_superpageblock(zone, page_to_pfn(page));
+			if (sb && spb_get_category(sb) =3D=3D SB_CLEAN) {
+				list_for_each_entry(alt,
+						    &area->free_list[fallback_mt],
+						    buddy_list) {
+					struct superpageblock *asb;
+
+					if (++scanned > SPB_SCAN_LIMIT)
+						break;
+					asb =3D pfn_to_superpageblock(zone,
+							page_to_pfn(alt));
+					if (asb && spb_get_category(asb) =3D=3D
+					    SB_TAINTED) {
+						page =3D alt;
+						break;
+					}
+				}
+			}
+		}
+
 		page =3D try_to_claim_block(zone, page, current_order, order,
 					  start_migratetype, fallback_mt,
 					  alloc_flags);
@@ -2867,6 +2903,7 @@ __rmqueue_claim(struct zone *zone, int order, int sta=
rt_migratetype,
 static __always_inline struct page *
 __rmqueue_steal(struct zone *zone, int order, int start_migratetype)
 {
+	struct superpageblock *sb;
 	struct free_area *area;
 	int current_order;
 	struct page *page;
@@ -2881,6 +2918,27 @@ __rmqueue_steal(struct zone *zone, int order, int st=
art_migratetype)
=20
 		page =3D get_page_from_free_area(area, fallback_mt);
 		page_del_and_expand(zone, page, order, current_order, fallback_mt);
+
+		/*
+		 * page_del_and_expand recorded PB_has_<fallback_mt> for the
+		 * source free list type. Also record the actual allocation
+		 * type so evacuation and defrag can find these pages.
+		 *
+		 * For example, a MOVABLE allocation stealing from an
+		 * UNMOVABLE free list must set PB_has_movable so the
+		 * pageblock is visible to evacuate_pageblock() and
+		 * spb_defrag_tainted(). __spb_set_has_type is idempotent:
+		 * it only increments the SPB counter on the 0->1 bit
+		 * transition.
+		 */
+		if (fallback_mt !=3D start_migratetype) {
+			__spb_set_has_type(page, start_migratetype);
+			sb =3D pfn_to_superpageblock(zone,
+						   page_to_pfn(page));
+			if (sb)
+				spb_update_list(sb);
+		}
+
 		trace_mm_page_alloc_extfrag(page, order, current_order,
 					    start_migratetype, fallback_mt);
 		return page;
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id DF4FB3F164C
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289248; cv=none;
 b=HbCLWcgY1MkM/KGxhUb84C7vuI09Azu7p/Rdz4sOek2XPKGoXXU2xX9809Mt2w6Tl3QmHCWfv24FqeI0bmb2EgqXul8alBKycUss7Ete6ciVb0xywPmM7uAAOlbkQfePRGACCT6Lni1JZKaR1p/ZmZ0e1t0wQVHSwcd1OMUPICs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289248; c=relaxed/simple;
	bh=YUopzE3EAetweTycTRh0eVwnKFhiZWlO4gUWGq8cU1s=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=K8OXBl9fRCtQkUAb7lNDVSyEjqHDLQLKoRFzTBLk7rxw9f3RxbgOIEE5S8n/Fx5R/QE4LRBAWEtPWX1S4Wfsp4mszWAT6tNxY/ZNHj2AGDECO0SuEDFE2YgjZjW1BZGxKdYoI0kBLp5v3oKpG2PJ6DUD9PsD3Ad13nvrloKoXX0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=Q3g3M3cu; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="Q3g3M3cu"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=2mObITUgdhBLk6YJLrlPC+4yDufeWuD49GNrXgWFp2A=; b=Q3g3M3cuLNQOnlx/T4T8wwyvb2
	ZzVgqv5JOIILXSp/eKFOk/t+n2LV8HCzoWneM6SV9jUlF0K+bBTJ5zGPleK3hxVrNP02djh7mObXQ
	hNRI2ePsozXIiHTc5/U9qbJYTGUbzXDTy8sN5U9GxypamHGjPRr8aRdnTVHyl1G0Zl9YIGMZJ5Gn0
	/ve03MW0czNeY8ykz+i1EnkHCiJ4zARSwHKQc306mImNoeFjgEiRYfxvTZbUZ26MLW25bQ9sLTqg7
	abfErKa3EE1LXcOCy5V98g0+bLMo2RhRGnuhPu8I1CHUxJTnFS4xD1GPVLp96gIWOQ6Puwd3bWO0x
	DGLKPhJw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-1RlE;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 12/40] mm: page_alloc: steer movable allocations to
 fullest clean superpageblocks
Date: Wed, 20 May 2026 10:59:18 -0400
Message-ID: <20260520150018.2491267-13-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

When refilling PCP with whole pageblocks for movable allocations, prefer
pageblocks from the fullest clean (only free + movable) superpageblock.
This packs movable allocations into already-partial superpageblocks,
preserving empty superpageblocks for potential 1GB hugepage allocation.

Add sb_preferred_for_movable() which walks the clean superpageblock lists
from SB_FULL toward SB_ALMOST_EMPTY to find the fullest clean
superpageblock with available free pageblocks. Add __rmqueue_from_sb()
which scans the buddy free list for a page within a specific
superpageblock's PFN range, with a bounded scan limit (8 entries) to avoid
excessive latency.

Hook into rmqueue_bulk() phase 1 (whole pageblock grab for PCP refill) to
try the preferred superpageblock before falling back to the normal
__rmqueue() path. This is the primary steering point for movable
allocations without per-superpageblock free lists.

Also fix an ALLOC_NOFRAGMENT propagation oversight in
alloc_pages_bulk_noprof(): the bulk allocator's preferred_zoneref is
computed locally, so it must also call alloc_flags_nofragment() to match
the protection that the single-page fastpath gets via
prepare_alloc_pages(). Without this, bulk folio refills for the page
cache could taint clean SPBs that the single-page path would have left
alone.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a17c4cd9a788..9dc65bf93e71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2330,6 +2330,73 @@ static void prep_new_page(struct page *page, unsigne=
d int order, gfp_t gfp_flags
 /* Bounded scan limit when searching free lists for tainted superpageblock=
 pages */
 #define SPB_SCAN_LIMIT 8
=20
+/**
+ * sb_preferred_for_movable - Find the fullest clean superpageblock for mo=
vable
+ * @zone: zone to search
+ *
+ * Walk spb_lists[CLEAN] from nearly full toward emptiest -- pack movable
+ * allocations into already-partial superpageblocks before starting new on=
es.
+ * Skip SB_FULL since those have no free pageblocks.
+ * Returns NULL if no suitable superpageblock found.
+ */
+static struct superpageblock *sb_preferred_for_movable(struct zone *zone)
+{
+	int full;
+	struct superpageblock *sb;
+
+	for (full =3D SB_FULL_75; full < __NR_SB_FULLNESS; full++) {
+		list_for_each_entry(sb, &zone->spb_lists[SB_CLEAN][full], list) {
+			if (sb->nr_free)
+				return sb;
+		}
+	}
+	/* Fall back to empty superpageblocks -- no clean partials available */
+	return NULL;
+}
+
+/**
+ * __rmqueue_from_sb - Try to allocate a page from a specific superpageblo=
ck
+ * @zone: zone to allocate from
+ * @order: allocation order
+ * @migratetype: type to allocate
+ * @sb: preferred superpageblock
+ *
+ * Scan the free list at the given order for a page within the superpagebl=
ock's
+ * PFN range. Bounded scan to avoid excessive latency. Returns NULL if
+ * no suitable page found.
+ */
+static struct page *__rmqueue_from_sb(struct zone *zone, unsigned int orde=
r,
+				      int migratetype, struct superpageblock *sb)
+{
+	unsigned int current_order;
+	unsigned long sb_start =3D sb->start_pfn;
+	unsigned long sb_end =3D sb_start + (1UL << SUPERPAGEBLOCK_ORDER);
+	struct free_area *area;
+	struct page *page;
+	int scanned;
+
+	for (current_order =3D order; current_order < NR_PAGE_ORDERS;
+	     ++current_order) {
+		area =3D &zone->free_area[current_order];
+		scanned =3D 0;
+
+		list_for_each_entry(page, &area->free_list[migratetype],
+				    buddy_list) {
+			unsigned long pfn =3D page_to_pfn(page);
+
+			if (pfn >=3D sb_start && pfn < sb_end) {
+				page_del_and_expand(zone, page, order,
+						    current_order,
+						    migratetype);
+				return page;
+			}
+			if (++scanned >=3D SPB_SCAN_LIMIT)
+				break;
+		}
+	}
+	return NULL;
+}
+
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
@@ -3119,12 +3186,26 @@ static bool rmqueue_bulk(struct zone *zone, unsigne=
d int order,
 	 * small zones, pages_needed can be less than a whole
 	 * pageblock; skip to smaller blocks or individual pages to
 	 * avoid overshooting the PCP high watermark.
+	 *
+	 * For movable allocations, prefer pageblocks from the
+	 * fullest clean superpageblock to pack allocations and
+	 * preserve empty superpageblocks for 1GB hugepages.
 	 */
 	while (refilled + pageblock_nr_pages <=3D pages_needed) {
-		struct page *page;
+		struct page *page =3D NULL;
=20
-		page =3D __rmqueue(zone, pageblock_order,
-				 migratetype, alloc_flags, &rmqm);
+		if (migratetype =3D=3D MIGRATE_MOVABLE) {
+			struct superpageblock *sb;
+
+			sb =3D sb_preferred_for_movable(zone);
+			if (sb)
+				page =3D __rmqueue_from_sb(zone, pageblock_order,
+							 migratetype, sb);
+		}
+		if (!page)
+			page =3D __rmqueue(zone, pageblock_order,
+					 migratetype,
+					 alloc_flags, &rmqm);
 		if (!page)
 			break;
=20
@@ -5843,6 +5924,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int =
preferred_nid,
 		goto out;
 	gfp =3D alloc_gfp;
=20
+	alloc_flags |=3D alloc_flags_nofragment(zonelist_zone(ac.preferred_zonere=
f), gfp);
+
 	/* Find an allowed local zone that meets the low watermark. */
 	z =3D ac.preferred_zoneref;
 	for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask)=
 {
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 55CC13F4DFC
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:54 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289260; cv=none;
 b=XpySJZPcBm72P0M2fc5Kk6RJLYNLT2oePnc41wuFFMlHKknFmR5kUW4kGeGTvcmUR4doIWWTL5MRp+56jZ0a3A9o5OzA2aP+VPUyemk0oZMGPJlRWSeTQeH8d131ol8aDMp2X290v8RyxBLZf9gVIJwcA/9KqC5ME1deAhsPQ2k=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289260; c=relaxed/simple;
	bh=Xj05axZhYV43D169CjNm9cnx8/uJVX6muICix8QKwTw=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=ZGjEmcHjV1Y5JuHBtZ+/hyTlVeakVdEA/sB2xB7E4RZXB4APD2XDmqcyINrasmEyu9AoWLkidgi7Zd1lA7uIwUzi460rwj3NE59sE15SRVp/Lk4+Dvz/SyBmn05I5sWA7VV5WsqD2ICDqprz6oXOL/TD+c22W6zo/dWHG3Ng3ck=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=daREX2O7; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="daREX2O7"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:Content-Type:MIME-Version:References:
	In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=hC1C9EftDdYJ0I9PlrjUJ9mOTwszjXtUqK9pcRRvRfI=; b=daREX2O7uq6uprF4Y/3TP/EAyE
	og/GcXVHMlrQDaIh32Df3Fo/KfstIHCi7DICOvLEtNk5DBqHosBdHHuEHiaqAjQkS7IRpEY678lmw
	vZIFGM45rQFlugo4EgGpfOL8/bWpXCQWWGaiRvcSpr4V18e71SdTQTw7nPpnf79AoSDz3JaapTtQS
	VuG3KQqPSalACFp/KcVMdGQvXG6NW9DsJdqzb2VplxFlfHrcnxxbzRH/Caz3/I3z8ugkbJoUL1CyF
	Sk9OiPA22JZwUYXqbV6BIseTEIgUFoMbQt6OjfVmuIAPAcXPAXzhMpsKabW3Bo3SkWBdjg7Ua7hEz
	FqhAiC1w==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-1XXJ;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 13/40] mm: page_alloc: extract claim_whole_block from
 try_to_claim_block
Date: Wed, 20 May 2026 10:59:19 -0400
Message-ID: <20260520150018.2491267-14-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

Extract the whole-pageblock claiming logic from try_to_claim_block()
into a standalone claim_whole_block() function. This handles the
PB_all_free =E2=86=92 used transition, pageblock migratetype change, and
block splitting for orders >=3D pageblock_order.

Pure refactoring, no functional change. Prepares for reuse of this
logic in the per-superpageblock free lists patch.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 89 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 55 insertions(+), 34 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9dc65bf93e71..1b619304864a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2750,6 +2750,57 @@ int find_suitable_fallback(struct free_area *area, u=
nsigned int order,
 	return -1;
 }
=20
+/*
+ * claim_whole_block - claim a free block (>=3D pageblock_order) for a new=
 type
+ * @zone: zone containing the page
+ * @page: free page to claim
+ * @current_order: order of the free page
+ * @order: requested allocation order
+ * @new_type: migratetype to assign
+ * @old_type: current migratetype of the block (for free list removal)
+ *
+ * Handle the PB_all_free =E2=86=92 used transition, change the pageblock
+ * migratetype, split the block down to @order, and return the page.
+ */
+static struct page *
+claim_whole_block(struct zone *zone, struct page *page,
+		  int current_order, int order, int new_type, int old_type)
+{
+	struct superpageblock *sb;
+	unsigned int nr_added;
+	unsigned long pb_pfn;
+
+	VM_WARN_ON_ONCE(current_order < order);
+
+	/*
+	 * Clear PB_all_free for pageblocks being claimed.
+	 * This path bypasses page_del_and_expand(), so we
+	 * must handle the free=E2=86=92used transition here.
+	 */
+	for (pb_pfn =3D page_to_pfn(page);
+	     pb_pfn < page_to_pfn(page) + (1 << current_order);
+	     pb_pfn +=3D pageblock_nr_pages) {
+		struct page *pb_page =3D pfn_to_page(pb_pfn);
+
+		if (get_pfnblock_bit(pb_page, pb_pfn, PB_all_free)) {
+			clear_pfnblock_bit(pb_page, pb_pfn, PB_all_free);
+			superpageblock_pb_now_used(pb_page);
+		}
+		__spb_set_has_type(pb_page, new_type);
+	}
+
+	del_page_from_free_list(page, zone, current_order, old_type);
+	change_pageblock_range(page, current_order, new_type);
+	nr_added =3D expand(zone, page, order, current_order, new_type);
+	account_freepages(zone, nr_added, new_type);
+
+	/* Single list update after all pageblocks processed */
+	sb =3D pfn_to_superpageblock(zone, page_to_pfn(page));
+	if (sb)
+		spb_update_list(sb);
+	return page;
+}
+
 /*
  * This function implements actual block claiming behaviour. If order is l=
arge
  * enough, we can claim the whole pageblock for the requested migratetype.=
 If
@@ -2764,9 +2815,9 @@ try_to_claim_block(struct zone *zone, struct page *pa=
ge,
 {
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
-	struct superpageblock *sb;
 #ifdef CONFIG_COMPACTION
 	struct page *start_page;
+	struct superpageblock *sb;
 #endif
=20
 	/*
@@ -2777,39 +2828,9 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 		return NULL;
=20
 	/* Take ownership for orders >=3D pageblock_order */
-	if (current_order >=3D pageblock_order) {
-		unsigned int nr_added;
-		unsigned long pb_pfn;
-
-		/*
-		 * Clear PB_all_free for pageblocks being claimed.
-		 * This path bypasses page_del_and_expand(), so we
-		 * must handle the free=E2=86=92used transition here.
-		 * Use block_type (the original migratetype) because
-		 * that's what was decremented when PB_all_free was set.
-		 */
-		for (pb_pfn =3D page_to_pfn(page);
-		     pb_pfn < page_to_pfn(page) + (1 << current_order);
-		     pb_pfn +=3D pageblock_nr_pages) {
-			struct page *pb_page =3D pfn_to_page(pb_pfn);
-
-			if (get_pfnblock_bit(pb_page, pb_pfn, PB_all_free)) {
-				clear_pfnblock_bit(pb_page, pb_pfn, PB_all_free);
-				superpageblock_pb_now_used(pb_page);
-			}
-			__spb_set_has_type(pb_page, start_type);
-		}
-		/* Single list update after all pageblocks processed */
-		sb =3D pfn_to_superpageblock(zone, page_to_pfn(page));
-		if (sb)
-			spb_update_list(sb);
-
-		del_page_from_free_list(page, zone, current_order, block_type);
-		change_pageblock_range(page, current_order, start_type);
-		nr_added =3D expand(zone, page, order, current_order, start_type);
-		account_freepages(zone, nr_added, start_type);
-		return page;
-	}
+	if (current_order >=3D pageblock_order)
+		return claim_whole_block(zone, page, current_order, order,
+					start_type, block_type);
=20
 	/* moving whole block can fail due to zone boundary conditions */
 	if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
--=20
2.54.0

From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D23C13F5BF7
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289267; cv=none;
 b=U37uIAOpuepjY11SjSlfYRlKnF2TmHAEK4Yv7SebYfNkWzxzcjRRBuOsoUBly7uOo6Rbi0GUQ2eEVpwMC6nlru/V6fBx3zPKxr8LmklkghxX9lFhiKyaIu9Ar2EiGqz95vXJExMr92oKWDPNVd1Z30JVHrvW1Vt6wzCN8jK4v/k=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289267; c=relaxed/simple;
	bh=XLT3JnHcmbIQgKie+rKFcC8MV1+vGWkXAgnifagfK7s=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=JAvsRjEULywlHOOuZsLOgdcsLnA9tuoczoon4iZpIs4AZfqrQ9sgYaSuDFbQnfx4kHw09FWoSx0dtYiiH/fkIX72lnyp5HdMk9/0+RKtHiOLihbW9vxk5o5A7omb2Vdz3fy283XZjoem+1sAYIZAHMTRGXPjIvREFVWF/rW8DPA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=F6LDqZW9; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="F6LDqZW9"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:Content-Type:MIME-Version:References:
	In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=lHYjlMPvi7UC1C03e8wV8+JVskmYtHQ3HvKwnbe2S4c=; b=F6LDqZW9WKo8AuZ0Wx1FP19NLq
	Q56bn3lHB1LJE/3kmY0eaKmrmA8d1PBmDGu7mfSPgHDDg5mQsoIKGz2Tod2Aek0vig4SKrPfHNibb
	SQ6/kumer9nIezo9qpmY30bb3th1p9L6kw00mjEJJFVVMzPCAofmL+EYHFwifyUTTO+3txRWAOoOb
	/iOan3D04hh6ZxHoGytO+sIcm0YsHhWTSuPIcsLV4tIvRc/4IXyTkwcp8qAkGnPMNwq6xSmsUoo4X
	C8A+0w99CMfHdOGFmO5hwiLUmFu6slzSyg4Y6hIKXeXAHkGMI4irR+QrNGXIGZZXR6yxT08j7EAG0
	NqkKLCzQ==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-1eWy;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 14/40] mm: page_alloc: add per-superpageblock free lists
Date: Wed, 20 May 2026 10:59:20 -0400
Message-ID: <20260520150018.2491267-15-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

Per-superpageblock free lists keep allocation steering effective at every
order: all pages belonging to a superpageblock are tracked on its own
free_area[NR_PAGE_ORDERS], not on the zone-level free_area.  This lets
__rmqueue_smallest target a specific SPB by category/fullness without
walking the whole zone.

Sub-pageblock-order frees route to the containing SPB's free list via
__free_one_page; whole-pageblock and higher orders likewise.  PCP refill,
buddy coalescing, and migratetype steering all consult the per-SPB
free_area.

Memory-hotplug correctness.  Once the resize loop in
resize_zone_superpageblocks() may be invoked on a previously-empty zone
(memoryless NUMA node receiving its first online memory, CXL hot-add
into a zone with no prior pages), two latent bugs surface:

  - The SPB list heads (zone->spb_empty and the spb_lists[cat][full]
    matrix) are initialized only by setup_superpageblocks(), which is
    __init and runs only at boot.  Hot-add into a previously-empty zone
    invokes init_one_superpageblock() with zero-initialized list_heads,
    and the inlined list_add_tail() NULL-derefs walking ->next->prev.
    Factor list-head init out of setup_superpageblocks() into
    init_zone_spb_lists(), call it from resize_zone_superpageblocks()
    on the first-time path (zone->superpageblocks =3D=3D NULL); subsequent
    resizes skip it.

  - The resize loop copies struct superpageblock entries to a newly
    kvmalloc()'d array but does not fix up the embedded
    free_area[order].free_list[mt] list_heads.  Pages on those lists
    have buddy_list.prev/next pointing into the *old* array's list
    heads, so as soon as the swap takes effect, __rmqueue_smallest
    walks pointers into freed memory.  Extend the per-SPB list_replace
    pass to walk all NR_PAGE_ORDERS * MIGRATE_TYPES free lists too.

The same critical section that copies struct contents and fixes up
list heads must run under zone->lock to prevent a concurrent allocator
from observing partial state; take the lock around the
copy+fixup+swap.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |  10 +
 mm/compaction.c        |  36 +-
 mm/internal.h          |  10 +
 mm/mm_init.c           | 146 +++++--
 mm/page_alloc.c        | 853 ++++++++++++++++++++++++++++++++---------
 mm/vmstat.c            |  66 ++--
 6 files changed, 883 insertions(+), 238 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b8ada3d13a34..85846bb041a8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1021,9 +1021,19 @@ struct superpageblock {
 	u16			nr_reserved;	/* holes, firmware, etc. */
 	u16			total_pageblocks; /* zone-clipped total */
=20
+	/* Total free pages across all per-superpageblock free lists */
+	unsigned long		nr_free_pages;
+
 	/* For organizing superpageblocks by fullness category */
 	struct list_head	list;
=20
+	/*
+	 * Per-superpageblock free lists for all buddy orders.
+	 * All pages belonging to this superpageblock are tracked here,
+	 * keeping allocation steering effective at every order.
+	 */
+	struct free_area	free_area[NR_PAGE_ORDERS];
+
 	/* Identity */
 	unsigned long		start_pfn;
 	struct zone		*zone;
diff --git a/mm/compaction.c b/mm/compaction.c
index e8ca651e2b07..6d2aefdbc0c8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -979,6 +979,12 @@ isolate_migratepages_block(struct compact_control *cc,=
 unsigned long low_pfn,
 					low_pfn +=3D (1UL << order) - 1;
 					nr_scanned +=3D (1UL << order) - 1;
 				}
+				/*
+				 * Skipped a movable page; clearing
+				 * PB_has_movable here would orphan SPB type
+				 * counters (debugfs invariant 1).
+				 */
+				movable_skipped =3D true;
 				goto isolate_fail;
 			}
 			/* for alloc_contig case */
@@ -1058,6 +1064,12 @@ isolate_migratepages_block(struct compact_control *c=
c, unsigned long low_pfn,
 					low_pfn +=3D (1UL << order) - 1;
 					nr_scanned +=3D (1UL << order) - 1;
 				}
+				/*
+				 * Skipped a movable compound page; clearing
+				 * PB_has_movable here would orphan SPB type
+				 * counters (debugfs invariant 1).
+				 */
+				movable_skipped =3D true;
 				goto isolate_fail;
 			}
 		}
@@ -1083,6 +1095,12 @@ isolate_migratepages_block(struct compact_control *c=
c, unsigned long low_pfn,
 				movable_skipped =3D true;
 			}
=20
+			/*
+			 * Non-LRU non-movable_ops page: still occupies the
+			 * pageblock, so clearing PB_has_movable here would
+			 * orphan SPB type counters (debugfs invariant 1).
+			 */
+			movable_skipped =3D true;
 			goto isolate_fail;
 		}
=20
@@ -1320,12 +1338,9 @@ isolate_migratepages_block(struct compact_control *c=
c, unsigned long low_pfn,
 		 * isolated (pinned, writeback, dirty, etc.), leave the
 		 * flag set so a future migration attempt can try again.
 		 */
-		if (!nr_isolated && !movable_skipped && valid_page &&
-		    get_pfnblock_bit(valid_page, pageblock_start_pfn(start_pfn),
-				     PB_has_movable))
-			clear_pfnblock_bit(valid_page,
-					   pageblock_start_pfn(start_pfn),
-					   PB_has_movable);
+		if (!nr_isolated && !movable_skipped && valid_page)
+			superpageblock_clear_has_movable(cc->zone,
+							valid_page);
 	}
=20
 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
@@ -1873,6 +1888,15 @@ static struct folio *compaction_alloc_noprof(struct =
folio *src, unsigned long da
 		prep_compound_page(&dst->page, order);
 	cc->nr_freepages -=3D 1 << order;
 	cc->nr_migratepages -=3D 1 << order;
+
+	/*
+	 * Compaction isolates free pages via __isolate_free_page, which
+	 * bypasses page_del_and_expand and its PB_has_* tracking.  The
+	 * destination will hold movable pages after migration, so mark
+	 * PB_has_movable on the destination pageblock now.
+	 */
+	superpageblock_set_has_movable(cc->zone, &dst->page);
+
 	return page_rmappable_folio(&dst->page);
 }
=20
diff --git a/mm/internal.h b/mm/internal.h
index 6a089bc4aa09..7091dc557f1f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1101,6 +1101,16 @@ void init_cma_reserved_pageblock(struct page *page);
=20
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
=20
+#ifdef CONFIG_COMPACTION
+void superpageblock_clear_has_movable(struct zone *zone, struct page *page=
);
+void superpageblock_set_has_movable(struct zone *zone, struct page *page);
+#else
+static inline void superpageblock_clear_has_movable(struct zone *zone,
+						    struct page *page) {}
+static inline void superpageblock_set_has_movable(struct zone *zone,
+						  struct page *page) {}
+#endif
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 void resize_zone_superpageblocks(struct zone *zone);
 #endif
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2dc73d8a8d6c..92e5f396cbd7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1523,16 +1523,27 @@ static void __meminit init_one_superpageblock(struc=
t superpageblock *sb,
 	unsigned long sb_end =3D start_pfn + SUPERPAGEBLOCK_NR_PAGES;
 	unsigned long pb_start =3D max(start_pfn, zone_start);
 	unsigned long pb_end =3D min(sb_end, zone_end);
+	int order, t;
 	u16 actual_pbs;
=20
 	sb->nr_unmovable =3D 0;
 	sb->nr_reclaimable =3D 0;
 	sb->nr_movable =3D 0;
 	sb->nr_free =3D 0;
+	sb->nr_free_pages =3D 0;
 	INIT_LIST_HEAD(&sb->list);
 	sb->start_pfn =3D start_pfn;
 	sb->zone =3D zone;
=20
+	/* Initialize per-superpageblock free areas */
+	for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
+		struct free_area *area =3D &sb->free_area[order];
+
+		for (t =3D 0; t < MIGRATE_TYPES; t++)
+			INIT_LIST_HEAD(&area->free_list[t]);
+		area->nr_free =3D 0;
+	}
+
 	/*
 	 * Start with all pageblock slots as reserved.
 	 * init_pageblock_migratetype() will decrement nr_reserved and
@@ -1561,6 +1572,22 @@ static void __meminit init_one_superpageblock(struct=
 superpageblock *sb,
 	}
 }
=20
+/*
+ * Initialize the per-zone SPB list heads. Called from boot
+ * (setup_superpageblocks) and from memory hotplug
+ * (resize_zone_superpageblocks) the first time SPBs are set up
+ * for a zone.
+ */
+static void __meminit init_zone_spb_lists(struct zone *zone)
+{
+	int cat, full;
+
+	INIT_LIST_HEAD(&zone->spb_empty);
+	for (cat =3D 0; cat < __NR_SB_CATEGORIES; cat++)
+		for (full =3D 0; full < __NR_SB_FULLNESS; full++)
+			INIT_LIST_HEAD(&zone->spb_lists[cat][full]);
+}
+
 static void __init setup_superpageblocks(struct zone *zone)
 {
 	unsigned long zone_start =3D zone->zone_start_pfn;
@@ -1568,17 +1595,22 @@ static void __init setup_superpageblocks(struct zon=
e *zone)
 	unsigned long sb_base, nr_superpageblocks;
 	size_t alloc_size;
 	unsigned long i;
-	int cat, full;
=20
 	zone->superpageblocks =3D NULL;
 	zone->nr_superpageblocks =3D 0;
 	zone->superpageblock_base_pfn =3D 0;
=20
 	/* Fullness lists steer allocations to preferred superpageblocks */
-	INIT_LIST_HEAD(&zone->spb_empty);
-	for (cat =3D 0; cat < __NR_SB_CATEGORIES; cat++)
-		for (full =3D 0; full < __NR_SB_FULLNESS; full++)
-			INIT_LIST_HEAD(&zone->spb_lists[cat][full]);
+	init_zone_spb_lists(zone);
+
+	/*
+	 * Warn if pages have already been freed into this zone's
+	 * free_area before superpageblocks are set up -- those pages
+	 * would become stranded because __rmqueue_smallest only
+	 * searches per-superpageblock free lists.
+	 */
+	for (i =3D 0; i < NR_PAGE_ORDERS; i++)
+		WARN_ON_ONCE(zone->free_area[i].nr_free);
=20
 	if (!zone->spanned_pages)
 		return;
@@ -1619,8 +1651,10 @@ static void __init setup_superpageblocks(struct zone=
 *zone)
  * the full zone span, copies existing superpageblocks (fixing up list hea=
ds),
  * and initializes new superpageblocks for the added range.
  *
- * Must be called under mem_hotplug_lock (write).  No concurrent
- * allocations can occur since the hotplugged pages are not yet online.
+ * Must be called under mem_hotplug_lock (write).  The hot-added pages
+ * themselves are not yet online, but allocations on previously-online
+ * pages within the same zone can still race the superpageblock-array
+ * swap; the function takes zone->lock for that critical section.
  */
 void __meminit resize_zone_superpageblocks(struct zone *zone)
 {
@@ -1634,6 +1668,7 @@ void __meminit resize_zone_superpageblocks(struct zon=
e *zone)
 	size_t alloc_size;
 	unsigned long i;
 	int nid =3D zone_to_nid(zone);
+	unsigned long flags;
=20
 	if (!zone->spanned_pages)
 		return;
@@ -1648,6 +1683,18 @@ void __meminit resize_zone_superpageblocks(struct zo=
ne *zone)
 	    new_nr_sbs =3D=3D zone->nr_superpageblocks)
 		return;
=20
+	/*
+	 * First time superpageblocks are being set up for this zone
+	 * (memory hot-added to a previously-empty zone, e.g. CXL bringing
+	 * a memoryless node online): the SPB fullness/category list heads
+	 * are still zero-initialized from the zone struct allocation.
+	 * setup_superpageblocks() runs only at boot via __init, so do that
+	 * piece of init here for the hotplug path. Subsequent calls for
+	 * the same zone will skip this -- superpageblocks is non-NULL.
+	 */
+	if (!zone->superpageblocks)
+		init_zone_spb_lists(zone);
+
 	alloc_size =3D new_nr_sbs * sizeof(struct superpageblock);
 	new_sbs =3D kvmalloc_node(alloc_size, GFP_KERNEL | __GFP_ZERO, nid);
 	if (!new_sbs) {
@@ -1656,6 +1703,37 @@ void __meminit resize_zone_superpageblocks(struct zo=
ne *zone)
 		return;
 	}
=20
+	/* Initialize new superpageblocks (not from old array) first, outside loc=
k */
+	if (zone->superpageblocks) {
+		old_offset =3D (zone->superpageblock_base_pfn - new_sb_base) >>
+			     SUPERPAGEBLOCK_ORDER;
+	} else {
+		old_offset =3D 0;
+	}
+
+	for (i =3D 0; i < new_nr_sbs; i++) {
+		struct superpageblock *sb =3D &new_sbs[i];
+		bool is_old =3D false;
+
+		if (zone->superpageblocks &&
+		    i >=3D old_offset &&
+		    i < old_offset + zone->nr_superpageblocks)
+			is_old =3D true;
+
+		if (is_old)
+			continue;
+
+		init_one_superpageblock(sb, zone,
+					new_sb_base + (i << SUPERPAGEBLOCK_ORDER),
+					zone_start, zone_end);
+	}
+
+	/*
+	 * Take zone->lock for the copy+fixup+swap to prevent concurrent
+	 * allocations from traversing free lists while we relocate them.
+	 */
+	spin_lock_irqsave(&zone->lock, flags);
+
 	/*
 	 * Copy existing superpageblocks to their new position.
 	 * The old array covers [old_base, old_base + old_nr * SB_SIZE).
@@ -1669,39 +1747,39 @@ void __meminit resize_zone_superpageblocks(struct z=
one *zone)
 		       zone->nr_superpageblocks * sizeof(struct superpageblock));
=20
 		/*
-		 * Fix up list_head pointers that were self-referencing
-		 * (empty lists) or pointing into the old array.
+		 * Fix up all list_head pointers: both the SPB category list
+		 * and every free_area[order].free_list[migratetype]. Pages on
+		 * buddy free lists have buddy_list.prev/next pointing at the
+		 * old array's list heads -- those must be updated to point at
+		 * the new array.
 		 */
 		for (i =3D old_offset; i < old_offset + zone->nr_superpageblocks; i++) {
 			struct superpageblock *sb =3D &new_sbs[i];
+			struct superpageblock *old_sb =3D
+				&zone->superpageblocks[i - old_offset];
+			int order, mt;
=20
-			if (list_empty(&sb->list))
+			/* Fix up sb->list (zone category/fullness list) */
+			if (list_empty(&old_sb->list))
 				INIT_LIST_HEAD(&sb->list);
 			else
-				list_replace(&zone->superpageblocks[i - old_offset].list,
-					     &sb->list);
-		}
-	}
-
-	/* Initialize new superpageblocks (slots not covered by old array) */
-	for (i =3D 0; i < new_nr_sbs; i++) {
-		struct superpageblock *sb =3D &new_sbs[i];
-		bool is_old =3D false;
-
-		if (zone->superpageblocks) {
-			old_offset =3D (zone->superpageblock_base_pfn - new_sb_base) >>
-				     SUPERPAGEBLOCK_ORDER;
-			if (i >=3D old_offset &&
-			    i < old_offset + zone->nr_superpageblocks)
-				is_old =3D true;
+				list_replace(&old_sb->list, &sb->list);
+
+			/* Fix up all free_area list heads */
+			for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
+				for (mt =3D 0; mt < MIGRATE_TYPES; mt++) {
+					struct list_head *old_list =3D
+						&old_sb->free_area[order].free_list[mt];
+					struct list_head *new_list =3D
+						&sb->free_area[order].free_list[mt];
+
+					if (list_empty(old_list))
+						INIT_LIST_HEAD(new_list);
+					else
+						list_replace(old_list, new_list);
+				}
+			}
 		}
-
-		if (is_old)
-			continue;
-
-		init_one_superpageblock(sb, zone,
-					new_sb_base + (i << SUPERPAGEBLOCK_ORDER),
-					zone_start, zone_end);
 	}
=20
 	/*
@@ -1740,6 +1818,8 @@ void __meminit resize_zone_superpageblocks(struct zon=
e *zone)
 	zone->superpageblock_base_pfn =3D new_sb_base;
 	zone->spb_kvmalloced =3D true;
=20
+	spin_unlock_irqrestore(&zone->lock, flags);
+
 	/*
 	 * The boot-time array was allocated with memblock_alloc, which
 	 * is not individually freeable after boot.  Only kvfree arrays
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1b619304864a..b9c957fb4783 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -515,6 +515,140 @@ static void __spb_set_has_type(struct page *page, int=
 migratetype)
 	}
 }
=20
+/*
+ * __spb_clear_has_type - clear PB_has_* and decrement type counter
+ *
+ * Idempotent: only decrements the counter on the 1=E2=86=920 bit transiti=
on.
+ */
+static void __spb_clear_has_type(struct page *page, int migratetype)
+{
+	unsigned long pfn =3D page_to_pfn(page);
+	struct superpageblock *sb =3D pfn_to_superpageblock(page_zone(page), pfn);
+	int bit;
+
+	if (!sb)
+		return;
+
+	bit =3D migratetype_to_has_bit(migratetype);
+	if (bit < 0)
+		return;
+
+	if (get_pfnblock_bit(page, pfn, bit)) {
+		clear_pfnblock_bit(page, pfn, bit);
+		switch (bit) {
+		case PB_has_unmovable:
+			if (sb->nr_unmovable)
+				sb->nr_unmovable--;
+			break;
+		case PB_has_reclaimable:
+			if (sb->nr_reclaimable)
+				sb->nr_reclaimable--;
+			break;
+		case PB_has_movable:
+			if (sb->nr_movable)
+				sb->nr_movable--;
+			break;
+		}
+	}
+}
+
+#ifdef CONFIG_COMPACTION
+/*
+ * spb_pageblock_has_free_movable_fragments - probe SPB free lists for mov=
able
+ * @zone: zone containing @page
+ * @page: any page within the target pageblock
+ *
+ * Returns true if the SPB containing @page has any free MOVABLE pages on =
its
+ * per-order free lists at orders below pageblock_order whose PFN falls wi=
thin
+ * the target pageblock. The compaction migrate scanner only sees in-use p=
ages,
+ * so a pageblock can look "empty of movable" to the scanner while the SPB
+ * still owns small-order MOVABLE fragments inside it. Clearing PB_has_mov=
able
+ * in that case would orphan those fragments from the SPB type accounting =
and
+ * trigger debugfs invariant 1 (sum_types undercount).
+ *
+ * Returns false (no fragments found) when the SPB lookup fails, which
+ * preserves the legacy clear-on-empty behavior for edge cases.
+ *
+ * Caller must hold zone->lock.
+ */
+static bool spb_pageblock_has_free_movable_fragments(struct zone *zone,
+						     struct page *page)
+{
+	unsigned long pfn =3D page_to_pfn(page);
+	unsigned long pb_start =3D pageblock_start_pfn(pfn);
+	unsigned long pb_end =3D pb_start + pageblock_nr_pages;
+	unsigned long frag_pfn;
+	struct superpageblock *sb;
+	struct list_head *list;
+	struct page *frag;
+	unsigned int order;
+
+	sb =3D pfn_to_superpageblock(zone, pfn);
+	if (!sb)
+		return false;
+
+	for (order =3D 0; order < pageblock_order; order++) {
+		list =3D &sb->free_area[order].free_list[MIGRATE_MOVABLE];
+		list_for_each_entry(frag, list, buddy_list) {
+			frag_pfn =3D page_to_pfn(frag);
+			if (frag_pfn >=3D pb_start && frag_pfn < pb_end)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * superpageblock_clear_has_movable - clear PB_has_movable with SPB counte=
r update
+ * @page: page within the pageblock
+ *
+ * Called from compaction when a full pageblock scan determines no movable
+ * pages remain. Clears PB_has_movable and decrements the superpageblock's
+ * nr_movable counter atomically (under zone->lock).
+ *
+ * Without this, clearing PB_has_movable directly via clear_pfnblock_bit()
+ * would leave the SPB counter stale, causing nr_movable to grow unbounded
+ * as subsequent movable allocations re-set the bit and re-increment.
+ *
+ * The migrate scanner only inspects in-use pages, so it is blind to MOVAB=
LE
+ * fragments below pageblock_order sitting on the SPB free lists. Probe th=
ose
+ * lists first; if any fragment of @page's pageblock is still tracked by t=
he
+ * SPB, leave PB_has_movable set so the SPB type accounting stays consiste=
nt
+ * (debugfs invariant 1: unmov + recl + mov + free >=3D total - rsv).
+ */
+void superpageblock_clear_has_movable(struct zone *zone, struct page *page)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	if (!spb_pageblock_has_free_movable_fragments(zone, page))
+		__spb_clear_has_type(page, MIGRATE_MOVABLE);
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/**
+ * superpageblock_set_has_movable - set PB_has_movable with SPB counter up=
date
+ * @zone: zone containing the page
+ * @page: page within the pageblock
+ *
+ * Called from compaction when a movable page is migrated into a pageblock.
+ * Compaction bypasses page_del_and_expand (which normally sets PB_has_*)
+ * by using __isolate_free_page + direct migration, so PB_has_movable must
+ * be set explicitly for the destination pageblock.
+ *
+ * Idempotent: only increments the counter on the 0=E2=86=921 bit transiti=
on.
+ */
+void superpageblock_set_has_movable(struct zone *zone, struct page *page)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	__spb_set_has_type(page, MIGRATE_MOVABLE);
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif /* CONFIG_COMPACTION */
+
 /**
  * spb_get_category - Determine if a superpageblock is clean or tainted
  * @sb: superpageblock to classify
@@ -585,7 +719,7 @@ static void spb_update_list(struct superpageblock *sb)
=20
 	list_del_init(&sb->list);
=20
-	if (sb->nr_free =3D=3D SUPERPAGEBLOCK_NR_PAGEBLOCKS) {
+	if (sb->nr_free =3D=3D sb->total_pageblocks) {
 		list_add_tail(&sb->list, &zone->spb_empty);
 		return;
 	}
@@ -1023,12 +1157,41 @@ static inline void account_freepages(struct zone *z=
one, int nr_pages,
 			   zone->nr_free_highatomic + nr_pages);
 }
=20
+/**
+ * pfn_sb_free_area - Get the correct free_area for a page at given order
+ * @zone: the zone
+ * @pfn: page frame number
+ * @order: buddy order
+ *
+ * Returns the per-superpageblock free_area if the page belongs to a valid
+ * superpageblock. Otherwise returns the zone free_area (for zones where t=
he
+ * superpageblock setup failed).
+ */
+static inline struct free_area *pfn_sb_free_area(struct zone *zone,
+						 unsigned long pfn,
+						 unsigned int order,
+						 struct superpageblock **sbp)
+{
+	struct superpageblock *sb =3D pfn_to_superpageblock(zone, pfn);
+
+	if (sb) {
+		if (sbp)
+			*sbp =3D sb;
+		return &sb->free_area[order];
+	}
+	if (sbp)
+		*sbp =3D NULL;
+	return &zone->free_area[order];
+}
+
 /* Used for pages not on another list */
 static inline void __add_to_free_list(struct page *page, struct zone *zone,
 				      unsigned int order, int migratetype,
 				      bool tail)
 {
-	struct free_area *area =3D &zone->free_area[order];
+	unsigned long pfn =3D page_to_pfn(page);
+	struct superpageblock *sb;
+	struct free_area *area =3D pfn_sb_free_area(zone, pfn, order, &sb);
 	int nr_pages =3D 1 << order;
=20
 	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype,
@@ -1041,6 +1204,13 @@ static inline void __add_to_free_list(struct page *p=
age, struct zone *zone,
 		list_add(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
=20
+	if (sb) {
+		/* Keep zone-level nr_free accurate for watermark checks */
+		zone->free_area[order].nr_free++;
+		/* Track total free pages per superpageblock */
+		sb->nr_free_pages +=3D nr_pages;
+	}
+
 	if (order >=3D pageblock_order && !is_migrate_isolate(migratetype))
 		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
 }
@@ -1053,7 +1223,8 @@ static inline void __add_to_free_list(struct page *pa=
ge, struct zone *zone,
 static inline void move_to_free_list(struct page *page, struct zone *zone,
 				     unsigned int order, int old_mt, int new_mt)
 {
-	struct free_area *area =3D &zone->free_area[order];
+	unsigned long pfn =3D page_to_pfn(page);
+	struct free_area *area =3D pfn_sb_free_area(zone, pfn, order, NULL);
 	int nr_pages =3D 1 << order;
=20
 	/* Free page moving can fail, so it happens before the type update */
@@ -1077,6 +1248,9 @@ static inline void move_to_free_list(struct page *pag=
e, struct zone *zone,
 static inline void __del_page_from_free_list(struct page *page, struct zon=
e *zone,
 					     unsigned int order, int migratetype)
 {
+	unsigned long pfn =3D page_to_pfn(page);
+	struct superpageblock *sb;
+	struct free_area *area =3D pfn_sb_free_area(zone, pfn, order, &sb);
 	int nr_pages =3D 1 << order;
=20
         VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype,
@@ -1090,7 +1264,14 @@ static inline void __del_page_from_free_list(struct =
page *page, struct zone *zon
 	list_del(&page->buddy_list);
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
-	zone->free_area[order].nr_free--;
+	area->nr_free--;
+
+	if (sb) {
+		/* Keep zone-level nr_free accurate for watermark checks */
+		zone->free_area[order].nr_free--;
+		/* Track total free pages per superpageblock */
+		sb->nr_free_pages -=3D nr_pages;
+	}
=20
 	if (order >=3D pageblock_order && !is_migrate_isolate(migratetype))
 		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
@@ -1146,33 +1327,44 @@ static void change_pageblock_range(struct page *pag=
eblock_page,
 	}
 }
=20
-/*
+/**
  * mark_pageblock_free - handle a pageblock becoming fully free
  * @page: page at the start of the pageblock
  * @pfn: page frame number
+ * @migratetype: pointer to the caller's migratetype variable (may be upda=
ted)
  *
- * Clear stale PCP ownership and actual-contents tracking flags when
- * buddy merging reconstructs a full pageblock or a whole pageblock is
- * freed directly. No PCP can still hold pages from this block (otherwise
- * the buddy merge couldn't have completed), so the ownership entry would
- * just cause misrouted frees.
+ * Clear stale PCP ownership and actual-contents tracking flags, mark the
+ * pageblock as fully free for superpageblock accounting, and reset the
+ * migratetype to MOVABLE so the page lands on free_list[MIGRATE_MOVABLE].
+ * Non-movable allocations must go through RMQUEUE_CLAIM to reuse it,
+ * which properly handles PB_all_free and superpageblock accounting.
  */
-static void mark_pageblock_free(struct page *page, unsigned long pfn)
+static void mark_pageblock_free(struct page *page, unsigned long pfn,
+				int *migratetype)
 {
 	clear_pcpblock_owner(page);
=20
 	/*
-	 * The entire block is now free -- clear actual-contents tracking
-	 * flags since no allocated pages remain.
+	 * Clear PB_has_* bits and decrement corresponding SPB type
+	 * counters. Use __spb_clear_has_type (no list update) to avoid
+	 * bouncing the SPB between lists; pb_now_free's spb_update_list
+	 * handles the final reclassification.
 	 */
-	clear_pfnblock_bit(page, pfn, PB_has_unmovable);
-	clear_pfnblock_bit(page, pfn, PB_has_reclaimable);
-	clear_pfnblock_bit(page, pfn, PB_has_movable);
+	__spb_clear_has_type(page, MIGRATE_UNMOVABLE);
+	__spb_clear_has_type(page, MIGRATE_RECLAIMABLE);
+	__spb_clear_has_type(page, MIGRATE_MOVABLE);
=20
 	if (!get_pfnblock_bit(page, pfn, PB_all_free)) {
 		set_pfnblock_bit(page, pfn, PB_all_free);
 		superpageblock_pb_now_free(page);
 	}
+
+	if (*migratetype =3D=3D MIGRATE_UNMOVABLE ||
+	    *migratetype =3D=3D MIGRATE_RECLAIMABLE ||
+	    *migratetype =3D=3D MIGRATE_HIGHATOMIC) {
+		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+		*migratetype =3D MIGRATE_MOVABLE;
+	}
 }
=20
 /*
@@ -1205,6 +1397,7 @@ static inline void __free_one_page(struct page *page,
 		int migratetype, fpi_t fpi_flags)
 {
 	struct capture_control *capc =3D task_capc(zone);
+	unsigned int orig_order =3D order;
 	unsigned long buddy_pfn =3D 0;
 	unsigned long combined_pfn;
 	struct page *buddy;
@@ -1217,18 +1410,31 @@ static inline void __free_one_page(struct page *pag=
e,
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
=20
-	account_freepages(zone, 1 << order, migratetype);
+	if (order >=3D pageblock_order) {
+		int i, nr_pbs =3D 1 << (order - pageblock_order);
=20
-	/*
-	 * When freeing a whole pageblock, clear stale PCP ownership
-	 * and actual-contents tracking flags up front, and mark it
-	 * as fully free for superpageblock accounting.  The in-loop
-	 * check only fires when sub-pageblock pages merge *up to*
-	 * pageblock_order, not when entering at pageblock_order
-	 * directly.
-	 */
-	if (order =3D=3D pageblock_order)
-		mark_pageblock_free(page, pfn);
+		for (i =3D 0; i < nr_pbs; i++) {
+			int pb_mt =3D get_pfnblock_migratetype(
+					page + i * pageblock_nr_pages,
+					pfn + i * pageblock_nr_pages);
+			mark_pageblock_free(page + i * pageblock_nr_pages,
+					    pfn + i * pageblock_nr_pages,
+					    &pb_mt);
+		}
+		/*
+		 * After mark_pageblock_free, non-CMA sub-pageblocks are
+		 * MOVABLE. CMA pageblocks retain their CMA type so pages
+		 * land on the correct free list for CMA allocations.
+		 * ISOLATE pageblocks must stay ISOLATE so that
+		 * account_freepages() correctly skips them -- otherwise
+		 * NR_FREE_PAGES gets incremented for isolated pages.
+		 */
+		if (!is_migrate_cma(migratetype) &&
+		    !is_migrate_isolate(migratetype))
+			migratetype =3D MIGRATE_MOVABLE;
+	}
+
+	account_freepages(zone, 1 << order, migratetype);
=20
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt =3D migratetype;
@@ -1285,8 +1491,29 @@ static inline void __free_one_page(struct page *page,
 		 * clear any stale PCP ownership and actual-contents
 		 * tracking flags.
 		 */
-		if (order =3D=3D pageblock_order)
-			mark_pageblock_free(page, pfn);
+		if (order =3D=3D pageblock_order) {
+			int old_mt =3D migratetype;
+
+			mark_pageblock_free(page, pfn, &migratetype);
+			/*
+			 * mark_pageblock_free may convert migratetype to
+			 * MOVABLE. Transfer the accounting done earlier so
+			 * nr_free_highatomic doesn't leak.
+			 *
+			 * We transfer 1 << orig_order pages -- the amount
+			 * credited by this __free_one_page call. Buddies
+			 * consumed during merging may also have HIGHATOMIC
+			 * credits from their own frees; those are not tracked
+			 * here. In practice HIGHATOMIC reserves are small and
+			 * short-lived, so any residual drift is minor.
+			 */
+			if (old_mt !=3D migratetype) {
+				account_freepages(zone, -(1 << orig_order),
+						  old_mt);
+				account_freepages(zone, 1 << orig_order,
+						  migratetype);
+			}
+		}
 	}
=20
 done_merging:
@@ -2163,20 +2390,42 @@ static __always_inline void page_del_and_expand(str=
uct zone *zone,
 						struct page *page, int low,
 						int high, int migratetype)
 {
+	struct superpageblock *sb;
 	int nr_pages =3D 1 << high;
=20
 	/*
 	 * If we're splitting a page that spans at least a full pageblock,
-	 * the allocated pageblock transitions from fully-free to in-use.
-	 * Clear PB_all_free and update superpageblock accounting.
+	 * each constituent pageblock transitions from fully-free to in-use.
+	 * Clear PB_all_free and update superpageblock accounting for ALL
+	 * pageblocks in the range, not just the first one.
 	 */
 	if (high >=3D pageblock_order) {
 		unsigned long pfn =3D page_to_pfn(page);
+		unsigned long end_pfn =3D pfn + (1 << high);
=20
-		if (get_pfnblock_bit(page, pfn, PB_all_free)) {
-			clear_pfnblock_bit(page, pfn, PB_all_free);
-			superpageblock_pb_now_used(page);
+		for (; pfn < end_pfn; pfn +=3D pageblock_nr_pages) {
+			struct page *pb_page =3D pfn_to_page(pfn);
+
+			if (get_pfnblock_bit(pb_page, pfn, PB_all_free)) {
+				clear_pfnblock_bit(pb_page, pfn, PB_all_free);
+				superpageblock_pb_now_used(pb_page);
+			}
+			__spb_set_has_type(pb_page, migratetype);
 		}
+		/* Single list update after all pageblocks processed */
+		sb =3D pfn_to_superpageblock(zone, page_to_pfn(page));
+		if (sb)
+			spb_update_list(sb);
+	} else {
+		/*
+		 * Sub-pageblock allocation: set PB_has_<migratetype> for
+		 * the containing pageblock. Idempotent: only increments
+		 * the counter on the first allocation of this type.
+		 */
+		__spb_set_has_type(page, migratetype);
+		sb =3D pfn_to_superpageblock(zone, page_to_pfn(page));
+		if (sb)
+			spb_update_list(sb);
 	}
=20
 	__del_page_from_free_list(page, zone, high, migratetype);
@@ -2330,6 +2579,15 @@ static void prep_new_page(struct page *page, unsigne=
d int order, gfp_t gfp_flags
 /* Bounded scan limit when searching free lists for tainted superpageblock=
 pages */
 #define SPB_SCAN_LIMIT 8
=20
+/*
+ * Reserve free pageblocks in tainted superpageblocks for unmovable/reclai=
mable
+ * allocations.  Movable allocations skip tainted superpageblocks that have
+ * fewer than this many free pageblocks, ensuring that unmovable claims
+ * always find room in existing tainted superpageblocks instead of spilling
+ * into clean ones.
+ */
+#define SPB_TAINTED_RESERVE	4
+
 /**
  * sb_preferred_for_movable - Find the fullest clean superpageblock for mo=
vable
  * @zone: zone to search
@@ -2369,38 +2627,38 @@ static struct page *__rmqueue_from_sb(struct zone *=
zone, unsigned int order,
 				      int migratetype, struct superpageblock *sb)
 {
 	unsigned int current_order;
-	unsigned long sb_start =3D sb->start_pfn;
-	unsigned long sb_end =3D sb_start + (1UL << SUPERPAGEBLOCK_ORDER);
 	struct free_area *area;
 	struct page *page;
-	int scanned;
=20
-	for (current_order =3D order; current_order < NR_PAGE_ORDERS;
+	/*
+	 * Search the superpageblock's own free lists for all orders.
+	 */
+	for (current_order =3D order;
+	     current_order < NR_PAGE_ORDERS;
 	     ++current_order) {
-		area =3D &zone->free_area[current_order];
-		scanned =3D 0;
-
-		list_for_each_entry(page, &area->free_list[migratetype],
-				    buddy_list) {
-			unsigned long pfn =3D page_to_pfn(page);
+		area =3D &sb->free_area[current_order];
+		page =3D get_page_from_free_area(area, migratetype);
+		if (!page)
+			continue;
=20
-			if (pfn >=3D sb_start && pfn < sb_end) {
-				page_del_and_expand(zone, page, order,
-						    current_order,
-						    migratetype);
-				return page;
-			}
-			if (++scanned >=3D SPB_SCAN_LIMIT)
-				break;
-		}
+		page_del_and_expand(zone, page, order, current_order,
+				    migratetype);
+		return page;
 	}
+
 	return NULL;
 }
=20
 /*
  * Go through the free lists for the given migratetype and remove
- * the smallest available page from the freelists
+ * the smallest available page from the freelists.
+ *
+ * When superpageblocks are enabled, search per-superpageblock free lists =
first,
+ * falling back to zone free lists for pages not in any superpageblock.
  */
+static struct page *claim_whole_block(struct zone *zone, struct page *page,
+		  int current_order, int order, int new_type, int old_type);
+
 static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
@@ -2408,14 +2666,179 @@ struct page *__rmqueue_smallest(struct zone *zone,=
 unsigned int order,
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
+	int full;
+	struct superpageblock *sb;
+	/*
+	 * Category search order: 2 passes.
+	 * Movable: clean first, then tainted (pack into clean SBs).
+	 * Others: tainted first, then clean (concentrate in tainted SBs).
+	 */
+	static const enum sb_category cat_order[2][2] =3D {
+		[0] =3D { SB_TAINTED, SB_CLEAN },  /* unmovable/reclaimable */
+		[1] =3D { SB_CLEAN, SB_TAINTED },  /* movable */
+	};
+	int movable =3D (migratetype =3D=3D MIGRATE_MOVABLE) ? 1 : 0;
=20
-	/* Find a page of the appropriate size in the preferred list */
-	for (current_order =3D order; current_order < NR_PAGE_ORDERS; ++current_o=
rder) {
+	/*
+	 * Search per-superpageblock free lists for pages of the requested
+	 * migratetype, walking superpageblocks from fullest to emptiest
+	 * to pack allocations.
+	 *
+	 * For unmovable/reclaimable, prefer tainted superpageblocks to
+	 * concentrate non-movable allocations into fewer superpageblocks.
+	 * For movable, prefer clean superpageblocks to keep them homogeneous.
+	 *
+	 * Search empty superpageblocks between the preferred and fallback
+	 * category passes to avoid movable allocations consuming free
+	 * pageblocks in tainted superpageblocks (which unmovable needs for
+	 * future CLAIMs), and vice versa.
+	 */
+	for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+		enum sb_category cat =3D cat_order[movable][0];
+
+		list_for_each_entry(sb,
+			&zone->spb_lists[cat][full], list) {
+			if (!sb->nr_free_pages)
+				continue;
+			for (current_order =3D order;
+			     current_order < NR_PAGE_ORDERS;
+			     ++current_order) {
+				area =3D &sb->free_area[current_order];
+				page =3D get_page_from_free_area(
+					area, migratetype);
+				if (!page)
+					continue;
+				page_del_and_expand(zone, page,
+					order, current_order,
+					migratetype);
+				trace_mm_page_alloc_zone_locked(
+					page, order, migratetype,
+					pcp_allowed_order(order) &&
+					migratetype < MIGRATE_PCPTYPES);
+				return page;
+			}
+		}
+	}
+
+	/*
+	 * For non-movable allocations, try to reclaim free pageblocks
+	 * from tainted superpageblocks before looking at empty or clean
+	 * ones. Free pageblocks in tainted SBs have pages on the MOVABLE
+	 * free list (reset by mark_pageblock_free), so the search above
+	 * misses them. Claim them inline to keep non-movable allocations
+	 * concentrated in already-tainted superpageblocks.
+	 */
+	if (!movable && !is_migrate_cma(migratetype)) {
+		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			list_for_each_entry(sb,
+				&zone->spb_lists[SB_TAINTED][full], list) {
+				if (!sb->nr_free)
+					continue;
+				for (current_order =3D max_t(unsigned int,
+						order, pageblock_order);
+				     current_order < NR_PAGE_ORDERS;
+				     ++current_order) {
+					area =3D &sb->free_area[current_order];
+					page =3D get_page_from_free_area(
+						area, MIGRATE_MOVABLE);
+					if (!page)
+						continue;
+					if (get_pageblock_isolate(page))
+						continue;
+					if (is_migrate_cma(
+					    get_pageblock_migratetype(page)))
+						continue;
+					page =3D claim_whole_block(zone, page,
+						current_order, order,
+						migratetype, MIGRATE_MOVABLE);
+					trace_mm_page_alloc_zone_locked(
+						page, order, migratetype,
+						pcp_allowed_order(order) &&
+						migratetype < MIGRATE_PCPTYPES);
+					return page;
+				}
+			}
+		}
+	}
+
+	/* Empty superpageblocks: try before falling back to non-preferred catego=
ry */
+	list_for_each_entry(sb, &zone->spb_empty, list) {
+		if (!sb->nr_free_pages)
+			continue;
+		for (current_order =3D max(order, pageblock_order);
+		     current_order < NR_PAGE_ORDERS;
+		     ++current_order) {
+			area =3D &sb->free_area[current_order];
+			page =3D get_page_from_free_area(area, migratetype);
+			if (!page)
+				continue;
+			page_del_and_expand(zone, page, order,
+				current_order, migratetype);
+			trace_mm_page_alloc_zone_locked(page, order,
+				migratetype,
+				pcp_allowed_order(order) &&
+				migratetype < MIGRATE_PCPTYPES);
+			return page;
+		}
+	}
+
+	/*
+	 * Pass 4: movable allocations fall back to tainted SPBs.
+	 * Non-movable allocations must NOT search clean SPBs here;
+	 * stale migratetype labels create phantom non-movable free
+	 * pages in clean SPBs that would cause unnecessary tainting.
+	 * Let __rmqueue_claim and __rmqueue_steal handle non-movable
+	 * fallback with proper ALLOC_NOFRAGMENT protection.
+	 */
+	if (movable) {
+		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			enum sb_category cat =3D cat_order[movable][1];
+
+			list_for_each_entry(sb,
+				&zone->spb_lists[cat][full], list) {
+				if (!sb->nr_free_pages)
+					continue;
+				/*
+				 * Movable falling back to tainted: skip SBs
+				 * with few free pageblocks to reserve space
+				 * for future unmovable/reclaimable claims.
+				 */
+				if (sb->nr_free <=3D SPB_TAINTED_RESERVE)
+					continue;
+				for (current_order =3D order;
+				     current_order < NR_PAGE_ORDERS;
+				     ++current_order) {
+					area =3D &sb->free_area[current_order];
+					page =3D get_page_from_free_area(
+						area, migratetype);
+					if (!page)
+						continue;
+					page_del_and_expand(zone, page,
+						order, current_order,
+						migratetype);
+					trace_mm_page_alloc_zone_locked(
+						page, order, migratetype,
+						pcp_allowed_order(order) &&
+						migratetype < MIGRATE_PCPTYPES);
+					return page;
+				}
+			}
+		}
+	}
+
+	/*
+	 * Zone free lists: all pages should be on superpageblock lists.
+	 * Finding a page here means zone hotplug added memory without
+	 * setting up superpageblocks for the new range.
+	 */
+	for (current_order =3D order;
+	     current_order < NR_PAGE_ORDERS; ++current_order) {
 		area =3D &(zone->free_area[current_order]);
 		page =3D get_page_from_free_area(area, migratetype);
 		if (!page)
 			continue;
=20
+		WARN_ON_ONCE(zone->superpageblocks);
 		page_del_and_expand(zone, page, order, current_order,
 				    migratetype);
 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
@@ -2761,6 +3184,8 @@ int find_suitable_fallback(struct free_area *area, un=
signed int order,
  *
  * Handle the PB_all_free =E2=86=92 used transition, change the pageblock
  * migratetype, split the block down to @order, and return the page.
+ * Used by both the claim fallback path and __rmqueue_smallest when
+ * reclaiming free pageblocks from tainted superpageblocks.
  */
 static struct page *
 claim_whole_block(struct zone *zone, struct page *page,
@@ -2772,11 +3197,6 @@ claim_whole_block(struct zone *zone, struct page *pa=
ge,
=20
 	VM_WARN_ON_ONCE(current_order < order);
=20
-	/*
-	 * Clear PB_all_free for pageblocks being claimed.
-	 * This path bypasses page_del_and_expand(), so we
-	 * must handle the free=E2=86=92used transition here.
-	 */
 	for (pb_pfn =3D page_to_pfn(page);
 	     pb_pfn < page_to_pfn(page) + (1 << current_order);
 	     pb_pfn +=3D pageblock_nr_pages) {
@@ -2827,6 +3247,16 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 	if (get_pageblock_isolate(page))
 		return NULL;
=20
+	/*
+	 * Never steal from CMA pageblocks.  CMA pages freed through
+	 * PCP may land on the MOVABLE free list (PCP caches the
+	 * allocation-time migratetype), making them visible to the
+	 * fallback search.  Stealing would corrupt CMA by changing
+	 * the pageblock type away from MIGRATE_CMA.
+	 */
+	if (is_migrate_cma(get_pageblock_migratetype(page)))
+		return NULL;
+
 	/* Take ownership for orders >=3D pageblock_order */
 	if (current_order >=3D pageblock_order)
 		return claim_whole_block(zone, page, current_order, order,
@@ -2893,10 +3323,134 @@ try_to_claim_block(struct zone *zone, struct page =
*page,
 	return NULL;
 }
=20
+/*
+ * Search per-superpageblock free lists for a page of a fallback migratety=
pe.
+ * Sub-pageblock-order free pages live on superpageblock free lists, not z=
one
+ * free lists, so __rmqueue_claim and __rmqueue_steal need this helper to
+ * find fallback pages at those orders.
+ *
+ * For unmovable/reclaimable allocations, prefer tainted superpageblocks to
+ * keep clean ones clean for future large contiguous allocations.
+ * For movable allocations, prefer clean superpageblocks to keep movable
+ * pages consolidated and superpageblocks homogeneous.
+ *
+ * @search_cats: bitmask controlling which categories to search.
+ *   bit 0: search the preferred category (tainted for unmov, clean for mo=
v)
+ *   bit 1: search empty superpageblocks
+ *   bit 2: search the fallback category (clean for unmov, tainted for mov)
+ * All bits set (0x7) gives the original behavior.
+ */
+#define SB_SEARCH_PREFERRED	(1 << 0)
+#define SB_SEARCH_EMPTY		(1 << 1)
+#define SB_SEARCH_FALLBACK	(1 << 2)
+#define SB_SEARCH_ALL		(SB_SEARCH_PREFERRED | SB_SEARCH_EMPTY | SB_SEARCH_=
FALLBACK)
+
+static struct page *
+__rmqueue_sb_find_fallback(struct zone *zone, unsigned int order,
+			   int start_migratetype, int *fallback_mt,
+			   unsigned int search_cats)
+{
+	int full, i;
+	struct superpageblock *sb;
+	/*
+	 * Category search order: 2 passes.
+	 * Movable: clean, tainted.  Others: tainted, clean.
+	 */
+	static const enum sb_category cat_order[2][2] =3D {
+		[0] =3D { SB_TAINTED, SB_CLEAN },  /* unmovable/reclaimable */
+		[1] =3D { SB_CLEAN, SB_TAINTED },   /* movable */
+	};
+	int movable =3D (start_migratetype =3D=3D MIGRATE_MOVABLE) ? 1 : 0;
+
+	/* Pass 0: preferred category */
+	if (search_cats & SB_SEARCH_PREFERRED) {
+		enum sb_category cat =3D cat_order[movable][0];
+
+		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			list_for_each_entry(sb,
+					    &zone->spb_lists[cat][full], list) {
+				struct free_area *area =3D
+					&sb->free_area[order];
+
+				if (movable && cat =3D=3D SB_TAINTED &&
+				    sb->nr_free <=3D SPB_TAINTED_RESERVE)
+					continue;
+
+				for (i =3D 0; i < MIGRATE_PCPTYPES - 1; i++) {
+					int fmt =3D fallbacks[start_migratetype][i];
+					struct page *page;
+
+					page =3D get_page_from_free_area(area,
+								       fmt);
+					if (page) {
+						*fallback_mt =3D fmt;
+						return page;
+					}
+				}
+			}
+		}
+	}
+
+	/* Empty superpageblocks: between preferred and fallback */
+	if (search_cats & SB_SEARCH_EMPTY) {
+		list_for_each_entry(sb, &zone->spb_empty, list) {
+			struct free_area *area =3D
+				&sb->free_area[order];
+
+			for (i =3D 0; i < MIGRATE_PCPTYPES - 1; i++) {
+				int fmt =3D fallbacks[start_migratetype][i];
+				struct page *page;
+
+				page =3D get_page_from_free_area(area,
+							       fmt);
+				if (page) {
+					*fallback_mt =3D fmt;
+					return page;
+				}
+			}
+		}
+	}
+
+	/* Pass 1: fallback category */
+	if (search_cats & SB_SEARCH_FALLBACK) {
+		enum sb_category cat =3D cat_order[movable][1];
+
+		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			list_for_each_entry(sb,
+					    &zone->spb_lists[cat][full], list) {
+				struct free_area *area =3D
+					&sb->free_area[order];
+
+				if (movable && cat =3D=3D SB_TAINTED &&
+				    sb->nr_free <=3D SPB_TAINTED_RESERVE)
+					continue;
+
+				for (i =3D 0; i < MIGRATE_PCPTYPES - 1; i++) {
+					int fmt =3D fallbacks[start_migratetype][i];
+					struct page *page;
+
+					page =3D get_page_from_free_area(area,
+								       fmt);
+					if (page) {
+						*fallback_mt =3D fmt;
+						return page;
+					}
+				}
+			}
+		}
+	}
+
+	return NULL;
+}
+
 /*
  * Try to allocate from some fallback migratetype by claiming the entire b=
lock,
  * i.e. converting it to the allocation's start migratetype.
  *
+ * Search by category first, then by order within each category, to avoid
+ * claiming clean/empty superpageblocks when tainted ones still have space
+ * at smaller orders.
+ *
  * The use of signed ints for order and current_order is a deliberate
  * deviation from the rest of this file, to make the for loop
  * condition simpler.
@@ -2905,11 +3459,16 @@ static __always_inline struct page *
 __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
 						unsigned int alloc_flags)
 {
-	struct free_area *area;
 	int current_order;
 	int min_order =3D order;
 	struct page *page;
 	int fallback_mt;
+	static const unsigned int cat_search[] =3D {
+		SB_SEARCH_PREFERRED,
+		SB_SEARCH_EMPTY,
+		SB_SEARCH_FALLBACK,
+	};
+	int c;
=20
 	/*
 	 * Do not steal pages from freelists belonging to other pageblocks
@@ -2920,65 +3479,34 @@ __rmqueue_claim(struct zone *zone, int order, int s=
tart_migratetype,
 		min_order =3D pageblock_order;
=20
 	/*
-	 * Find the largest available free page in the other list. This roughly
-	 * approximates finding the pageblock with the most free pages, which
-	 * would be too costly to do exactly.
+	 * Find the largest available free page in a fallback migratetype.
+	 * Search each superpageblock category across all orders before
+	 * moving to the next category, so that smaller blocks in tainted
+	 * superpageblocks are preferred over larger blocks in empty/clean
+	 * ones.
 	 */
-	for (current_order =3D MAX_PAGE_ORDER; current_order >=3D min_order;
-				--current_order) {
-		area =3D &(zone->free_area[current_order]);
-		fallback_mt =3D find_suitable_fallback(area, current_order,
-						     start_migratetype, true);
-
-		/* No block in that order */
-		if (fallback_mt =3D=3D -1)
-			continue;
-
-		/* Advanced into orders too low to claim, abort */
-		if (fallback_mt =3D=3D -2)
-			break;
-
-		page =3D get_page_from_free_area(area, fallback_mt);
+	for (c =3D 0; c < ARRAY_SIZE(cat_search); c++) {
+		for (current_order =3D MAX_PAGE_ORDER;
+		     current_order >=3D min_order; --current_order) {
+			if (!should_try_claim_block(current_order,
+						    start_migratetype))
+				break;
+			page =3D __rmqueue_sb_find_fallback(zone, current_order,
+						start_migratetype,
+						&fallback_mt, cat_search[c]);
+			if (!page)
+				continue;
=20
-		/*
-		 * For unmovable/reclaimable stealing, prefer pages from
-		 * tainted superpageblocks (already contaminated) to keep clean
-		 * superpageblocks clean for future 1GB allocations.
-		 */
-		if (start_migratetype !=3D MIGRATE_MOVABLE &&
-		    zone->superpageblocks && page) {
-			struct superpageblock *sb;
-			struct page *alt;
-			int scanned =3D 0;
-
-			sb =3D pfn_to_superpageblock(zone, page_to_pfn(page));
-			if (sb && spb_get_category(sb) =3D=3D SB_CLEAN) {
-				list_for_each_entry(alt,
-						    &area->free_list[fallback_mt],
-						    buddy_list) {
-					struct superpageblock *asb;
-
-					if (++scanned > SPB_SCAN_LIMIT)
-						break;
-					asb =3D pfn_to_superpageblock(zone,
-							page_to_pfn(alt));
-					if (asb && spb_get_category(asb) =3D=3D
-					    SB_TAINTED) {
-						page =3D alt;
-						break;
-					}
-				}
+			page =3D try_to_claim_block(zone, page, current_order,
+						  order, start_migratetype,
+						  fallback_mt, alloc_flags);
+			if (page) {
+				trace_mm_page_alloc_extfrag(page, order,
+					current_order, start_migratetype,
+					fallback_mt);
+				return page;
 			}
 		}
-
-		page =3D try_to_claim_block(zone, page, current_order, order,
-					  start_migratetype, fallback_mt,
-					  alloc_flags);
-		if (page) {
-			trace_mm_page_alloc_extfrag(page, order, current_order,
-						    start_migratetype, fallback_mt);
-			return page;
-		}
 	}
=20
 	return NULL;
@@ -2992,19 +3520,23 @@ static __always_inline struct page *
 __rmqueue_steal(struct zone *zone, int order, int start_migratetype)
 {
 	struct superpageblock *sb;
-	struct free_area *area;
 	int current_order;
 	struct page *page;
 	int fallback_mt;
=20
+	/*
+	 * Search per-superpageblock free lists for fallback migratetypes.
+	 * Superpageblocks are always enabled for populated zones.
+	 */
 	for (current_order =3D order; current_order < NR_PAGE_ORDERS; current_ord=
er++) {
-		area =3D &(zone->free_area[current_order]);
-		fallback_mt =3D find_suitable_fallback(area, current_order,
-						     start_migratetype, false);
-		if (fallback_mt =3D=3D -1)
+		page =3D __rmqueue_sb_find_fallback(zone, current_order,
+					start_migratetype,
+					&fallback_mt,
+					SB_SEARCH_PREFERRED | SB_SEARCH_FALLBACK);
+
+		if (!page)
 			continue;
=20
-		page =3D get_page_from_free_area(area, fallback_mt);
 		page_del_and_expand(zone, page, order, current_order, fallback_mt);
=20
 		/*
@@ -3239,33 +3771,11 @@ static bool rmqueue_bulk(struct zone *zone, unsigne=
d int order,
 		goto out;
=20
 	/*
-	 * Phase 2: Zone too fragmented for whole pageblocks.
-	 * Sweep zone free lists top-down for same-migratetype
-	 * chunks. Avoids cross-type stealing and keeps PCP
-	 * functional under fragmentation.
-	 *
-	 * No ownership claim or PagePCPBuddy - these are
-	 * sub-pageblock fragments cached for batching only.
-	 *
-	 * Stop above the requested order -- at that point,
-	 * phase 3's __rmqueue() does the same lookup but with
-	 * migratetype fallback.
+	 * Phase 2 was removed: it swept zone free lists for sub-pageblock
+	 * fragments, which are always empty when superpageblocks are enabled.
+	 * Phase 3's __rmqueue() -> __rmqueue_smallest() properly searches
+	 * per-superpageblock free lists at all orders.
 	 */
-	for (o =3D pageblock_order - 1;
-	     o > (int)order && refilled < pages_needed; o--) {
-		struct free_area *area =3D &zone->free_area[o];
-		struct page *page;
-
-		while (refilled + (1 << o) <=3D pages_needed) {
-			page =3D get_page_from_free_area(area, migratetype);
-			if (!page)
-				break;
-
-			del_page_from_free_list(page, zone, o, migratetype);
-			pcp_enqueue_tail(pcp, page, migratetype, o);
-			refilled +=3D 1 << o;
-		}
-	}
=20
 	/*
 	 * Phase 3: Last resort. Use __rmqueue() which does
@@ -4367,10 +4877,19 @@ static bool unreserve_highatomic_pageblock(const st=
ruct alloc_context *ac,
=20
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
-			struct free_area *area =3D &(zone->free_area[order]);
+			struct free_area *area;
+			struct superpageblock *sb;
 			unsigned long size;
-
-			page =3D get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+			unsigned long i;
+
+			page =3D NULL;
+			/* Search per-superpageblock free lists */
+			for (i =3D 0; i < zone->nr_superpageblocks && !page; i++) {
+				sb =3D &zone->superpageblocks[i];
+				area =3D &sb->free_area[order];
+				page =3D get_page_from_free_area(area,
+							       MIGRATE_HIGHATOMIC);
+			}
 			if (!page)
 				continue;
=20
@@ -4501,29 +5020,20 @@ bool __zone_watermark_ok(struct zone *z, unsigned i=
nt order, unsigned long mark,
 	if (!order)
 		return true;
=20
-	/* For a high-order request, check at least one suitable page is free */
+	/*
+	 * For a high-order request, check at least one suitable page is free.
+	 * Zone free_area nr_free is shadowed -- it includes pages on
+	 * per-superpageblock free lists. A non-zero nr_free means the allocator
+	 * will find pages on superpageblock lists even if zone list heads are
+	 * empty.
+	 */
 	for (o =3D order; o < NR_PAGE_ORDERS; o++) {
 		struct free_area *area =3D &z->free_area[o];
-		int mt;
=20
 		if (!area->nr_free)
 			continue;
=20
-		for (mt =3D 0; mt < MIGRATE_PCPTYPES; mt++) {
-			if (!free_area_empty(area, mt))
-				return true;
-		}
-
-#ifdef CONFIG_CMA
-		if ((alloc_flags & ALLOC_CMA) &&
-		    !free_area_empty(area, MIGRATE_CMA)) {
-			return true;
-		}
-#endif
-		if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
-		    !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
-			return true;
-		}
+		return true;
 	}
 	return false;
 }
@@ -8991,11 +9501,12 @@ static int superpageblock_debugfs_show(struct seq_f=
ile *m, void *v)
 		/* Per-superpageblock detail */
 		for (i =3D 0; i < zone->nr_superpageblocks; i++) {
 			sb =3D &zone->superpageblocks[i];
-			seq_printf(m, "  sb[%lu] pfn=3D0x%lx: unmov=3D%u recl=3D%u mov=3D%u rsv=
=3D%u free=3D%u total=3D%u\n",
+			seq_printf(m, "  sb[%lu] pfn=3D0x%lx: unmov=3D%u recl=3D%u mov=3D%u rsv=
=3D%u free=3D%u total=3D%u free_pages=3D%lu\n",
 				   i, sb->start_pfn,
 				   sb->nr_unmovable, sb->nr_reclaimable,
 				   sb->nr_movable, sb->nr_reserved,
-				   sb->nr_free, sb->total_pageblocks);
+				   sb->nr_free, sb->total_pageblocks,
+				   sb->nr_free_pages);
 		}
 	}
 	return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7b48b84287a7..9133254b6b87 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1575,41 +1575,51 @@ static int frag_show(struct seq_file *m, void *arg)
 static void pagetypeinfo_showfree_print(struct seq_file *m,
 					pg_data_t *pgdat, struct zone *zone)
 {
+	unsigned long counts[MIGRATE_TYPES][NR_PAGE_ORDERS] =3D { };
+	bool overflow[MIGRATE_TYPES][NR_PAGE_ORDERS] =3D { };
+	unsigned long sb_idx, nr_sbs =3D zone->nr_superpageblocks;
 	int order, mtype;
=20
+	/*
+	 * Free pages live on per-superpageblock free lists. Walk the SPBs,
+	 * accumulating per (migratetype, order) counts. The 100000 cap per
+	 * cell limits time under zone->lock; this is a debugging interface,
+	 * knowing there is "a lot" of one size is sufficient. zone->lock is
+	 * dropped between SPBs, so concurrent memory hotplug may produce
+	 * inconsistent counts -- acceptable for a debug-only interface.
+	 */
+	for (sb_idx =3D 0; sb_idx < nr_sbs; sb_idx++) {
+		struct superpageblock *sb =3D &zone->superpageblocks[sb_idx];
+
+		for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
+			struct free_area *area =3D &sb->free_area[order];
+			struct list_head *curr;
+
+			for (mtype =3D 0; mtype < MIGRATE_TYPES; mtype++) {
+				if (overflow[mtype][order])
+					continue;
+				list_for_each(curr, &area->free_list[mtype]) {
+					if (++counts[mtype][order] >=3D 100000) {
+						overflow[mtype][order] =3D true;
+						break;
+					}
+				}
+			}
+		}
+		spin_unlock_irq(&zone->lock);
+		cond_resched();
+		spin_lock_irq(&zone->lock);
+	}
+
 	for (mtype =3D 0; mtype < MIGRATE_TYPES; mtype++) {
 		seq_printf(m, "Node %4d, zone %8s, type %12s ",
 					pgdat->node_id,
 					zone->name,
 					migratetype_names[mtype]);
-		for (order =3D 0; order < NR_PAGE_ORDERS; ++order) {
-			unsigned long freecount =3D 0;
-			struct free_area *area;
-			struct list_head *curr;
-			bool overflow =3D false;
-
-			area =3D &(zone->free_area[order]);
-
-			list_for_each(curr, &area->free_list[mtype]) {
-				/*
-				 * Cap the free_list iteration because it might
-				 * be really large and we are under a spinlock
-				 * so a long time spent here could trigger a
-				 * hard lockup detector. Anyway this is a
-				 * debugging tool so knowing there is a handful
-				 * of pages of this order should be more than
-				 * sufficient.
-				 */
-				if (++freecount >=3D 100000) {
-					overflow =3D true;
-					break;
-				}
-			}
-			seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
-			spin_unlock_irq(&zone->lock);
-			cond_resched();
-			spin_lock_irq(&zone->lock);
-		}
+		for (order =3D 0; order < NR_PAGE_ORDERS; order++)
+			seq_printf(m, "%s%6lu ",
+				   overflow[mtype][order] ? ">" : "",
+				   counts[mtype][order]);
 		seq_putc(m, '\n');
 	}
 }
--=20
2.54.0

From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C62D83F0A9F
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:56 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289266; cv=none;
 b=rdXqgYRNxplUL1LaSEw2r8wdXXP86CkCuaKTPyqKkb8NxgMXQrIXSpBoAi6kKDOvPISVToLAVR7yh/E+Ic3PDrJRDnebQMmOn0RNHWVY5td4+ICv+emN3rE0G2+ohEdAO7pHQJx4sojH2eTbXx1su+F1kMnQxnNerQykV+UDUUo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289266; c=relaxed/simple;
	bh=zOhwH3aFCNnFLCkhGNZ0FhaNmuAyaNof7B9o3nUGgkc=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=jehpSaSawIcL630KtmvlBH8MinBOQJjHqTla3c0Hp2127EMLIgQxhQswk2UMQPrs4DJHTpkgrPxmbLXzxl+ks858KrNl90ZLGD4F4pl6wYLL8ZIrOBE2ztMxeUmAP1wgQJobZou81KHF0bDb445c5lWatdpAmbP/dOItr0VOljk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=I34VUAdR; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="I34VUAdR"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=RO0fqhIiurFpA3oc5oMi5WLVFRVYBJxY3P6Jo+KfnbU=; b=I34VUAdR1fM0HfWYXNiLy5zOT4
	dghLCG68qzPKa8W1nLPURpxd/GN8aPN4neypltU/BXe6gSEJg9aoz8EPwtQNU3yC90dVbJXz0hnkg
	FawS4yZZDIH+Q/NApgZa8joLu4A2DG0yn5hV0ii3O2hHAMi1YLihEv71qezcBLZhrprLfbXFJSoF/
	P9QqPrV097bmwPbxU7wclsVzSK9GYS6X5b+Ms5Xtff0djQ1cifxXU0hS3nwGVlQPUL0bt1oekbBlm
	Nicbn9LtJ+1nzfAp5B7sQanJWc8ctGXtT4Tr5+HDV0mgEM3+6/ozHicHJ4rnhf82lgi5xETPB6Tqy
	qnNKocVA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-1ouG;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 15/40] mm: page_alloc: add background superpageblock
 defragmentation worker
Date: Wed, 20 May 2026 10:59:21 -0400
Message-ID: <20260520150018.2491267-16-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add an event-driven background worker that evacuates movable pages
from tainted superpageblocks when free space runs low.  Each super-
pageblock has its own work_struct, so defrag targets the specific
superpageblock that needs it rather than scanning the entire system.

Defrag is triggered from sb_update_list() when a tainted super-
pageblock drops below threshold: 1 or fewer free pageblocks, or less
than 2 pageblocks worth of free pages.  The worker evacuates movable
pageblocks until free space recovers: at least 2 free pageblocks or
3 pageblocks worth of free pages, or no movable pages remain.  Clean
superpageblocks (only free + movable) are never defragged; super-
pageblocks with no movable pages are skipped.

The worker calls evacuate_pageblock() directly from within its own
work_struct, so the older per-pageblock evacuate plumbing
(queue_pageblock_evacuate, evacuate_item, evacuate_pool,
evacuate_freelist, evacuate_item_alloc/free, evacuate_work_fn,
evacuate_irq_work_fn, pgdat->evacuate_pending,
pgdat->evacuate_irq_work, and their per-pgdat init in
pageblock_evacuate_init()) is no longer used and is dropped, along
with its sole remaining call site in try_to_claim_block().

Memory-hotplug correctness: this commit introduces the per-SPB
defrag_work / defrag_irq_work fields.  The resize loop in
resize_zone_superpageblocks() already runs init_zone_spb_lists() on
the first-time path and rewires per-SPB list heads after the kvmalloc
copy (from the previous commit), but the defrag work_structs need
their own init both for *copied* SPBs (the memcpy leaves them with
function pointers that reference the old array's per-SPB storage)
and for *newly allocated* SPBs (boot-time init via the
pageblock_evacuate_init late_initcall only walks SPBs that exist at
boot, so hot-added SPBs would otherwise have zero-initialized
defrag_work and crash on first defrag attempt).  Call
init_superpageblock_defrag(sb) right after init_one_superpageblock(sb)
in the new-SPB loop, and add it to the copied-SPB fixup loop as well.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |  19 ++-
 mm/internal.h          |   2 +
 mm/mm_init.c           |  11 ++
 mm/page_alloc.c        | 325 +++++++++++++++++++++++++++++------------
 4 files changed, 259 insertions(+), 98 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 85846bb041a8..6cba69603918 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1034,6 +1034,23 @@ struct superpageblock {
 	 */
 	struct free_area	free_area[NR_PAGE_ORDERS];
=20
+#ifdef CONFIG_COMPACTION
+	/* Background defragmentation work for this superpageblock */
+	struct work_struct	defrag_work;
+	struct irq_work		defrag_irq_work;
+	bool			defrag_active;
+	/*
+	 * Back-off state after a no-op defrag pass: defer the next attempt
+	 * until either nr_free_pages has grown by at least pageblock_nr_pages
+	 * or a cooldown elapses, so allocator hot paths cannot re-arm
+	 * defrag_work many times per second on an SB that cannot make progress.
+	 * defrag_last_no_progress_jiffies =3D=3D 0 means the previous pass made
+	 * progress (or no pass has run yet).
+	 */
+	unsigned long		defrag_last_no_progress_jiffies;
+	unsigned long		defrag_last_no_progress_pages;
+#endif
+
 	/* Identity */
 	unsigned long		start_pfn;
 	struct zone		*zone;
@@ -1632,8 +1649,6 @@ typedef struct pglist_data {
 	struct task_struct *kcompactd;
 	bool proactive_compact_trigger;
 	struct workqueue_struct *evacuate_wq;
-	struct llist_head evacuate_pending;
-	struct irq_work evacuate_irq_work;
 #endif
 	/*
 	 * This is a per-node reserve of pages that are not available
diff --git a/mm/internal.h b/mm/internal.h
index 7091dc557f1f..c0dbc2e4b7f0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1102,9 +1102,11 @@ void init_cma_reserved_pageblock(struct page *page);
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
=20
 #ifdef CONFIG_COMPACTION
+void init_superpageblock_defrag(struct superpageblock *sb);
 void superpageblock_clear_has_movable(struct zone *zone, struct page *page=
);
 void superpageblock_set_has_movable(struct zone *zone, struct page *page);
 #else
+static inline void init_superpageblock_defrag(struct superpageblock *sb) {}
 static inline void superpageblock_clear_has_movable(struct zone *zone,
 						    struct page *page) {}
 static inline void superpageblock_set_has_movable(struct zone *zone,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 92e5f396cbd7..ee5dcd89e31e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1726,6 +1726,14 @@ void __meminit resize_zone_superpageblocks(struct zo=
ne *zone)
 		init_one_superpageblock(sb, zone,
 					new_sb_base + (i << SUPERPAGEBLOCK_ORDER),
 					zone_start, zone_end);
+		/*
+		 * Boot-time defrag work init in pageblock_evacuate_init()
+		 * is a late_initcall and only walks SPBs that exist at
+		 * that point. Newly hot-added SPBs need their work structs
+		 * initialized here, mirroring the reinit loop above for
+		 * copied SPBs.
+		 */
+		init_superpageblock_defrag(sb);
 	}
=20
 	/*
@@ -1779,6 +1787,9 @@ void __meminit resize_zone_superpageblocks(struct zon=
e *zone)
 						list_replace(old_list, new_list);
 				}
 			}
+
+			/* Reinitialize defrag work structs (contain stale pointers) */
+			init_superpageblock_defrag(sb);
 		}
 	}
=20
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9c957fb4783..530ddc73e90a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -63,10 +63,6 @@
 #include "shuffle.h"
 #include "page_reporting.h"
=20
-#ifdef CONFIG_COMPACTION
-static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn);
-#endif
-
 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(=
). */
 typedef int __bitwise fpi_t;
=20
@@ -709,8 +705,15 @@ static inline enum sb_fullness sb_get_fullness(struct =
superpageblock *sb,
  *
  * Called after counters change. Removes from current list (if any)
  * and adds to the appropriate list based on current fullness and
- * taint status.
+ * taint status. Also triggers background defragmentation if the
+ * superpageblock is tainted and running low on free space.
  */
+#ifdef CONFIG_COMPACTION
+static void spb_maybe_start_defrag(struct superpageblock *sb);
+#else
+static inline void spb_maybe_start_defrag(struct superpageblock *sb) {}
+#endif
+
 static void spb_update_list(struct superpageblock *sb)
 {
 	struct zone *zone =3D sb->zone;
@@ -727,6 +730,8 @@ static void spb_update_list(struct superpageblock *sb)
 	cat =3D spb_get_category(sb);
 	full =3D sb_get_fullness(sb, cat);
 	list_add_tail(&sb->list, &zone->spb_lists[cat][full]);
+
+	spb_maybe_start_defrag(sb);
 }
=20
 /**
@@ -3311,11 +3316,6 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 		if (sb)
 			spb_update_list(sb);
=20
-		if ((start_type =3D=3D MIGRATE_UNMOVABLE ||
-		     start_type =3D=3D MIGRATE_RECLAIMABLE) &&
-		    get_pfnblock_bit(start_page, start_pfn,
-				     PB_has_movable))
-			queue_pageblock_evacuate(zone, start_pfn);
 #endif
 		return __rmqueue_smallest(zone, order, start_type);
 	}
@@ -8188,42 +8188,14 @@ void __init page_alloc_sysctl_init(void)
=20
 #ifdef CONFIG_COMPACTION
 /*
- * Pageblock evacuation: asynchronously migrate movable pages out of
- * pageblocks that were stolen for unmovable/reclaimable allocations.
- * This keeps unmovable/reclaimable allocations concentrated in fewer
- * pageblocks, reducing long-term fragmentation.
- *
- * Uses a global pool of 64 pre-allocated work items (~3.5KB total)
- * and a per-pgdat workqueue to keep migration node-local.
+ * Pageblock evacuation: synchronously migrate movable pages out of a
+ * pageblock to consolidate fragmentation. Driven by the background
+ * superpageblock defragmentation worker (see below); has no per-pageblock
+ * scheduling infrastructure of its own.
  */
=20
-struct evacuate_item {
-	struct work_struct	work;
-	struct zone		*zone;
-	unsigned long		start_pfn;
-	struct llist_node	free_node;
-};
-
-#define NR_EVACUATE_ITEMS	64
-static struct evacuate_item evacuate_pool[NR_EVACUATE_ITEMS];
-static struct llist_head evacuate_freelist;
-
-static struct evacuate_item *evacuate_item_alloc(void)
-{
-	struct llist_node *node;
-
-	node =3D llist_del_first(&evacuate_freelist);
-	if (!node)
-		return NULL;
-	return container_of(node, struct evacuate_item, free_node);
-}
-
-static void evacuate_item_free(struct evacuate_item *item)
-{
-	llist_add(&item->free_node, &evacuate_freelist);
-}
-
-static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn)
+static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn,
+			       bool force)
 {
 	unsigned long end_pfn =3D start_pfn + pageblock_nr_pages;
 	unsigned long pfn =3D start_pfn;
@@ -8241,8 +8213,14 @@ static void evacuate_pageblock(struct zone *zone, un=
signed long start_pfn)
 		.gfp_mask =3D GFP_HIGHUSER_MOVABLE,
 	};
=20
-	/* Verify this pageblock is still worth evacuating */
-	if (get_pageblock_migratetype(pfn_to_page(start_pfn)) =3D=3D MIGRATE_MOVA=
BLE)
+	/*
+	 * Verify this pageblock is still worth evacuating.
+	 * Skip if it reverted to MOVABLE (steal was undone) -- unless
+	 * force is set (background defrag wants to clear movable pages
+	 * out of tainted superpageblocks regardless of pageblock type).
+	 */
+	if (!force &&
+	    get_pageblock_migratetype(pfn_to_page(start_pfn)) =3D=3D MIGRATE_MOVA=
BLE)
 		return;
=20
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -8297,86 +8275,215 @@ static void evacuate_pageblock(struct zone *zone, =
unsigned long start_pfn)
 		putback_movable_pages(&cc.migratepages);
 }
=20
-static void evacuate_work_fn(struct work_struct *work)
+/*
+ * Background superpageblock defragmentation.
+ *
+ * Evacuate movable pageblocks from tainted superpageblocks to consolidate
+ * contamination. Triggered on-demand when a tainted superpageblock runs
+ * low on free space, rather than running on a fixed timer.
+ *
+ * Goals for tainted superpageblocks:
+ * - At least 2 free pageblocks if movable pageblocks still exist
+ * - Or 3 pageblocks worth of free pages while movable pages remain
+ * - Skip superpageblocks with no movable pages (nothing to evacuate)
+ */
+
+/* Target free space: 3 pageblocks worth of free pages */
+#define SPB_DEFRAG_FREE_PAGES_TARGET	(3UL * pageblock_nr_pages)
+
+/**
+ * spb_needs_defrag - Check if a superpageblock needs defragmentation
+ * @sb: superpageblock to check (may be NULL)
+ *
+ * Returns false for NULL, non-tainted, or clean superpageblocks.
+ * A tainted superpageblock needs defrag if it has movable pages that can
+ * be evacuated AND free space is running low (1 or fewer free
+ * pageblocks, or less than 2 pageblocks worth of free pages).
+ */
+/*
+ * Cooldown between defrag attempts that made no progress, in seconds.
+ * Long enough to keep the allocator hot path quiet on saturated SBs;
+ * short enough that a freshly-freed pageblock isn't ignored for long.
+ */
+#define SPB_DEFRAG_NOOP_COOLDOWN_SECS	5
+
+static bool spb_needs_defrag(struct superpageblock *sb)
 {
-	struct evacuate_item *item =3D container_of(work, struct evacuate_item,
-						  work);
-	evacuate_pageblock(item->zone, item->start_pfn);
-	evacuate_item_free(item);
+	if (!sb)
+		return false;
+
+	if (spb_get_category(sb) !=3D SB_TAINTED)
+		return false;
+
+	/*
+	 * Back off if the previous pass made no progress: do not retry until
+	 * either the cooldown elapses or free pages have grown by at least a
+	 * pageblock's worth (a hint that there might be new material to
+	 * consolidate or evacuate).
+	 */
+	if (sb->defrag_last_no_progress_jiffies &&
+	    time_before(jiffies, sb->defrag_last_no_progress_jiffies +
+				 SPB_DEFRAG_NOOP_COOLDOWN_SECS * HZ) &&
+	    sb->nr_free_pages < sb->defrag_last_no_progress_pages +
+				pageblock_nr_pages)
+		return false;
+
+	/*
+	 * Tainted superpageblocks: evacuate movable pages to concentrate
+	 * unmovable/reclaimable allocations.  Migration targets are
+	 * allocated system-wide, so no internal free space is needed.
+	 * Maintain the tainted reserve so unmovable claims always
+	 * find room in existing tainted superpageblocks.
+	 */
+	return sb->nr_movable > 0 &&
+	       sb->nr_free < SPB_TAINTED_RESERVE;
 }
=20
 /**
- * evacuate_irq_work_fn - IRQ work callback to drain pending evacuations
- * @work: the irq_work embedded in pg_data_t
+ * spb_defrag_done - Check if defrag target has been reached
+ * @sb: superpageblock being defragmented
  *
- * queue_work() can deadlock when called from inside the page allocator
- * because it may try to allocate memory with locks already held.
- * Use irq_work to defer the queue_work() calls to a safe context.
+ * Stop defragmenting when the superpageblock has enough free space
+ * or there are no more movable pages to evacuate.
+ */
+static bool spb_defrag_done(struct superpageblock *sb)
+{
+	/*
+	 * Tainted superpageblocks: keep evacuating movable pages until
+	 * the reserve of free pageblocks is restored, or until there
+	 * are no more movable pages to evacuate.
+	 */
+	return !sb->nr_movable ||
+	       sb->nr_free >=3D SPB_TAINTED_RESERVE;
+}
+
+/**
+ * spb_defrag_superpageblock - evacuate movable pages from a tainted super=
pageblock
+ * @sb: the tainted superpageblock to defragment
+ *
+ * Find any pageblock with movable pages (PB_has_movable) and evacuate
+ * them, leaving only unmovable, reclaimable, and free pages behind.
+ * Stop when the free space target is reached.
  */
-static void evacuate_irq_work_fn(struct irq_work *work)
+static void spb_defrag_superpageblock(struct superpageblock *sb)
 {
-	pg_data_t *pgdat =3D container_of(work, pg_data_t,
-					evacuate_irq_work);
-	struct llist_node *pending;
-	struct evacuate_item *item, *next;
+	unsigned long pfn, end_pfn;
+	struct zone *zone =3D sb->zone;
=20
-	if (!pgdat->evacuate_wq)
+	if (!sb->nr_movable)
 		return;
=20
+	end_pfn =3D sb->start_pfn + SUPERPAGEBLOCK_NR_PAGES;
+
+	for (pfn =3D sb->start_pfn; pfn < end_pfn; pfn +=3D pageblock_nr_pages) {
+		struct page *page;
+
+		if (spb_defrag_done(sb))
+			return;
+
+		if (!pfn_valid(pfn))
+			continue;
+
+		page =3D pfn_to_page(pfn);
+
+		/* Skip pageblocks without movable pages */
+		if (!get_pfnblock_bit(page, pfn, PB_has_movable))
+			continue;
+
+		/* Skip if fully free -- nothing to evacuate */
+		if (get_pfnblock_bit(page, pfn, PB_all_free))
+			continue;
+
+		evacuate_pageblock(zone, pfn, true);
+	}
+}
+
+static void spb_defrag_work_fn(struct work_struct *work)
+{
+	struct superpageblock *sb =3D container_of(work, struct superpageblock,
+					     defrag_work);
+	u16 nr_free_before =3D sb->nr_free;
+	unsigned long flags;
+
+	spb_defrag_superpageblock(sb);
+
 	/*
-	 * Collect all pending items first, then queue them.  Use _safe
-	 * because evacuate_work_fn() may run immediately on another
-	 * CPU and free the item before we follow the next pointer.
+	 * If this pass produced no new free pageblocks, arm the no-progress
+	 * cooldown so spb_needs_defrag() rejects re-arms until either time
+	 * passes or nr_free_pages grows enough to suggest new material to
+	 * work on.  Use jiffies | 1 so the field is never accidentally zero.
 	 */
-	pending =3D llist_del_all(&pgdat->evacuate_pending);
-	llist_for_each_entry_safe(item, next, pending, free_node) {
-		INIT_WORK(&item->work, evacuate_work_fn);
-		queue_work(pgdat->evacuate_wq, &item->work);
+	if (sb->nr_free =3D=3D nr_free_before) {
+		sb->defrag_last_no_progress_jiffies =3D jiffies | 1;
+		sb->defrag_last_no_progress_pages =3D sb->nr_free_pages;
+	} else {
+		sb->defrag_last_no_progress_jiffies =3D 0;
 	}
+
+	/*
+	 * Allow new defrag requests for this superpageblock.  Clear under
+	 * zone->lock to match the read/set sites in spb_maybe_start_defrag();
+	 * without this a missed re-arm window exists on weakly-ordered arches
+	 * when the worker retires just before the next allocator caller checks
+	 * defrag_active.
+	 */
+	spin_lock_irqsave(&sb->zone->lock, flags);
+	sb->defrag_active =3D false;
+	spin_unlock_irqrestore(&sb->zone->lock, flags);
 }
=20
 /**
- * queue_pageblock_evacuate - schedule async evacuation of movable pages
- * @zone: the zone containing the pageblock
- * @pfn: start PFN of the pageblock (must be pageblock-aligned)
+ * spb_defrag_irq_work_fn - IRQ work callback to safely queue defrag work
+ * @work: the irq_work embedded in struct superpageblock
  *
- * Called from the page allocator when a movable pageblock is claimed
- * for unmovable or reclaimable allocations. Queues the pageblock for
- * background migration of its remaining movable pages. Uses irq_work
- * to defer the actual queue_work() call outside the allocator's lock
- * context.
+ * queue_work() can deadlock when called from inside the page allocator
+ * because it may try to allocate memory with locks already held.
+ * Use irq_work to defer the queue_work() call to a safe context.
  */
-static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn)
+static void spb_defrag_irq_work_fn(struct irq_work *work)
 {
-	struct evacuate_item *item;
-	pg_data_t *pgdat =3D zone->zone_pgdat;
+	struct superpageblock *sb =3D container_of(work, struct superpageblock,
+					     defrag_irq_work);
+	pg_data_t *pgdat =3D sb->zone->zone_pgdat;
=20
-	if (!pgdat->evacuate_irq_work.func)
+	if (pgdat->evacuate_wq)
+		queue_work(pgdat->evacuate_wq, &sb->defrag_work);
+}
+
+/**
+ * spb_maybe_start_defrag - Trigger defrag if a superpageblock needs it
+ * @sb: superpageblock whose counters just changed
+ *
+ * Called from counter update paths (under zone->lock). If the
+ * superpageblock is tainted and running low on free space, schedule
+ * irq_work to queue defrag work outside the allocator's lock context.
+ * The irq_work handler is set up by pageblock_evacuate_init();
+ * before that runs, defrag_irq_work.func is NULL and we skip.
+ */
+static void spb_maybe_start_defrag(struct superpageblock *sb)
+{
+	if (!spb_needs_defrag(sb))
 		return;
=20
-	item =3D evacuate_item_alloc();
-	if (!item)
+	/* Don't pile up work items; one defrag pass per superpageblock at a time=
 */
+	if (sb->defrag_active)
 		return;
=20
-	item->zone =3D zone;
-	item->start_pfn =3D pfn;
-	llist_add(&item->free_node, &pgdat->evacuate_pending);
-	irq_work_queue(&pgdat->evacuate_irq_work);
+	if (sb->defrag_irq_work.func) {
+		sb->defrag_active =3D true;
+		irq_work_queue(&sb->defrag_irq_work);
+	}
 }
=20
 static int __init pageblock_evacuate_init(void)
 {
-	int nid, i;
-
-	/* Initialize the global freelist of work items */
-	init_llist_head(&evacuate_freelist);
-	for (i =3D 0; i < NR_EVACUATE_ITEMS; i++)
-		llist_add(&evacuate_pool[i].free_node, &evacuate_freelist);
+	int nid;
=20
 	/* Create a per-pgdat workqueue */
 	for_each_online_node(nid) {
 		pg_data_t *pgdat =3D NODE_DATA(nid);
 		char name[32];
+		int z;
=20
 		snprintf(name, sizeof(name), "kevacuate/%d", nid);
 		pgdat->evacuate_wq =3D alloc_workqueue(name, WQ_MEM_RECLAIM, 1);
@@ -8385,14 +8492,40 @@ static int __init pageblock_evacuate_init(void)
 			continue;
 		}
=20
-		init_llist_head(&pgdat->evacuate_pending);
-		init_irq_work(&pgdat->evacuate_irq_work,
-			      evacuate_irq_work_fn);
+		/* Initialize per-superpageblock defrag work structs */
+		for (z =3D 0; z < MAX_NR_ZONES; z++) {
+			struct zone *zone =3D &pgdat->node_zones[z];
+			unsigned long j;
+
+			if (!zone->superpageblocks)
+				continue;
+
+			for (j =3D 0; j < zone->nr_superpageblocks; j++) {
+				INIT_WORK(&zone->superpageblocks[j].defrag_work,
+					  spb_defrag_work_fn);
+				init_irq_work(&zone->superpageblocks[j].defrag_irq_work,
+					      spb_defrag_irq_work_fn);
+			}
+		}
 	}
=20
 	return 0;
 }
 late_initcall(pageblock_evacuate_init);
+
+/**
+ * init_superpageblock_defrag - initialize defrag work structs for a super=
pageblock
+ * @sb: superpageblock to initialize
+ *
+ * Called during boot from pageblock_evacuate_init() and during memory
+ * hotplug from resize_zone_superpageblocks().  Safe to call multiple times
+ * on the same superpageblock (reinitializes work structs).
+ */
+void init_superpageblock_defrag(struct superpageblock *sb)
+{
+	INIT_WORK(&sb->defrag_work, spb_defrag_work_fn);
+	init_irq_work(&sb->defrag_irq_work, spb_defrag_irq_work_fn);
+}
 #endif /* CONFIG_COMPACTION */
=20
 #ifdef CONFIG_CONTIG_ALLOC
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B4D2C3F0AB8
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289252; cv=none;
 b=Oiy/M9WozskjXB7h4gzE/lEVZXFqcJB/nVa17+/TzLSdKRjeArPM6Zg75jb8RDjtjsLHLgI6w2IwBkZJCZ+Xdyc8XwFA0FC2GKSc3um5qAc7r9oq8v0aS50Kjhoxx7ypZKMXHWGMHcUnDgC7BOMTpPwalA+19Vnu6qdWZJ6SZIU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289252; c=relaxed/simple;
	bh=wbLL1OIyv0FX2GYi5Rsv93MD3ypPszRC533Fv4TCRUo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=tariHDSLJWoq5qExSB8UeOpplMi1ac6V5LUndKZ31ZacKVf8akF872AY/fEpo2zxu14tgkwi2Cgh/HevbC3KMcvpWW1RkyEFK+GQ8xnvOPPWPOhUzWcmvd4MJUErK4poYhJlZgh9fTgJ4T678OS28Jp2MnH9qsdAZLtLv8o88qY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=gIUAUZ5c; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="gIUAUZ5c"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=aXenHoeHOgHgeBlGRM9WW9ffCAAzTAUJFryLoW48gj8=; b=gIUAUZ5cFcoJFrjGwzCASCJMZ4
	1ZmNU3h/vNkDRRDZ8A6jQKCTOv4ZPYWl5auCHT2uJ7smyBOTLCuSFPsxfqyen9Dq8fdm2fAPg7cPP
	R5SVfSCJSYWi0oWRCfqueFIakOKKVhmIWiTTT5SzdBBN3+xk57i+cx3fMgujoWTZUBY+6i3Txq/TD
	3tZG2ZiHbbuHj7y6iT1lm73KI2wVc/GMQdFet1FC5V84ch6dNAooa55/I5w9qPAPdAhq8xhrfWB8c
	y3xXFoTFrAc+7euEuq6cFCdy50fWdTfOPMfHbhosphsifXr6F+au+6nvtH1vUMOmAGe5jfk6WF2BZ
	6Pj8DQKg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-1wKR;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 16/40] mm: compaction: walk per-superpageblock free lists
 for migration targets
Date: Wed, 20 May 2026 10:59:22 -0400
Message-ID: <20260520150018.2491267-17-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Free pages live on per-SPB free lists
rather than zone-level free_lists. Standard compaction's free-page
scanner needs to walk the per-SPB free lists to find migration targets;
without this, kcompactd would see "nothing free" even when SPBs hold
plenty of order-9 buddies.

Also wire superpageblock_set_has_movable() and the corresponding clear
calls into the migration-source-isolation and free-page-isolation paths,
so pageblock movability bookkeeping stays correct as compaction shuffles
contents around.

Fix the PB_has_movable check for zones whose start_pfn is not aligned
to pageblock_order (DMA32 with reserved memory at the bottom).

This is the compaction-side infrastructure for SPB-aware standard
compaction. Subsequent commits add the predicates that let kcompactd
skip useless tainted SPBs.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |   1 +
 mm/compaction.c        | 337 ++++++++++++++++++++++++++++-------------
 mm/page_alloc.c        | 135 ++++++++++++-----
 3 files changed, 330 insertions(+), 143 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6cba69603918..e7d760a689f9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1039,6 +1039,7 @@ struct superpageblock {
 	struct work_struct	defrag_work;
 	struct irq_work		defrag_irq_work;
 	bool			defrag_active;
+	unsigned long		defrag_cursor;
 	/*
 	 * Back-off state after a no-op defrag pass: defer the next attempt
 	 * until either nr_free_pages has grown by at least pageblock_nr_pages
diff --git a/mm/compaction.c b/mm/compaction.c
index 6d2aefdbc0c8..e4ba21072435 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -867,7 +867,8 @@ isolate_migratepages_block(struct compact_control *cc, =
unsigned long low_pfn,
 	bool skip_on_failure =3D false;
 	unsigned long next_skip_pfn =3D 0;
 	bool skip_updated =3D false;
-	bool movable_skipped =3D false;
+	bool movable_seen =3D false;
+	bool pb_cleared =3D false;
 	int ret =3D 0;
=20
 	cc->migrate_pfn =3D low_pfn;
@@ -964,6 +965,26 @@ isolate_migratepages_block(struct compact_control *cc,=
 unsigned long low_pfn,
 				goto isolate_abort;
 			}
 			valid_page =3D page;
+
+			/*
+			 * Clear PB_has_movable up-front. The scan below will
+			 * re-set it if any movable page is encountered. This
+			 * self-corrects stale bits left behind when movable
+			 * content was previously freed without the bit being
+			 * cleared (e.g. PB held both movable and unmovable
+			 * pages, so mark_pageblock_free was never reached).
+			 * A racing allocator that places a movable page in
+			 * this PB will set the bit too; both setters are
+			 * idempotent, so the bit ends up correctly set.
+			 */
+			if (pageblock_start_pfn(start_pfn) >=3D
+			    cc->zone->zone_start_pfn &&
+			    get_pfnblock_bit(valid_page, low_pfn,
+					     PB_has_movable)) {
+				superpageblock_clear_has_movable(cc->zone,
+								 valid_page);
+				pb_cleared =3D true;
+			}
 		}
=20
 		if (PageHuge(page)) {
@@ -979,12 +1000,8 @@ isolate_migratepages_block(struct compact_control *cc=
, unsigned long low_pfn,
 					low_pfn +=3D (1UL << order) - 1;
 					nr_scanned +=3D (1UL << order) - 1;
 				}
-				/*
-				 * Skipped a movable page; clearing
-				 * PB_has_movable here would orphan SPB type
-				 * counters (debugfs invariant 1).
-				 */
-				movable_skipped =3D true;
+				/* HugeTLB page is movable content. */
+				movable_seen =3D true;
 				goto isolate_fail;
 			}
 			/* for alloc_contig case */
@@ -1064,12 +1081,8 @@ isolate_migratepages_block(struct compact_control *c=
c, unsigned long low_pfn,
 					low_pfn +=3D (1UL << order) - 1;
 					nr_scanned +=3D (1UL << order) - 1;
 				}
-				/*
-				 * Skipped a movable compound page; clearing
-				 * PB_has_movable here would orphan SPB type
-				 * counters (debugfs invariant 1).
-				 */
-				movable_skipped =3D true;
+				/* THP/compound page is movable content. */
+				movable_seen =3D true;
 				goto isolate_fail;
 			}
 		}
@@ -1088,19 +1101,21 @@ isolate_migratepages_block(struct compact_control *=
cc, unsigned long low_pfn,
 					locked =3D NULL;
 				}
=20
+				/* movable_ops page is movable content. */
+				movable_seen =3D true;
 				if (isolate_movable_ops_page(page, mode)) {
 					folio =3D page_folio(page);
 					goto isolate_success;
 				}
-				movable_skipped =3D true;
 			}
=20
 			/*
-			 * Non-LRU non-movable_ops page: still occupies the
-			 * pageblock, so clearing PB_has_movable here would
-			 * orphan SPB type counters (debugfs invariant 1).
+			 * Non-LRU, non-movable_ops page (slab, pgtable,
+			 * reserved, ...): not movable content. Do NOT mark
+			 * the PB as having movable pages; if it had no other
+			 * movable pages, the up-front clear of PB_has_movable
+			 * stays in effect.
 			 */
-			movable_skipped =3D true;
 			goto isolate_fail;
 		}
=20
@@ -1113,6 +1128,14 @@ isolate_migratepages_block(struct compact_control *c=
c, unsigned long low_pfn,
 		if (unlikely(!folio))
 			goto isolate_fail;
=20
+		/*
+		 * LRU folio reference acquired: this PB definitely
+		 * contains movable content. Mark it now so any abort
+		 * before isolate_success/isolate_fail_put still
+		 * triggers the post-loop PB_has_movable re-set.
+		 */
+		movable_seen =3D true;
+
 		/*
 		 * Migration will fail if an anonymous page is pinned in memory,
 		 * so avoid taking lru_lock and isolating it unnecessarily in an
@@ -1266,7 +1289,8 @@ isolate_migratepages_block(struct compact_control *cc=
, unsigned long low_pfn,
 			lruvec_unlock_irqrestore(locked, flags);
 			locked =3D NULL;
 		}
-		movable_skipped =3D true;
+		/* Page was LRU; treat as movable content even though we couldn't take i=
t. */
+		movable_seen =3D true;
 		folio_put(folio);
=20
 isolate_fail:
@@ -1330,17 +1354,31 @@ isolate_migratepages_block(struct compact_control *=
cc, unsigned long low_pfn,
 		if (!cc->no_set_skip_hint && valid_page && !skip_updated)
 			set_pageblock_skip(valid_page);
 		update_cached_migrate(cc, low_pfn);
+	}
+
+	/*
+	 * PB_has_movable was cleared up-front when this PB was first
+	 * entered. Re-set it unless a complete scan of the pageblock
+	 * proved no movable content exists. Re-setting is required on:
+	 *   - any partial scan (low_pfn !=3D end_pfn): we can't conclude
+	 *     the PB is movable-free without seeing every PFN
+	 *   - nr_isolated > 0: pages may fail migration and return to
+	 *     this PB, so the bit must persist
+	 *   - movable_seen: hugeTLB/THP/movable_ops/LRU content was
+	 *     observed, even if it could not be isolated
+	 * The set is idempotent (a racing allocator may set it too).
+	 */
+	if (pb_cleared && valid_page &&
+	    (low_pfn !=3D end_pfn || nr_isolated || movable_seen)) {
+		unsigned long pb_pfn =3D pageblock_start_pfn(start_pfn);
=20
 		/*
-		 * Full pageblock scanned with no movable pages isolated.
-		 * Only clear PB_has_movable if no movable pages were
-		 * seen at all. If movable pages exist but could not be
-		 * isolated (pinned, writeback, dirty, etc.), leave the
-		 * flag set so a future migration attempt can try again.
+		 * start_pfn may not be pageblock-aligned when the zone
+		 * start is not aligned (e.g. DMA zone at PFN 1). Skip
+		 * the update if the pageblock start falls below the zone.
 		 */
-		if (!nr_isolated && !movable_skipped && valid_page)
-			superpageblock_clear_has_movable(cc->zone,
-							valid_page);
+		if (pb_pfn >=3D cc->zone->zone_start_pfn)
+			superpageblock_set_has_movable(cc->zone, valid_page);
 	}
=20
 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
@@ -1557,6 +1595,7 @@ static void fast_isolate_freepages(struct compact_con=
trol *cc)
 	unsigned long low_pfn, min_pfn, highest =3D 0;
 	unsigned long nr_isolated =3D 0;
 	unsigned long distance;
+	unsigned long si, nr_spb;
 	struct page *page =3D NULL;
 	bool scan_start =3D false;
 	int order;
@@ -1594,45 +1633,66 @@ static void fast_isolate_freepages(struct compact_c=
ontrol *cc)
 	for (order =3D cc->search_order;
 	     !page && order >=3D 0;
 	     order =3D next_search_order(cc, order)) {
-		struct free_area *area =3D &cc->zone->free_area[order];
-		struct list_head *freelist;
-		struct page *freepage;
+		struct list_head *freelist =3D NULL;
+		struct page *freepage =3D NULL;
 		unsigned long flags;
 		unsigned int order_scanned =3D 0;
 		unsigned long high_pfn =3D 0;
=20
-		if (!area->nr_free)
+		if (!cc->zone->free_area[order].nr_free)
 			continue;
=20
 		spin_lock_irqsave(&cc->zone->lock, flags);
-		freelist =3D &area->free_list[MIGRATE_MOVABLE];
-		list_for_each_entry_reverse(freepage, freelist, buddy_list) {
-			unsigned long pfn;
-
-			order_scanned++;
-			nr_scanned++;
-			pfn =3D page_to_pfn(freepage);
-
-			if (pfn >=3D highest)
-				highest =3D max(pageblock_start_pfn(pfn),
-					      cc->zone->zone_start_pfn);
-
-			if (pfn >=3D low_pfn) {
-				cc->fast_search_fail =3D 0;
-				cc->search_order =3D order;
-				page =3D freepage;
-				break;
+
+		/*
+		 * With superpageblocks, free pages live on per-SPB free
+		 * lists rather than zone-level free lists.  Iterate all
+		 * SPBs to find candidate pages.
+		 */
+		nr_spb =3D cc->zone->nr_superpageblocks;
+		for (si =3D 0; !page && order_scanned < limit; si++) {
+			struct free_area *area;
+
+			if (nr_spb) {
+				if (si >=3D nr_spb)
+					break;
+				area =3D &cc->zone->superpageblocks[si].free_area[order];
+			} else {
+				if (si > 0)
+					break;
+				area =3D &cc->zone->free_area[order];
 			}
=20
-			if (pfn >=3D min_pfn && pfn > high_pfn) {
-				high_pfn =3D pfn;
+			freelist =3D &area->free_list[MIGRATE_MOVABLE];
+			list_for_each_entry_reverse(freepage,
+						    freelist,
+						    buddy_list) {
+				unsigned long pfn;
+
+				order_scanned++;
+				nr_scanned++;
+				pfn =3D page_to_pfn(freepage);
+
+				if (pfn >=3D highest)
+					highest =3D max(
+					    pageblock_start_pfn(pfn),
+					    cc->zone->zone_start_pfn);
+
+				if (pfn >=3D low_pfn) {
+					cc->fast_search_fail =3D 0;
+					cc->search_order =3D order;
+					page =3D freepage;
+					break;
+				}
=20
-				/* Shorten the scan if a candidate is found */
-				limit >>=3D 1;
-			}
+				if (pfn >=3D min_pfn && pfn > high_pfn) {
+					high_pfn =3D pfn;
+					limit >>=3D 1;
+				}
=20
-			if (order_scanned >=3D limit)
-				break;
+				if (order_scanned >=3D limit)
+					break;
+			}
 		}
=20
 		/* Use a maximum candidate pfn if a preferred one was not found */
@@ -1641,10 +1701,24 @@ static void fast_isolate_freepages(struct compact_c=
ontrol *cc)
=20
 			/* Update freepage for the list reorder below */
 			freepage =3D page;
+
+			/*
+			 * high_pfn page may be on a different SPB's list
+			 * than the last one scanned; fix up freelist.
+			 */
+			if (cc->zone->nr_superpageblocks) {
+				struct superpageblock *sb;
+
+				sb =3D pfn_to_superpageblock(cc->zone,
+							   high_pfn);
+				if (sb)
+					freelist =3D &sb->free_area[order].free_list[MIGRATE_MOVABLE];
+			}
 		}
=20
 		/* Reorder to so a future search skips recent pages */
-		move_freelist_head(freelist, freepage);
+		if (freelist && freepage)
+			move_freelist_head(freelist, freepage);
=20
 		/* Isolate the page if available */
 		if (page) {
@@ -1985,6 +2059,7 @@ static unsigned long fast_find_migrateblock(struct co=
mpact_control *cc)
 	unsigned long distance;
 	unsigned long pfn =3D cc->migrate_pfn;
 	unsigned long high_pfn;
+	unsigned long si, nr_spb;
 	int order;
 	bool found_block =3D false;
=20
@@ -2038,47 +2113,73 @@ static unsigned long fast_find_migrateblock(struct =
compact_control *cc)
 	for (order =3D cc->order - 1;
 	     order >=3D PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < l=
imit;
 	     order--) {
-		struct free_area *area =3D &cc->zone->free_area[order];
-		struct list_head *freelist;
 		unsigned long flags;
 		struct page *freepage;
=20
-		if (!area->nr_free)
+		if (!cc->zone->free_area[order].nr_free)
 			continue;
=20
 		spin_lock_irqsave(&cc->zone->lock, flags);
-		freelist =3D &area->free_list[MIGRATE_MOVABLE];
-		list_for_each_entry(freepage, freelist, buddy_list) {
-			unsigned long free_pfn;
=20
-			if (nr_scanned++ >=3D limit) {
-				move_freelist_tail(freelist, freepage);
-				break;
+		/*
+		 * With superpageblocks, free pages live on per-SPB free
+		 * lists.  Iterate all SPBs to find candidates.
+		 */
+		nr_spb =3D cc->zone->nr_superpageblocks;
+		for (si =3D 0; !found_block && nr_scanned < limit; si++) {
+			struct free_area *area;
+			struct list_head *freelist;
+
+			if (nr_spb) {
+				if (si >=3D nr_spb)
+					break;
+				area =3D &cc->zone->superpageblocks[si].free_area[order];
+			} else {
+				if (si > 0)
+					break;
+				area =3D &cc->zone->free_area[order];
 			}
=20
-			free_pfn =3D page_to_pfn(freepage);
-			if (free_pfn < high_pfn) {
-				/*
-				 * Avoid if skipped recently. Ideally it would
-				 * move to the tail but even safe iteration of
-				 * the list assumes an entry is deleted, not
-				 * reordered.
-				 */
-				if (get_pageblock_skip(freepage))
-					continue;
-
-				/* Reorder to so a future search skips recent pages */
-				move_freelist_tail(freelist, freepage);
-
-				update_fast_start_pfn(cc, free_pfn);
-				pfn =3D pageblock_start_pfn(free_pfn);
-				if (pfn < cc->zone->zone_start_pfn)
-					pfn =3D cc->zone->zone_start_pfn;
-				cc->fast_search_fail =3D 0;
-				found_block =3D true;
-				break;
+			freelist =3D &area->free_list[MIGRATE_MOVABLE];
+			list_for_each_entry(freepage, freelist,
+					    buddy_list) {
+				unsigned long free_pfn;
+
+				if (nr_scanned++ >=3D limit) {
+					move_freelist_tail(freelist,
+							   freepage);
+					break;
+				}
+
+				free_pfn =3D page_to_pfn(freepage);
+				if (free_pfn < high_pfn) {
+					/*
+					 * Avoid if skipped recently.
+					 * Ideally it would move to
+					 * the tail but even safe
+					 * iteration of the list
+					 * assumes an entry is deleted,
+					 * not reordered.
+					 */
+					if (get_pageblock_skip(freepage))
+						continue;
+
+					move_freelist_tail(freelist,
+							   freepage);
+
+					update_fast_start_pfn(cc,
+							      free_pfn);
+					pfn =3D pageblock_start_pfn(
+							free_pfn);
+					if (pfn < cc->zone->zone_start_pfn)
+						pfn =3D cc->zone->zone_start_pfn;
+					cc->fast_search_fail =3D 0;
+					found_block =3D true;
+					break;
+				}
 			}
 		}
+
 		spin_unlock_irqrestore(&cc->zone->lock, flags);
 	}
=20
@@ -2292,6 +2393,7 @@ static bool should_proactive_compact_node(pg_data_t *=
pgdat)
 static enum compact_result __compact_finished(struct compact_control *cc)
 {
 	unsigned int order;
+	unsigned long si, nr_spb;
 	const int migratetype =3D cc->migratetype;
 	int ret;
=20
@@ -2364,33 +2466,56 @@ static enum compact_result __compact_finished(struc=
t compact_control *cc)
=20
 	/* Direct compactor: Is a suitable page free? */
 	ret =3D COMPACT_NO_SUITABLE_PAGE;
+	nr_spb =3D cc->zone->nr_superpageblocks;
 	for (order =3D cc->order; order < NR_PAGE_ORDERS; order++) {
-		struct free_area *area =3D &cc->zone->free_area[order];
+		/* Zone-level nr_free is maintained even with SPBs */
+		if (!cc->zone->free_area[order].nr_free)
+			continue;
=20
-		/* Job done if page is free of the right migratetype */
-		if (!free_area_empty(area, migratetype))
-			return COMPACT_SUCCESS;
+		/*
+		 * With superpageblocks, free pages live on per-SPB free
+		 * lists.  Check all SPBs for a suitable page.
+		 */
+		for (si =3D 0; ; si++) {
+			struct free_area *area;
+
+			if (nr_spb) {
+				if (si >=3D nr_spb)
+					break;
+				area =3D &cc->zone->superpageblocks[si].free_area[order];
+			} else {
+				if (si > 0)
+					break;
+				area =3D &cc->zone->free_area[order];
+			}
+
+			/* Job done if page is free of the right migratetype */
+			if (!free_area_empty(area, migratetype))
+				return COMPACT_SUCCESS;
=20
 #ifdef CONFIG_CMA
-		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
-		if (migratetype =3D=3D MIGRATE_MOVABLE &&
-			!free_area_empty(area, MIGRATE_CMA))
-			return COMPACT_SUCCESS;
+			/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+			if (migratetype =3D=3D MIGRATE_MOVABLE &&
+				!free_area_empty(area, MIGRATE_CMA))
+				return COMPACT_SUCCESS;
 #endif
-		/*
-		 * Job done if allocation would steal freepages from
-		 * other migratetype buddy lists.
-		 */
-		if (find_suitable_fallback(area, order, migratetype, true) >=3D 0)
 			/*
-			 * Movable pages are OK in any pageblock. If we are
-			 * stealing for a non-movable allocation, make sure
-			 * we finish compacting the current pageblock first
-			 * (which is assured by the above migrate_pfn align
-			 * check) so it is as free as possible and we won't
-			 * have to steal another one soon.
+			 * Job done if allocation would steal freepages from
+			 * other migratetype buddy lists.
 			 */
-			return COMPACT_SUCCESS;
+			if (find_suitable_fallback(area, order, migratetype,
+						   true) >=3D 0)
+				/*
+				 * Movable pages are OK in any pageblock. If we
+				 * are stealing for a non-movable allocation,
+				 * make sure we finish compacting the current
+				 * pageblock first (which is assured by the
+				 * above migrate_pfn align check) so it is as
+				 * free as possible and we won't have to steal
+				 * another one soon.
+				 */
+				return COMPACT_SUCCESS;
+		}
 	}
=20
 out:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 530ddc73e90a..3c11c8c5ce6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8288,17 +8288,13 @@ static void evacuate_pageblock(struct zone *zone, u=
nsigned long start_pfn,
  * - Skip superpageblocks with no movable pages (nothing to evacuate)
  */
=20
-/* Target free space: 3 pageblocks worth of free pages */
-#define SPB_DEFRAG_FREE_PAGES_TARGET	(3UL * pageblock_nr_pages)
-
 /**
  * spb_needs_defrag - Check if a superpageblock needs defragmentation
  * @sb: superpageblock to check (may be NULL)
  *
- * Returns false for NULL, non-tainted, or clean superpageblocks.
- * A tainted superpageblock needs defrag if it has movable pages that can
- * be evacuated AND free space is running low (1 or fewer free
- * pageblocks, or less than 2 pageblocks worth of free pages).
+ * Defrag here is the per-SPB tainted-pool evacuation worker. Clean SPBs
+ * are handled by standard compaction (kcompactd) and do not return true
+ * from this predicate.
  */
 /*
  * Cooldown between defrag attempts that made no progress, in seconds.
@@ -8312,14 +8308,11 @@ static bool spb_needs_defrag(struct superpageblock =
*sb)
 	if (!sb)
 		return false;
=20
-	if (spb_get_category(sb) !=3D SB_TAINTED)
-		return false;
-
 	/*
 	 * Back off if the previous pass made no progress: do not retry until
 	 * either the cooldown elapses or free pages have grown by at least a
 	 * pageblock's worth (a hint that there might be new material to
-	 * consolidate or evacuate).
+	 * evacuate).
 	 */
 	if (sb->defrag_last_no_progress_jiffies &&
 	    time_before(jiffies, sb->defrag_last_no_progress_jiffies +
@@ -8330,21 +8323,24 @@ static bool spb_needs_defrag(struct superpageblock =
*sb)
=20
 	/*
 	 * Tainted superpageblocks: evacuate movable pages to concentrate
-	 * unmovable/reclaimable allocations.  Migration targets are
-	 * allocated system-wide, so no internal free space is needed.
-	 * Maintain the tainted reserve so unmovable claims always
-	 * find room in existing tainted superpageblocks.
+	 * unmovable/reclaimable allocations.  Maintain the tainted reserve
+	 * so unmovable claims always find room in existing tainted
+	 * superpageblocks.
 	 */
-	return sb->nr_movable > 0 &&
-	       sb->nr_free < SPB_TAINTED_RESERVE;
+	if (spb_get_category(sb) =3D=3D SB_TAINTED)
+		return sb->nr_movable > 0 &&
+		       sb->nr_free < SPB_TAINTED_RESERVE;
+
+	/* Clean SPBs: kcompactd handles consolidation; nothing to do here. */
+	return false;
 }
=20
 /**
- * spb_defrag_done - Check if defrag target has been reached
+ * spb_defrag_done - Check if defrag should stop
  * @sb: superpageblock being defragmented
  *
- * Stop defragmenting when the superpageblock has enough free space
- * or there are no more movable pages to evacuate.
+ * Only meaningful for tainted SPBs.  Clean SPBs never reach this from
+ * the SPB defrag worker (spb_needs_defrag returns false for them).
  */
 static bool spb_defrag_done(struct superpageblock *sb)
 {
@@ -8353,49 +8349,112 @@ static bool spb_defrag_done(struct superpageblock =
*sb)
 	 * the reserve of free pageblocks is restored, or until there
 	 * are no more movable pages to evacuate.
 	 */
-	return !sb->nr_movable ||
-	       sb->nr_free >=3D SPB_TAINTED_RESERVE;
+	if (spb_get_category(sb) =3D=3D SB_TAINTED)
+		return !sb->nr_movable ||
+		       sb->nr_free >=3D SPB_TAINTED_RESERVE;
+
+	/* Clean SPBs should not be handled here. */
+	return true;
+}
+
+static void spb_clear_skip_bits(struct superpageblock *sb)
+{
+	unsigned long pfn, end_pfn;
+	struct zone *zone =3D sb->zone;
+
+	end_pfn =3D sb->start_pfn + SUPERPAGEBLOCK_NR_PAGES;
+
+	for (pfn =3D sb->start_pfn; pfn < end_pfn; pfn +=3D pageblock_nr_pages) {
+		struct page *page;
+
+		if (!pfn_valid(pfn))
+			continue;
+		if (!zone_spans_pfn(zone, pfn))
+			continue;
+
+		page =3D pfn_to_page(pfn);
+		clear_pageblock_skip(page);
+	}
 }
=20
 /**
- * spb_defrag_superpageblock - evacuate movable pages from a tainted super=
pageblock
+ * spb_defrag_tainted - evacuate movable pages from a tainted superpageblo=
ck
  * @sb: the tainted superpageblock to defragment
  *
  * Find any pageblock with movable pages (PB_has_movable) and evacuate
  * them, leaving only unmovable, reclaimable, and free pages behind.
  * Stop when the free space target is reached.
  */
-static void spb_defrag_superpageblock(struct superpageblock *sb)
+static void spb_defrag_tainted(struct superpageblock *sb)
 {
-	unsigned long pfn, end_pfn;
+	unsigned long pfn, end_pfn, start_pfn, cursor;
 	struct zone *zone =3D sb->zone;
+	bool wrapped =3D false;
=20
 	if (!sb->nr_movable)
 		return;
=20
-	end_pfn =3D sb->start_pfn + SUPERPAGEBLOCK_NR_PAGES;
+	start_pfn =3D sb->start_pfn;
+	end_pfn =3D start_pfn + SUPERPAGEBLOCK_NR_PAGES;
=20
-	for (pfn =3D sb->start_pfn; pfn < end_pfn; pfn +=3D pageblock_nr_pages) {
+	cursor =3D sb->defrag_cursor;
+	if (cursor < start_pfn || cursor >=3D end_pfn) {
+		cursor =3D start_pfn;
+		spb_clear_skip_bits(sb);
+	}
+
+	pfn =3D cursor;
+
+	while (pfn < end_pfn) {
 		struct page *page;
=20
 		if (spb_defrag_done(sb))
-			return;
+			goto out;
=20
 		if (!pfn_valid(pfn))
-			continue;
+			goto next;
+
+		if (!zone_spans_pfn(zone, pfn))
+			goto next;
=20
 		page =3D pfn_to_page(pfn);
=20
-		/* Skip pageblocks without movable pages */
 		if (!get_pfnblock_bit(page, pfn, PB_has_movable))
-			continue;
+			goto next;
=20
-		/* Skip if fully free -- nothing to evacuate */
 		if (get_pfnblock_bit(page, pfn, PB_all_free))
-			continue;
+			goto next;
+
+		if (get_pageblock_skip(page))
+			goto next;
=20
 		evacuate_pageblock(zone, pfn, true);
+next:
+		pfn +=3D pageblock_nr_pages;
+		if (pfn >=3D end_pfn && !wrapped) {
+			spb_clear_skip_bits(sb);
+			pfn =3D start_pfn;
+			wrapped =3D true;
+		}
+		if (wrapped && pfn > cursor)
+			break;
 	}
+out:
+	sb->defrag_cursor =3D pfn;
+}
+
+/**
+ * spb_defrag_superpageblock - defragment a tainted superpageblock
+ * @sb: the superpageblock to defragment
+ *
+ * Tainted SPBs are evacuated by spb_defrag_tainted.  Clean SPBs are
+ * handled by standard compaction (kcompactd) and never reach this
+ * dispatcher (spb_needs_defrag returns false for them).
+ */
+static void spb_defrag_superpageblock(struct superpageblock *sb)
+{
+	if (spb_get_category(sb) =3D=3D SB_TAINTED)
+		spb_defrag_tainted(sb);
 }
=20
 static void spb_defrag_work_fn(struct work_struct *work)
@@ -8455,10 +8514,12 @@ static void spb_defrag_irq_work_fn(struct irq_work =
*work)
  * @sb: superpageblock whose counters just changed
  *
  * Called from counter update paths (under zone->lock). If the
- * superpageblock is tainted and running low on free space, schedule
- * irq_work to queue defrag work outside the allocator's lock context.
- * The irq_work handler is set up by pageblock_evacuate_init();
- * before that runs, defrag_irq_work.func is NULL and we skip.
+ * superpageblock needs defragmentation -- either evacuation of movable
+ * pages from a tainted superpageblock, or internal compaction of a
+ * clean superpageblock -- schedule irq_work to queue defrag work outside
+ * the allocator's lock context. The irq_work handler is set up by
+ * pageblock_evacuate_init(); before that runs, defrag_irq_work.func
+ * is NULL and we skip.
  */
 static void spb_maybe_start_defrag(struct superpageblock *sb)
 {
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D63E73F44EA
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289261; cv=none;
 b=s5G+MsNGQNS+vxpfuP18jHYv2fUX1YO56cd1i0sLkSTLR7GobbgY/Z1TysmaZ8Z3fUMmgGMQEnsugwVbbtK+fYPSTAmcL+Mu90dnHouhmaUdAAVIaqborHxe8hQlrJWfv6kBKNgA7UEQC7sg3bCaoG4NnXcoV09hIWQGN2Ze91k=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289261; c=relaxed/simple;
	bh=z5Usx0p1WxX4rrgW79WKs/rU1ndBW0cBAo3r+9EfiYY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=HUquRNzh8VNefH/yLUt9MuVpZo+yjYO/WvQ7opJHyKoxl4w9YG+sPCkVZtKOg1Y5axf7HLT5gtAjhTWt+Vq0MkOg2LeNlzlmiyVwDTb1hkI0uHQcmWJbS8/HifXoJiPJ9WAVXQSOq4nUzppmNHDk4oTBVG9ETAaF1ufC7oACNlY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=Mn4fAAwI; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="Mn4fAAwI"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=oTAVpne4dz1fuDA8W/prd0eclefaTEcXKU8By+o+BtM=; b=Mn4fAAwIAh8xgfm/ll5osiycux
	VQrzGezEmAlD19+730RJd4EA++yroMkxIhd5KKvhYXEvyxpthOGyC8Ii2Bp1zOSzP5BFNplM91otp
	l6WFd6fJIb3HaqGin5SEWwADGn3fvJIu1mpde/joRSZCiQWl+GPFdLdTAf03BFbmi9mkH9pXkTdeh
	uUtTdIOV3pJbbAWgdQhSNGTFje6Epq3TSiOwaTjVniasxdY9PP7NcB4TaBAfKwgUOud0aUcilAFwv
	abcrM2aq+dAdjLrJeex4lXoCxaGk68oD1wLF6NEV6uWD5o+yTRwJ+svwDsy/F/hjRbhAD7DMmtOoo
	bOUXH7vg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-24o2;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 17/40] mm: page_alloc: superpageblock-aware contiguous and
 higher order allocation
Date: Wed, 20 May 2026 10:59:23 -0400
Message-ID: <20260520150018.2491267-18-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add superpageblock-aware contiguous page allocation that leverages SPB
metadata to find ranges of clean (all-free) superpageblocks, instead of
scanning all memory with alloc_contig_range(). The SPB metadata identifies
exactly which 1GB regions have only free pages, making CMA and large
contiguous allocations more targeted.

Track contiguous allocations in superpageblock metadata by marking fully-
covered SPBs with contig_allocated, moving them to the spb_isolated list so
they don't participate in allocation steering. Fix the iteration to use
ALIGN(start, spb_pages) to correctly handle non-aligned allocation
boundaries.

Hook superpageblock-aware allocation into __alloc_pages_direct_compact()
for THP/mTHP and high-order unmovable/reclaimable allocations. For movable
allocations at pageblock_order or above, try sb_try_alloc_contig() first.
For unmovable/reclaimable, evacuate movable pages from tainted
superpageblocks to create buddy coalescing opportunities. Both paths fall
through to traditional compaction if the SPB approach fails.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |   2 +
 mm/page_alloc.c        | 484 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 479 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e7d760a689f9..a0124c170ac0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1020,6 +1020,7 @@ struct superpageblock {
 	u16			nr_movable;
 	u16			nr_reserved;	/* holes, firmware, etc. */
 	u16			total_pageblocks; /* zone-clipped total */
+	bool			contig_allocated; /* all pages held by contig alloc */
=20
 	/* Total free pages across all per-superpageblock free lists */
 	unsigned long		nr_free_pages;
@@ -1107,6 +1108,7 @@ struct zone {
=20
 	/* Superpageblock fullness lists for allocation steering */
 	struct list_head	spb_empty;	/* completely free superpageblocks */
+	struct list_head	spb_isolated;	/* fully isolated (1GB contig alloc) */
 	struct list_head	spb_lists[__NR_SB_CATEGORIES][__NR_SB_FULLNESS];
=20
 	/* zone_start_pfn =3D=3D zone_start_paddr >> PAGE_SHIFT */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c11c8c5ce6a..b6a07bd72c0b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -710,8 +710,30 @@ static inline enum sb_fullness sb_get_fullness(struct =
superpageblock *sb,
  */
 #ifdef CONFIG_COMPACTION
 static void spb_maybe_start_defrag(struct superpageblock *sb);
+static bool spb_needs_defrag(struct superpageblock *sb);
+static bool spb_evacuate_for_order(struct zone *zone, unsigned int order,
+				  int migratetype);
 #else
 static inline void spb_maybe_start_defrag(struct superpageblock *sb) {}
+static inline bool spb_needs_defrag(struct superpageblock *sb) { return fa=
lse; }
+static inline bool spb_evacuate_for_order(struct zone *zone, unsigned int =
order,
+					 int migratetype)
+{
+	return false;
+}
+#endif
+
+#ifdef CONFIG_CONTIG_ALLOC
+static struct page *spb_try_alloc_contig(struct zone *zone,
+					unsigned long nr_pages,
+					gfp_t gfp_mask);
+#else
+static inline struct page *spb_try_alloc_contig(struct zone *zone,
+					       unsigned long nr_pages,
+					       gfp_t gfp_mask)
+{
+	return NULL;
+}
 #endif
=20
 static void spb_update_list(struct superpageblock *sb)
@@ -722,6 +744,11 @@ static void spb_update_list(struct superpageblock *sb)
=20
 	list_del_init(&sb->list);
=20
+	if (sb->contig_allocated) {
+		list_add_tail(&sb->list, &zone->spb_isolated);
+		return;
+	}
+
 	if (sb->nr_free =3D=3D sb->total_pageblocks) {
 		list_add_tail(&sb->list, &zone->spb_empty);
 		return;
@@ -872,6 +899,45 @@ void __meminit init_pageblock_migratetype(struct page =
*page,
 	}
 }
=20
+#ifdef CONFIG_CONTIG_ALLOC
+/**
+ * superpageblock_contig_mark - Mark/unmark SPBs for contiguous allocation
+ * @start: start PFN of the contiguous range
+ * @end: end PFN (exclusive) of the contiguous range
+ * @allocated: true when allocated, false when freed
+ *
+ * Called after a successful contiguous allocation (or before freeing) to
+ * mark fully-covered superpageblocks as contig_allocated. This moves them
+ * to the spb_isolated list so they don't participate in allocation steeri=
ng,
+ * and makes them visible in debugfs.
+ */
+static void superpageblock_contig_mark(unsigned long start, unsigned long =
end,
+				       bool allocated)
+{
+	struct zone *zone =3D page_zone(pfn_to_page(start));
+	unsigned long spb_pages =3D SUPERPAGEBLOCK_NR_PAGES;
+	unsigned long pfn;
+	unsigned long flags;
+
+	/* Only track full-SPB contiguous allocations */
+	if (end - start < spb_pages)
+		return;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	for (pfn =3D ALIGN(start, spb_pages); pfn + spb_pages <=3D end;
+	     pfn +=3D spb_pages) {
+		struct superpageblock *sb =3D pfn_to_superpageblock(zone, pfn);
+
+		if (!sb)
+			continue;
+
+		sb->contig_allocated =3D allocated;
+		spb_update_list(sb);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif /* CONFIG_CONTIG_ALLOC */
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *pa=
ge)
 {
@@ -4311,6 +4377,17 @@ static void __free_frozen_pages(struct page *page, u=
nsigned int order,
=20
 void free_frozen_pages(struct page *page, unsigned int order)
 {
+#ifdef CONFIG_CONTIG_ALLOC
+	/*
+	 * If freeing a superpageblock-sized (or larger) range, clear the
+	 * contig_allocated flag so the SPB returns to normal allocation.
+	 */
+	if (order >=3D SUPERPAGEBLOCK_ORDER) {
+		unsigned long pfn =3D page_to_pfn(page);
+
+		superpageblock_contig_mark(pfn, pfn + (1UL << order), false);
+	}
+#endif
 	__free_frozen_pages(page, order, FPI_NONE);
 }
=20
@@ -5511,6 +5588,69 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigne=
d int order,
 	if (!order)
 		return NULL;
=20
+	/*
+	 * Superpageblock-aware contiguous allocation for movable high-order
+	 * allocations. Use superpageblock metadata to find clean ranges and
+	 * evacuate them via alloc_contig_frozen_range, bypassing the
+	 * blind compaction scanner entirely.
+	 */
+	if (order >=3D pageblock_order &&
+	    ac->migratetype =3D=3D MIGRATE_MOVABLE) {
+		struct zoneref *z;
+		struct zone *zone;
+
+		for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+					       ac->highest_zoneidx,
+					       ac->nodemask) {
+			page =3D spb_try_alloc_contig(zone, 1UL << order,
+						   gfp_mask);
+			if (page) {
+				/*
+				 * spb_try_alloc_contig() returns a page that
+				 * has already been prepped by
+				 * alloc_contig_frozen_range_noprof() (either
+				 * via the __GFP_COMP head-prep branch or via
+				 * split_free_frozen_pages()->post_alloc_hook
+				 * for non-compound).  Do not prep again here:
+				 * a second prep_new_page() trips the new-page
+				 * sanity checks and double-applies KASAN tag
+				 * setup / init-on-alloc.
+				 */
+				*compact_result =3D COMPACT_SUCCESS;
+				count_vm_event(COMPACTSUCCESS);
+				return page;
+			}
+		}
+	}
+
+	/*
+	 * Superpageblock-aware targeted evacuation for unmovable/reclaimable
+	 * high-order allocations. Instead of blind compaction, find
+	 * pageblocks of the right migratetype in tainted superpageblocks
+	 * and evacuate their movable pages to create buddy coalescing
+	 * opportunities.
+	 */
+	if (ac->migratetype =3D=3D MIGRATE_UNMOVABLE ||
+	    ac->migratetype =3D=3D MIGRATE_RECLAIMABLE) {
+		struct zoneref *z;
+		struct zone *zone;
+
+		for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+					       ac->highest_zoneidx,
+					       ac->nodemask) {
+			if (spb_evacuate_for_order(zone, order,
+						  ac->migratetype)) {
+				page =3D get_page_from_freelist(gfp_mask, order,
+							     alloc_flags, ac);
+				if (page) {
+					*compact_result =3D COMPACT_SUCCESS;
+					count_vm_event(COMPACTSUCCESS);
+					return page;
+				}
+			}
+		}
+	}
+
 	psi_memstall_enter(&pflags);
 	delayacct_compact_start();
 	noreclaim_flag =3D memalloc_noreclaim_save();
@@ -8692,9 +8832,16 @@ static void split_free_frozen_pages(struct list_head=
 *list, gfp_t gfp_mask)
 static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_ma=
sk)
 {
 	const gfp_t reclaim_mask =3D __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+	/*
+	 * __GFP_NOMEMALLOC is allowed: GFP_TRANSHUGE sets it, and the SPB
+	 * fast path that calls into here (spb_try_alloc_contig) does not
+	 * invoke the rest of the contig-alloc reclaim/compaction machinery
+	 * that would otherwise consult memalloc reserves, so accepting the
+	 * flag is safe and lets THP allocations through.
+	 */
 	const gfp_t action_mask =3D __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWA=
RN |
 				  __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO |
-				  __GFP_SKIP_KASAN;
+				  __GFP_SKIP_KASAN | __GFP_NOMEMALLOC;
 	const gfp_t cc_action_mask =3D __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
=20
 	/*
@@ -8889,6 +9036,8 @@ int alloc_contig_frozen_range_noprof(unsigned long st=
art, unsigned long end,
 	}
 done:
 	undo_isolate_page_range(start, end);
+	if (!ret)
+		superpageblock_contig_mark(start, end, true);
 	return ret;
 }
 EXPORT_SYMBOL(alloc_contig_frozen_range_noprof);
@@ -8983,6 +9132,287 @@ static bool zone_spans_last_pfn(const struct zone *=
zone,
 	return zone_spans_pfn(zone, last_pfn);
 }
=20
+/*
+ * Maximum superpageblock candidates to collect for contiguous allocation.
+ * Collected under zone->lock, then tried without it.
+ */
+#define SPB_CONTIG_MAX_CANDIDATES 4
+
+#ifdef CONFIG_COMPACTION
+/**
+ * sb_collect_contig_candidates - Find superpageblock ranges for contiguou=
s alloc
+ * @zone: zone to search (must hold zone->lock)
+ * @nr_pages: number of contiguous pages needed
+ * @pfns: output array of candidate start PFNs
+ * @max: maximum candidates to collect
+ *
+ * For superpageblock-sized (1GB) allocations:
+ *   1. Empty superpageblocks first -- no evacuation needed
+ *   2. Clean superpageblocks from almost-empty to full -- less evacuation=
 work
+ *
+ * For pageblock-sized (2MB+) sub-superpageblock allocations:
+ *   1. Clean superpageblocks from fullest to almost-empty -- pack allocat=
ions
+ *      to preserve empty superpageblocks for 1GB
+ *   2. Empty superpageblocks as last resort
+ *
+ * Returns number of candidates found.
+ */
+static int sb_collect_contig_candidates(struct zone *zone,
+					unsigned long nr_pages,
+					unsigned long *pfns, int max)
+{
+	struct superpageblock *sb;
+	int full, n =3D 0;
+
+	lockdep_assert_held(&zone->lock);
+
+	if (nr_pages >=3D SUPERPAGEBLOCK_NR_PAGES) {
+		/* 1GB+: empty superpageblocks first (no evacuation needed) */
+		list_for_each_entry(sb, &zone->spb_empty, list) {
+			if (sb->total_pageblocks < SUPERPAGEBLOCK_NR_PAGEBLOCKS)
+				continue;
+			pfns[n++] =3D sb->start_pfn;
+			if (n >=3D max)
+				return n;
+		}
+		/* Then clean superpageblocks, almost-empty first (less work) */
+		for (full =3D __NR_SB_FULLNESS - 1; full >=3D 0; full--) {
+			list_for_each_entry(sb,
+					    &zone->spb_lists[SB_CLEAN][full],
+					    list) {
+				if (sb->total_pageblocks <
+				    SUPERPAGEBLOCK_NR_PAGEBLOCKS)
+					continue;
+				pfns[n++] =3D sb->start_pfn;
+				if (n >=3D max)
+					return n;
+			}
+		}
+		return n;
+	}
+
+	/*
+	 * 2MB+ sub-superpageblock allocations.
+	 * Walk clean superpageblocks fullest-first -- pack allocations into
+	 * partial superpageblocks to preserve empty ones for 1GB use.
+	 * Pick one candidate per superpageblock for diversity.
+	 */
+	for (full =3D SB_FULL_75; full < __NR_SB_FULLNESS; full++) {
+		list_for_each_entry(sb, &zone->spb_lists[SB_CLEAN][full], list) {
+			unsigned long pfn, sb_end;
+
+			sb_end =3D sb->start_pfn +
+				(unsigned long)sb->total_pageblocks *
+				pageblock_nr_pages;
+			pfn =3D ALIGN(sb->start_pfn, nr_pages);
+
+			if (pfn + nr_pages <=3D sb_end) {
+				pfns[n++] =3D pfn;
+				if (n >=3D max)
+					return n;
+			}
+		}
+	}
+	/* Empty superpageblocks as last resort for 2MB */
+	list_for_each_entry(sb, &zone->spb_empty, list) {
+		unsigned long pfn =3D ALIGN(sb->start_pfn, nr_pages);
+		unsigned long sb_end =3D sb->start_pfn +
+			(unsigned long)sb->total_pageblocks *
+			pageblock_nr_pages;
+
+		if (pfn + nr_pages <=3D sb_end) {
+			pfns[n++] =3D pfn;
+			if (n >=3D max)
+				return n;
+		}
+	}
+	return n;
+}
+
+/**
+ * spb_try_alloc_contig - Superpageblock-aware contiguous page allocation
+ * @zone: zone to allocate from
+ * @nr_pages: number of contiguous pages needed (>=3D pageblock_nr_pages)
+ * @gfp_mask: GFP mask for allocation
+ *
+ * Use superpageblock metadata to quickly find suitable ranges for contigu=
ous
+ * allocation, avoiding the brute-force PFN scan. Each candidate is tried
+ * twice to handle transient failures (e.g., temporary page pins, racing
+ * allocations), then falls through to the next candidate.
+ *
+ * Returns: page pointer on success, NULL on failure.
+ */
+static struct page *spb_try_alloc_contig(struct zone *zone,
+					unsigned long nr_pages,
+					gfp_t gfp_mask)
+{
+	unsigned long pfns[SPB_CONTIG_MAX_CANDIDATES];
+	unsigned long flags;
+	int nr_candidates, i;
+
+	if (nr_pages < pageblock_nr_pages)
+		return NULL;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	nr_candidates =3D sb_collect_contig_candidates(zone, nr_pages,
+						     pfns,
+						     SPB_CONTIG_MAX_CANDIDATES);
+	spin_unlock_irqrestore(&zone->lock, flags);
+
+	for (i =3D 0; i < nr_candidates; i++) {
+		int attempts;
+
+		for (attempts =3D 0; attempts < 2; attempts++) {
+			int ret;
+
+			ret =3D alloc_contig_frozen_range_noprof(pfns[i],
+					pfns[i] + nr_pages,
+					ACR_FLAGS_NONE, gfp_mask);
+			if (!ret)
+				return pfn_to_page(pfns[i]);
+			/*
+			 * -EINVAL is a permanent gfp_mask incompatibility,
+			 * not a transient race; retrying is wasted lock-
+			 * acquire churn and would also fail on every other
+			 * candidate.  Give up immediately.
+			 */
+			if (ret =3D=3D -EINVAL)
+				return NULL;
+		}
+
+		/*
+		 * Failed on this candidate -- rotate its superpageblock to the
+		 * tail of its list so the next call tries fresh candidates.
+		 */
+		spin_lock_irqsave(&zone->lock, flags);
+		{
+			struct superpageblock *sb =3D
+				pfn_to_superpageblock(zone, pfns[i]);
+			if (sb) {
+				struct list_head *head;
+
+				if (sb->nr_free =3D=3D sb->total_pageblocks)
+					head =3D &zone->spb_empty;
+				else
+					head =3D &zone->spb_lists
+						[spb_get_category(sb)]
+						[sb_get_fullness(sb, spb_get_category(sb))];
+				list_move_tail(&sb->list, head);
+			}
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+	return NULL;
+}
+
+/**
+ * sb_collect_evacuate_candidates - Find pageblocks for targeted evacuation
+ * @zone: zone to search (must hold zone->lock)
+ * @migratetype: desired migratetype (MIGRATE_UNMOVABLE or MIGRATE_RECLAIM=
ABLE)
+ * @sb_pfns: output array of tainted superpageblock start PFNs
+ * @max: maximum candidates to collect
+ *
+ * Find tainted superpageblocks containing pageblocks of the desired migra=
tetype
+ * that also have movable pages to evacuate. Evacuating movable pages from
+ * these pageblocks creates buddy coalescing opportunities for high-order
+ * allocations of the desired migratetype.
+ *
+ * Returns number of candidate superpageblock PFNs found.
+ */
+static int sb_collect_evacuate_candidates(struct zone *zone, int migratety=
pe,
+					  unsigned long *sb_pfns, int max)
+{
+	struct superpageblock *sb;
+	int full, n =3D 0;
+
+	lockdep_assert_held(&zone->lock);
+
+	for (full =3D 0; full < __NR_SB_FULLNESS; full++) {
+		list_for_each_entry(sb, &zone->spb_lists[SB_TAINTED][full],
+				    list) {
+			bool has_matching;
+
+			if (!sb->nr_movable)
+				continue;
+
+			if (migratetype =3D=3D MIGRATE_UNMOVABLE)
+				has_matching =3D sb->nr_unmovable > 0;
+			else if (migratetype =3D=3D MIGRATE_RECLAIMABLE)
+				has_matching =3D sb->nr_reclaimable > 0;
+			else
+				continue;
+
+			if (!has_matching)
+				continue;
+
+			sb_pfns[n++] =3D sb->start_pfn;
+			if (n >=3D max)
+				return n;
+		}
+	}
+	return n;
+}
+
+/**
+ * spb_evacuate_for_order - Targeted evacuation of movable pages from
+ *                         unmovable/reclaimable pageblocks
+ * @zone: zone to work on
+ * @order: allocation order that failed
+ * @migratetype: desired migratetype (MIGRATE_UNMOVABLE or MIGRATE_RECLAIM=
ABLE)
+ *
+ * Instead of blind compaction, use superpageblock metadata to find pagebl=
ocks
+ * of the right migratetype in tainted superpageblocks and evacuate their
+ * movable pages. This creates buddy coalescing opportunities within
+ * the pageblock, enabling higher-order allocations.
+ *
+ * Returns true if evacuation was performed (caller should retry allocatio=
n).
+ */
+static bool spb_evacuate_for_order(struct zone *zone, unsigned int order,
+				  int migratetype)
+{
+	unsigned long sb_pfns[SPB_CONTIG_MAX_CANDIDATES];
+	unsigned long flags;
+	int nr_sbs, i;
+	bool did_evacuate =3D false;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	nr_sbs =3D sb_collect_evacuate_candidates(zone, migratetype,
+						sb_pfns,
+						SPB_CONTIG_MAX_CANDIDATES);
+	spin_unlock_irqrestore(&zone->lock, flags);
+
+	for (i =3D 0; i < nr_sbs && !did_evacuate; i++) {
+		unsigned long pfn, end_pfn;
+
+		end_pfn =3D sb_pfns[i] + SUPERPAGEBLOCK_NR_PAGES;
+		for (pfn =3D sb_pfns[i]; pfn < end_pfn;
+		     pfn +=3D pageblock_nr_pages) {
+			struct page *page;
+
+			if (!pfn_valid(pfn))
+				continue;
+
+			/* Superpageblocks can straddle zone boundaries. */
+			if (!zone_spans_pfn(zone, pfn))
+				continue;
+
+			page =3D pfn_to_page(pfn);
+
+			if (get_pfnblock_migratetype(page, pfn) !=3D migratetype)
+				continue;
+
+			if (!get_pfnblock_bit(page, pfn, PB_has_movable))
+				continue;
+
+			evacuate_pageblock(zone, pfn, true);
+			did_evacuate =3D true;
+			break;
+		}
+	}
+	return did_evacuate;
+}
+#endif /* CONFIG_COMPACTION */
+
 /**
  * alloc_contig_frozen_pages() -- tries to find and allocate contiguous ra=
nge of frozen pages
  * @nr_pages:	Number of contiguous pages to allocate
@@ -9016,9 +9446,29 @@ struct page *alloc_contig_frozen_pages_noprof(unsign=
ed long nr_pages,
 	struct zonelist *zonelist;
 	struct zone *zone;
 	struct zoneref *z;
+	struct page *page;
 	bool skip_hugetlb =3D true;
 	bool skipped_hugetlb =3D false;
=20
+	/*
+	 * First pass: superpageblock-aware search. Use superpageblock metadata
+	 * to quickly find suitable ranges, avoiding the brute-force PFN
+	 * scan. For 1GB allocations this walks spb_empty then
+	 * spb_lists[SB_CLEAN]; for 2MB+ it finds evacuatable pageblocks
+	 * in clean superpageblocks.
+	 */
+	if (nr_pages >=3D pageblock_nr_pages) {
+		zonelist =3D node_zonelist(nid, gfp_mask);
+		for_each_zone_zonelist_nodemask(zone, z, zonelist,
+					       gfp_zone(gfp_mask), nodemask) {
+			page =3D spb_try_alloc_contig(zone, nr_pages, gfp_mask);
+			if (page)
+				return page;
+		}
+	}
+
+	/* Second pass: brute-force PFN scan (existing fallback) */
+
 retry:
 	zonelist =3D node_zonelist(nid, gfp_mask);
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
@@ -9113,6 +9563,8 @@ void free_contig_frozen_range(unsigned long pfn, unsi=
gned long nr_pages)
 	if (WARN_ON_ONCE(first_page !=3D compound_head(first_page)))
 		return;
=20
+	superpageblock_contig_mark(pfn, pfn + nr_pages, false);
+
 	if (PageHead(first_page)) {
 		WARN_ON_ONCE(order !=3D compound_order(first_page));
 		free_frozen_pages(first_page, order);
@@ -9132,9 +9584,13 @@ EXPORT_SYMBOL(free_contig_frozen_range);
  */
 void free_contig_range(unsigned long pfn, unsigned long nr_pages)
 {
+	unsigned long end =3D pfn + nr_pages;
+
 	if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn))))
 		return;
=20
+	superpageblock_contig_mark(pfn, end, false);
+
 	for (; nr_pages--; pfn++)
 		__free_page(pfn_to_page(pfn));
 }
@@ -9677,6 +10133,15 @@ static int superpageblock_debugfs_show(struct seq_f=
ile *m, void *v)
 		if (empty_count)
 			seq_printf(m, "  empty: %d\n", empty_count);
=20
+		{
+			int isolated_count =3D 0;
+
+			list_for_each_entry(sb, &zone->spb_isolated, list)
+				isolated_count++;
+			if (isolated_count)
+				seq_printf(m, "  contig_alloc: %d\n", isolated_count);
+		}
+
 		for (cat =3D 0; cat < __NR_SB_CATEGORIES; cat++) {
 			for (full =3D 0; full < __NR_SB_FULLNESS; full++) {
 				int count =3D 0;
@@ -9695,12 +10160,17 @@ static int superpageblock_debugfs_show(struct seq_=
file *m, void *v)
 		/* Per-superpageblock detail */
 		for (i =3D 0; i < zone->nr_superpageblocks; i++) {
 			sb =3D &zone->superpageblocks[i];
-			seq_printf(m, "  sb[%lu] pfn=3D0x%lx: unmov=3D%u recl=3D%u mov=3D%u rsv=
=3D%u free=3D%u total=3D%u free_pages=3D%lu\n",
-				   i, sb->start_pfn,
-				   sb->nr_unmovable, sb->nr_reclaimable,
-				   sb->nr_movable, sb->nr_reserved,
-				   sb->nr_free, sb->total_pageblocks,
-				   sb->nr_free_pages);
+			if (sb->contig_allocated)
+				seq_printf(m, "  sb[%lu] pfn=3D0x%lx: contig_allocated total=3D%u\n",
+					   i, sb->start_pfn,
+					   sb->total_pageblocks);
+			else
+				seq_printf(m, "  sb[%lu] pfn=3D0x%lx: unmov=3D%u recl=3D%u mov=3D%u rs=
v=3D%u free=3D%u total=3D%u free_pages=3D%lu\n",
+					   i, sb->start_pfn,
+					   sb->nr_unmovable, sb->nr_reclaimable,
+					   sb->nr_movable, sb->nr_reserved,
+					   sb->nr_free, sb->total_pageblocks,
+					   sb->nr_free_pages);
 		}
 	}
 	return 0;
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D50153F0777
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:41 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289249; cv=none;
 b=VXAxrNCs9pgBYIQUZbgBDSWqwf7hcoQWenOxUrwgl9W8kTx/HvfL0j/NajhyztT50/c1k8XdESaj9QSO0RpYx0iNBXuF8eCGkkaZ4PxDGrtlYc2EZF0aOa7Ic0p+nY60+SfGTxdxBVXfOXGAsj5hVzrJC9k7Wb3d85Oeg+Ae0iU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289249; c=relaxed/simple;
	bh=yw+fKVRNY2L6bTmB1ETuUvtiFWbVxAY0sZ7tYpTK6iU=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=mPXiHP/XjJpPk+NCG8pMVSluBwlxw8AbzNE3njauYcuQeHM6tAZYaXEPmbPil+pAObSIGqE4B9eFDnM0y+MVKk0xFaFE3wsEI29Ag77Jk68YHVxcogQ9ffdtsxBXAQ07HwaWL16rFDEOHI+jYK1zlPr0QdLsR2borh6X1uUYNYI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=GSdjivBQ; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="GSdjivBQ"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=oR+Qcf2D3se1DFkmwZrR9GIoGiGN/MvwrIeUwpMAxpY=; b=GSdjivBQyu9RnwdzSA7o+upiVM
	7ULQxOYtTYARmik6vKgdaNpz1WP4vMNeKWyrgp8OSTWOiCp/hdgH9CHTRrCIFkK9cckQGw/X0oJDd
	Zg2L44EERWJyw2E8DYPhhqvKTSk6zH2qBh07ayFzABtjCTsxp30w4ThUoiFFylNbwnWRqKFjBvI/1
	36h27iLInG1+0UmkzxvQtfYOhE356aAuAS6EsD8pvP6W8jeWz9xC+q6fjH2Ij0Hnk112ZxmaVSc3t
	voNmuqqSe+LlPljZysRA2BznlLjE57GFWqYVMOY06+J+IhBkYpTpYn9uKbhaooHB5lOJspVC12Mjc
	ck2VGXhw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-2CqZ;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 18/40] mm: page_alloc: prevent atomic allocations from
 tainting clean SPBs
Date: Wed, 20 May 2026 10:59:24 -0400
Message-ID: <20260520150018.2491267-19-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Non-DIRECT_RECLAIM (atomic) allocations that fail with
ALLOC_NOFRAGMENT previously dropped the flag entirely and retried,
allowing them to taint clean superpageblocks.  This was the primary
source of taint spreading observed on production systems.

Stage the relaxation in three steps that keep atomic allocations
inside tainted SPBs as long as possible:

1. Extend Pass 2 in __rmqueue_smallest with a sub-pageblock phase
   (Pass 2b).  Pass 2 only finds whole free pageblocks (>=3D pageblock
   order) in tainted SPBs.  Pass 2b searches for sub-pageblock-order
   free blocks and uses try_to_claim_block() to claim a pageblock
   that has enough compatible pages.  This finds pages in tainted
   SPBs that have fragmented free space but no whole free pageblocks.

2. Add an ALLOC_NOFRAG_TAINTED_OK intermediate flag.  Instead of
   going directly from ALLOC_NOFRAGMENT to no protection, atomic
   allocations first retry with ALLOC_NOFRAG_TAINTED_OK, which
   allows __rmqueue_steal to search tainted SPBs only.  Clean and
   empty SPBs remain protected.  Only if steal from tainted SPBs
   also fails is ALLOC_NOFRAGMENT fully dropped as a last resort.

3. Bypass the pageblock compatibility threshold inside
   try_to_claim_block() when the call originates from the
   tainted-SPB walk in Pass 2b.  The
   free_pages + alike_pages >=3D 1 << (pageblock_order - 1) gate was
   designed to prevent the cross-fragment-fallback path from
   spreading mixing into clean SPBs; inside an already-tainted SPB
   the fragmentation has already been accepted, and the threshold
   rejects the typical fragmented-MOVABLE-pageblock case Pass 2b is
   meant to reclaim.  Without the bypass Pass 2b would be largely a
   no-op.

For callers that pass __GFP_NORETRY, the relaxation sequence is
wrong in principle.  The NORETRY contract is "I have a fallback;
don't go to extreme lengths."  Network skb_page_frag_refill, slab
high-order allocations, and similar hot-path callers use NORETRY
exactly so the allocator can return NULL and let their own fallback
(smaller frag, lower-order slab, etc.) take over.  Tainting a clean
superpageblock to satisfy such a request is a lasting cost -- the
SPB stays tainted for the remainder of the workload's lifetime,
blocking 1 GiB hugepage allocation from that region -- that
outlives the single allocation that triggered it.  Skip the
relaxation steps for NORETRY callers and return NULL immediately;
their fallback path absorbs the failure cleanly.

Observed on a 250 GB system running the page-superblock series:
an atomic order-3 alloc from swapper context (PCP refill,
gfp=3D0x152820 =3D __GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_NOWARN |
__GFP_NORETRY | __GFP_COMP | __GFP_HARDWALL) tainted a fresh clean
SPB at boot+~90 min despite ALLOC_NOFRAGMENT being set, because
the atomic-retry path stripped the flag.  The caller had a NORETRY
fallback ready; the taint was gratuitous.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/internal.h   |   1 +
 mm/page_alloc.c | 120 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index c0dbc2e4b7f0..e6d61dbc18d9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1511,6 +1511,7 @@ unsigned int reclaim_clean_pages_from_list(struct zon=
e *zone,
 #define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
 #define ALLOC_TRYLOCK		0x400 /* Only use spin_trylock in allocation path */
 #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAI=
M set */
+#define ALLOC_NOFRAG_TAINTED_OK	0x1000 /* NOFRAGMENT, but allow steal from=
 tainted SPBs */
=20
 /* Flags that allow allocations below the min watermark. */
 #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC=
|ALLOC_OOM)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b6a07bd72c0b..6884f638a97c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2729,6 +2729,10 @@ static struct page *__rmqueue_from_sb(struct zone *z=
one, unsigned int order,
  */
 static struct page *claim_whole_block(struct zone *zone, struct page *page,
 		  int current_order, int order, int new_type, int old_type);
+static struct page *try_to_claim_block(struct zone *zone, struct page *pag=
e,
+		  int current_order, int order, int start_type,
+		  int block_type, unsigned int alloc_flags,
+		  bool from_tainted_spb);
=20
 static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
@@ -2798,6 +2802,11 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 	 * free list (reset by mark_pageblock_free), so the search above
 	 * misses them. Claim them inline to keep non-movable allocations
 	 * concentrated in already-tainted superpageblocks.
+	 *
+	 * Try whole pageblock orders first (preferred for PCP buddy optimization=
),
+	 * then fall back to sub-pageblock orders. Sub-pageblock claiming uses
+	 * try_to_claim_block which checks whether the pageblock has enough
+	 * compatible pages to justify claiming it.
 	 */
 	if (!movable && !is_migrate_cma(migratetype)) {
 		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
@@ -2830,6 +2839,43 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 				}
 			}
 		}
+		/* Pass 2b: sub-pageblock orders in tainted SPBs */
+		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			list_for_each_entry(sb,
+				&zone->spb_lists[SB_TAINTED][full], list) {
+				int co;
+
+				if (!sb->nr_free_pages)
+					continue;
+				for (co =3D min_t(int, pageblock_order - 1,
+						NR_PAGE_ORDERS - 1);
+				     co >=3D (int)order;
+				     --co) {
+					current_order =3D co;
+					area =3D &sb->free_area[current_order];
+					page =3D get_page_from_free_area(
+						area, MIGRATE_MOVABLE);
+					if (!page)
+						continue;
+					if (get_pageblock_isolate(page))
+						continue;
+					if (is_migrate_cma(
+					    get_pageblock_migratetype(page)))
+						continue;
+					page =3D try_to_claim_block(zone, page,
+						current_order, order,
+						migratetype, MIGRATE_MOVABLE,
+						0, true);
+					if (!page)
+						continue;
+					trace_mm_page_alloc_zone_locked(
+						page, order, migratetype,
+						pcp_allowed_order(order) &&
+						migratetype < MIGRATE_PCPTYPES);
+					return page;
+				}
+			}
+		}
 	}
=20
 	/* Empty superpageblocks: try before falling back to non-preferred catego=
ry */
@@ -3298,11 +3344,17 @@ claim_whole_block(struct zone *zone, struct page *p=
age,
  * not, we check the pageblock for constituent pages; if at least half of =
the
  * pages are free or compatible, we can still claim the whole block, so pa=
ges
  * freed in the future will be put on the correct free list.
+ *
+ * @from_tainted_spb: caller has already verified the block lives in a tai=
nted
+ * superpageblock, where SPB-level fragmentation has already been accepted.
+ * Skip the per-pageblock compatibility threshold so we can absorb non-mov=
able
+ * demand into the existing tainted SPB instead of tainting a fresh clean =
one.
  */
 static struct page *
 try_to_claim_block(struct zone *zone, struct page *page,
 		   int current_order, int order, int start_type,
-		   int block_type, unsigned int alloc_flags)
+		   int block_type, unsigned int alloc_flags,
+		   bool from_tainted_spb)
 {
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
@@ -3362,8 +3414,14 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 	/*
 	 * If a sufficient number of pages in the block are either free or of
 	 * compatible migratability as our allocation, claim the whole block.
-	 */
-	if (free_pages + alike_pages >=3D (1 << (pageblock_order-1)) ||
+	 * The compatibility threshold protects clean MOVABLE pageblocks from
+	 * being relabeled when most of their pages are still in-use movable
+	 * allocations. Inside a tainted SPB the protection is unnecessary:
+	 * fragmentation has already been accepted at the SPB level, and
+	 * relabeling is much cheaper than tainting a fresh clean SPB.
+	 */
+	if (from_tainted_spb ||
+	    free_pages + alike_pages >=3D (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled) {
 		__move_freepages_block(zone, start_pfn, block_type, start_type);
 		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
@@ -3565,7 +3623,8 @@ __rmqueue_claim(struct zone *zone, int order, int sta=
rt_migratetype,
=20
 			page =3D try_to_claim_block(zone, page, current_order,
 						  order, start_migratetype,
-						  fallback_mt, alloc_flags);
+						  fallback_mt, alloc_flags,
+						  false);
 			if (page) {
 				trace_mm_page_alloc_extfrag(page, order,
 					current_order, start_migratetype,
@@ -3583,12 +3642,23 @@ __rmqueue_claim(struct zone *zone, int order, int s=
tart_migratetype,
  * the block as its current migratetype, potentially causing fragmentation.
  */
 static __always_inline struct page *
-__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype,
+		unsigned int alloc_flags)
 {
 	struct superpageblock *sb;
 	int current_order;
 	struct page *page;
 	int fallback_mt;
+	unsigned int search_cats;
+
+	/*
+	 * When ALLOC_NOFRAG_TAINTED_OK is set, only steal from tainted
+	 * SPBs to avoid tainting clean ones. Otherwise search all categories.
+	 */
+	if (alloc_flags & ALLOC_NOFRAG_TAINTED_OK)
+		search_cats =3D SB_SEARCH_PREFERRED;
+	else
+		search_cats =3D SB_SEARCH_PREFERRED | SB_SEARCH_FALLBACK;
=20
 	/*
 	 * Search per-superpageblock free lists for fallback migratetypes.
@@ -3598,7 +3668,7 @@ __rmqueue_steal(struct zone *zone, int order, int sta=
rt_migratetype)
 		page =3D __rmqueue_sb_find_fallback(zone, current_order,
 					start_migratetype,
 					&fallback_mt,
-					SB_SEARCH_PREFERRED | SB_SEARCH_FALLBACK);
+					search_cats);
=20
 		if (!page)
 			continue;
@@ -3698,8 +3768,10 @@ __rmqueue(struct zone *zone, unsigned int order, int=
 migratetype,
 		}
 		fallthrough;
 	case RMQUEUE_STEAL:
-		if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
-			page =3D __rmqueue_steal(zone, order, migratetype);
+		if (!(alloc_flags & ALLOC_NOFRAGMENT) ||
+		    (alloc_flags & ALLOC_NOFRAG_TAINTED_OK)) {
+			page =3D __rmqueue_steal(zone, order, migratetype,
+					       alloc_flags);
 			if (page) {
 				*mode =3D RMQUEUE_STEAL;
 				return page;
@@ -5408,9 +5480,35 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int =
order, int alloc_flags,
 	/*
 	 * It's possible on a UMA machine to get through all zones that are
 	 * fragmented. If avoiding fragmentation, reset and try again.
-	 */
-	if (no_fallback && !defrag_mode) {
-		alloc_flags &=3D ~ALLOC_NOFRAGMENT;
+	 *
+	 * For allocations that can do direct reclaim, keep NOFRAGMENT set
+	 * and let the slowpath try reclaim and compaction to free pages in
+	 * already-tainted superpageblocks before allowing clean SPBs to be
+	 * tainted.
+	 *
+	 * Atomic allocations cannot reclaim, but try an intermediate step
+	 * first: allow steal/claim from tainted SPBs only. This avoids
+	 * tainting clean SPBs while still finding pages in tainted ones.
+	 * Only drop NOFRAGMENT entirely if that also fails.
+	 *
+	 * Exception: callers that explicitly opted into failure with
+	 * __GFP_NORETRY have a fallback path of their own (a smaller
+	 * order, a different cache, returning NULL from a best-effort
+	 * cache refill, etc.). Tainting a clean superpageblock is a
+	 * lasting cost that outlives this allocation; it is not justified
+	 * to absorb it just to satisfy a caller that already has a
+	 * cheaper escape hatch. Return NULL and let the caller's fallback
+	 * run instead.
+	 */
+	if (no_fallback && !defrag_mode &&
+	    !(gfp_mask & __GFP_DIRECT_RECLAIM)) {
+		if (gfp_mask & __GFP_NORETRY)
+			return NULL;
+		if (!(alloc_flags & ALLOC_NOFRAG_TAINTED_OK)) {
+			alloc_flags |=3D ALLOC_NOFRAG_TAINTED_OK;
+			goto retry;
+		}
+		alloc_flags &=3D ~(ALLOC_NOFRAGMENT | ALLOC_NOFRAG_TAINTED_OK);
 		goto retry;
 	}
=20
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB9A33F54D4
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:54 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289265; cv=none;
 b=rP/SYq2B7U6KoEh4H6BFXUKTySfbOpaKqrFmJJIhSHJYb1FrHJFXb+mDJ8EoJ7du/yeDPmxjqQvw2yJC5Qp8Vssn/cpa86LXcOaWvbdXX4CJ5mD7Y97yD8fzEPkIPQZo2TG6gcxyy+0uG8v859/QRBmLGSNw9Td3TFSirDw4EH8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289265; c=relaxed/simple;
	bh=Ee4lGK5d3tleJz8cvo/Fvzc3W6r/3Eyb+mPVDQFlJIM=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=MMdlOC1bYVaLmSwbQ/Txo6agkXkW+Gfw6S0ePh/ms2g8s8vQvNw2mR8+K+7j2DkMQxAHx5GCXGfIpeKlcYmxUwDMhkXcCDuILR6zpNKQiQhcP3vlCIsWMpBRJCCoTJdtACKaawzQnfZwSNle7iQH439rwY6CxvVNwyJzHMrp5GM=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=XDjS0uf/; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="XDjS0uf/"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=gQXjUbTM4iHYcDcs5d+InjFN6hA+vO3mGOWCUeiALP4=; b=XDjS0uf/xvBIXecJpX3z5Qn4T/
	3UpZRI8V5GmDC1Ef4EgttMNcKxUNwk6+1FzzDy+6ll6srrwu4uq76bXWpjHJWpt2H5EKZwJX2Ljog
	/y2bNYBTWJUPkUXzdMTiH4ORK0v+ij64xY0JL+lXm9vSb5zWkxVv3euYlzrCt6CR6n4NvSOYREOZg
	k3RJFUxJeVXzb3Kn7yfhX4hE+ITtiKNx0DQNI2+fbs/xOkFpY1CtDwYq24mizxBGhnCCvQcoy16cU
	9G5k3Xkuqqcho51ZXq9LMnApRD/9ZCsoOBT1G2y35QYt1Zei5Qapq0H9O3QyG0Lvnn2/N+L2YRkJ+
	4mxVt48g==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-2Is2;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 19/40] mm: page_alloc: aggressively pack non-movable
 allocs in tainted SPBs on large systems
Date: Wed, 20 May 2026 10:59:25 -0400
Message-ID: <20260520150018.2491267-20-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

On systems with many superpageblocks, sub-pageblock MOVABLE fragments
within already-tainted SPBs were being skipped by __rmqueue_claim()
due to the ALLOC_NOFRAGMENT pageblock_order floor. This caused the
allocator to fall through to clean SPBs, tainting them unnecessarily.

Introduce SPB_AGGRESSIVE_THRESHOLD: on systems with more than 8
superpageblocks, relax the min_order floor for the preferred category
(tainted SPBs) so non-movable allocations consume free space there at
any granularity. On small systems, preserve the pageblock_order floor
to protect MOVABLE capacity within tainted SPBs.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 68 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6884f638a97c..63151e99bd53 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2659,6 +2659,24 @@ static void prep_new_page(struct page *page, unsigne=
d int order, gfp_t gfp_flags
  */
 #define SPB_TAINTED_RESERVE	4
=20
+/*
+ * On systems with many superpageblocks, we can afford to "write off"
+ * tainted superpageblocks by aggressively packing unmovable/reclaimable
+ * allocations into them -- even sub-pageblock fragments -- to keep clean
+ * superpageblocks clean for future 1GB hugepage and contiguous allocation=
s.
+ *
+ * On small systems (few superpageblocks), each SPB represents a large
+ * fraction of total memory. Aggressively claiming sub-pageblock movable
+ * fragments from tainted SPBs would destroy MOVABLE capacity that the
+ * system can't afford to lose, with little benefit since there are too
+ * few SPBs to meaningfully separate movable from unmovable anyway.
+ *
+ * This threshold controls the crossover: above it, prefer concentrating
+ * non-movable allocations in tainted SPBs at any granularity; below it,
+ * only claim whole free pageblocks from tainted SPBs.
+ */
+#define SPB_AGGRESSIVE_THRESHOLD	8
+
 /**
  * sb_preferred_for_movable - Find the fullest clean superpageblock for mo=
vable
  * @zone: zone to search
@@ -3585,6 +3603,7 @@ __rmqueue_claim(struct zone *zone, int order, int sta=
rt_migratetype,
 {
 	int current_order;
 	int min_order =3D order;
+	int nofrag_min_order =3D order;
 	struct page *page;
 	int fallback_mt;
 	static const unsigned int cat_search[] =3D {
@@ -3598,9 +3617,18 @@ __rmqueue_claim(struct zone *zone, int order, int st=
art_migratetype,
 	 * Do not steal pages from freelists belonging to other pageblocks
 	 * i.e. orders < pageblock_order. If there are no local zones free,
 	 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+	 *
+	 * Only apply this restriction to empty and clean superpageblocks.
+	 * Claiming within already-tainted superpageblocks does not cause
+	 * new fragmentation, and skipping them wastes free space that
+	 * could prevent tainting clean superpageblocks.
+	 *
+	 * When ALLOC_NOFRAGMENT is set, skip empty and clean superpageblocks
+	 * entirely to avoid tainting them. The slowpath will try reclaim and
+	 * compaction first, and only drop ALLOC_NOFRAGMENT as a last resort.
 	 */
 	if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
-		min_order =3D pageblock_order;
+		nofrag_min_order =3D pageblock_order;
=20
 	/*
 	 * Find the largest available free page in a fallback migratetype.
@@ -3610,6 +3638,31 @@ __rmqueue_claim(struct zone *zone, int order, int st=
art_migratetype,
 	 * ones.
 	 */
 	for (c =3D 0; c < ARRAY_SIZE(cat_search); c++) {
+		/*
+		 * When avoiding fragmentation, do not search clean/empty
+		 * superpageblocks for fallback pages. Tainting a clean SPB
+		 * is the worst outcome -- better to fail and let the slowpath
+		 * try reclaim and compaction in already-tainted SPBs first.
+		 */
+		if ((alloc_flags & ALLOC_NOFRAGMENT) &&
+		    cat_search[c] !=3D SB_SEARCH_PREFERRED)
+			continue;
+
+		/*
+		 * For the preferred category (tainted SPBs for non-movable),
+		 * search all orders down to the allocation order on systems
+		 * with enough superpageblocks that we can afford to write off
+		 * tainted ones. These SPBs are already tainted, so sub-pageblock
+		 * stealing doesn't cause additional fragmentation.
+		 *
+		 * On small systems, keep the pageblock_order floor to preserve
+		 * MOVABLE capacity within tainted SPBs -- see comment at
+		 * SPB_AGGRESSIVE_THRESHOLD.
+		 */
+		min_order =3D (cat_search[c] =3D=3D SB_SEARCH_PREFERRED &&
+			     zone->nr_superpageblocks > SPB_AGGRESSIVE_THRESHOLD) ?
+			    order : nofrag_min_order;
+
 		for (current_order =3D MAX_PAGE_ORDER;
 		     current_order >=3D min_order; --current_order) {
 			if (!should_try_claim_block(current_order,
@@ -3881,8 +3934,18 @@ static bool rmqueue_bulk(struct zone *zone, unsigned=
 int order,
 	 * For movable allocations, prefer pageblocks from the
 	 * fullest clean superpageblock to pack allocations and
 	 * preserve empty superpageblocks for 1GB hugepages.
+	 *
+	 * For non-movable allocations, force ALLOC_NOFRAGMENT so
+	 * __rmqueue cannot steal a whole pageblock out of a clean
+	 * SPB. Stealing is the worst possible outcome for a bulk
+	 * refill: a single network or slab burst can taint dozens
+	 * of clean pageblocks. Phase 2 will adopt sub-pageblock
+	 * fragments from tainted SPBs before Phase 3 falls back to
+	 * the original alloc_flags (which may eventually steal at
+	 * the requested order, a much smaller fragmentation event).
 	 */
 	while (refilled + pageblock_nr_pages <=3D pages_needed) {
+		unsigned int p1_alloc_flags =3D alloc_flags;
 		struct page *page =3D NULL;
=20
 		if (migratetype =3D=3D MIGRATE_MOVABLE) {
@@ -3892,11 +3955,14 @@ static bool rmqueue_bulk(struct zone *zone, unsigne=
d int order,
 			if (sb)
 				page =3D __rmqueue_from_sb(zone, pageblock_order,
 							 migratetype, sb);
+		} else if (!is_migrate_cma(migratetype)) {
+			p1_alloc_flags =3D (p1_alloc_flags | ALLOC_NOFRAGMENT) &
+					 ~ALLOC_NOFRAG_TAINTED_OK;
 		}
 		if (!page)
 			page =3D __rmqueue(zone, pageblock_order,
 					 migratetype,
-					 alloc_flags, &rmqm);
+					 p1_alloc_flags, &rmqm);
 		if (!page)
 			break;
=20
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 85ADE3E9F9D
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289256; cv=none;
 b=n4OzPVrl8yOkRe6DyTNVs5b+VHnPD5esP52D9Xw5Sk7kBBqtO5E5KVq8HtLWwf3JstLOF22cCtSGifEjsAwERjYhRkEVm8aj6TxDJSyOV2AV30QJpd844Z14pjk64+hX96xiWmrquic2XKiwpJ3DR+zTmdw3ajy1oIjN4DMECnQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289256; c=relaxed/simple;
	bh=U4JXi6NGhV9U1k8x7eMbksFk0opqbZtz3iM4mo1Nj3k=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=Pmf7cZX+2JAQXTU1LYAawPAtkflYjK9lz6UkbEP1dt1wcDT+ZquyqxpLdgVJG8nDxGPFpDhsTAEfPTknVDQrFNNXM84nl14k5bIahLHk1Mv+iNESlZTe5tyS4EgnkXBQGha0cgKKKk4FgC9XfEtXJB54pKseJ9t42SS/UGrOyNE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=brf4iceO; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="brf4iceO"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=b+Uw4FvKv/DMZzRr9dlauNErROj+6HFTPfymYFoRYco=; b=brf4iceOt3injvroqoR/CYYfuQ
	vOf01R8yUWmIrUsL9n94ZBh7Sou30+aKSgBkeDJK4CxqGuzknTVFC7qZ1i4fjS3Ndt7/O4rOGWkVJ
	/kKtieczn/y2oYDH7CaI9TMtFcHB7MYgh5Y3CQ13gp0H3EayyFlHe97XK1q3opr63DWg8jjqjedDU
	ovDvRidBUfZw24X5uAZbP6vm8VIP8w7RbuSZPaLKzBEdMhFryHNswVtXiMy2nAswVsDWKWUv4UVJs
	N/VnI4mLXMuet4UOGZHxKAoi+iIwnhkS5SmlCPbHXrAFcqTU6/7KPr65AsA25793iOA27TRHlAwYS
	vqIzsAUg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-2SAa;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 20/40] mm: page_alloc: prefer reclaim over tainting clean
 superpageblocks
Date: Wed, 20 May 2026 10:59:26 -0400
Message-ID: <20260520150018.2491267-21-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

When the allocator needs pages for unmovable or reclaimable allocations
and tainted superpageblocks are exhausted, it currently falls through to
clean superpageblocks immediately, permanently tainting them. This
defeats the purpose of superpageblock anti-fragmentation.

Restructure the allocation fallback cascade to try reclaim and compaction
before tainting clean superpageblocks:

1. Reorder __rmqueue_smallest to search each preferred SPB completely
   before moving to the next source. Within each preferred SPB, try
   whole-pageblock allocations first (for PCP buddy optimization),
   then fall back to sub-pageblock allocations. This ensures that
   sub-pageblock free pages in existing tainted SPBs are used before
   tainting empty or clean SPBs. The pass order is:
   - Preferred SPBs: whole pageblock first, then sub-pageblock
   - Whole pageblock inline claim from tainted SPBs (non-movable only)
   - Whole pageblock from empty SPBs
   - Fallback to non-preferred SPBs

2. Preserve ALLOC_NOFRAGMENT through the slowpath by calling
   alloc_flags_nofragment() after gfp_to_alloc_flags(). Previously
   the slowpath only set NOFRAGMENT for defrag_mode, losing the SPB
   protection that the fastpath established.

3. After reclaim and compaction have both been tried and failed, drop
   ALLOC_NOFRAGMENT unconditionally as a last resort before OOM.
   Previously this was gated on defrag_mode.

Testing shows that with this change, clean superpageblocks maintain
unmov=3D0 throughout a heavy mixed workload (swap pressure, filesystem
metadata, anonymous memory cycling, compaction, hugepage allocation),
where previously 2-3 additional SPBs would become tainted with 7-8
unmovable pageblocks each.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 74 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 61 insertions(+), 13 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63151e99bd53..093be0d930c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2781,11 +2781,23 @@ struct page *__rmqueue_smallest(struct zone *zone, =
unsigned int order,
 	 * concentrate non-movable allocations into fewer superpageblocks.
 	 * For movable, prefer clean superpageblocks to keep them homogeneous.
 	 *
-	 * Search empty superpageblocks between the preferred and fallback
-	 * category passes to avoid movable allocations consuming free
-	 * pageblocks in tainted superpageblocks (which unmovable needs for
-	 * future CLAIMs), and vice versa.
+	 * Prefer whole pageblock allocations (>=3D pageblock_order) over
+	 * sub-pageblock allocations because whole pageblocks enable the
+	 * PCP buddy optimization for fast subsequent allocations.
+	 *
+	 * Search order:
+	 * 1. Preferred SPBs: whole pageblock first, then sub-pageblock
+	 * 2. Whole pageblock inline claim from tainted SPBs (non-movable only)
+	 * 3. Whole pageblock from empty SPBs
+	 * 4. Fallback to non-preferred SPBs
+	 *
+	 * Pass 1 tries whole pageblock first for PCP buddy optimization,
+	 * then falls back to sub-pageblock within the same preferred SPBs.
+	 * This ensures we never taint empty/clean SPBs while preferred
+	 * SPBs still have free pages at any order.
 	 */
+
+	/* Pass 1: preferred SPBs -- whole pageblock first, then sub-pageblock */
 	for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
 		enum sb_category cat =3D cat_order[movable][0];
=20
@@ -2793,7 +2805,8 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 			&zone->spb_lists[cat][full], list) {
 			if (!sb->nr_free_pages)
 				continue;
-			for (current_order =3D order;
+			/* Try whole pageblock (or larger) first for PCP buddy */
+			for (current_order =3D max(order, pageblock_order);
 			     current_order < NR_PAGE_ORDERS;
 			     ++current_order) {
 				area =3D &sb->free_area[current_order];
@@ -2810,15 +2823,34 @@ struct page *__rmqueue_smallest(struct zone *zone, =
unsigned int order,
 					migratetype < MIGRATE_PCPTYPES);
 				return page;
 			}
+			/* Then try sub-pageblock (no PCP buddy) */
+			if (order < pageblock_order) {
+				for (current_order =3D order;
+				     current_order < pageblock_order;
+				     ++current_order) {
+					area =3D &sb->free_area[current_order];
+					page =3D get_page_from_free_area(
+						area, migratetype);
+					if (!page)
+						continue;
+					page_del_and_expand(zone, page,
+						order, current_order,
+						migratetype);
+					trace_mm_page_alloc_zone_locked(
+						page, order, migratetype,
+						pcp_allowed_order(order) &&
+						migratetype < MIGRATE_PCPTYPES);
+					return page;
+				}
+			}
 		}
 	}
=20
 	/*
-	 * For non-movable allocations, try to reclaim free pageblocks
-	 * from tainted superpageblocks before looking at empty or clean
-	 * ones. Free pageblocks in tainted SBs have pages on the MOVABLE
-	 * free list (reset by mark_pageblock_free), so the search above
-	 * misses them. Claim them inline to keep non-movable allocations
+	 * Pass 2: for non-movable allocations, try to claim free pageblocks
+	 * from tainted superpageblocks. Free pageblocks in tainted SBs have
+	 * pages on the MOVABLE free list (reset by mark_pageblock_free), so
+	 * pass 1 misses them. Claim them inline to keep non-movable allocations
 	 * concentrated in already-tainted superpageblocks.
 	 *
 	 * Try whole pageblock orders first (preferred for PCP buddy optimization=
),
@@ -2896,7 +2928,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 		}
 	}
=20
-	/* Empty superpageblocks: try before falling back to non-preferred catego=
ry */
+	/* Pass 3: whole pageblock from empty superpageblocks */
 	list_for_each_entry(sb, &zone->spb_empty, list) {
 		if (!sb->nr_free_pages)
 			continue;
@@ -6422,6 +6454,17 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int =
order,
 	if (!zonelist_zone(ac->preferred_zoneref))
 		goto nopage;
=20
+	/*
+	 * Preserve ALLOC_NOFRAGMENT through the slowpath so that reclaim
+	 * and compaction are tried before allowing clean superpageblocks
+	 * to be tainted. The fast path sets this via alloc_flags_nofragment()
+	 * but gfp_to_alloc_flags() only sets it for defrag_mode. Re-add it
+	 * here so the slowpath retries with NOFRAGMENT still protecting
+	 * clean SPBs until the last-resort drop below.
+	 */
+	alloc_flags |=3D alloc_flags_nofragment(
+				zonelist_zone(ac->preferred_zoneref), gfp_mask);
+
 	/*
 	 * Check for insane configurations where the cpuset doesn't contain
 	 * any suitable zone to satisfy the request - e.g. non-movable
@@ -6561,8 +6604,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int =
order,
 				&compaction_retries))
 		goto retry;
=20
-	/* Reclaim/compaction failed to prevent the fallback */
-	if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
+	/*
+	 * Reclaim and compaction have been tried but could not free enough
+	 * pages in already-tainted superpageblocks. Drop NOFRAGMENT as a
+	 * last resort to allow claiming from clean/empty SPBs and stealing
+	 * across migratetype boundaries. This is better than OOM-killing.
+	 */
+	if (alloc_flags & ALLOC_NOFRAGMENT) {
 		alloc_flags &=3D ~ALLOC_NOFRAGMENT;
 		goto retry;
 	}
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E24FA3F1651
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289250; cv=none;
 b=JyoBlGjE919rZtJvtBi4gymSafKk1JSTAVu3266iAmhvaWvcfw/tC4GvNVQC5oTlILXyq1s+VkkKZuBFr4VR/dsmBg/AMjtJ17CsMcGEpmUBH67r7Vrcl/4wwV7al+Zgao6Qwl9RLaDAzOxMzgZghgq+ZmuXiCChgOpFGWChkY0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289250; c=relaxed/simple;
	bh=mtWst9XfFI2ermy1jb5exOY5tf7wCKx/+GaYAWMOIRM=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=UY/7T/YCPFLZPkOOL2UmCt6i/zVzbsvK6VxGbON3HhDH/n9nqoxZsQ4umJNvg36UycJhnFP2yMCKagR5UjV1EJ1PNgIT6FWtUzE5kg6zPQ3sVb6z5FkxSRItVNTui4MNORfMx5qEV85hPLlfO2V1Aiwb4TxG2r/QemfiK1RFnj0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=haLL/lLV; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="haLL/lLV"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=soU8EMGTnZp6JdKODlWY7ua2z0I6BshZRtKHimKYnmk=; b=haLL/lLV5W/Xd8lSRXfoSB0b0o
	bo6kV1YJOWQqBWKTQfPvFTKRclEZnxrTmDxof7eRKya0afSNS50EX80+u3tzEW0GxGwCrU1yDLK2a
	oyzWKlC73Hb+kw+jr3x5ddVLJ8y+0eddvnkNioVLy+BEt2IE5DeyBY7pT937YLuGPPEA9ZYj+RvxF
	LG92t4qfm04AXbL9xxWwAfFmJqT5i0omlKjQQNQ3s47G6Y9vidY/0ZIkQEcGgA5TaSL3mE/0NVNT5
	vjZC4+DVHelK4wQb7DQG2t+iP1OPLg6XaKfOgkG93ZSo2z/h6bOXnZO/Y/TX9aK7k3pyBTgdORLv6
	Mn9u7crA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-2atp;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 21/40] mm: page_alloc: adopt partial pageblocks from
 tainted superpageblocks
Date: Wed, 20 May 2026 10:59:27 -0400
Message-ID: <20260520150018.2491267-22-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add Phase 2 to rmqueue_bulk: when refilling PCP for unmovable or
reclaimable allocations, search tainted superpageblocks for partially-free
pageblocks with sub-pageblock buddy entries of the requested migratetype.

Claim ownership of the pageblock and move the found entry to PCP with
PCPBuddy marking.  Pass 0 (the existing owned-block recovery phase)
picks up remaining buddy entries on subsequent refills, so there is no
need to sweep the entire pageblock eagerly.

This concentrates non-movable allocations into already-tainted
superpageblocks, reducing fragmentation spread to clean superpageblocks.

Pageblock-ownership handling: a pageblock encoded as pbd->cpu=3D=3D0 is
unowned and may be claimed; a non-zero value means another CPU's PCP
has frozen pages from this block.  In the latter case the refill walk
keeps following the pageblock (the merge pass at __free_one_page can
reabsorb the other CPU's PCPBuddy entries in the same lock acquire,
clearing ownership before the walk finishes), instead of unconditionally
skipping it.  Without this, busy multi-CPU systems with high tainted-SPB
occupancy would skip every already-touched pageblock in Phase 2 and let
clean SPBs taint instead -- the exact failure Phase 2 was added to
prevent.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 131 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 117 insertions(+), 14 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 093be0d930c0..8027412da866 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1090,7 +1090,7 @@ static inline void set_buddy_order(struct page *page,=
 unsigned int order)
  * - Set when Phase 0/1 restore or acquire whole pageblocks.
  * - Propagated to split remainders in pcp_rmqueue_smallest().
  * - Set on freed pages from owned blocks routed to the owner PCP.
- * - NOT set for Phase 2/3 fragments or zone-owned frees.
+ * - NOT set for Phase 3 fragments or zone-owned frees.
  * - The merge pass in free_pcppages_bulk() only processes
  *   PagePCPBuddy pages, ensuring it never touches pages on
  *   another CPU's PCP list.
@@ -3871,15 +3871,15 @@ __rmqueue(struct zone *zone, unsigned int order, in=
t migratetype,
  * under a single hold of the lock, for efficiency.  Add them to the
  * freelist of @pcp.
  *
- * When @pcp is non-NULL and @count > 1 (normal pageset), uses a four-phase
+ * When @pcp is non-NULL and @count > 1 (normal pageset), uses a multi-pha=
se
  * approach:
- *   Phase 0: Recover previously owned, partially drained blocks.
- *   Phase 1: Acquire whole pageblocks, claim ownership, set PagePCPBuddy.
- *            These pages are eligible for PCP-level buddy merging.
- *   Phase 2: Grab sub-pageblock fragments of the same migratetype.
- *   Phase 3: Fall back to __rmqueue() with migratetype fallback.
- *   Phase 2/3 pages are cached for batching only -- no ownership claim,
- *   no PagePCPBuddy, no PCP-level merging.
+ *   Phase 0:   Recover previously owned, partially drained blocks.
+ *   Phase 1:   Acquire whole pageblocks, claim ownership, set PagePCPBudd=
y.
+ *              These pages are eligible for PCP-level buddy merging.
+ *   Phase 2:   Adopt partial pageblocks from tainted SPBs (non-movable on=
ly).
+ *              Claims ownership so Pass 0 can recover buddy entries later.
+ *   Phase 3:   Fall back to __rmqueue() with migratetype fallback.
+ *              No ownership claim, no PagePCPBuddy, no PCP-level merging.
  *
  * When @pcp is NULL or @count <=3D 1 (boot pageset), acquires individual
  * pages of the requested order directly.
@@ -3897,7 +3897,7 @@ static bool rmqueue_bulk(struct zone *zone, unsigned =
int order,
 	int cpu =3D smp_processor_id();
 	unsigned long refilled =3D 0;
 	unsigned long flags;
-	int o;
+	unsigned int o;
=20
 	if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
 		if (!spin_trylock_irqsave(&zone->lock, flags))
@@ -4007,11 +4007,114 @@ static bool rmqueue_bulk(struct zone *zone, unsign=
ed int order,
 		goto out;
=20
 	/*
-	 * Phase 2 was removed: it swept zone free lists for sub-pageblock
-	 * fragments, which are always empty when superpageblocks are enabled.
-	 * Phase 3's __rmqueue() -> __rmqueue_smallest() properly searches
-	 * per-superpageblock free lists at all orders.
+	 * Phase 2: Adopt partial pageblocks from tainted SPBs.
+	 *
+	 * Phase 1 only grabs whole free pageblocks. When a tainted SPB
+	 * has partially-used pageblocks with free sub-pageblock buddy
+	 * entries, Phase 1 can't use them. Phase 3 can find them via
+	 * __rmqueue_smallest, but without ownership or PCPBuddy marking,
+	 * so they fragment further on drain.
+	 *
+	 * This phase bridges the gap: find a sub-pageblock free entry
+	 * in a tainted SPB and claim ownership of its pageblock. Pass 0
+	 * will pick up remaining buddy entries on subsequent refills.
+	 *
+	 * Only for unmovable/reclaimable -- movable should use clean SPBs.
 	 */
+	if (migratetype !=3D MIGRATE_MOVABLE &&
+	    !is_migrate_cma(migratetype)) {
+		enum sb_fullness full;
+
+		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			struct superpageblock *sb;
+
+			list_for_each_entry(sb,
+				&zone->spb_lists[SB_TAINTED][full], list) {
+				struct page *page;
+				int found_order =3D -1;
+				bool claim_pb;
+
+				if (sb->nr_free_pages < pageblock_nr_pages / 4)
+					continue;
+
+				/*
+				 * Find a sub-pageblock free entry for our
+				 * migratetype, starting from the largest order.
+				 *
+				 * Use a post-decrement loop so the unsigned
+				 * counter cannot underflow when @order is 0;
+				 * the previous signed counter relied on the
+				 * mixed signed/unsigned comparison wrapping
+				 * to a huge value, which UBSAN flagged and
+				 * which let the loop walk free_area[-1].
+				 */
+				for (o =3D pageblock_order; o-- > order; ) {
+					struct free_area *area;
+
+					area =3D &sb->free_area[o];
+					page =3D get_page_from_free_area(
+						area, migratetype);
+					if (page) {
+						found_order =3D o;
+						break;
+					}
+				}
+				if (found_order < 0)
+					continue;
+
+				/*
+				 * Found a free fragment in a tainted SPB. Take
+				 * it from the buddy.
+				 *
+				 * If the source pageblock is unowned, claim it:
+				 * mark our pages PagePCPBuddy and register the
+				 * block on owned_blocks so Pass 0 can recover
+				 * remaining fragments on future refills.
+				 *
+				 * If the source pageblock is already owned by
+				 * some CPU (us or another), take the page as a
+				 * plain non-PCPBuddy fragment -- the same way
+				 * Phase 3 / __rmqueue_smallest would. Setting
+				 * PagePCPBuddy here would let two CPUs hold
+				 * PCPBuddy pages from the same pageblock, and
+				 * the PCP merge pass could then corrupt the
+				 * other CPU's PCP list.
+				 *
+				 * Set PB_has_<migratetype> either way (bypasses
+				 * page_del_and_expand which normally does the
+				 * PB_has tracking); idempotent if already set.
+				 */
+				pbd =3D pfn_to_pageblock(page,
+						       page_to_pfn(page));
+				claim_pb =3D (pbd->cpu =3D=3D 0);
+
+				del_page_from_free_list(page, zone,
+							found_order,
+							migratetype);
+				__spb_set_has_type(page, migratetype);
+				if (claim_pb) {
+					set_pcpblock_owner(page, cpu);
+					__SetPagePCPBuddy(page);
+				}
+				pcp_enqueue_tail(pcp, page, migratetype,
+						 found_order);
+				refilled +=3D 1 << found_order;
+
+				/*
+				 * Register for Phase 0 recovery so future
+				 * drains from this pageblock can be swept
+				 * back efficiently. Only meaningful when we
+				 * actually claimed ownership above.
+				 */
+				if (claim_pb && list_empty(&pbd->cpu_node))
+					list_add(&pbd->cpu_node,
+						 &pcp->owned_blocks);
+
+				if (refilled >=3D pages_needed)
+					goto out;
+			}
+		}
+	}
=20
 	/*
 	 * Phase 3: Last resort. Use __rmqueue() which does
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 038243F44E4
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289255; cv=none;
 b=DaHjmD+86G6iHyOUfA5yVbZ7ukqtayD/RQ6urVgrDT9wn1LoUk6FWEmpcU3SfC34wcPhenanvcfzW0RuleZupAgwmmVSs2QSWbGw2RxRflYWtdPgH1Trq8sGiQ5wt0RlLzpK+QKLY3BWV45BHdlirFS3QD/N/ojugJJS26op8Ag=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289255; c=relaxed/simple;
	bh=CYJ+cYBNmtonKiSOG59OLHc5C51h4xmSg9F/jy8nbC8=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=K0PCZ5MYWqj6sb1I/81cndICsULRtx/K0WYIeN4d7XDd2R7n2L7El/4gtXtqEXbBytnoFi3LH/el/QwwpiBKUfo/BCDWg+xONPXoKhMyAfe2+4lMNOMbaOnqLaN11zRKuPL/82woUIR6G5XwZzITh0ipNrOejUQSejiEQwwYrr8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=Ik8r2kWH; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="Ik8r2kWH"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=NwVqkzhkrdkV2AHWI0B8Z//Qg+XaJB1kiB31HcDYG68=; b=Ik8r2kWHczdta/uHFI6arjJ/fs
	Nuxve6+zYFQbMfUKTwu96aIJ4bsaeE+6DUs44cnubs5qsxKw0aaAWtMR0RHAAFTSPt+KdvWwU5s6B
	gnf+YgRFfElKUeUymHTUeVN8Vynvnt13z0GRYwsqZbu0FaP3v5XKzANik0nY4Gu6oJW7Fo04gHXCd
	DN5h4lZkinXbZtGOmbw0HCC0Vw3FPiGZ6SDDwlOGRQu7H4L0OGaA82V8zYj66iPDt03r1pfQPiRrU
	Lary9QMc44z5C8lN3kzxld/zd1Ypydi8z59mKk300kVFrVi+gd906D4rfRDAU3KMMjgfyX5Siv24Q
	IDUUEwWw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-2iBW;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 22/40] mm: page_alloc: add CONFIG_DEBUG_VM sanity checks
 for SPB counters
Date: Wed, 20 May 2026 10:59:28 -0400
Message-ID: <20260520150018.2491267-23-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add spb_debug_check() and call it after every site that mutates the
per-superpageblock type counters (nr_free / nr_unmovable / nr_reclaimable
/ nr_movable). Each counter must be <=3D total_pageblocks; a violation
indicates that a PB_has_<mt> bit transition was missed by one of the
allocation, free, claim, or evacuation paths and the counter has drifted
out of sync with the bits.

VM_WARN_ONCE keeps the production cost zero (CONFIG_DEBUG_VM only) while
giving us a single place to catch counter drift early during stress
testing instead of debugging it from a much later misaccounting symptom.

Relax three pre-existing VM_WARN_ONCE checks in __add_to_free_list,
move_to_free_list, and __del_page_from_free_list so they no longer warn
for MIGRATE_ISOLATE / MIGRATE_CMA pageblocks. Those legitimately carry
stale per-type counters from the isolation/un-isolation flow, and the
warnings would fire spuriously once spb_debug_check() exposes that path
under load.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8027412da866..e267390a5948 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -477,6 +477,32 @@ static inline int migratetype_to_has_bit(int migratety=
pe)
 	}
 }
=20
+#ifdef CONFIG_DEBUG_VM
+static void spb_debug_check(struct superpageblock *sb, const char *caller)
+{
+	u16 total =3D sb->total_pageblocks;
+
+	VM_WARN_ONCE(sb->nr_free > total,
+		     "%s: nr_free %u > total %u (zone=3D%s sb=3D%lu)\n",
+		     caller, sb->nr_free, total, sb->zone->name,
+		     (unsigned long)(sb - sb->zone->superpageblocks));
+	VM_WARN_ONCE(sb->nr_unmovable > total,
+		     "%s: nr_unmovable %u > total %u (zone=3D%s sb=3D%lu)\n",
+		     caller, sb->nr_unmovable, total, sb->zone->name,
+		     (unsigned long)(sb - sb->zone->superpageblocks));
+	VM_WARN_ONCE(sb->nr_reclaimable > total,
+		     "%s: nr_reclaimable %u > total %u (zone=3D%s sb=3D%lu)\n",
+		     caller, sb->nr_reclaimable, total, sb->zone->name,
+		     (unsigned long)(sb - sb->zone->superpageblocks));
+	VM_WARN_ONCE(sb->nr_movable > total,
+		     "%s: nr_movable %u > total %u (zone=3D%s sb=3D%lu)\n",
+		     caller, sb->nr_movable, total, sb->zone->name,
+		     (unsigned long)(sb - sb->zone->superpageblocks));
+}
+#else
+static inline void spb_debug_check(struct superpageblock *sb, const char *=
caller) {}
+#endif
+
 /*
  * __spb_set_has_type - set PB_has_* and increment type counter
  *
@@ -508,6 +534,7 @@ static void __spb_set_has_type(struct page *page, int m=
igratetype)
 			sb->nr_movable++;
 			break;
 		}
+		spb_debug_check(sb, "__spb_set_has_type");
 	}
 }
=20
@@ -545,6 +572,7 @@ static void __spb_clear_has_type(struct page *page, int=
 migratetype)
 				sb->nr_movable--;
 			break;
 		}
+		spb_debug_check(sb, "__spb_clear_has_type");
 	}
 }
=20
@@ -778,6 +806,7 @@ static void superpageblock_pb_now_free(struct page *pag=
e)
 		return;
=20
 	sb->nr_free++;
+	spb_debug_check(sb, "pb_now_free");
=20
 	spb_update_list(sb);
 }
@@ -800,6 +829,7 @@ static void superpageblock_pb_now_used(struct page *pag=
e)
=20
 	if (sb->nr_free)
 		sb->nr_free--;
+	spb_debug_check(sb, "pb_now_used");
=20
 	spb_update_list(sb);
 }
@@ -1265,7 +1295,9 @@ static inline void __add_to_free_list(struct page *pa=
ge, struct zone *zone,
 	struct free_area *area =3D pfn_sb_free_area(zone, pfn, order, &sb);
 	int nr_pages =3D 1 << order;
=20
-	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype,
+	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype &&
+		     !is_migrate_isolate(get_pageblock_migratetype(page)) &&
+		     !is_migrate_cma(get_pageblock_migratetype(page)),
 		     "page type is %d, passed migratetype is %d (nr=3D%d)\n",
 		     get_pageblock_migratetype(page), migratetype, nr_pages);
=20
@@ -1299,7 +1331,8 @@ static inline void move_to_free_list(struct page *pag=
e, struct zone *zone,
 	int nr_pages =3D 1 << order;
=20
 	/* Free page moving can fail, so it happens before the type update */
-	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D old_mt,
+	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D old_mt &&
+		     !is_migrate_cma(get_pageblock_migratetype(page)),
 		     "page type is %d, passed migratetype is %d (nr=3D%d)\n",
 		     get_pageblock_migratetype(page), old_mt, nr_pages);
=20
@@ -1324,7 +1357,9 @@ static inline void __del_page_from_free_list(struct p=
age *page, struct zone *zon
 	struct free_area *area =3D pfn_sb_free_area(zone, pfn, order, &sb);
 	int nr_pages =3D 1 << order;
=20
-        VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype,
+	VM_WARN_ONCE(get_pageblock_migratetype(page) !=3D migratetype &&
+		     !is_migrate_isolate(get_pageblock_migratetype(page)) &&
+		     !is_migrate_cma(get_pageblock_migratetype(page)),
 		     "page type is %d, passed migratetype is %d (nr=3D%d)\n",
 		     get_pageblock_migratetype(page), migratetype, nr_pages);
=20
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2D8E93D1CA2
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289250; cv=none;
 b=ei8ZvOs3kMYOqmYuUddEpemw3qFeEL0dAwOoDhwchlcQt+71wIjaoTbKr6PUdD+UXk8dZn5bHW1jYQCtBkDY6p/3R9S8yRM9aVTtFoIs0TVNEGHsFepoCbLQs1SYpceuVk476eXWw+6xtOlzWatjcmES1waGa12t8mRvYfwfoQU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289250; c=relaxed/simple;
	bh=3P6kWS23yU+yCSdGrEawo/AfzR+cUMX9w+03fvwGnxI=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=IuS+O66Ur2vRTafLi2NXGNzBgmEdTVLbZvLxscmujzWQIEFMoZGKemTMfW+94BFkaxIbS8zswohrdlr4TXocXPP1BG/Vsvn7iOndGJXkECmTkafHva/ihFEUvFnnFH33/nuf/C5EaBhFWEDDPlKt9RqQvWcUxEi0FEK4YKg/Ixk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=ltbB5qd3; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="ltbB5qd3"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=s+pNIgKBYOMF9XTykE71T7SGn/qzSW7h0asRcRtULUM=; b=ltbB5qd3Y5BdyzT1CztNk42OiQ
	y0urApjVkoRnFEpKAihu/PvTiKTtJjd3Yn4inEq83JzsVoy6n11JXOYFPM9CAtbszQb6NwAf2yYWw
	G88/7FcLQmhKyW1tSVYWlk8f4y543H3WOxeUMuapVI0+RCQYH5JOdg3icge57MPy7kq/PYGa+85qA
	2daw2m7KhirjMPySTP/R+YmvEE/NkNVnELa5IPRbQ8W923JQAO5rCaWcvueQUyH0Od6HVyWc1IO53
	tL0LLR//a7eZcIUk9T7x4izq5VPqUtwHrzCVyTeZPf27iQOUCNlKh6XYvkDRcolISEDCzMuXjq9fz
	KtYppxqg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-2oiO;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 23/40] mm: page_alloc: targeted evacuation and dynamic
 reserves for tainted SPBs
Date: Wed, 20 May 2026 10:59:29 -0400
Message-ID: <20260520150018.2491267-24-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Reduce tainted-superpageblock proliferation with three changes:

1. Dynamic SPB_TAINTED_RESERVE.  Scale the movable-steering reserve
   with SPB size (~3% of pageblocks, minimum 4).  For a
   512-pageblock SPB this gives 16 reserved pageblocks instead of
   the previous flat 4, triggering async defrag 4x earlier and
   keeping more headroom for unmovable claims.

2. Targeted evacuation before NOFRAGMENT drop.  When the slowpath
   is about to drop ALLOC_NOFRAGMENT for an unmovable/reclaimable
   allocation, first try evacuating movable pages from tainted
   SPBs so the retry can satisfy via Pass 2 (claim_whole_block)
   without forcing __rmqueue_claim to taint a clean SPB.

3. Single-pass evacuation with a relaxed source-PB filter, sized
   for the slowpath.

Stack impact: sb_pfns[] grows from 32 bytes to 128 bytes -- trivial
for an 8K/16K kernel stack.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 334 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 219 insertions(+), 115 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e267390a5948..b4794ba7024f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2691,8 +2691,16 @@ static void prep_new_page(struct page *page, unsigne=
d int order, gfp_t gfp_flags
  * fewer than this many free pageblocks, ensuring that unmovable claims
  * always find room in existing tainted superpageblocks instead of spilling
  * into clean ones.
+ *
+ * Scale with SPB size: reserve ~3% of pageblocks (minimum 4).
+ * For a 512-pageblock SPB this gives 16 reserved pageblocks.
  */
-#define SPB_TAINTED_RESERVE	4
+#define SPB_TAINTED_RESERVE_MIN	4
+
+static inline u16 spb_tainted_reserve(const struct superpageblock *sb)
+{
+	return max_t(u16, SPB_TAINTED_RESERVE_MIN, sb->total_pageblocks / 32);
+}
=20
 /*
  * On systems with many superpageblocks, we can afford to "write off"
@@ -3005,7 +3013,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 				 * with few free pageblocks to reserve space
 				 * for future unmovable/reclaimable claims.
 				 */
-				if (sb->nr_free <=3D SPB_TAINTED_RESERVE)
+				if (sb->nr_free <=3D spb_tainted_reserve(sb))
 					continue;
 				for (current_order =3D order;
 				     current_order < NR_PAGE_ORDERS;
@@ -3582,7 +3590,7 @@ __rmqueue_sb_find_fallback(struct zone *zone, unsigne=
d int order,
 					&sb->free_area[order];
=20
 				if (movable && cat =3D=3D SB_TAINTED &&
-				    sb->nr_free <=3D SPB_TAINTED_RESERVE)
+				    sb->nr_free <=3D spb_tainted_reserve(sb))
 					continue;
=20
 				for (i =3D 0; i < MIGRATE_PCPTYPES - 1; i++) {
@@ -3631,7 +3639,7 @@ __rmqueue_sb_find_fallback(struct zone *zone, unsigne=
d int order,
 					&sb->free_area[order];
=20
 				if (movable && cat =3D=3D SB_TAINTED &&
-				    sb->nr_free <=3D SPB_TAINTED_RESERVE)
+				    sb->nr_free <=3D spb_tainted_reserve(sb))
 					continue;
=20
 				for (i =3D 0; i < MIGRATE_PCPTYPES - 1; i++) {
@@ -6744,9 +6752,33 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int =
order,
=20
 	/*
 	 * Reclaim and compaction have been tried but could not free enough
-	 * pages in already-tainted superpageblocks. Drop NOFRAGMENT as a
-	 * last resort to allow claiming from clean/empty SPBs and stealing
-	 * across migratetype boundaries. This is better than OOM-killing.
+	 * pages in already-tainted superpageblocks. Before dropping
+	 * NOFRAGMENT, try targeted evacuation of movable pages from
+	 * tainted SPBs to create free pageblocks for unmovable claims.
+	 */
+	if ((alloc_flags & ALLOC_NOFRAGMENT) &&
+	    (ac->migratetype =3D=3D MIGRATE_UNMOVABLE ||
+	     ac->migratetype =3D=3D MIGRATE_RECLAIMABLE)) {
+		struct zoneref *z;
+		struct zone *zone;
+
+		for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+					       ac->highest_zoneidx,
+					       ac->nodemask) {
+			if (spb_evacuate_for_order(zone, order,
+						  ac->migratetype)) {
+				page =3D get_page_from_freelist(gfp_mask, order,
+							     alloc_flags, ac);
+				if (page)
+					goto got_pg;
+			}
+		}
+	}
+
+	/*
+	 * Targeted evacuation could not free enough either. Drop
+	 * NOFRAGMENT as a last resort to allow claiming from clean/empty
+	 * SPBs. This is better than OOM-killing.
 	 */
 	if (alloc_flags & ALLOC_NOFRAGMENT) {
 		alloc_flags &=3D ~ALLOC_NOFRAGMENT;
@@ -8819,7 +8851,7 @@ static bool spb_needs_defrag(struct superpageblock *s=
b)
 	 */
 	if (spb_get_category(sb) =3D=3D SB_TAINTED)
 		return sb->nr_movable > 0 &&
-		       sb->nr_free < SPB_TAINTED_RESERVE;
+		       sb->nr_free < spb_tainted_reserve(sb);
=20
 	/* Clean SPBs: kcompactd handles consolidation; nothing to do here. */
 	return false;
@@ -8841,7 +8873,7 @@ static bool spb_defrag_done(struct superpageblock *sb)
 	 */
 	if (spb_get_category(sb) =3D=3D SB_TAINTED)
 		return !sb->nr_movable ||
-		       sb->nr_free >=3D SPB_TAINTED_RESERVE;
+		       sb->nr_free >=3D spb_tainted_reserve(sb);
=20
 	/* Clean SPBs should not be handled here. */
 	return true;
@@ -9077,6 +9109,184 @@ void init_superpageblock_defrag(struct superpageblo=
ck *sb)
 	INIT_WORK(&sb->defrag_work, spb_defrag_work_fn);
 	init_irq_work(&sb->defrag_irq_work, spb_defrag_irq_work_fn);
 }
+
+/*
+ * Maximum tainted superpageblock candidates per spb_evacuate_for_order ca=
ll.
+ * Collected under zone->lock, then evacuated without it. Larger than the
+ * contig-allocation candidate cap because evacuation runs from the slowpa=
th
+ * after reclaim/compaction failed: we need a meaningful chance of freeing=
 a
+ * non-MOV-claimable pageblock before the slowpath escalates to dropping
+ * ALLOC_NOFRAGMENT (which lets __rmqueue_claim taint clean SPBs). Sized to
+ * scan a meaningful fraction of a typical tainted-pool population.
+ */
+#define SPB_EVACUATE_MAX_CANDIDATES 16
+
+/*
+ * Maximum pageblocks to evacuate per candidate SPB inside
+ * spb_evacuate_for_order. Each evacuation triggers page migration which is
+ * O(pages_per_pageblock) wall-clock cost, so this caps per-call latency.
+ * Bumped from 3 to 8 to free more capacity per slowpath escalation pass.
+ * Combined cap: SPB_EVACUATE_MAX_CANDIDATES * SPB_EVACUATE_MAX_PB_PER_SB
+ * pageblocks per call (16 * 8 =3D 128 =3D 256 MiB on x86 max migration bu=
dget).
+ */
+#define SPB_EVACUATE_MAX_PB_PER_SB 8
+
+/**
+ * sb_collect_evacuate_candidates - Find tainted SPBs for targeted evacuat=
ion
+ * @zone: zone to search (must hold zone->lock)
+ * @sb_pfns: output array of tainted superpageblock start PFNs
+ * @max: maximum candidates to collect
+ *
+ * Walk the per-zone tainted SPB lists and collect the start PFNs of
+ * superpageblocks that still hold movable pages, up to @max entries.
+ * The caller (spb_evacuate_for_order) drops zone->lock before actually
+ * evacuating each candidate.
+ *
+ * Returns number of candidate superpageblock PFNs found.
+ */
+static int sb_collect_evacuate_candidates(struct zone *zone,
+					  unsigned long *sb_pfns, int max)
+{
+	struct superpageblock *sb;
+	int full, n =3D 0;
+
+	lockdep_assert_held(&zone->lock);
+
+	for (full =3D 0; full < __NR_SB_FULLNESS; full++) {
+		list_for_each_entry(sb, &zone->spb_lists[SB_TAINTED][full],
+				    list) {
+			if (!sb->nr_movable)
+				continue;
+
+			sb_pfns[n++] =3D sb->start_pfn;
+			if (n >=3D max)
+				return n;
+		}
+	}
+	return n;
+}
+
+/*
+ * Evacuate MOV content out of any pageblock in the given range that has i=
t.
+ *
+ * The previous version filtered on the source pageblock's migratetype tag,
+ * which made evacuation blind to MOV stragglers living in PBs whose tag d=
id
+ * not match the current allocation's requesting type:
+ *
+ *   - PASS_2C / PASS_2D borrows set PB_has_<requesting_mt> on a MOV-tagged
+ *     PB without changing the tag. The borrowed pages return to the MOV
+ *     free list when freed, so a MOV-tagged PB can host non-MOV PB_has bi=
ts
+ *     and MOV content simultaneously.
+ *
+ *   - When __spb_set_has_type adds a non-MOV bit on a PB, the PB tag is n=
ot
+ *     re-evaluated. PBs accumulate has-bits over time without their tag
+ *     necessarily reflecting current content.
+ *
+ * Drop the migratetype tag filter and accept any PB with PB_has_movable s=
et.
+ * Skip only the cases whose semantics forbid touching them here:
+ *   - MIGRATE_ISOLATE     under quarantine
+ *   - CMA                 own allocator
+ *   - MIGRATE_HIGHATOMIC  reserve, evac would race the reservation logic
+ *
+ * Returns number of pageblocks evacuated.
+ */
+static int evacuate_pb_range(struct zone *zone, unsigned long start_pfn,
+			     unsigned long end_pfn, int max)
+{
+	unsigned long pfn;
+	int nr_evacuated =3D 0;
+
+	for (pfn =3D start_pfn; pfn < end_pfn; pfn +=3D pageblock_nr_pages) {
+		struct page *page;
+		int pb_mt;
+
+		if (!pfn_valid(pfn))
+			continue;
+
+		if (!zone_spans_pfn(zone, pfn))
+			continue;
+
+		page =3D pfn_to_page(pfn);
+
+		if (!get_pfnblock_bit(page, pfn, PB_has_movable))
+			continue;
+
+		pb_mt =3D get_pfnblock_migratetype(page, pfn);
+		if (is_migrate_isolate(pb_mt) ||
+		    is_migrate_cma(pb_mt) ||
+		    pb_mt =3D=3D MIGRATE_HIGHATOMIC)
+			continue;
+
+		evacuate_pageblock(zone, pfn, true);
+		if (++nr_evacuated >=3D max)
+			break;
+	}
+	return nr_evacuated;
+}
+
+/**
+ * spb_evacuate_for_order - Targeted evacuation of movable pages from
+ *                         tainted superpageblocks
+ * @zone: zone to work on
+ * @order: allocation order that failed
+ * @migratetype: requesting migratetype (informational; the evacuation
+ *               loop itself is migratetype-independent)
+ *
+ * Collect tainted superpageblocks with movable content under zone->lock,
+ * then drop the lock and walk each candidate's pageblocks, evacuating
+ * any PB that has PB_has_movable set and does not have a special tag
+ * (ISOLATE, CMA, HIGHATOMIC). PBs that are pure MOV become empty whole
+ * pageblocks for Pass 2 / claim_whole_block on retry; mixed PBs lose
+ * their MOV stragglers so future allocations of the dominant type stop
+ * competing with MOV residue.
+ *
+ * Returns true if evacuation was performed (caller should retry allocatio=
n).
+ */
+static bool spb_evacuate_for_order(struct zone *zone, unsigned int order,
+				  int migratetype)
+{
+	unsigned long sb_pfns[SPB_EVACUATE_MAX_CANDIDATES];
+	unsigned long flags;
+	int nr_sbs, i;
+	unsigned int attempts =3D 0;
+	bool did_evacuate =3D false;
+
+	/*
+	 * Single-pass evacuation: collect candidate tainted SPBs (anything
+	 * with MOV content), then walk each one's pageblocks evacuating MOV
+	 * content from any non-special PB. evacuate_pb_range filters by
+	 * PB_has_movable, so this is a no-op on PBs that have no MOV content.
+	 *
+	 * Two effects accumulate:
+	 *   - PBs that are pure MOV become empty -> free MOV pageblock,
+	 *     claimable by Pass 2 / claim_whole_block on the retry.
+	 *   - PBs that are mixed (e.g., UNMOV + MOV stragglers) lose the MOV
+	 *     stragglers, so future allocations of the dominant type can use
+	 *     the PB without competing with the MOV residue.
+	 *
+	 * The previous two-phase design tried to do these separately and
+	 * filtered evacuation by source PB tag. That left MOV content
+	 * stranded in PBs whose tag did not match either phase, and gave up
+	 * after one phase even though the other phase could have helped.
+	 */
+	spin_lock_irqsave(&zone->lock, flags);
+	nr_sbs =3D sb_collect_evacuate_candidates(zone, sb_pfns,
+						SPB_EVACUATE_MAX_CANDIDATES);
+	spin_unlock_irqrestore(&zone->lock, flags);
+
+	for (i =3D 0; i < nr_sbs; i++) {
+		unsigned long end_pfn =3D sb_pfns[i] + SUPERPAGEBLOCK_NR_PAGES;
+		int n;
+
+		n =3D evacuate_pb_range(zone, sb_pfns[i], end_pfn,
+				      SPB_EVACUATE_MAX_PB_PER_SB);
+		attempts +=3D n;
+		if (n)
+			did_evacuate =3D true;
+	}
+
+	return did_evacuate;
+}
 #endif /* CONFIG_COMPACTION */
=20
 #ifdef CONFIG_CONTIG_ALLOC
@@ -9655,112 +9865,6 @@ static struct page *spb_try_alloc_contig(struct zon=
e *zone,
 	return NULL;
 }
=20
-/**
- * sb_collect_evacuate_candidates - Find pageblocks for targeted evacuation
- * @zone: zone to search (must hold zone->lock)
- * @migratetype: desired migratetype (MIGRATE_UNMOVABLE or MIGRATE_RECLAIM=
ABLE)
- * @sb_pfns: output array of tainted superpageblock start PFNs
- * @max: maximum candidates to collect
- *
- * Find tainted superpageblocks containing pageblocks of the desired migra=
tetype
- * that also have movable pages to evacuate. Evacuating movable pages from
- * these pageblocks creates buddy coalescing opportunities for high-order
- * allocations of the desired migratetype.
- *
- * Returns number of candidate superpageblock PFNs found.
- */
-static int sb_collect_evacuate_candidates(struct zone *zone, int migratety=
pe,
-					  unsigned long *sb_pfns, int max)
-{
-	struct superpageblock *sb;
-	int full, n =3D 0;
-
-	lockdep_assert_held(&zone->lock);
-
-	for (full =3D 0; full < __NR_SB_FULLNESS; full++) {
-		list_for_each_entry(sb, &zone->spb_lists[SB_TAINTED][full],
-				    list) {
-			bool has_matching;
-
-			if (!sb->nr_movable)
-				continue;
-
-			if (migratetype =3D=3D MIGRATE_UNMOVABLE)
-				has_matching =3D sb->nr_unmovable > 0;
-			else if (migratetype =3D=3D MIGRATE_RECLAIMABLE)
-				has_matching =3D sb->nr_reclaimable > 0;
-			else
-				continue;
-
-			if (!has_matching)
-				continue;
-
-			sb_pfns[n++] =3D sb->start_pfn;
-			if (n >=3D max)
-				return n;
-		}
-	}
-	return n;
-}
-
-/**
- * spb_evacuate_for_order - Targeted evacuation of movable pages from
- *                         unmovable/reclaimable pageblocks
- * @zone: zone to work on
- * @order: allocation order that failed
- * @migratetype: desired migratetype (MIGRATE_UNMOVABLE or MIGRATE_RECLAIM=
ABLE)
- *
- * Instead of blind compaction, use superpageblock metadata to find pagebl=
ocks
- * of the right migratetype in tainted superpageblocks and evacuate their
- * movable pages. This creates buddy coalescing opportunities within
- * the pageblock, enabling higher-order allocations.
- *
- * Returns true if evacuation was performed (caller should retry allocatio=
n).
- */
-static bool spb_evacuate_for_order(struct zone *zone, unsigned int order,
-				  int migratetype)
-{
-	unsigned long sb_pfns[SPB_CONTIG_MAX_CANDIDATES];
-	unsigned long flags;
-	int nr_sbs, i;
-	bool did_evacuate =3D false;
-
-	spin_lock_irqsave(&zone->lock, flags);
-	nr_sbs =3D sb_collect_evacuate_candidates(zone, migratetype,
-						sb_pfns,
-						SPB_CONTIG_MAX_CANDIDATES);
-	spin_unlock_irqrestore(&zone->lock, flags);
-
-	for (i =3D 0; i < nr_sbs && !did_evacuate; i++) {
-		unsigned long pfn, end_pfn;
-
-		end_pfn =3D sb_pfns[i] + SUPERPAGEBLOCK_NR_PAGES;
-		for (pfn =3D sb_pfns[i]; pfn < end_pfn;
-		     pfn +=3D pageblock_nr_pages) {
-			struct page *page;
-
-			if (!pfn_valid(pfn))
-				continue;
-
-			/* Superpageblocks can straddle zone boundaries. */
-			if (!zone_spans_pfn(zone, pfn))
-				continue;
-
-			page =3D pfn_to_page(pfn);
-
-			if (get_pfnblock_migratetype(page, pfn) !=3D migratetype)
-				continue;
-
-			if (!get_pfnblock_bit(page, pfn, PB_has_movable))
-				continue;
-
-			evacuate_pageblock(zone, pfn, true);
-			did_evacuate =3D true;
-			break;
-		}
-	}
-	return did_evacuate;
-}
 #endif /* CONFIG_COMPACTION */
=20
 /**
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E62E83F1659
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289248; cv=none;
 b=drVv4fXJmywRQFsGvFlPJSHCNaN2MDAFoDNxGm0ZY7/M55sXTugrCCKjxfp1XomE9pkbOIeX8FoI9JljbNWjYu8uREsBJ+NO1GAWXNX5Ji22y9U7hc0oMRqYe66xfZYllqpw+IEykK7Eyhd9ll3ne1+DnbOEg+Zh4EyhrcpVd7g=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289248; c=relaxed/simple;
	bh=Ll86wWv6MaIW/0HGrwCuX5Rbwgs8lS58QUtkp1PIA28=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=Vvg8E2MNtt/+Pi14OwNYed/FmaJl8XCxkKWkXpZhnDzGtZnMmJFpLpMI1nx9c0u+mzp5vzi3BPn3SO0sHhOYTk2tCHCZk5ZrYUIu/U6xa7cs9OZKjdA03PEnCG0PnTOYaGNl668Tmey3ZGfcRUpCDJOh+BAtgFiWm1IlFg/XhO4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=SfRF7MLY; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="SfRF7MLY"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=FbmaVyF/aGay1UdLwoY7Lt8blnO3tO+P/LpIkjcJR/U=; b=SfRF7MLYk9xj5qBGEvpHO0ldSU
	QQS3xxzy06qwE4Ajgkm4BCl84Q+axWwqlquuDIS5DL+K1lZGtcbkGyMCMhjAofxJe0QhYNk9jvxjK
	DtwclI6+uVf5qEtukidxaD8hgZpHkRi3ZSxz2zDUmxhL9AG+R9XqQ5auVkciIkCdFBwK/9FAr9A/4
	TwzdLjm9yzyNKSLNQapuGpJ/aAHAjIkK+dFV9MUzYu7GYGS6dEiNkD4+2tm3nJuSZvjMnYHOaRCQG
	uwi5qccxPGKPwXC7Uh+Xdvxd1GlA73SCYNjxtlHb8jcQwUBwFJf5Gv1crIN89PCMOUVzJp1gfOIvk
	p6rFBL7w==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-2y1s;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 24/40] mm: page_alloc: prevent UNMOVABLE/RECLAIMABLE
 mixing in pageblocks
Date: Wed, 20 May 2026 10:59:30 -0400
Message-ID: <20260520150018.2491267-25-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Inside a tainted SPB, free pages of UNMOVABLE and RECLAIMABLE
allocations cannot be told apart by the buddy allocator's
compatibility heuristic (alike_pages =3D=3D 0 between the two non-movable
types in try_to_claim_block). Once a pageblock holds in-use pages of
both, any sticky UNMOVABLE pinhole prevents the RECLAIMABLE pages
from coalescing into useful higher-order chunks when they drain back
to the buddy. The PB's free capacity is permanently capped at
order-1 dust regardless of how much of it actually returns. Sticky
recl pages (active dentries, locked btrfs eb folios, NOFS slab) are
unavoidable; the cost is paid in internal fragmentation.

Two paths in the page allocator create UNMOVABLE<->RECLAIMABLE
mixing today:

  1. try_to_claim_block() relabels a partial PB whenever the 50%
     threshold "free_pages + alike_pages >=3D pageblock_nr_pages/2"
     passes. For UNMOV<->RECL, alike_pages =3D=3D 0, so the rule
     degenerates to free_pages >=3D 256. A PB with 256 in-use UNMOV
     pages plus 256 free pages passes and is relabeled RECL. Both
     PB_has_unmovable and PB_has_reclaimable are then set.

  2. __rmqueue_steal() takes a single foreign-type page out of a
     PB without relabeling the PB. A UNMOVABLE allocation stealing
     from a RECLAIMABLE-labeled PB sets PB_has_unmovable on top of
     the existing PB_has_reclaimable.

Tighten both paths:

  - Add noncompatible_cross_type() helper that detects the
    UNMOV<->RECL pair (MOVABLE may still mix with either since
    movable pages can be migrated out).

  - In try_to_claim_block(), require a fully-free PB
    (free_pages =3D=3D pageblock_nr_pages) for any cross-type relabel,
    regardless of from_tainted_spb. The other-type bit inherited
    from the prior label is stale on a fully-free PB (no in-use
    pages of either type) so clear it during the relabel rather
    than leaving the PB visibly mixed in PB_has_* state.

  - In __rmqueue_steal(), pass a new SB_SKIP_CROSS_TYPE flag to
    __rmqueue_sb_find_fallback() so the cross-type fallback entry
    in fallbacks[] is skipped. Steal then falls through to the
    MIGRATE_MOVABLE second fallback instead of single-page-stealing
    into a foreign non-movable PB.

The from_tainted_spb=3Dtrue caller of try_to_claim_block() is
unaffected because it hardcodes block_type=3DMIGRATE_MOVABLE. The
claim_whole_block() branch (current_order >=3D pageblock_order) is
also unaffected: it requires PB_all_free, so the PB is fully free
of any prior type.

Existing mixed PBs from before this change won't unmix; the win
is for PBs created after.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 108 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 25 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b4794ba7024f..988cf6f27938 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3073,6 +3073,23 @@ static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTY=
PES - 1] =3D {
 	[MIGRATE_RECLAIMABLE] =3D { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE   },
 };
=20
+/*
+ * UNMOVABLE and RECLAIMABLE allocations should not share the same
+ * pageblock. Their free pages are interchangeable on the buddy free
+ * lists (alike_pages =3D=3D 0 between them), so once a PB holds both
+ * types the buddy can no longer tell them apart and any sticky
+ * UNMOVABLE pinhole prevents the RECLAIMABLE pages from coalescing
+ * into useful higher-order chunks when they drain back. MOVABLE may
+ * mix with either, since MOVABLE pages can be migrated out.
+ */
+static inline bool noncompatible_cross_type(int start_type, int fallback_t=
ype)
+{
+	return (start_type =3D=3D MIGRATE_UNMOVABLE &&
+		fallback_type =3D=3D MIGRATE_RECLAIMABLE) ||
+	       (start_type =3D=3D MIGRATE_RECLAIMABLE &&
+		fallback_type =3D=3D MIGRATE_UNMOVABLE);
+}
+
 #ifdef CONFIG_CMA
 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zo=
ne,
 					unsigned int order)
@@ -3450,11 +3467,10 @@ try_to_claim_block(struct zone *zone, struct page *=
page,
 		   bool from_tainted_spb)
 {
 	int free_pages, movable_pages, alike_pages;
-	unsigned long start_pfn;
 #ifdef CONFIG_COMPACTION
-	struct page *start_page;
 	struct superpageblock *sb;
 #endif
+	unsigned long start_pfn;
=20
 	/*
 	 * Don't steal from pageblocks that are isolated for
@@ -3512,32 +3528,48 @@ try_to_claim_block(struct zone *zone, struct page *=
page,
 	 * allocations. Inside a tainted SPB the protection is unnecessary:
 	 * fragmentation has already been accepted at the SPB level, and
 	 * relabeling is much cheaper than tainting a fresh clean SPB.
-	 */
-	if (from_tainted_spb ||
-	    free_pages + alike_pages >=3D (1 << (pageblock_order-1)) ||
-			page_group_by_mobility_disabled) {
-		__move_freepages_block(zone, start_pfn, block_type, start_type);
-		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
-#ifdef CONFIG_COMPACTION
-		/*
-		 * Track actual page contents in pageblock flags and
-		 * update superpageblock counters so the SPB moves to
-		 * the correct fullness list for steering.
-		 */
-		start_page =3D pfn_to_page(start_pfn);
-		__spb_set_has_type(start_page, start_type);
-		if (block_type !=3D start_type)
-			__spb_set_has_type(start_page, block_type);
-
-		sb =3D pfn_to_superpageblock(zone, start_pfn);
-		if (sb)
-			spb_update_list(sb);
+	 *
+	 * UNMOVABLE<->RECLAIMABLE cross-type claims override these rules:
+	 * once mixed, sticky pinholes of one type prevent the other from
+	 * coalescing into useful higher-order free chunks even after drain.
+	 * Only relabel a fully-free PB in that case, regardless of whether
+	 * the SPB is tainted.
+	 */
+	if (noncompatible_cross_type(start_type, block_type)) {
+		if (free_pages !=3D pageblock_nr_pages)
+			return NULL;
+	} else if (!from_tainted_spb &&
+		   free_pages + alike_pages < (1 << (pageblock_order-1)) &&
+		   !page_group_by_mobility_disabled) {
+		return NULL;
+	}
=20
-#endif
-		return __rmqueue_smallest(zone, order, start_type);
+	__move_freepages_block(zone, start_pfn, block_type, start_type);
+	set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
+#ifdef CONFIG_COMPACTION
+	/*
+	 * Track actual page contents in pageblock flags and update
+	 * superpageblock counters so the SPB moves to the correct
+	 * fullness list for steering.
+	 *
+	 * For cross-type UNMOVABLE<->RECLAIMABLE relabel (which by the
+	 * predicate above only fires on a fully-free PB), the inherited
+	 * PB_has_<block_type> bit is stale -- there are no in-use pages
+	 * of that type. Clear it so the resulting PB is unmixed.
+	 */
+	__spb_set_has_type(pfn_to_page(start_pfn), start_type);
+	if (block_type !=3D start_type) {
+		if (noncompatible_cross_type(start_type, block_type))
+			__spb_clear_has_type(pfn_to_page(start_pfn), block_type);
+		else
+			__spb_set_has_type(pfn_to_page(start_pfn), block_type);
 	}
=20
-	return NULL;
+	sb =3D pfn_to_superpageblock(zone, start_pfn);
+	if (sb)
+		spb_update_list(sb);
+#endif
+	return __rmqueue_smallest(zone, order, start_type);
 }
=20
 /*
@@ -3561,6 +3593,13 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 #define SB_SEARCH_EMPTY		(1 << 1)
 #define SB_SEARCH_FALLBACK	(1 << 2)
 #define SB_SEARCH_ALL		(SB_SEARCH_PREFERRED | SB_SEARCH_EMPTY | SB_SEARCH_=
FALLBACK)
+/*
+ * Skip UNMOVABLE<->RECLAIMABLE cross-type fallback. Used by the steal
+ * path to prevent landing single foreign-type pages into a PB labeled
+ * with the other non-movable type -- a steal does not relabel the PB
+ * so cross-type stealing creates permanent mixing.
+ */
+#define SB_SKIP_CROSS_TYPE	(1 << 3)
=20
 static struct page *
 __rmqueue_sb_find_fallback(struct zone *zone, unsigned int order,
@@ -3597,6 +3636,10 @@ __rmqueue_sb_find_fallback(struct zone *zone, unsign=
ed int order,
 					int fmt =3D fallbacks[start_migratetype][i];
 					struct page *page;
=20
+					if ((search_cats & SB_SKIP_CROSS_TYPE) &&
+					    noncompatible_cross_type(start_migratetype, fmt))
+						continue;
+
 					page =3D get_page_from_free_area(area,
 								       fmt);
 					if (page) {
@@ -3618,6 +3661,10 @@ __rmqueue_sb_find_fallback(struct zone *zone, unsign=
ed int order,
 				int fmt =3D fallbacks[start_migratetype][i];
 				struct page *page;
=20
+				if ((search_cats & SB_SKIP_CROSS_TYPE) &&
+				    noncompatible_cross_type(start_migratetype, fmt))
+					continue;
+
 				page =3D get_page_from_free_area(area,
 							       fmt);
 				if (page) {
@@ -3646,6 +3693,10 @@ __rmqueue_sb_find_fallback(struct zone *zone, unsign=
ed int order,
 					int fmt =3D fallbacks[start_migratetype][i];
 					struct page *page;
=20
+					if ((search_cats & SB_SKIP_CROSS_TYPE) &&
+					    noncompatible_cross_type(start_migratetype, fmt))
+						continue;
+
 					page =3D get_page_from_free_area(area,
 								       fmt);
 					if (page) {
@@ -3782,11 +3833,18 @@ __rmqueue_steal(struct zone *zone, int order, int s=
tart_migratetype,
 	/*
 	 * When ALLOC_NOFRAG_TAINTED_OK is set, only steal from tainted
 	 * SPBs to avoid tainting clean ones. Otherwise search all categories.
+	 *
+	 * Always skip UNMOVABLE<->RECLAIMABLE cross-type fallback. The steal
+	 * path takes a single page without relabeling its PB, so a cross-type
+	 * steal would land an UNMOVABLE page in a RECLAIMABLE-labeled PB
+	 * (or vice versa) and create permanent mixing. Falling through to
+	 * MIGRATE_MOVABLE (the second fallback) is preferable.
 	 */
 	if (alloc_flags & ALLOC_NOFRAG_TAINTED_OK)
 		search_cats =3D SB_SEARCH_PREFERRED;
 	else
 		search_cats =3D SB_SEARCH_PREFERRED | SB_SEARCH_FALLBACK;
+	search_cats |=3D SB_SKIP_CROSS_TYPE;
=20
 	/*
 	 * Search per-superpageblock free lists for fallback migratetypes.
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E34C13F1654
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289247; cv=none;
 b=EfZYGs0ksP8NXa212AexCM+aAi8xSsLXt21Uf6Bqm1kveJJcf2kg6YpENJlKogXa7gzCLs7Sa2TXKDegl5CyHPAViYH5/qIsKQLENcps2NpDI46qpjA3AlGbp3ZSLDmc4IAUnlUiJLdSu5WOBiyymRXKzkIvgEqk9OR+KcQvNsc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289247; c=relaxed/simple;
	bh=IGg+OxLzTFDwKAMndxzoV6L/lUdZ6k5kckGkIECt/rM=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=MLQl4okTFgT+LqmVYUGJ7/m0g8/jzMTcqQau3cg9BXFQVGf3BS1XLrFhfV79YnaKH9ELqE4PpwDC/xhvuegzcWvv50k9bE/v3oFEJ4Ief1pzWRa8d6te+P6hXFoP/l+CW7CoJCnvZNoNBwXzQVv+h7NinKoI3vlChLqyZe3fCk8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=d76rjulT; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="d76rjulT"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=vullDLo/ir7bMpSdhQSH6gjcQ40j69FGPX0V2CPSNvM=; b=d76rjulTzoVkULTv1OxRtTQ/MH
	6zAClVWWDPrRzpHsfUrw2PY/9PbgP3FCgbKptt5OaFuS1/epwjMwAN2unLyQPuuYawV8iiimc8iEu
	Zmq+8+NSvNWcnJnX1gH65XCm18nJAf6cCxuWxJyTJ7pfX9INUMzDTOWsixLwC/hK8ujDKQCsL+XPU
	+TfyUHgHWYDmkhyn8NeYAyB5jukI9PXnD7flRj+Hm9z0VRsVvuDDyO9hCvgkKb6OhkqLdR6FuJsyi
	0Z2GpE3qenZSfIcyUcJl+Wi5CLZOQA0nUx7m7W2WFQNnpmUeJ5dqW71lezIvcN3fabj4D0hw2Hlju
	dQbki8aw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-35zc;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 25/40] mm: trigger deferred SPB evac when atomic allocs
 would taint a clean SPB
Date: Wed, 20 May 2026 10:59:31 -0400
Message-ID: <20260520150018.2491267-26-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Hook queue_spb_evacuate() into __rmqueue_claim() so that whenever a
non-movable allocation is about to claim a pageblock from an empty or
clean superpageblock as a fallback (i.e. cat_search[c] is not
SB_SEARCH_PREFERRED), a deferred spb_evacuate_for_order() is scheduled
on the zone's pgdat workqueue.

The current allocation still proceeds and taints the clean SPB this
time, but the deferred evacuation creates free pageblocks inside
existing tainted SPBs so the next caller hitting the same trigger can
claim from the tainted pool instead of tainting another clean SPB.

Movable allocations are excluded because their preferred category is
SB_CLEAN; falling back from clean to tainted does not taint anything
new and so does not need the hint.

The trigger is gated by single-flight, throttle, and tainted-pool
precheck inside queue_spb_evacuate(), so it is safe to fire from this
hot path without storming the workqueue.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |  18 ++++
 mm/page_alloc.c        | 198 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 215 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a0124c170ac0..db719335b32a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1236,6 +1236,22 @@ struct zone {
 	unsigned int		compact_considered;
 	unsigned int		compact_defer_shift;
 	int			compact_order_failed;
+
+	/*
+	 * Atomic-context SPB evacuation deferral state.
+	 *
+	 * spb_evac_in_flight: bitmap indexed by
+	 *   migratetype * NR_PAGE_ORDERS + order, set on enqueue and
+	 *   cleared by the worker after spb_evacuate_for_order returns.
+	 *   Provides single-flight gating per (migratetype, order).
+	 *
+	 * spb_evac_last: jiffies of the last enqueue per migratetype,
+	 *   used as a 10ms throttle to prevent wakeup storms from
+	 *   concurrent atomic allocations.
+	 */
+	DECLARE_BITMAP(spb_evac_in_flight,
+		       MIGRATE_PCPTYPES * NR_PAGE_ORDERS);
+	unsigned long		spb_evac_last[MIGRATE_PCPTYPES];
 #endif
=20
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -1652,6 +1668,8 @@ typedef struct pglist_data {
 	struct task_struct *kcompactd;
 	bool proactive_compact_trigger;
 	struct workqueue_struct *evacuate_wq;
+	struct llist_head spb_evac_pending;
+	struct irq_work spb_evac_irq_work;
 #endif
 	/*
 	 * This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 988cf6f27938..dfbfed056bbb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -741,6 +741,8 @@ static void spb_maybe_start_defrag(struct superpagebloc=
k *sb);
 static bool spb_needs_defrag(struct superpageblock *sb);
 static bool spb_evacuate_for_order(struct zone *zone, unsigned int order,
 				  int migratetype);
+static void queue_spb_evacuate(struct zone *zone, unsigned int order,
+			       int migratetype);
 #else
 static inline void spb_maybe_start_defrag(struct superpageblock *sb) {}
 static inline bool spb_needs_defrag(struct superpageblock *sb) { return fa=
lse; }
@@ -749,6 +751,8 @@ static inline bool spb_evacuate_for_order(struct zone *=
zone, unsigned int order,
 {
 	return false;
 }
+static inline void queue_spb_evacuate(struct zone *zone, unsigned int orde=
r,
+				      int migratetype) {}
 #endif
=20
 #ifdef CONFIG_CONTIG_ALLOC
@@ -3800,6 +3804,18 @@ __rmqueue_claim(struct zone *zone, int order, int st=
art_migratetype,
 			if (!page)
 				continue;
=20
+			/*
+			 * About to claim from an empty or clean superpageblock
+			 * for a non-movable allocation -- this taints a fresh
+			 * SPB.  Defer an evacuation pass over the tainted pool
+			 * so subsequent allocations can reclaim freed
+			 * pageblocks instead of repeating this fallback.
+			 */
+			if (cat_search[c] !=3D SB_SEARCH_PREFERRED &&
+			    start_migratetype !=3D MIGRATE_MOVABLE)
+				queue_spb_evacuate(zone, order,
+						   start_migratetype);
+
 			page =3D try_to_claim_block(zone, page, current_order,
 						  order, start_migratetype,
 						  fallback_mt, alloc_flags,
@@ -8855,6 +8871,177 @@ static void evacuate_pageblock(struct zone *zone, u=
nsigned long start_pfn,
 		putback_movable_pages(&cc.migratepages);
 }
=20
+/*
+ * Atomic-context SPB evacuation deferral.
+ *
+ * When an atomic allocation in __rmqueue_claim is about to taint a
+ * clean superpageblock because the tainted pool has no free page at
+ * the requested (order, migratetype), schedule a deferred call to
+ * spb_evacuate_for_order. That frees pageblocks inside tainted SPBs so
+ * subsequent allocations can claim them instead of tainting more clean
+ * SPBs.
+ *
+ * Two-step deferral mirrors the pageblock-evacuate path: irq_work to
+ * leave allocator lock context, then queue_work to reach process
+ * context where spb_evacuate_for_order can sleep in migrate_pages.
+ */
+
+struct spb_evac_request {
+	struct work_struct	work;
+	struct zone		*zone;
+	unsigned int		order;
+	int			migratetype;
+	struct llist_node	free_node;
+};
+
+#define NR_SPB_EVAC_REQUESTS	64
+static struct spb_evac_request spb_evac_pool[NR_SPB_EVAC_REQUESTS];
+static struct llist_head spb_evac_freelist;
+/*
+ * llist_del_first() requires single-consumer or external locking.
+ * queue_spb_evacuate() runs under zone->lock, but different zones
+ * hold different locks, so consumers race.  spb_evac_request_free()
+ * uses llist_add which is multi-producer-safe and stays lockless.
+ */
+static DEFINE_SPINLOCK(spb_evac_freelist_lock);
+
+static struct spb_evac_request *spb_evac_request_alloc(void)
+{
+	struct llist_node *node;
+
+	spin_lock(&spb_evac_freelist_lock);
+	node =3D llist_del_first(&spb_evac_freelist);
+	spin_unlock(&spb_evac_freelist_lock);
+	if (!node)
+		return NULL;
+	return container_of(node, struct spb_evac_request, free_node);
+}
+
+static void spb_evac_request_free(struct spb_evac_request *req)
+{
+	llist_add(&req->free_node, &spb_evac_freelist);
+}
+
+static void spb_evac_work_fn(struct work_struct *work)
+{
+	struct spb_evac_request *req =3D container_of(work,
+						    struct spb_evac_request,
+						    work);
+	struct zone *zone =3D req->zone;
+	unsigned int order =3D req->order;
+	int mt =3D req->migratetype;
+
+	spb_evacuate_for_order(zone, order, mt);
+
+	/*
+	 * Clearing the in-flight bit lets a future caller hitting the
+	 * same (mt, order) re-enqueue evacuation.  Ordering between this
+	 * worker's SPB state changes and the future caller's
+	 * tainted_pool_has_free walk is provided by zone->lock taken
+	 * inside spb_evacuate_for_order and by the future caller.
+	 */
+	clear_bit(mt * NR_PAGE_ORDERS + order, zone->spb_evac_in_flight);
+	spb_evac_request_free(req);
+}
+
+static void spb_evac_irq_work_fn(struct irq_work *work)
+{
+	pg_data_t *pgdat =3D container_of(work, pg_data_t,
+					spb_evac_irq_work);
+	struct llist_node *pending;
+	struct spb_evac_request *req, *next;
+
+	if (!pgdat->evacuate_wq)
+		return;
+
+	pending =3D llist_del_all(&pgdat->spb_evac_pending);
+	llist_for_each_entry_safe(req, next, pending, free_node) {
+		INIT_WORK(&req->work, spb_evac_work_fn);
+		queue_work(pgdat->evacuate_wq, &req->work);
+	}
+}
+
+/*
+ * Walk tainted SPBs to check whether any has a free page at the given
+ * order and migratetype.  When this returns true, a clean-SPB claim is
+ * not pool depletion but a try_to_claim_block over-rejection: skip the
+ * deferred evacuation since it cannot help.
+ */
+static bool tainted_pool_has_free(struct zone *zone, unsigned int order,
+				  int migratetype)
+{
+	struct superpageblock *sb;
+	int full;
+
+	lockdep_assert_held(&zone->lock);
+
+	for (full =3D 0; full < __NR_SB_FULLNESS; full++) {
+		list_for_each_entry(sb, &zone->spb_lists[SB_TAINTED][full],
+				    list) {
+			struct free_area *fa =3D &sb->free_area[order];
+
+			if (fa->nr_free &&
+			    !list_empty(&fa->free_list[migratetype]))
+				return true;
+		}
+	}
+	return false;
+}
+
+/**
+ * queue_spb_evacuate - schedule deferred SPB evacuation from atomic conte=
xt
+ * @zone: zone that just failed to find a free page in the tainted pool
+ * @order: requested allocation order
+ * @migratetype: requested migratetype (UNMOVABLE or RECLAIMABLE only)
+ *
+ * Caller must hold zone->lock; the tainted-pool walk asserts it.
+ *
+ * Single-flight gated per (zone, migratetype, order) and throttled to
+ * one enqueue per 10ms per (zone, migratetype).  Pool exhaustion
+ * silently drops the request; the next caller hitting the same trigger
+ * will retry.
+ */
+static void queue_spb_evacuate(struct zone *zone, unsigned int order,
+			       int migratetype)
+{
+	pg_data_t *pgdat =3D zone->zone_pgdat;
+	struct spb_evac_request *req;
+	unsigned int bit;
+
+	lockdep_assert_held(&zone->lock);
+
+	if (!pgdat->spb_evac_irq_work.func)
+		return;
+	if (order >=3D NR_PAGE_ORDERS || migratetype >=3D MIGRATE_PCPTYPES)
+		return;
+
+	if (time_before(jiffies,
+			zone->spb_evac_last[migratetype] + HZ / 100))
+		return;
+
+	bit =3D migratetype * NR_PAGE_ORDERS + order;
+	if (test_and_set_bit(bit, zone->spb_evac_in_flight))
+		return;
+
+	if (tainted_pool_has_free(zone, order, migratetype)) {
+		clear_bit(bit, zone->spb_evac_in_flight);
+		return;
+	}
+
+	req =3D spb_evac_request_alloc();
+	if (!req) {
+		clear_bit(bit, zone->spb_evac_in_flight);
+		return;
+	}
+
+	zone->spb_evac_last[migratetype] =3D jiffies;
+	req->zone =3D zone;
+	req->order =3D order;
+	req->migratetype =3D migratetype;
+	llist_add(&req->free_node, &pgdat->spb_evac_pending);
+	irq_work_queue(&pgdat->spb_evac_irq_work);
+}
+
 /*
  * Background superpageblock defragmentation.
  *
@@ -9118,7 +9305,12 @@ static void spb_maybe_start_defrag(struct superpageb=
lock *sb)
=20
 static int __init pageblock_evacuate_init(void)
 {
-	int nid;
+	int nid, i;
+
+	/* Initialize the global freelist of SPB evacuate requests */
+	init_llist_head(&spb_evac_freelist);
+	for (i =3D 0; i < NR_SPB_EVAC_REQUESTS; i++)
+		llist_add(&spb_evac_pool[i].free_node, &spb_evac_freelist);
=20
 	/* Create a per-pgdat workqueue */
 	for_each_online_node(nid) {
@@ -9133,6 +9325,10 @@ static int __init pageblock_evacuate_init(void)
 			continue;
 		}
=20
+		init_llist_head(&pgdat->spb_evac_pending);
+		init_irq_work(&pgdat->spb_evac_irq_work,
+			      spb_evac_irq_work_fn);
+
 		/* Initialize per-superpageblock defrag work structs */
 		for (z =3D 0; z < MAX_NR_ZONES; z++) {
 			struct zone *zone =3D &pgdat->node_zones[z];
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD86A3EF648
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289263; cv=none;
 b=Qo8jk3PrZ6ughvZuBC2wFG+Vt/y+t/V+nWMMLE0szWauSHK63Lg5Al7TUpC59jmjTNKKcGviE5JGJEeWD3CfhIMXwIJxIuAvhqLegsB94l9WTdHSERLu6v3VJI67pozmBdbXDoaiq6UXNStaDsc2/7qMW+Xmk5VYyeDkTGtnM6g=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289263; c=relaxed/simple;
	bh=r23BunYu+z3kgoljSFUV7yw7jqt8wJcnjfDwjXwnEIg=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=qumTKUBpS+vcZ+jab3ut32kiGLCskac06OaPLAJjYCfkwBRo66X9+/9WQPtoIQkOqeETadda6L0oMJj5k1ATLuES+nEpkYuCFH9CozeYQ3NieWNMZF/ip16DcJNRY9ojO+o0oLUb6FIwuw/nSNTW2mrczJiCjnOQEC/F97m/IO0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=CsZ0AD/7; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="CsZ0AD/7"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=I/TcUjtUKEbuTDWUw2WODukw7GuJFX0QHxyJ5ij3iHs=; b=CsZ0AD/7/nkVIebM6aFXUIh0WP
	gYNLJZCd88R656ND3hZKlDVSzcJRAEFEItdfAR0mLXc3Z85VwGkFpmak0wwTNKNakDAEw2EZtWvJG
	H5S6LOHgukPjCFbwCpMlhbOEGNaU2qn/NQMPtoCQpcruE6gS2Q65HlnR4iQNTOhzNFcIdHrsWshnA
	TOuBSp0z9Bv6CpJ1syP08XxKUu9M/B7U4z5C0wXWsLpz+SAeO7TWkQmwX+KUeQbIjSI09L4z2r4Xx
	YYub66NbhXWoqeT7R0juZoIRPHk78zPkIr7+huOwOigtDlf18hBDJnAFllXJtp8kFnO5naRGUIBNy
	9KnYhsWA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3Ewm;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 26/40] mm: page_alloc: refuse fragmenting fallback for
 callers with cheap fallback
Date: Wed, 20 May 2026 10:59:32 -0400
Message-ID: <20260520150018.2491267-27-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

A coarse bail-out gate in get_page_from_freelist's slowpath retry,
returning NULL to keep atomic-shape allocations from tainting clean
SPBs, would break early-boot in QEMU: cred_init's slab cache create
reaches the slowpath with gfp =3D __GFP_COMP (gfp_allowed_mask =3D
GFP_BOOT_MASK strips __GFP_RECLAIM from GFP_KERNEL during boot), has
no fallback path, and panics when a coarse gate refuses the
allocation.

Add a finer-grained refusal anchored in __rmqueue, where the SPB-aware
free-list walk already runs:

  - Add ALLOC_HIGHORDER_OPTIONAL, set in gfp_to_alloc_flags() for two
    shapes:

      1. Explicit fallback declaration: __GFP_NORETRY without
         __GFP_RETRY_MAYFAIL. Used by THP, slab high-order refill,
         skb_page_frag_refill on full sockets, etc.

      2. Atomic-context shape: no __GFP_DIRECT_RECLAIM, no
         __GFP_NOMEMALLOC, no __GFP_NOFAIL. Catches GFP_ATOMIC,
         GFP_NOWAIT, including ALLOC_HIGHATOMIC consumers (which still
         get a second crack at the dedicated MIGRATE_HIGHATOMIC reserve
         in rmqueue_buddy after __rmqueue returns NULL).

    __GFP_MEMALLOC and __GFP_NOFAIL never get the flag -- they must
    succeed even at the cost of fresh-SPB taint.

  - Add struct spb_tainted_walk to record what __rmqueue_smallest's
    Pass 1 saw on the SB_TAINTED list (any free pages, any free PB,
    below-reserve pageblock count). Thread it through the function's
    new fourth argument; non-walking call sites pass NULL.

  - In __rmqueue, allocate the walk on the stack for callers with
    ALLOC_HIGHORDER_OPTIONAL set on a non-movable, non-CMA migratetype.
    Force *mode back to RMQUEUE_NORMAL on every call so rmqueue_bulk
    Phase 3 can't reuse a memoised RMQUEUE_CLAIM/STEAL state to skip
    the gate across iterations.

  - After __rmqueue_smallest returns NULL, check the walk: if a tainted
    SPB has free pages or a free pageblock that could absorb this
    allocation after evacuation, return NULL and bump
    SPB_HIGHORDER_REFUSED. Skip RMQUEUE_CLAIM and RMQUEUE_STEAL
    entirely (both can taint clean SPBs). The slowpath will eventually
    drop NOFRAGMENT and let the allocation proceed only for the
    callers that lack ALLOC_HIGHORDER_OPTIONAL -- i.e. the truly
    must-not-fail consumers.

  - Before falling through to Pass 3 (empty SPBs) inside
    __rmqueue_smallest, kick queue_spb_evacuate() when the walk saw a
    tainted SPB below its reserve threshold, so future allocations
    have a movable-evicted home in an already-tainted SPB.

  - Add SPB_HIGHORDER_REFUSED vm event counter (events, not refused
    allocations: a single high-level alloc that retries can be counted
    multiple times across per-zone attempts).

The early-boot SB_TAINTED list is empty, so the walk records nothing,
the refusal does not engage, and __rmqueue falls through to
RMQUEUE_CLAIM which taints the first SPB normally (the first taint is
unavoidable). cred_init's slab create succeeds, boot succeeds.

Tested in a 16 GB QEMU VM under combined sb-stress + UDP-loopback +
fork/mmap storms (~480s); 2 tainted Normal SPBs out of 13 (boot
baseline 1, +1 during stress); 11 clean SPBs distributed movable load;
no kernel BUG, oops, hang, or panic.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/vm_event_item.h |   5 ++
 mm/internal.h                 |   1 +
 mm/page_alloc.c               | 115 ++++++++++++++++++++++++++++++++--
 mm/vmstat.c                   |   1 +
 4 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 03fe95f5a020..4a8513d5fc3e 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -76,6 +76,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		CMA_ALLOC_SUCCESS,
 		CMA_ALLOC_FAIL,
 #endif
+		SPB_HIGHORDER_REFUSED,	/*
+					 * refused fragmenting fallback to keep
+					 * a clean SPB clean when a tainted SPB
+					 * still has free pageblocks
+					 */
 		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
 		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
 		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
diff --git a/mm/internal.h b/mm/internal.h
index e6d61dbc18d9..f52575202a96 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1512,6 +1512,7 @@ unsigned int reclaim_clean_pages_from_list(struct zon=
e *zone,
 #define ALLOC_TRYLOCK		0x400 /* Only use spin_trylock in allocation path */
 #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAI=
M set */
 #define ALLOC_NOFRAG_TAINTED_OK	0x1000 /* NOFRAGMENT, but allow steal from=
 tainted SPBs */
+#define ALLOC_HIGHORDER_OPTIONAL 0x2000 /* caller can fall back to a lower=
 order */
=20
 /* Flags that allow allocations below the min watermark. */
 #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC=
|ALLOC_OOM)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dfbfed056bbb..e4ecddb428c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2799,9 +2799,21 @@ static struct page *try_to_claim_block(struct zone *=
zone, struct page *page,
 		  int block_type, unsigned int alloc_flags,
 		  bool from_tainted_spb);
=20
+/*
+ * Snapshot of tainted-SPB state observed while __rmqueue_smallest walks t=
he
+ * free lists. Lets the caller (currently __rmqueue) decide whether to ref=
use
+ * a fragmenting fallback when an existing tainted SPB could absorb the de=
mand
+ * once it is evacuated.
+ */
+struct spb_tainted_walk {
+	bool saw_free_pages;	/* tainted SPB has any free pages, any order */
+	bool saw_free_pb;	/* tainted SPB has at least one free pageblock */
+	bool saw_below_reserve;	/* tainted SPB has nr_free <=3D spb_tainted_reser=
ve */
+};
+
 static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
-						int migratetype)
+				int migratetype, struct spb_tainted_walk *walk)
 {
 	unsigned int current_order;
 	struct free_area *area;
@@ -2850,6 +2862,20 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
=20
 		list_for_each_entry(sb,
 			&zone->spb_lists[cat][full], list) {
+			/*
+			 * Snapshot tainted-SPB capacity before the
+			 * nr_free_pages skip: an SPB with a free pageblock
+			 * but nothing on the requested-MT freelist still
+			 * counts as "could absorb this allocation after evac".
+			 */
+			if (walk && cat =3D=3D SB_TAINTED) {
+				if (sb->nr_free_pages)
+					walk->saw_free_pages =3D true;
+				if (sb->nr_free)
+					walk->saw_free_pb =3D true;
+				if (sb->nr_free <=3D spb_tainted_reserve(sb))
+					walk->saw_below_reserve =3D true;
+			}
 			if (!sb->nr_free_pages)
 				continue;
 			/* Try whole pageblock (or larger) first for PCP buddy */
@@ -2975,6 +3001,16 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 		}
 	}
=20
+	/*
+	 * About to fall through to Pass 3 (empty SPBs) or Pass 4 fallback,
+	 * which risks tainting a clean SPB. If the tainted-SPB walk above
+	 * showed that some tainted SPB is below its reserve threshold of
+	 * free pageblocks, kick deferred evacuation so future allocations
+	 * have a movable-evicted home in an already-tainted SPB.
+	 */
+	if (walk && walk->saw_below_reserve)
+		queue_spb_evacuate(zone, order, migratetype);
+
 	/* Pass 3: whole pageblock from empty superpageblocks */
 	list_for_each_entry(sb, &zone->spb_empty, list) {
 		if (!sb->nr_free_pages)
@@ -3098,7 +3134,7 @@ static inline bool noncompatible_cross_type(int start=
_type, int fallback_type)
 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zo=
ne,
 					unsigned int order)
 {
-	return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+	return __rmqueue_smallest(zone, order, MIGRATE_CMA, NULL);
 }
 #else
 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
@@ -3573,7 +3609,7 @@ try_to_claim_block(struct zone *zone, struct page *pa=
ge,
 	if (sb)
 		spb_update_list(sb);
 #endif
-	return __rmqueue_smallest(zone, order, start_type);
+	return __rmqueue_smallest(zone, order, start_type, NULL);
 }
=20
 /*
@@ -3920,8 +3956,29 @@ static __always_inline struct page *
 __rmqueue(struct zone *zone, unsigned int order, int migratetype,
 	  unsigned int alloc_flags, enum rmqueue_mode *mode)
 {
+	struct spb_tainted_walk walk =3D { };
+	struct spb_tainted_walk *walkp =3D NULL;
 	struct page *page;
=20
+	/*
+	 * Track tainted-SPB state for non-movable, non-CMA callers that
+	 * signaled they have a cheap fallback (atomic shape or explicit
+	 * NORETRY). We use that to refuse a fragmenting CLAIM/STEAL when a
+	 * tainted SPB still has free pageblocks waiting to be evacuated.
+	 *
+	 * Force *mode back to RMQUEUE_NORMAL so the walk + refusal check
+	 * runs on every call. rmqueue_bulk Phase 3 chains many __rmqueue
+	 * calls reusing *mode; without this reset, a single successful
+	 * RMQUEUE_CLAIM/STEAL on the first iteration would let every
+	 * subsequent iteration skip the case RMQUEUE_NORMAL block and taint
+	 * additional clean SPBs unchecked.
+	 */
+	if (migratetype !=3D MIGRATE_MOVABLE && !is_migrate_cma(migratetype) &&
+	    (alloc_flags & ALLOC_HIGHORDER_OPTIONAL)) {
+		walkp =3D &walk;
+		*mode =3D RMQUEUE_NORMAL;
+	}
+
 	if (IS_ENABLED(CONFIG_CMA)) {
 		/*
 		 * Balance movable allocations between regular and CMA areas by
@@ -3948,9 +4005,22 @@ __rmqueue(struct zone *zone, unsigned int order, int=
 migratetype,
 	 */
 	switch (*mode) {
 	case RMQUEUE_NORMAL:
-		page =3D __rmqueue_smallest(zone, order, migratetype);
+		page =3D __rmqueue_smallest(zone, order, migratetype, walkp);
 		if (page)
 			return page;
+		/*
+		 * Refuse to fragment a clean SPB when a tainted SPB already
+		 * holds free pages or a free pageblock that could absorb
+		 * this allocation after evacuation. The caller has a cheap
+		 * fallback (lower-order retry, vmalloc, single-page fragment,
+		 * drop the packet, etc.) -- better that than tainting fresh
+		 * capacity. Pre-Pass-3 evac trigger in __rmqueue_smallest
+		 * already kicked deferred eviction.
+		 */
+		if (walkp && (walk.saw_free_pages || walk.saw_free_pb)) {
+			count_vm_event(SPB_HIGHORDER_REFUSED);
+			return NULL;
+		}
 		fallthrough;
 	case RMQUEUE_CMA:
 		if (alloc_flags & ALLOC_CMA) {
@@ -5073,7 +5143,8 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
 			spin_lock_irqsave(&zone->lock, flags);
 		}
 		if (alloc_flags & ALLOC_HIGHATOMIC)
-			page =3D __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+			page =3D __rmqueue_smallest(zone, order,
+						  MIGRATE_HIGHATOMIC, NULL);
 		if (!page) {
 			enum rmqueue_mode rmqm =3D RMQUEUE_NORMAL;
=20
@@ -5086,7 +5157,9 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
 			 * high-order atomic allocation in the future.
 			 */
 			if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
-				page =3D __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+				page =3D __rmqueue_smallest(zone, order,
+							  MIGRATE_HIGHATOMIC,
+							  NULL);
=20
 			if (!page) {
 				spin_unlock_irqrestore(&zone->lock, flags);
@@ -6435,6 +6508,36 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int orde=
r)
 	if (defrag_mode)
 		alloc_flags |=3D ALLOC_NOFRAGMENT;
=20
+	/*
+	 * Mark callers that have a cheap fallback if the page allocator returns
+	 * NULL, so __rmqueue can refuse to taint a clean SPB when an existing
+	 * tainted SPB still has free pageblocks waiting to be evacuated.
+	 *
+	 * Two shapes qualify:
+	 *
+	 *  1. Explicit fallback declaration: __GFP_NORETRY without
+	 *     __GFP_RETRY_MAYFAIL. Used by THP, slab high-order refill,
+	 *     skb_page_frag_refill on full sockets, etc.
+	 *
+	 *  2. Atomic-context shape: no __GFP_DIRECT_RECLAIM, no __GFP_NOMEMALLOC,
+	 *     no __GFP_NOFAIL. These callers (GFP_ATOMIC, GFP_NOWAIT, including
+	 *     ALLOC_HIGHATOMIC consumers) have implicit fallbacks: drop the
+	 *     packet, demote the slab order, return ENOMEM up the slowpath,
+	 *     retry from process context with GFP_KERNEL, etc. ALLOC_HIGHATOMIC
+	 *     callers also get a second crack at the dedicated MIGRATE_HIGHATOMIC
+	 *     reserve in rmqueue_buddy after __rmqueue returns NULL.
+	 *     Tainting a 1 GiB SPB to satisfy any of them is a long-lived
+	 *     fragmentation event for short-lived data.
+	 *
+	 * __GFP_MEMALLOC (reclaim recursion) and __GFP_NOFAIL (declared cannot
+	 * fail) are excluded -- they must succeed even at the cost of taint.
+	 */
+	if ((gfp_mask & __GFP_NORETRY) && !(gfp_mask & __GFP_RETRY_MAYFAIL))
+		alloc_flags |=3D ALLOC_HIGHORDER_OPTIONAL;
+	else if (!(gfp_mask & (__GFP_DIRECT_RECLAIM | __GFP_NOMEMALLOC |
+			       __GFP_NOFAIL)))
+		alloc_flags |=3D ALLOC_HIGHORDER_OPTIONAL;
+
 	return alloc_flags;
 }
=20
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9133254b6b87..0be1b969f493 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1388,6 +1388,7 @@ const char * const vmstat_text[] =3D {
 	[I(CMA_ALLOC_SUCCESS)]			=3D "cma_alloc_success",
 	[I(CMA_ALLOC_FAIL)]			=3D "cma_alloc_fail",
 #endif
+	[I(SPB_HIGHORDER_REFUSED)]		=3D "spb_highorder_refused",
 	[I(UNEVICTABLE_PGCULLED)]		=3D "unevictable_pgs_culled",
 	[I(UNEVICTABLE_PGSCANNED)]		=3D "unevictable_pgs_scanned",
 	[I(UNEVICTABLE_PGRESCUED)]		=3D "unevictable_pgs_rescued",
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BE16D3F4DC2
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289258; cv=none;
 b=J6M7j7yKXLmT601tC5Z38GqLFPhKnT6ajFjwVCZZl63KVIL26gBjV5JuEQJdu3tx4F5nCNZMU3dok3acUlhII0LGK9Hrk71ohYv9efLlOENp32gcXP0jYwWoK3njUzxAbcc62FPRkd4u+wvGMXfqQWmCRUn+YFxGxDk2a2v8Zco=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289258; c=relaxed/simple;
	bh=D4HeCpzu1j1agSFijM4b9T2jwmvivUzDq/9xVJBuIwI=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=WKCSeNhWXq2Y2vFXdM1h+kmfNqSNIR4+M5/rj/9kCn/WoDyKot06bpOXEG1rn59jRjf/2UebnYKycZE4JZJAweoRikAPrMwrNyX1lr1uvffZI709UcF++8IAHbFhs7kE9KT8Z5CwIqgVGpfiBAooyu0kjOPGRq41Yg2AXNj2+r4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=l/gkW9W6; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="l/gkW9W6"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:Content-Type:MIME-Version:References:
	In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=EEdP7NoP36KTb44zc6aIOMe8hvnM+eOFVJfuIoIwnjw=; b=l/gkW9W67R9khRVnsUcUIkfZFs
	elpeejPCQcer+Dm8sGdBEHkuTlufAtjMCMIP7K+cRckWkcUOlxy9OXxiNKSbu7ZcsxxxV6Gdu7E9X
	zOtup70nJ+gsHbCCr6hV7QuwmcjBr5cZ3tAgbRKi7Jh1jDvgJlPtXECteABmdA4njIKRFaH96nO8b
	ZAEGCIOdQmHqldq4yk3Pl9SF0M/eLBeNl2uuKZVmWt4Qzq5qyr7iE7KlOvcENaltH+N5Y8++kLH8j
	MH9kJW3C7U7fawx9W1pYwwESxsGiCsBvBTSRPJ40XjWO69wHDTUKzT2n7Der0aIkjqgm6pMCBD2YR
	fxwmMWNw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3LUf;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 27/40] mm: page_alloc: cross-migratetype buddy borrow
 within tainted SPBs
Date: Wed, 20 May 2026 10:59:33 -0400
Message-ID: <20260520150018.2491267-28-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

When pages are freed via __free_one_page they're placed on the
per-SPB free_list determined by their pageblock's migratetype, not
the original allocation's migratetype.  Slab-heavy and cache-heavy
workloads both expose structural mismatches that leave non-movable
allocations stranded:

  - RECLAIMABLE pageblocks fill up densely with live slab objects
    (e.g. btrfs_inode caches), leaving very few sub-pageblock free
    fragments on the RECL free list.
  - UNMOVABLE pageblocks accumulate sparse free space from vmalloc
    and raw-alloc churn -- tens of thousands of free pages, all
    on the UNMOV free list.
  - MOVABLE-tagged pageblocks in tainted SPBs absorb freed
    page-cache and anon-LRU pages, accumulating large pools all on
    the MOVABLE free list -- invisible to non-movable demand even
    though the tainted SPB has plenty of unused space.

Add two new passes between Pass 2b and Pass 3 of __rmqueue_smallest,
both restricted to SB_TAINTED (clean SPBs must not be polluted with
cross-type mixing) and both purely transient borrows (no pageblock
relabel; the borrowed page returns to its source list when freed):

  Pass 2c -- cross-non-movable borrow.  UNMOV alloc tries the
  RECL free list; RECL alloc tries the UNMOV free list.  Restricted
  to UNMOV <-> RECL.

  Pass 2d -- cross-MOV borrow.  Non-movable alloc tries the
  MOVABLE free list of a tainted SPB.  Tradeoff: the borrowed
  UNMOV/RECL content blocks compaction of its source pageblock
  until freed; restricted to SB_TAINTED so contamination is bounded
  to one pageblock inside an already-tainted SPB.  The alternative
  -- Pass 3 tainting a fresh clean SPB -- removes a 1 GiB region
  from the clean pool, which is strictly worse for the anti-
  fragmentation invariant the series is built around.

PB_has_<requested_type> is set via __spb_set_has_type so spb_defrag
accounting reflects that the pageblock now hosts our type's
content.  PB_has_<source_type> stays set since other buddies of
that type remain.

Movable allocations don't participate (they have Pass 4) and CMA
is skipped.  Observable as SPB_ALLOC_OUTCOME_PASS_2C and
SPB_ALLOC_OUTCOME_PASS_2D on the spb_alloc_walk tracepoint.

Live measurement on a 250 GB system with btrfs root
(Stage 1 + simplified Stage 2a) at boot+7min: 12 tainted Normal-
zone SPBs grew from 4 baseline despite the existing 11 having
between 825 and 87,062 free pages each, ALL on the UNMOV list
while the workload kept allocating RECL btrfs_inode slab pages.
Pass 2c lets those allocs absorb into the existing UNMOV-listed
free pool rather than creating fresh tainted SPBs; Pass 2d
extends the same idea to the MOV-listed free pool that page-
cache reclaim leaves behind.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e4ecddb428c3..ce8cd99dd283 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2820,6 +2820,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 	struct page *page;
 	int full;
 	struct superpageblock *sb;
+	int opposite_mt;
 	/*
 	 * Category search order: 2 passes.
 	 * Movable: clean first, then tainted (pack into clean SBs).
@@ -2999,6 +3000,161 @@ struct page *__rmqueue_smallest(struct zone *zone, =
unsigned int order,
 				}
 			}
 		}
+
+		/*
+		 * Pass 2c: cross-non-movable borrow within tainted SPBs.
+		 *
+		 * If we're a non-movable alloc and Pass 1/2/2b couldn't find a
+		 * buddy on our migratetype's free list anywhere, but tainted
+		 * SPBs have free buddies on the *opposite* non-movable type's
+		 * free list, take one of those.
+		 *
+		 * Why this happens: when pages are freed, __free_one_page puts
+		 * them on the free_list determined by their pageblock's tag,
+		 * not the original allocation's migratetype. Slab caches tend
+		 * to be dense (RECL pageblocks fill up; few sub-PB fragments),
+		 * while UNMOV pageblocks accumulate sparse free space from
+		 * vmalloc/raw alloc churn. Net effect: tainted SPBs frequently
+		 * have tens of thousands of free pages all on the UNMOV list,
+		 * invisible to RECL allocs (or vice versa). Without this pass,
+		 * the alloc falls through to Pass 3 and taints a fresh clean
+		 * SPB even though the existing tainted ones have plenty of
+		 * unused space.
+		 *
+		 * We do NOT relabel the source pageblock. The buddy is taken
+		 * from @opposite_mt's free list and the splits go back on
+		 * @opposite_mt's list (page_del_and_expand uses the same mt
+		 * for delete and expand). The pageblock tag is unchanged, so
+		 * the page returns to @opposite_mt's list when freed via
+		 * __free_one_page. Effectively a borrow: the alloc takes a
+		 * physical page from a UNMOV-tagged pageblock for a RECL
+		 * use, and the page cycles back to UNMOV's list on free.
+		 *
+		 * We do set PB_has_<migratetype> via __spb_set_has_type so
+		 * spb_defrag accounting reflects that this pageblock now hosts
+		 * our migratetype's content too. PB_has_<opposite_mt> stays
+		 * set since other buddies of that type remain.
+		 *
+		 * Restricted to UNMOV =E2=86=94 RECL. Movable allocations don't
+		 * participate (they have their own Pass 4 fallback path).
+		 *
+		 * Restricted to SB_TAINTED to avoid spreading mixing into
+		 * clean SPBs.
+		 */
+		opposite_mt =3D -1;
+		if (migratetype =3D=3D MIGRATE_UNMOVABLE)
+			opposite_mt =3D MIGRATE_RECLAIMABLE;
+		else if (migratetype =3D=3D MIGRATE_RECLAIMABLE)
+			opposite_mt =3D MIGRATE_UNMOVABLE;
+
+		if (opposite_mt >=3D 0) {
+			for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+				list_for_each_entry(sb,
+					&zone->spb_lists[SB_TAINTED][full], list) {
+					int co;
+
+					if (!sb->nr_free_pages)
+						continue;
+					for (co =3D min_t(int, pageblock_order - 1,
+							NR_PAGE_ORDERS - 1);
+					     co >=3D (int)order;
+					     --co) {
+						current_order =3D co;
+						area =3D &sb->free_area[current_order];
+						page =3D get_page_from_free_area(
+							area, opposite_mt);
+						if (!page)
+							continue;
+						if (get_pageblock_isolate(page))
+							continue;
+						if (is_migrate_cma(
+						    get_pageblock_migratetype(page)))
+							continue;
+						page_del_and_expand(zone, page,
+							order, current_order,
+							opposite_mt);
+						__spb_set_has_type(page,
+							migratetype);
+						trace_mm_page_alloc_zone_locked(
+							page, order, migratetype,
+							pcp_allowed_order(order) &&
+							migratetype < MIGRATE_PCPTYPES);
+						return page;
+					}
+				}
+			}
+		}
+
+		/*
+		 * Pass 2d: cross-MOV borrow within tainted SPBs.
+		 *
+		 * If Pass 1/2/2b/2c all failed, the next step is Pass 3
+		 * which would taint a fresh clean SPB. Before that, try
+		 * to borrow an individual buddy from a tainted SPB's
+		 * MIGRATE_MOVABLE free list.
+		 *
+		 * Tainted SPBs accumulate large amounts of free space on
+		 * the MOV free list (e.g. reclaimed page-cache pages
+		 * whose pageblock tag is MOVABLE). Pass 1 cannot see
+		 * those for non-movable allocs, Pass 2/2b cannot claim a
+		 * whole pageblock when sb->nr_free =3D=3D 0, and Pass 2c is
+		 * restricted to UNMOV<->RECL. The result is a tainted
+		 * SPB with tens to hundreds of thousands of free pages
+		 * all unreachable from non-movable demand.
+		 *
+		 * Borrow semantics mirror Pass 2c: take a buddy from the
+		 * MOVABLE free list without relabeling the source
+		 * pageblock. The page is used for the requesting non-
+		 * movable mt for the lifetime of the allocation, then on
+		 * free returns to the MOVABLE list.
+		 *
+		 * Cost: the borrowed UNMOV/RECL content blocks
+		 * compaction of its source pageblock until freed.
+		 * Restricted to SB_TAINTED so the contamination is
+		 * bounded to an already-tainted SPB; the alternative
+		 * (Pass 3) taints a fresh clean SPB and removes a 1 GiB
+		 * region from the clean pool, which is strictly worse.
+		 *
+		 * Skipped for movable allocs (they have Pass 4) and for
+		 * CMA allocs.
+		 */
+		if (!movable && !is_migrate_cma(migratetype)) {
+			for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+				list_for_each_entry(sb,
+					&zone->spb_lists[SB_TAINTED][full], list) {
+					int co;
+
+					if (!sb->nr_free_pages)
+						continue;
+					for (co =3D min_t(int, pageblock_order - 1,
+							NR_PAGE_ORDERS - 1);
+					     co >=3D (int)order;
+					     --co) {
+						current_order =3D co;
+						area =3D &sb->free_area[current_order];
+						page =3D get_page_from_free_area(
+							area, MIGRATE_MOVABLE);
+						if (!page)
+							continue;
+						if (get_pageblock_isolate(page))
+							continue;
+						if (is_migrate_cma(
+						    get_pageblock_migratetype(page)))
+							continue;
+						page_del_and_expand(zone, page,
+							order, current_order,
+							MIGRATE_MOVABLE);
+						__spb_set_has_type(page,
+							migratetype);
+						trace_mm_page_alloc_zone_locked(
+							page, order, migratetype,
+							pcp_allowed_order(order) &&
+							migratetype < MIGRATE_PCPTYPES);
+						return page;
+					}
+				}
+			}
+		}
 	}
=20
 	/*
--=20
2.54.0

From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB09C22301
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289252; cv=none;
 b=AZDvdJc2rqUZE7FVyomPrcWbYywi8rQ4UPwTk44UOJIVldZv53A40MHbDDyWSGnsgMZc14K5KP1cYTYGFpBzr545tXBjk3etxMgaDRmtc4ChdZgLjn+u/gpLZDPG9b6NwVia/MZeUpbGwddSgP95Y4LLas+P2pIyhv3dd6gISyI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289252; c=relaxed/simple;
	bh=mh+GyO5+xCkv8g1BVuh3W2WzpJ6cVffVsqpWqRXCOGo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=hFJ267OKFw13pseSClZmAJfh99/ZC8e/7ynteUmInCnTXkgxMOkRjE+HxtbV4C6qFi8iJMZWEGnOZ6QsiXYz8oFMhScoz8wlgAyPmM6xNweEeElMv20O9Th35//+s03DWWSs/y8vcw7wn2c0RZKQtv3iX4zS+ZsmgQmcy6YZTdE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=IK62loQP; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="IK62loQP"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=nWY/9Z0pSPkko20mW+Mqac0ySxLdrDURge4Q5L7V2go=; b=IK62loQP49oPfNwokflVkxXfOo
	mElq46Ss0KHVp/j+YQT1L+D0WvdjTPcvsyR9coYN87gyH41otH1v+Pvdo6P7427D9//64R0+OeYd/
	vWY+ldU5pW+x1Cop4cRuC8zoN06N4FUCTIA1grRMwzQ7gZJupKaOE+judn8e7o1ss2LGN/c1svI5L
	7iOom7rrMiLtvuJvj8WG2aNd640hC/fRKju32RjICED7+pu31J9uGFy98XbBkZ+wXiaUb897CKZUu
	QkDAzdkxAo2SdpnbUDR/GPqWFzb1koiqHdVMS3pIDjFl8YlTnk2ynetBMHZT58OLwONQyGhueYRlP
	3TuaQwvQ==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3SC6;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 28/40] mm: page_alloc: drive slab shrink from SPB
 anti-fragmentation pressure
Date: Wed, 20 May 2026 10:59:34 -0400
Message-ID: <20260520150018.2491267-29-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

GFP_KERNEL slab callers (dentry/inode/page-table caches) have no
"cheap fallback" the ALLOC_HIGHORDER_OPTIONAL refusal gate can use to
return NULL, and reach __rmqueue_claim/_steal whenever the tainted-SPB
pool runs out of headroom.  Without an external pressure release valve,
sustained slab growth eventually drains the tainted pool, every clean
SPB starts absorbing one taint, and fragmentation grows until equilibrium
at a much higher tainted-SPB count than the workload memory-footprint
warrants.

The pageblock-evacuation worker (spb_evacuate_for_order /
queue_spb_evacuate) already runs from these pressure points, but it can
only consolidate movable pages out of tainted SPBs.  Slab content
stranded in tainted SPBs blocks free pageblocks from re-coalescing and
forces new taints when movable supply runs out.

Add a parallel slab-shrink mechanism that mirrors the evacuation
infrastructure: a per-pgdat irq_work that bridges from allocator-lock
context out to a workqueue, a pool of request descriptors, and
queue_spb_slab_shrink() with queue_work()-native single-flight semantics
(queue_work() absorbs per-alloc bursts at near-zero cost via test-and-set
on WORK_STRUCT_PENDING_BIT).  The worker calls shrink_slab() with the
zone's nid, walking node-local shrinkers from DEF_PRIORITY toward 0
until either no shrinker reports progress or a pageblock-sized batch of
objects has been freed.

Trigger sites:

  * Demand, alongside the existing queue_spb_evacuate calls:
    __rmqueue_smallest pre-Pass-3 when spb_tainted_walk reports
    saw_below_reserve; __rmqueue_claim when a non-movable claim is
    about to taint a clean SPB.

  * Supply, unconditional at the end of spb_evacuate_for_order:
    keeps headroom for the next burst when MOV supply runs out.

  * Proactive high-water, on the success paths of Pass 1
    SB_TAINTED / Pass 2 / 2b / 2c / 2d: when a non-movable
    allocation consumes from a tainted SPB whose nr_free_pages
    has fallen below spb_tainted_reserve worth of pages.
    Demand-trigger placement alone is too late -- tainted SPBs
    can keep absorbing pressure for tens of minutes before
    exhaustion, by which point the only option left is to taint
    a fresh SPB.

The proactive high-water and end-of-spb_evacuate_for_order triggers
share a helper, spb_react_to_tainted_alloc(), with the per-SPB defrag
worker -- since both pressure-release mechanisms apply to the same
tainted-SPB state, the helper invokes spb_maybe_start_defrag() alongside
queue_spb_slab_shrink() under the same high-water gate. The defrag
worker's own cooldown gate inside spb_needs_defrag() makes this cheap to
call from every relevant success path.

A last-chance defrag walk runs just before the Pass 3 fall-through to a
clean SPB: list_for_each_entry over zone->spb_lists[SB_TAINTED] calling
spb_maybe_start_defrag() on each. This catches tainted SPBs that are
stuck in expired-cooldown state because no allocator activity recently
touched them (spb_update_list, the routine event-driven trigger, only
fires on bucket transitions).

Per-pgdat init (init_pgdat_spb_state) wires irq_work and work_struct
fields at boot via the pageblock_evacuate_init late_initcall and
lazily on the memory-hotplug path; without it the trigger sites'
`.func !=3D NULL` gate would leave the mechanism inert system-wide.

shrink_slab is location-agnostic -- it doesn't know about SPBs -- but
since slab pages live in already-tainted SPBs (that is where they were
allocated), the freed pages naturally land back in the tainted pool,
restoring headroom without spreading the taint to clean SPBs.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h        |  12 ++
 include/linux/vm_event_item.h |   5 +
 mm/internal.h                 |   2 +
 mm/mm_init.c                  |   8 +
 mm/page_alloc.c               | 281 +++++++++++++++++++++++++++++++---
 mm/vmstat.c                   |   2 +
 6 files changed, 290 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index db719335b32a..46eb5012d18b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1670,6 +1670,18 @@ typedef struct pglist_data {
 	struct workqueue_struct *evacuate_wq;
 	struct llist_head spb_evac_pending;
 	struct irq_work spb_evac_irq_work;
+
+	/*
+	 * SPB-driven slab reclaim: single work item per pgdat (shrink_slab
+	 * is node-scoped, so one work in-flight per node is the max).
+	 * queue_work() gives us single-flight semantics for free -- fresh
+	 * triggers no-op while a pass is in progress.
+	 *
+	 * irq_work defers the queue_work() call outside the allocator's
+	 * lock context to avoid pool->lock vs hrtimer_bases.lock inversion.
+	 */
+	struct irq_work spb_slab_shrink_irq_work;
+	struct work_struct spb_slab_shrink_work;
 #endif
 	/*
 	 * This is a per-node reserve of pages that are not available
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 4a8513d5fc3e..87c82f9d7fb7 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -81,6 +81,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 					 * a clean SPB clean when a tainted SPB
 					 * still has free pageblocks
 					 */
+		SPB_SLAB_SHRINK_QUEUED,	/*
+					 * queued a deferred slab shrink to
+					 * reclaim space inside tainted SPBs
+					 */
+		SPB_SLAB_SHRINK_RAN,	/* slab shrink worker ran a pass */
 		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
 		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
 		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
diff --git a/mm/internal.h b/mm/internal.h
index f52575202a96..9854d76ebf36 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1103,10 +1103,12 @@ void init_cma_reserved_pageblock(struct page *page);
=20
 #ifdef CONFIG_COMPACTION
 void init_superpageblock_defrag(struct superpageblock *sb);
+void init_pgdat_spb_state(pg_data_t *pgdat);
 void superpageblock_clear_has_movable(struct zone *zone, struct page *page=
);
 void superpageblock_set_has_movable(struct zone *zone, struct page *page);
 #else
 static inline void init_superpageblock_defrag(struct superpageblock *sb) {}
+static inline void init_pgdat_spb_state(pg_data_t *pgdat) {}
 static inline void superpageblock_clear_has_movable(struct zone *zone,
 						    struct page *page) {}
 static inline void superpageblock_set_has_movable(struct zone *zone,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index ee5dcd89e31e..af71ef8393c6 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1695,6 +1695,14 @@ void __meminit resize_zone_superpageblocks(struct zo=
ne *zone)
 	if (!zone->superpageblocks)
 		init_zone_spb_lists(zone);
=20
+	/*
+	 * Lazily initialize pgdat-level SPB state (evacuate_wq, evac llist,
+	 * shrink work). pageblock_evacuate_init() is a late_initcall and
+	 * only walks for_each_online_node, so a node that gets its first
+	 * memory via hotplug needs this here. Idempotent.
+	 */
+	init_pgdat_spb_state(zone->zone_pgdat);
+
 	alloc_size =3D new_nr_sbs * sizeof(struct superpageblock);
 	new_sbs =3D kvmalloc_node(alloc_size, GFP_KERNEL | __GFP_ZERO, nid);
 	if (!new_sbs) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ce8cd99dd283..6dadfe9d59d9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -743,6 +743,7 @@ static bool spb_evacuate_for_order(struct zone *zone, u=
nsigned int order,
 				  int migratetype);
 static void queue_spb_evacuate(struct zone *zone, unsigned int order,
 			       int migratetype);
+static void queue_spb_slab_shrink(struct zone *zone);
 #else
 static inline void spb_maybe_start_defrag(struct superpageblock *sb) {}
 static inline bool spb_needs_defrag(struct superpageblock *sb) { return fa=
lse; }
@@ -753,6 +754,7 @@ static inline bool spb_evacuate_for_order(struct zone *=
zone, unsigned int order,
 }
 static inline void queue_spb_evacuate(struct zone *zone, unsigned int orde=
r,
 				      int migratetype) {}
+static inline void queue_spb_slab_shrink(struct zone *zone) {}
 #endif
=20
 #ifdef CONFIG_CONTIG_ALLOC
@@ -2706,6 +2708,47 @@ static inline u16 spb_tainted_reserve(const struct s=
uperpageblock *sb)
 	return max_t(u16, SPB_TAINTED_RESERVE_MIN, sb->total_pageblocks / 32);
 }
=20
+/*
+ * High-water threshold for proactively kicking the slab shrinker. When a
+ * non-movable allocation consumes from a tainted SPB whose total free
+ * pages have fallen below spb_tainted_reserve worth of pages, queue a
+ * shrink so we start freeing slab memory before the SPB is exhausted.
+ *
+ * Compared against nr_free_pages rather than nr_free (whole pageblocks):
+ * sub-pageblock allocations and fragmented free space don't move the
+ * pageblock count, but they do consume the SPB's freeable capacity, and
+ * we can't assume slab reclaim will produce whole pageblocks either.
+ */
+static inline bool spb_below_shrink_high_water(const struct superpageblock=
 *sb)
+{
+	return sb->nr_free_pages <
+		(unsigned long)spb_tainted_reserve(sb) * pageblock_nr_pages;
+}
+
+/*
+ * spb_react_to_tainted_alloc - kick reclaim machinery on a tainted-SPB al=
loc.
+ *
+ * Called from each PASS_1/2/2B/2C/2D success path after a successful
+ * allocation against a tainted SPB. If the SPB is below its shrink
+ * high-water mark, queue the SPB-driven slab shrink and try to start
+ * the per-SPB defrag worker. Both have their own cooldown gates inside,
+ * so this is cheap to call on every such allocation.
+ *
+ * Skips quickly when the SPB is not tainted (e.g. movable allocation
+ * landing on a clean SPB) or when the high-water mark hasn't been
+ * crossed.
+ */
+static inline void spb_react_to_tainted_alloc(struct superpageblock *sb,
+					      struct zone *zone)
+{
+	if (spb_get_category(sb) !=3D SB_TAINTED)
+		return;
+	if (!spb_below_shrink_high_water(sb))
+		return;
+	queue_spb_slab_shrink(zone);
+	spb_maybe_start_defrag(sb);
+}
+
 /*
  * On systems with many superpageblocks, we can afford to "write off"
  * tainted superpageblocks by aggressively packing unmovable/reclaimable
@@ -2891,6 +2934,8 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 				page_del_and_expand(zone, page,
 					order, current_order,
 					migratetype);
+				if (cat =3D=3D SB_TAINTED)
+					spb_react_to_tainted_alloc(sb, zone);
 				trace_mm_page_alloc_zone_locked(
 					page, order, migratetype,
 					pcp_allowed_order(order) &&
@@ -2910,6 +2955,8 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 					page_del_and_expand(zone, page,
 						order, current_order,
 						migratetype);
+					if (cat =3D=3D SB_TAINTED)
+						spb_react_to_tainted_alloc(sb, zone);
 					trace_mm_page_alloc_zone_locked(
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
@@ -2955,6 +3002,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 					page =3D claim_whole_block(zone, page,
 						current_order, order,
 						migratetype, MIGRATE_MOVABLE);
+					spb_react_to_tainted_alloc(sb, zone);
 					trace_mm_page_alloc_zone_locked(
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
@@ -2992,6 +3040,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 						0, true);
 					if (!page)
 						continue;
+					spb_react_to_tainted_alloc(sb, zone);
 					trace_mm_page_alloc_zone_locked(
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
@@ -3075,6 +3124,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 							opposite_mt);
 						__spb_set_has_type(page,
 							migratetype);
+						spb_react_to_tainted_alloc(sb, zone);
 						trace_mm_page_alloc_zone_locked(
 							page, order, migratetype,
 							pcp_allowed_order(order) &&
@@ -3146,6 +3196,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 							MIGRATE_MOVABLE);
 						__spb_set_has_type(page,
 							migratetype);
+						spb_react_to_tainted_alloc(sb, zone);
 						trace_mm_page_alloc_zone_locked(
 							page, order, migratetype,
 							pcp_allowed_order(order) &&
@@ -3163,9 +3214,34 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 	 * showed that some tainted SPB is below its reserve threshold of
 	 * free pageblocks, kick deferred evacuation so future allocations
 	 * have a movable-evicted home in an already-tainted SPB.
+	 *
+	 * Queue slab shrink alongside evacuation: even when movable evac
+	 * succeeds, shrinking slab in parallel keeps headroom available
+	 * for the next burst, when the movable supply may have run out.
 	 */
-	if (walk && walk->saw_below_reserve)
+	if (walk && walk->saw_below_reserve) {
 		queue_spb_evacuate(zone, order, migratetype);
+		queue_spb_slab_shrink(zone);
+	}
+
+	/*
+	 * Last-chance defrag trigger before tainting a fresh clean SPB.
+	 * Walk the tainted-SPB list and try to wake the per-SPB defrag
+	 * worker on each. Catches SPBs that are stuck in expired-cooldown
+	 * state because no allocator activity has touched them recently
+	 * (the routine event-driven trigger from spb_update_list only
+	 * fires on bucket transitions, not on every alloc). Once the
+	 * cooldown has expired, spb_maybe_start_defrag() will requeue
+	 * work; otherwise the gate inside spb_needs_defrag() no-ops
+	 * cheaply. Bounded by nr_tainted_spbs and only runs when we are
+	 * already on the slow path of fragmenting the clean pool.
+	 */
+	for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+		list_for_each_entry(sb,
+			&zone->spb_lists[SB_TAINTED][full], list) {
+			spb_maybe_start_defrag(sb);
+		}
+	}
=20
 	/* Pass 3: whole pageblock from empty superpageblocks */
 	list_for_each_entry(sb, &zone->spb_empty, list) {
@@ -4001,12 +4077,17 @@ __rmqueue_claim(struct zone *zone, int order, int s=
tart_migratetype,
 			 * for a non-movable allocation -- this taints a fresh
 			 * SPB.  Defer an evacuation pass over the tainted pool
 			 * so subsequent allocations can reclaim freed
-			 * pageblocks instead of repeating this fallback.
+			 * pageblocks instead of repeating this fallback. Also
+			 * kick a slab shrink so the tainted pool gets fresh
+			 * headroom (movable evac alone can't free pages held
+			 * by slab).
 			 */
 			if (cat_search[c] !=3D SB_SEARCH_PREFERRED &&
-			    start_migratetype !=3D MIGRATE_MOVABLE)
+			    start_migratetype !=3D MIGRATE_MOVABLE) {
 				queue_spb_evacuate(zone, order,
 						   start_migratetype);
+				queue_spb_slab_shrink(zone);
+			}
=20
 			page =3D try_to_claim_block(zone, page, current_order,
 						  order, start_migratetype,
@@ -9301,6 +9382,133 @@ static void queue_spb_evacuate(struct zone *zone, u=
nsigned int order,
 	irq_work_queue(&pgdat->spb_evac_irq_work);
 }
=20
+/*
+ * SPB-driven slab reclaim.
+ *
+ * When tainted SPBs run low on free pageblocks under sustained
+ * non-movable pressure (slab inode/dentry/page-table caches), the
+ * pageblock-evacuation worker can only consolidate *movable* pages out
+ * of tainted SPBs. Non-movable slab content stays put, so once the
+ * movable supply is drained the only way to recover headroom in a
+ * tainted SPB is to shrink the slab caches whose pages live there.
+ *
+ * shrink_slab() is node-scoped, so one work item per pgdat is enough:
+ * a single embedded work_struct. queue_work() returns false if the work
+ * is already queued/running, so we get single-flight for free -- fresh
+ * triggers no-op until the in-flight pass completes.
+ *
+ * shrink_slab() itself is location-agnostic -- it walks all registered
+ * shrinkers and frees objects whose backing pages may live in any
+ * zone or SPB. That is fine here because any slab page reclaimed
+ * frees space the next allocation can reuse without tainting a fresh
+ * SPB. We pass the pgdat's nid so node-aware shrinkers prefer caches
+ * local to the pressured node.
+ */
+
+/*
+ * Per-invocation budget: walk shrinkers from DEF_PRIORITY (scan 1/4096
+ * of each cache) down toward 0 (full scan), stopping when shrinkers
+ * report no more progress or we have freed a pageblock-sized chunk.
+ * The trigger frequency is what controls overall reclaim rate; this
+ * loop just bounds latency per worker run.
+ */
+#define SPB_SLAB_SHRINK_TARGET_OBJS	(pageblock_nr_pages * 4UL)
+
+static void spb_slab_shrink_work_fn(struct work_struct *work)
+{
+	pg_data_t *pgdat =3D container_of(work, pg_data_t,
+					spb_slab_shrink_work);
+	int nid =3D pgdat->node_id;
+	unsigned long freed =3D 0;
+	int prio =3D DEF_PRIORITY;
+
+	count_vm_event(SPB_SLAB_SHRINK_RAN);
+
+	while (freed < SPB_SLAB_SHRINK_TARGET_OBJS && prio >=3D 0) {
+		unsigned long delta =3D 0;
+		struct mem_cgroup *memcg;
+
+		/*
+		 * Walk the memcg hierarchy starting at the root, the same
+		 * pattern shrink_one_node uses for global slab reclaim.
+		 * Some cgroups may not be present on the node that is
+		 * being shrunk, but many allocators will use any memory.
+		 */
+		memcg =3D mem_cgroup_iter(NULL, NULL, NULL);
+		do {
+			delta +=3D shrink_slab(GFP_KERNEL, nid, memcg, prio);
+		} while ((memcg =3D mem_cgroup_iter(NULL, memcg, NULL)) !=3D NULL);
+
+		if (!delta)
+			break;
+		freed +=3D delta;
+		/*
+		 * Increase aggressiveness each round; DEF_PRIORITY scans
+		 * a small slice of each cache, prio 0 scans the whole
+		 * thing. Most workloads find enough at one or two
+		 * iterations below DEF_PRIORITY.
+		 */
+		prio--;
+	}
+}
+
+/**
+ * spb_slab_shrink_irq_work_fn - IRQ work callback to safely queue slab sh=
rink
+ * @work: the irq_work embedded in pg_data_t
+ *
+ * queue_work() can create a lock ordering issue when called from inside
+ * the page allocator under hrtimer_bases.lock (via debug_objects_fill_pool
+ * allocations during hrtimer re-enqueue), because pool->lock depends on
+ * hrtimer_bases.lock through the scheduler path. Use irq_work to defer
+ * the queue_work() call to a safe context, matching the pattern used by
+ * spb_evac_irq_work_fn and spb_defrag_irq_work_fn.
+ */
+static void spb_slab_shrink_irq_work_fn(struct irq_work *work)
+{
+	pg_data_t *pgdat =3D container_of(work, struct pglist_data,
+					spb_slab_shrink_irq_work);
+
+	if (pgdat->evacuate_wq &&
+	    queue_work(pgdat->evacuate_wq, &pgdat->spb_slab_shrink_work))
+		count_vm_event(SPB_SLAB_SHRINK_QUEUED);
+}
+
+/**
+ * queue_spb_slab_shrink - schedule deferred slab shrink for SPB pressure
+ * @zone: zone whose tainted-SPB pool is running low
+ *
+ * Single-flight via queue_work(): if the work is already queued or
+ * running, it returns false and we no-op. There is no time-based
+ * throttle -- the rate at which fresh shrink runs can fire is bounded
+ * by how fast the worker completes (one full pass freeing up to
+ * SPB_SLAB_SHRINK_TARGET_OBJS objects).
+ *
+ * Callable from any context: the irq_work deferral ensures that
+ * queue_work() runs outside the allocator's lock context, avoiding
+ * pool->lock vs hrtimer_bases.lock ordering issues.
+ *
+ * Pairs with queue_spb_evacuate: evacuation moves movable pages out
+ * of tainted SPBs to free up whole pageblocks; this shrinks slab to
+ * free up the remaining (non-movable) pages. We queue both because
+ * even when movable evacuation succeeds, shrinking slab in parallel
+ * keeps headroom available for the next burst, when movable supply
+ * may have run out.
+ */
+static void queue_spb_slab_shrink(struct zone *zone)
+{
+	pg_data_t *pgdat =3D zone->zone_pgdat;
+
+	if (!pgdat->spb_slab_shrink_irq_work.func)
+		return;
+
+	/* Defer queue_work() to irq context to keep pool->lock acquisition
+	 * out of the allocator's lock context (avoids pool->lock vs
+	 * hrtimer_bases.lock inversion via debug_objects_fill_pool).
+	 * No throttle here: queue_work()'s built-in single-flight semantics
+	 * absorb per-alloc bursts at near-zero cost. */
+	irq_work_queue(&pgdat->spb_slab_shrink_irq_work);
+}
+
 /*
  * Background superpageblock defragmentation.
  *
@@ -9562,6 +9770,42 @@ static void spb_maybe_start_defrag(struct superpageb=
lock *sb)
 	}
 }
=20
+/**
+ * init_pgdat_spb_state - allocate evacuate_wq and initialize SPB work str=
ucts
+ * @pgdat: node being initialized
+ *
+ * Idempotent: returns immediately if pgdat->evacuate_wq is already set
+ * (a previous call succeeded). Best-effort: if alloc_workqueue() fails,
+ * pgdat->evacuate_wq stays NULL and SPB defrag/shrink for this node is
+ * disabled (the runtime callers all check for NULL evacuate_wq).
+ *
+ * Called from pageblock_evacuate_init() at boot for every online node, and
+ * lazily from resize_zone_superpageblocks() when memory hotplug brings up
+ * a node whose pgdat-level SPB state was never initialized (because the
+ * node was empty at boot -- late_initcall only walks for_each_online_node=
).
+ */
+void init_pgdat_spb_state(pg_data_t *pgdat)
+{
+	char name[32];
+
+	if (pgdat->evacuate_wq)
+		return;
+
+	snprintf(name, sizeof(name), "kevacuate/%d", pgdat->node_id);
+	pgdat->evacuate_wq =3D alloc_workqueue(name, WQ_MEM_RECLAIM, 1);
+	if (!pgdat->evacuate_wq) {
+		pr_warn("Failed to create evacuate workqueue for node %d\n",
+			pgdat->node_id);
+		return;
+	}
+
+	init_llist_head(&pgdat->spb_evac_pending);
+	init_irq_work(&pgdat->spb_evac_irq_work, spb_evac_irq_work_fn);
+	init_irq_work(&pgdat->spb_slab_shrink_irq_work,
+		      spb_slab_shrink_irq_work_fn);
+	INIT_WORK(&pgdat->spb_slab_shrink_work, spb_slab_shrink_work_fn);
+}
+
 static int __init pageblock_evacuate_init(void)
 {
 	int nid, i;
@@ -9571,22 +9815,14 @@ static int __init pageblock_evacuate_init(void)
 	for (i =3D 0; i < NR_SPB_EVAC_REQUESTS; i++)
 		llist_add(&spb_evac_pool[i].free_node, &spb_evac_freelist);
=20
-	/* Create a per-pgdat workqueue */
+	/* Initialize per-pgdat SPB workqueue and work structs */
 	for_each_online_node(nid) {
 		pg_data_t *pgdat =3D NODE_DATA(nid);
-		char name[32];
 		int z;
=20
-		snprintf(name, sizeof(name), "kevacuate/%d", nid);
-		pgdat->evacuate_wq =3D alloc_workqueue(name, WQ_MEM_RECLAIM, 1);
-		if (!pgdat->evacuate_wq) {
-			pr_warn("Failed to create evacuate workqueue for node %d\n", nid);
+		init_pgdat_spb_state(pgdat);
+		if (!pgdat->evacuate_wq)
 			continue;
-		}
-
-		init_llist_head(&pgdat->spb_evac_pending);
-		init_irq_work(&pgdat->spb_evac_irq_work,
-			      spb_evac_irq_work_fn);
=20
 		/* Initialize per-superpageblock defrag work structs */
 		for (z =3D 0; z < MAX_NR_ZONES; z++) {
@@ -9596,12 +9832,8 @@ static int __init pageblock_evacuate_init(void)
 			if (!zone->superpageblocks)
 				continue;
=20
-			for (j =3D 0; j < zone->nr_superpageblocks; j++) {
-				INIT_WORK(&zone->superpageblocks[j].defrag_work,
-					  spb_defrag_work_fn);
-				init_irq_work(&zone->superpageblocks[j].defrag_irq_work,
-					      spb_defrag_irq_work_fn);
-			}
+			for (j =3D 0; j < zone->nr_superpageblocks; j++)
+				init_superpageblock_defrag(&zone->superpageblocks[j]);
 		}
 	}
=20
@@ -9798,6 +10030,15 @@ static bool spb_evacuate_for_order(struct zone *zon=
e, unsigned int order,
 			did_evacuate =3D true;
 	}
=20
+	/*
+	 * Always kick a slab shrink after an evacuation pass -- even when
+	 * MOV evacuation succeeded. Slab content stranded inside tainted
+	 * SPBs can only be freed by shrinking the cache; doing it now keeps
+	 * headroom available for the next burst, when the MOV supply may
+	 * have run out and evac alone would have nothing to do.
+	 */
+	queue_spb_slab_shrink(zone);
+
 	return did_evacuate;
 }
 #endif /* CONFIG_COMPACTION */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0be1b969f493..5fd9a150d0a5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1389,6 +1389,8 @@ const char * const vmstat_text[] =3D {
 	[I(CMA_ALLOC_FAIL)]			=3D "cma_alloc_fail",
 #endif
 	[I(SPB_HIGHORDER_REFUSED)]		=3D "spb_highorder_refused",
+	[I(SPB_SLAB_SHRINK_QUEUED)]		=3D "spb_slab_shrink_queued",
+	[I(SPB_SLAB_SHRINK_RAN)]		=3D "spb_slab_shrink_ran",
 	[I(UNEVICTABLE_PGCULLED)]		=3D "unevictable_pgs_culled",
 	[I(UNEVICTABLE_PGSCANNED)]		=3D "unevictable_pgs_scanned",
 	[I(UNEVICTABLE_PGRESCUED)]		=3D "unevictable_pgs_rescued",
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F9483EC2E3
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289250; cv=none;
 b=I39yQUYmGUZUMe4LrFblqQZPG0lQ8faTIgiQtf9tGudBxiZWIbs0kAARxw5WZn0Rm6gA+MMQPFvMYOwHOrcBMNZIFPS4F7dYMEloCJIF/8flVpPG+1Vj+U+xEWrt6bPahoTs6QOstEUF+dB/nN/ngDpgiv/HRm74+H/hZy7ROQU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289250; c=relaxed/simple;
	bh=K1m8knU3cvytwyuYRKi68yAbe5m5MdpIE8ofs+MRjoY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=bRnPzZRD0c9D4VnIQijz3qe+PU4sFNgHAFUurlX+Qt5AqAkv6WCsMFwzfsIM41Y/qpkupYzMLBb5o/d/7+grD4hsG/L3ehyfugsMifw1U7592YewZmlNucovJqgYNt5IdUOR9xnIibBkEVSqZUM4KlL5S3AavbCPjL+4fKVPb9U=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=AHZz4tEu; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="AHZz4tEu"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=0ilHxWrxV0w+HHwXx+JoGzn2at6PBghAZehbCb2Te4A=; b=AHZz4tEuTx0U9Sznit2wEgRbR7
	kIvcRHDUgXlwmnrS/+174PZcFKRBy93M/HLDHenQAhDMPze7gSZbeC2gLWgudZk1jJlS9pHOBKdIh
	CstV3FZyUzOzqKxN0sGzUsfY0XGGGbrNFgCL1Ku2qHDgXJdLvjZeujedUIihU6inXov8FBQlGfu6l
	LXxHR6UdI9dgHF7tWyuKAvD7xlYiMoAjG6iNtyJFemYSaRXIPeCmVeivLzOgy0UZYpzx2OF2lq5Sm
	3BF/hyhACc+UrA7+qlzivHP8HaYSj0RMek21WPIfc8cGOf17aQlXmDdr/f/gAis8NPU87wrSaVBxb
	i5nJuP9Q==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3ZRl;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 29/40] mm: page_reporting: walk per-superpageblock free
 lists
Date: Wed, 20 May 2026 10:59:35 -0400
Message-ID: <20260520150018.2491267-30-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

After the SPB rework, free pages live on per-superpageblock free lists
(zone->superpageblocks[i].free_area[order].free_list[mt]) rather than
on a single zone-level list. page_reporting_cycle() was still walking
the now-empty zone-level list, so virtio-balloon free page reporting
silently became a no-op on systems with superpageblocks: no pages were
ever isolated, no MADV_DONTNEED hints reached the host, and any guest
memory backing balloon-eligible pages stayed resident on the host.

Refactor the per-list walk into page_reporting_cycle_list() taking an
explicit list_head and a pointer to the shared budget, then have
page_reporting_cycle() iterate every SPB in the zone for the requested
(order, mt). The budget is shared across the whole walk so a fragmented
zone does not multiply the rate-limit. The zone-level shadow nr_free
(maintained by __add_to_free_list / __del_page_from_free_list) is used
both for the early-out and for the budget total; that shadow already
sums all SPBs.

Hold the memory hotplug read lock around the SPB walk.
resize_zone_superpageblocks() swaps zone->superpageblocks under
zone->lock and immediately kvfree()s the old array with no RCU grace
period. The helper drops zone->lock during prdev->report() (which can
sleep) and resumes operating on a list_head pointer that lives inside
an SPB; without get_online_mems(), that pointer can become a dangling
reference if hotplug runs in the unlock window.

The zone-level fallback path is retained for zones whose SPB array has
not yet been allocated (e.g. unpopulated hotplug zones).

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_reporting.c | 149 ++++++++++++++++++++++++++------------------
 1 file changed, 90 insertions(+), 59 deletions(-)

diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 7418f2e500bb..836d97879b8d 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -6,6 +6,7 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/memory_hotplug.h>
 #include <linux/scatterlist.h>
=20
 #include "page_reporting.h"
@@ -138,116 +139,68 @@ page_reporting_drain(struct page_reporting_dev_info =
*prdev,
 }
=20
 /*
- * The page reporting cycle consists of 4 stages, fill, report, drain, and
- * idle. We will cycle through the first 3 stages until we cannot obtain a
- * full scatterlist of pages, in that case we will switch to idle.
+ * Walk a single free_list (zone-level or per-superpageblock), pulling
+ * unreported pages into the scatterlist and calling prdev->report() each
+ * time the scatterlist fills. Updates *budget and *offset across calls so
+ * the caller can spread one budget across multiple lists (e.g. one per SP=
B).
  */
 static int
-page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *z=
one,
-		     unsigned int order, unsigned int mt,
-		     struct scatterlist *sgl, unsigned int *offset)
+page_reporting_cycle_list(struct page_reporting_dev_info *prdev,
+			  struct zone *zone, struct list_head *list,
+			  unsigned int order, struct scatterlist *sgl,
+			  unsigned int *offset, long *budget)
 {
-	struct free_area *area =3D &zone->free_area[order];
-	struct list_head *list =3D &area->free_list[mt];
 	unsigned int page_len =3D PAGE_SIZE << order;
 	struct page *page, *next;
-	long budget;
 	int err =3D 0;
=20
-	/*
-	 * Perform early check, if free area is empty there is
-	 * nothing to process so we can skip this free_list.
-	 */
 	if (list_empty(list))
-		return err;
+		return 0;
=20
 	spin_lock_irq(&zone->lock);
=20
-	/*
-	 * Limit how many calls we will be making to the page reporting
-	 * device for this list. By doing this we avoid processing any
-	 * given list for too long.
-	 *
-	 * The current value used allows us enough calls to process over a
-	 * sixteenth of the current list plus one additional call to handle
-	 * any pages that may have already been present from the previous
-	 * list processed. This should result in us reporting all pages on
-	 * an idle system in about 30 seconds.
-	 *
-	 * The division here should be cheap since PAGE_REPORTING_CAPACITY
-	 * should always be a power of 2.
-	 */
-	budget =3D DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
-
-	/* loop through free list adding unreported pages to sg list */
 	list_for_each_entry_safe(page, next, list, lru) {
-		/* We are going to skip over the reported pages. */
 		if (PageReported(page))
 			continue;
=20
-		/*
-		 * If we fully consumed our budget then update our
-		 * state to indicate that we are requesting additional
-		 * processing and exit this list.
-		 */
-		if (budget < 0) {
+		if (*budget < 0) {
 			atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
 			next =3D page;
 			break;
 		}
=20
-		/* Attempt to pull page from list and place in scatterlist */
 		if (*offset) {
 			if (!__isolate_free_page(page, order)) {
 				next =3D page;
 				break;
 			}
=20
-			/* Add page to scatter list */
 			--(*offset);
 			sg_set_page(&sgl[*offset], page, page_len, 0);
=20
 			continue;
 		}
=20
-		/*
-		 * Make the first non-reported page in the free list
-		 * the new head of the free list before we release the
-		 * zone lock.
-		 */
 		if (!list_is_first(&page->lru, list))
 			list_rotate_to_front(&page->lru, list);
=20
-		/* release lock before waiting on report processing */
 		spin_unlock_irq(&zone->lock);
=20
-		/* begin processing pages in local list */
 		err =3D prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
=20
-		/* reset offset since the full list was reported */
 		*offset =3D PAGE_REPORTING_CAPACITY;
+		(*budget)--;
=20
-		/* update budget to reflect call to report function */
-		budget--;
-
-		/* reacquire zone lock and resume processing */
 		spin_lock_irq(&zone->lock);
=20
-		/* flush reported pages from the sg list */
 		page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
=20
-		/*
-		 * Reset next to first entry, the old next isn't valid
-		 * since we dropped the lock to report the pages
-		 */
 		next =3D list_first_entry(list, struct page, lru);
=20
-		/* exit on error */
 		if (err)
 			break;
 	}
=20
-	/* Rotate any leftover pages to the head of the freelist */
 	if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, li=
st))
 		list_rotate_to_front(&next->lru, list);
=20
@@ -256,6 +209,84 @@ page_reporting_cycle(struct page_reporting_dev_info *p=
rdev, struct zone *zone,
 	return err;
 }
=20
+/*
+ * The page reporting cycle consists of 4 stages, fill, report, drain, and
+ * idle. We will cycle through the first 3 stages until we cannot obtain a
+ * full scatterlist of pages, in that case we will switch to idle.
+ *
+ * With superpageblocks, free pages live on per-SPB free_lists rather than=
 a
+ * single zone-level list, so the cycle iterates every SPB for the request=
ed
+ * (order, mt). The budget is shared across the entire walk so that
+ * fragmented zones do not produce a budget multiplier.
+ */
+static int
+page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *z=
one,
+		     unsigned int order, unsigned int mt,
+		     struct scatterlist *sgl, unsigned int *offset)
+{
+	long budget;
+	int err =3D 0;
+
+	/*
+	 * Early exit if the per-zone shadow says there is nothing free at
+	 * this order in any SPB. Avoids touching every SPB's list head.
+	 */
+	if (!data_race(zone->free_area[order].nr_free))
+		return 0;
+
+	/*
+	 * Limit how many calls we will be making to the page reporting
+	 * device. By doing this we avoid processing any given (order, mt)
+	 * for too long.
+	 *
+	 * The current value used allows us enough calls to process over a
+	 * sixteenth of the current free pool plus one additional call to
+	 * handle any pages that may have already been present from the
+	 * previous list processed. This should result in us reporting all
+	 * pages on an idle system in about 30 seconds.
+	 *
+	 * The division here should be cheap since PAGE_REPORTING_CAPACITY
+	 * should always be a power of 2.
+	 */
+	budget =3D DIV_ROUND_UP(data_race(zone->free_area[order].nr_free),
+			      PAGE_REPORTING_CAPACITY * 16);
+
+	/*
+	 * Block memory hotplug for the SPB walk. resize_zone_superpageblocks()
+	 * swaps zone->superpageblocks under zone->lock and immediately
+	 * kvfree()s the old array, with no RCU grace period. The helper drops
+	 * zone->lock during prdev->report() and resumes using a list_head
+	 * pointer into an SPB; without holding mem_hotplug_lock for read,
+	 * that pointer can become a dangling reference into freed memory.
+	 */
+	get_online_mems();
+
+	if (zone->nr_superpageblocks) {
+		unsigned long sb_idx, nr_sbs =3D zone->nr_superpageblocks;
+
+		for (sb_idx =3D 0; sb_idx < nr_sbs; sb_idx++) {
+			struct list_head *list =3D
+				&zone->superpageblocks[sb_idx].free_area[order].free_list[mt];
+
+			err =3D page_reporting_cycle_list(prdev, zone, list,
+							order, sgl, offset,
+							&budget);
+			if (err || budget < 0)
+				break;
+		}
+	} else {
+		/* No SPBs (e.g. unpopulated zone); fall back to zone-level list. */
+		struct list_head *list =3D &zone->free_area[order].free_list[mt];
+
+		err =3D page_reporting_cycle_list(prdev, zone, list, order,
+						sgl, offset, &budget);
+	}
+
+	put_online_mems();
+
+	return err;
+}
+
 static int
 page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 			    struct scatterlist *sgl, struct zone *zone)
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E958F3F58D6
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289263; cv=none;
 b=U5y0U5ka3eR0VuxJmLQFbXTRrEfwAYCk7X1wJr56gDOUGRZoPTtw1WUs4q6hCXRM/KSAHN4F4kvfaxyip9jxJm6Aa8uN1rosu4xLeiBB5iYnu+BkQH8V00zjrTI7qsIMHElVuuafc/fnP4VSayxoVmX7Bpyd2CyviYC2bNVhxjQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289263; c=relaxed/simple;
	bh=+kk1PZdiImIr1dKOTa+ewHznoIGgF8KrHzSzKdOss/o=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=n2LnZHjrOtVaU9/8Wsxmp8l13wiVrwme2YyBMmlP/I9KSGyNpcbCbq051FyiFA9FMorTx1uoIQqSnZhojt0x4Sfodt9gtff4EJUxKOVC/ep1PsTYY8YB3lMd5vlvtTsArGu350z/dt701XjxqJSXevRdae+icxYG6kQ5+Awyo44=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=kHajg5X9; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="kHajg5X9"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=FZOK6h58T4z/LIHuXmRraHuYbG6z8gXR7nqP+GNo268=; b=kHajg5X985RLx3rxaGTf2XVaPD
	76I6dDk8PrQizDgUV5HI5hcX6tMKj+GUooPY4X9BlzFgqWoOBVH9wQkwOE+GEURlEpuAKcMZuNQT0
	3ol0VVNAQ5mE6mrRBtMZIZf3OSC5s3XkC1G71FZ3UJtA3ddgaUZoDPpprIhgEQ2hQrqsgCymNkgm1
	Rb2h2uUV/2Z+9HRitqLvbZenecb/l882JfBGvJAMUpm0w3KHmHecmAfbzEa0FJaLG+a0PDY+2P2KX
	EZugxcfEcj4TsLMovFNVl3yMfvgp00aPGfJG82UXJi7kegZkuopGtUujezDGg4p6hHcFbJp2w7dtR
	nGA7mB4w==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3h1w;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 30/40] mm: show_mem: collect migratetype letters from
 per-superpageblock lists
Date: Wed, 20 May 2026 10:59:36 -0400
Message-ID: <20260520150018.2491267-31-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

show_mem()'s per-order line includes a parenthesized set of letters
(UME, etc.) indicating which migratetypes have free pages at that
order. This was computed by checking free_area_empty() on
zone->free_area[order].free_list[type]. After the SPB rework, those
zone-level list heads are always empty -- free pages live on per-
superpageblock lists -- so the migratetype letters never appeared.

Iterate every SPB in the zone for each order, OR'ing in any non-empty
migratetype lists, with an early exit once all migratetypes have been
seen. The shadow nr_free count remains correct (zone->free_area[].
nr_free is updated by __add_to_free_list / __del_page_from_free_list
to sum across all SPBs).

Falls back to the zone-level free_area for zones whose SPB array has
not yet been allocated.

The whole loop runs under spin_lock_irqsave(&zone->lock) without
drops, so no hotplug race. Worst case work is bounded
(NR_PAGE_ORDERS * MIGRATE_TYPES * nr_superpageblocks list_empty
pointer compares per zone) and acceptable for a diagnostic path.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/show_mem.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/mm/show_mem.c b/mm/show_mem.c
index d08f1263480a..ce7f43416199 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -367,16 +367,31 @@ static void show_free_areas(unsigned int filter, node=
mask_t *nodemask, int max_z
=20
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
-			struct free_area *area =3D &zone->free_area[order];
+			unsigned long sb_idx;
+			unsigned long nr_lists =3D zone->nr_superpageblocks ? : 1;
 			int type;
=20
-			nr[order] =3D area->nr_free;
+			nr[order] =3D zone->free_area[order].nr_free;
 			total +=3D nr[order] << order;
=20
+			/*
+			 * Collect the migratetypes present at this order. After
+			 * the SPB rework, free pages live on per-superpageblock
+			 * free lists, so check each SPB. Stop early once all
+			 * migratetypes have been observed.
+			 */
 			types[order] =3D 0;
-			for (type =3D 0; type < MIGRATE_TYPES; type++) {
-				if (!free_area_empty(area, type))
-					types[order] |=3D 1 << type;
+			for (sb_idx =3D 0; sb_idx < nr_lists; sb_idx++) {
+				struct free_area *area =3D zone->nr_superpageblocks ?
+					&zone->superpageblocks[sb_idx].free_area[order] :
+					&zone->free_area[order];
+
+				for (type =3D 0; type < MIGRATE_TYPES; type++) {
+					if (!free_area_empty(area, type))
+						types[order] |=3D 1 << type;
+				}
+				if (types[order] =3D=3D (1 << MIGRATE_TYPES) - 1)
+					break;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CE7663F39C1
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289257; cv=none;
 b=VKx2zxYFL/lwCl437BrMljTy54pPdRT8LSyAENsYpSUBWyBj5zJA5Ueq9U3QhD//HXbSPbxn1tSoGxbRvdiIONUgKqH+bVH4FTZ+QjGFIfHsYlpuRtmpghysQTNi5nT7/QTWxWjiW8PCLfKErOLdhAeKFhYwFZcbyrHKHT5Ha7Q=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289257; c=relaxed/simple;
	bh=Q1ecHK68Jltx1W9CbyPPBwUyRpxXYq1H/j0XOr1c2lQ=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=Oh9sN63K2geW6gg3Q1OkRwqslCqAV/WekQKQos85A/3eTi6C47TsGZ5eRyNZnxUEE0g8qfVZy4dMkgHOLcsSRZWuHNPqPqyb9JBVZzKXm9sXSjMTwt7nArLIg4HvHi4O+lsNtyqMsaAGUnjm6d8K/v27jkfDeFttLr8Ex+JyYN4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=hOpQGzKJ; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="hOpQGzKJ"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=V73fGF4L85rQyR6G/fi/9PDMTPCkAr5GDPqZo9IXQjg=; b=hOpQGzKJeYN1l7DrULohp+i2G3
	4msBWr6gPA6KazkCeDd1ojsNgMK/ljBfcImp9tctWWMEHLCfWjIqjV18ouZhKskUSQuJGjeV4LFRa
	QMvLTL4NJyZgbx/18bnSdxNnxfdNPyl8yvyrd3t+pZjvhnuwB6S1edYd6Esf5HSlPYsiKvMuDgDPv
	xoCnC/Qq+MuLBQKvK+BzECiUUhnF9E6ZjFEDxdEqDFJVXpV/fATxij7xqpLd7pRLuXM2stEBBt2XF
	IV+07L2v3lcp07smH9SSNWh1h8GhTLmCAZo/FN21M0/+K3vFirc22Lut0frUhRZLOOIvdL7P4ePNI
	rDYqqyAg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3mye;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 31/40] mm: page_alloc: per-(zone, order,
 mt) PASS_1 hint cache
Date: Wed, 20 May 2026 10:59:37 -0400
Message-ID: <20260520150018.2491267-32-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

PASS_1 of __rmqueue_smallest walks &zone->spb_lists[cat][full]
linearly. Under steady workload on a 250 GB test system, the median
walk depth was ~50 SPBs and 20-57% of allocations visited 100+ SPBs.

Cache the SPB that last satisfied a PASS_1 alloc for each
(zone, order, migratetype) tuple, in two layers:

  - per-zone hint (zone->sb_hint[order][mt]) -- visible to all CPUs,
    serialized by zone->lock.
  - per-CPU hint indexed by zone_idx -- cache-hot, contention-free.
    Each slot stores (zone *, sb *) because zone_idx is per-pgdat
    (not globally unique on NUMA); the zone-pointer check on read
    prevents a cross-node SPB from being handed back to the wrong
    zone's accounting.

Stale hints are harmless: try_alloc_from_sb_pass1() returns NULL and
the standard list walk runs as before. On PASS_1 success both hints
are refreshed. spb_invalidate_warm_hints() clears both arrays from
resize_zone_superpageblocks() under zone->lock to prevent UAF across
memory hotplug-add.

Hint hits show up in tracepoint:kmem:spb_alloc_walk as the [0, 5)
bucket because n_spbs_visited stays 0; no new tracepoint needed.
Skipped for migratetype >=3D MIGRATE_PCPTYPES (HIGHATOMIC/CMA/ISOLATE
are already cheap or rare).

Measurement on the same test system with this commit applied:

  median walk depth:        ~50 SPBs   ->   ~5
  tail (>=3D100 SPB visits):  20-57%     ->   0.4%
  hint hit rate (n=3D0):                 ->   99%

Memory cost: ~320 B per zone + ~2.6 KB per CPU
(MAX_NR_ZONES * NR_PAGE_ORDERS * MIGRATE_PCPTYPES * sizeof(slot)).

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h |  11 +++
 mm/internal.h          |   2 +
 mm/mm_init.c           |   8 ++
 mm/page_alloc.c        | 173 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 194 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 46eb5012d18b..c9c248d5b14e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1111,6 +1111,17 @@ struct zone {
 	struct list_head	spb_isolated;	/* fully isolated (1GB contig alloc) */
 	struct list_head	spb_lists[__NR_SB_CATEGORIES][__NR_SB_FULLNESS];
=20
+	/*
+	 * PASS_1 fast-path hint: most-recent SPB that satisfied a
+	 * (order, mt) PASS_1 allocation. Stale hints are harmless -- the hint
+	 * try-alloc just falls through to the standard list walk on miss.
+	 * Sized for [0..NR_PAGE_ORDERS) x PCPTYPES; HIGHATOMIC/CMA/ISOLATE
+	 * skip the hint (already cheap or rare). Invalidated by
+	 * spb_invalidate_warm_hints() when the SPB array is resized
+	 * (memory hotplug add).
+	 */
+	struct superpageblock	*sb_hint[NR_PAGE_ORDERS][MIGRATE_PCPTYPES];
+
 	/* zone_start_pfn =3D=3D zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
=20
diff --git a/mm/internal.h b/mm/internal.h
index 9854d76ebf36..3a847dcfb03f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1119,6 +1119,8 @@ static inline void superpageblock_set_has_movable(str=
uct zone *zone,
 void resize_zone_superpageblocks(struct zone *zone);
 #endif
=20
+void spb_invalidate_warm_hints(struct zone *zone);
+
 struct cma;
=20
 #ifdef CONFIG_CMA
diff --git a/mm/mm_init.c b/mm/mm_init.c
index af71ef8393c6..19a338ed1bdf 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1837,6 +1837,14 @@ void __meminit resize_zone_superpageblocks(struct zo=
ne *zone)
 	zone->superpageblock_base_pfn =3D new_sb_base;
 	zone->spb_kvmalloced =3D true;
=20
+	/*
+	 * Invalidate PASS_1 hints under zone->lock so that no
+	 * concurrent allocator (also entering __rmqueue_smallest under
+	 * zone->lock) can dereference an old SPB pointer that is about
+	 * to be freed below.
+	 */
+	spb_invalidate_warm_hints(zone);
+
 	spin_unlock_irqrestore(&zone->lock, flags);
=20
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6dadfe9d59d9..116d9cc0a493 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2854,6 +2854,109 @@ struct spb_tainted_walk {
 	bool saw_below_reserve;	/* tainted SPB has nr_free <=3D spb_tainted_reser=
ve */
 };
=20
+/*
+ * PASS_1 fast-path hint: most-recent SPB this CPU successfully
+ * allocated from for a given (zone, order, migratetype). Combined with
+ * the per-zone zone->sb_hint[][], this lets PASS_1 skip the linear walk
+ * of spb_lists[cat][full] in the common case. Stale hints are
+ * harmless -- the try-alloc just falls through to the standard list walk
+ * on miss.
+ *
+ * The slot stores both the zone pointer and the SPB pointer because
+ * zone_idx(zone) is per-pgdat (not globally unique on NUMA), so two
+ * nodes' ZONE_NORMAL share the same array index. The zone-pointer check
+ * on read prevents a cross-node SPB from being handed back to the wrong
+ * zone (which would corrupt per-zone NR_FREE_PAGES accounting).
+ */
+struct spb_warm_hint_slot {
+	struct zone		*zone;
+	struct superpageblock	*sb;
+};
+struct spb_warm_hints {
+	struct spb_warm_hint_slot slot[MAX_NR_ZONES][NR_PAGE_ORDERS][MIGRATE_PCPT=
YPES];
+};
+static DEFINE_PER_CPU(struct spb_warm_hints, spb_warm_hints);
+
+/**
+ * spb_invalidate_warm_hints - drop all cached hints into @zone
+ * @zone: zone whose SPB array is about to change
+ *
+ * Called from memory hotplug paths that resize zone->superpageblocks
+ * (and therefore invalidate every SPB pointer for @zone). Must be
+ * called with zone->lock held; the lock serializes against any CPU
+ * doing a hint read inside __rmqueue_smallest (also under zone->lock),
+ * so callers see either pre-invalidation state (old SPB pointers,
+ * still-valid old array) or post-invalidation state (NULL slots) --
+ * never a half-state with stale pointers into a freed array.
+ */
+void spb_invalidate_warm_hints(struct zone *zone)
+{
+	enum zone_type zidx =3D zone_idx(zone);
+	int cpu, order, mt;
+
+	lockdep_assert_held(&zone->lock);
+
+	memset(zone->sb_hint, 0, sizeof(zone->sb_hint));
+
+	for_each_possible_cpu(cpu) {
+		struct spb_warm_hints *h =3D per_cpu_ptr(&spb_warm_hints, cpu);
+
+		for (order =3D 0; order < NR_PAGE_ORDERS; order++) {
+			for (mt =3D 0; mt < MIGRATE_PCPTYPES; mt++) {
+				if (h->slot[zidx][order][mt].zone !=3D zone)
+					continue;
+				h->slot[zidx][order][mt].zone =3D NULL;
+				h->slot[zidx][order][mt].sb =3D NULL;
+			}
+		}
+	}
+}
+
+/*
+ * Try to allocate from a single SPB using PASS_1 semantics:
+ * whole pageblock first (PCP-buddy friendly), then sub-pageblock.
+ * Returns the page on success, NULL on miss. Caller is responsible
+ * for hint updates and shrinker queueing.
+ */
+static struct page *try_alloc_from_sb_pass1(struct zone *zone,
+					    struct superpageblock *sb,
+					    unsigned int order,
+					    int migratetype)
+{
+	unsigned int current_order;
+	struct free_area *area;
+	struct page *page;
+
+	if (!sb->nr_free_pages)
+		return NULL;
+
+	for (current_order =3D max(order, pageblock_order);
+	     current_order < NR_PAGE_ORDERS;
+	     ++current_order) {
+		area =3D &sb->free_area[current_order];
+		page =3D get_page_from_free_area(area, migratetype);
+		if (!page)
+			continue;
+		page_del_and_expand(zone, page, order,
+				    current_order, migratetype);
+		return page;
+	}
+	if (order < pageblock_order) {
+		for (current_order =3D order;
+		     current_order < pageblock_order;
+		     ++current_order) {
+			area =3D &sb->free_area[current_order];
+			page =3D get_page_from_free_area(area, migratetype);
+			if (!page)
+				continue;
+			page_del_and_expand(zone, page, order,
+					    current_order, migratetype);
+			return page;
+		}
+	}
+	return NULL;
+}
+
 static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 				int migratetype, struct spb_tainted_walk *walk)
@@ -2875,6 +2978,58 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 	};
 	int movable =3D (migratetype =3D=3D MIGRATE_MOVABLE) ? 1 : 0;
=20
+	/*
+	 * PASS_1 fast-path: try per-CPU then per-zone hint SPB before the
+	 * linear list walk. The hint stores the SPB that last satisfied a
+	 * PASS_1 alloc for this (zone, order, migratetype). On hit, we
+	 * skip the entire spb_lists walk. Skip for HIGHATOMIC/CMA/ISOLATE
+	 * -- those paths are already cheap (atomic-NORETRY skip) or rare.
+	 */
+	if (migratetype < MIGRATE_PCPTYPES) {
+		enum zone_type zidx =3D zone_idx(zone);
+		struct superpageblock *cpu_hint =3D NULL, *zone_hint;
+		struct spb_warm_hint_slot *slot;
+
+		slot =3D this_cpu_ptr(
+			&spb_warm_hints.slot[zidx][order][migratetype]);
+		/*
+		 * Validate slot->zone =3D=3D zone: zone_idx is per-pgdat, so
+		 * on NUMA the same slot index is shared by every node's
+		 * zone of this type. Without this check, a hint written
+		 * from one node would be returned to allocations on
+		 * another node and corrupt the wrong zone's accounting.
+		 */
+		if (slot->zone =3D=3D zone)
+			cpu_hint =3D slot->sb;
+		if (cpu_hint) {
+			page =3D try_alloc_from_sb_pass1(zone, cpu_hint,
+						       order, migratetype);
+			if (page) {
+				spb_react_to_tainted_alloc(cpu_hint, zone);
+				trace_mm_page_alloc_zone_locked(page, order,
+				    migratetype,
+				    pcp_allowed_order(order) &&
+				    migratetype < MIGRATE_PCPTYPES);
+				return page;
+			}
+		}
+		zone_hint =3D zone->sb_hint[order][migratetype];
+		if (zone_hint && zone_hint !=3D cpu_hint) {
+			page =3D try_alloc_from_sb_pass1(zone, zone_hint,
+						       order, migratetype);
+			if (page) {
+				spb_react_to_tainted_alloc(zone_hint, zone);
+				slot->zone =3D zone;
+				slot->sb =3D zone_hint;
+				trace_mm_page_alloc_zone_locked(page, order,
+				    migratetype,
+				    pcp_allowed_order(order) &&
+				    migratetype < MIGRATE_PCPTYPES);
+				return page;
+			}
+		}
+	}
+
 	/*
 	 * Search per-superpageblock free lists for pages of the requested
 	 * migratetype, walking superpageblocks from fullest to emptiest
@@ -2940,6 +3095,15 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 					page, order, migratetype,
 					pcp_allowed_order(order) &&
 					migratetype < MIGRATE_PCPTYPES);
+				if (migratetype < MIGRATE_PCPTYPES) {
+					struct spb_warm_hint_slot *slot;
+
+					zone->sb_hint[order][migratetype] =3D sb;
+					slot =3D this_cpu_ptr(&spb_warm_hints.slot
+					    [zone_idx(zone)][order][migratetype]);
+					slot->zone =3D zone;
+					slot->sb =3D sb;
+				}
 				return page;
 			}
 			/* Then try sub-pageblock (no PCP buddy) */
@@ -2961,6 +3125,15 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
 						migratetype < MIGRATE_PCPTYPES);
+					if (migratetype < MIGRATE_PCPTYPES) {
+						struct spb_warm_hint_slot *slot;
+
+						zone->sb_hint[order][migratetype] =3D sb;
+						slot =3D this_cpu_ptr(&spb_warm_hints.slot
+						    [zone_idx(zone)][order][migratetype]);
+						slot->zone =3D zone;
+						slot->sb =3D sb;
+					}
 					return page;
 				}
 			}
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3CD4C3EEAD6
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289260; cv=none;
 b=DmakCvl4eF07MdIJhlGOVjFp4U2b0Et/0EJqshJJmpA7R9Q76GY2kwoBfwO3DsTKTa7WlbVlKQ2+zwA2cdguEA40Y+/9LbkNygEnhpLGQhGRI/VN/MN1NLjfWC7/9qUVREV+7p2o0oLxDsZio2cg9khb5A8E/vGgsLRupuKD5vU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289260; c=relaxed/simple;
	bh=TIW0YEtE+oYb4M7MMJDqXm72Gm9H3t78Bbk6bOshKpA=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=ZYuZq7fzEAtMZPz7yYSbrHMj1ADNMk6USKkpdJGIXVETaLSuPx1vQCiJ1TqDDz+K+0rGbPIEkOjvY2Jr2nmPSiok+KG7wjIrUwHM8bfqqdWN1PgF0EZB9fONEQ2gkp2sZDTg7XwwxzTWkUaI8GEqFJUa+cYn2zvRnZnPBzJigDU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=JS7U779I; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="JS7U779I"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=LP/AzFvTKoIDyrXjf92uY+voMQvTqKcRBE5XC1e952Y=; b=JS7U779IELjaE2QGCB0k/b0Vx5
	QPIc1MtZFEDE+nVd1Y87prVHr2R+SVew+KMU3yQIgfGF6fC8om3IjfKmUYrAtLVxupOX/v5UmJ+pA
	kqTdyCcjnhGY1PNprycURjqSXYthGmuy6FHwZlh3T0D3y7JxKBdgzftKggPxMHW8kLaQNI0K9yefB
	phBymDjMe/UEk+hgL5Ht0l+2hDTdBRFVN4PfnWFAqk7oCtRwFD8Ac9+2GS6hW7Vl6NNLr7uRig7wA
	C/HTbM3dMd5kVrrkov6RAXStqcYpSUTgHzchPOPMgn4gdS99Z92m2VNmbzxNdD43lkF/bbxh71muN
	8AqbBAeg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3tUc;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 32/40] mm: debug: prevent infinite recursion in
 dump_page() with CMA
Date: Wed, 20 May 2026 10:59:38 -0400
Message-ID: <20260520150018.2491267-33-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

dump_page() calls is_migrate_cma_folio() which expands to
get_pfnblock_migratetype(&folio->page, pfn).  That helper resolves
the pageblock via pfn_to_pageblock(), and on !CONFIG_SPARSEMEM
configurations pfn_to_pageblock() reads page_zone(page) to compute
the per-zone pageblock_data offset.

When dump_page() is invoked on a page whose zone is not initialised
(unavailable PFN ranges, very early boot, or a poisoned struct page),
the page_zone() dereference returns garbage and a downstream
VM_BUG_ON_PAGE in dump_page()'s own consistency checks fires.  The
BUG handler then calls dump_page() on the same page, which re-enters
the same code path, hits the same BUG, and recurses until the kernel
runs out of stack.

Guard the is_migrate_cma_folio() call with pfn_valid() and only
resolve page_zone() once that has succeeded; only then run
zone_spans_pfn() before classifying the page.  dump_page() can now
safely report on pages without a meaningful zone, and the "CMA"
suffix is only printed if the page is genuinely in a CMA pageblock.

Found by: dump_page() called from a VM_BUG_ON_PAGE in early boot
hitting a page in an unavailable range, recursing until stack
exhaustion.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/debug.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/mm/debug.c b/mm/debug.c
index d4542d5d202b..e233520b009c 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -73,6 +73,7 @@ static void __dump_folio(const struct folio *folio, const=
 struct page *page,
 {
 	struct address_space *mapping =3D folio_mapping(folio);
 	int mapcount =3D atomic_read(&page->_mapcount) + 1;
+	bool cma =3D false;
 	char *type =3D "";
=20
 	if (page_mapcount_is_type(mapcount))
@@ -112,9 +113,24 @@ static void __dump_folio(const struct folio *folio, co=
nst struct page *page,
 	 * "isolate" again in the meantime, but since we are just dumping the
 	 * state for debugging, it should be fine to accept a bit of
 	 * inaccuracy here due to racing.
+	 *
+	 * Guard the is_migrate_cma_folio() call with pfn_valid() and
+	 * zone_spans_pfn(). The macro calls get_pfnblock_migratetype()
+	 * which calls get_pfnblock_flags_word() which has a VM_BUG_ON_PAGE
+	 * for !zone_spans_pfn(). If that fires, dump_page() recurses
+	 * infinitely. Call page_zone() only after pfn_valid() to avoid
+	 * dereferencing uninitialized zone data during early boot.
 	 */
+#ifdef CONFIG_CMA
+	if (pfn_valid(pfn)) {
+		struct zone *zone =3D page_zone(page);
+
+		if (zone_spans_pfn(zone, pfn))
+			cma =3D is_migrate_cma_folio(folio, pfn);
+	}
+#endif
 	pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
-		is_migrate_cma_folio(folio, pfn) ? " CMA" : "");
+		cma ? " CMA" : "");
 	if (page_has_type(&folio->page))
 		pr_warn("page_type: %x(%s)\n", folio->page.page_type >> 24,
 				page_type_name(folio->page.page_type));
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CE4A43EFFC2;
	Wed, 20 May 2026 15:00:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289259; cv=none;
 b=Br6lSCcn482EonEqH92nIAzAOkrrWiWjbP5/GX8aOqahYH17GBboYJb7zqP5Jc+fgIeIK8SLEIiNCbI8n8bU5Yup6J2s4onZlDxYsSCSkTmZC389lf58lQjUjidhHg7GxHFL6MYvuzxrHLmNxQbimf0yoXqfqjzqHx6ccdVgokU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289259; c=relaxed/simple;
	bh=h4YAvjMwugtBeP5rgTTnM3/4ur6xJ+sNlmMg9e/BRN0=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=T8wuTSr37y/yQ2szr4S6MyLrbNpu8FPFpKz66t59hxd6C0tVlBOwWfVaGX76xv17YizFZQeWHsRxglYqjdoq88zN6vdqhxKgHdAFnF7HDBB49CFQmkFdZBkmMVHAE3gmbMub12dfyy96T++ir8UpMtbiR7bOR3dXq+v9WcSMEuE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=f5AW/Fqk; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="f5AW/Fqk"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=kgTR8MXGEfuZkiCxXrjBV2lPgK1Sp/fx+wzmK8Ohvys=; b=f5AW/FqkAgj7z0DGFu9w4cBw2N
	WcrdccVsn6mnShQ3g+LF1B382noR9hBdYJxGn2eAt8L+SrCyApBFclSQV+UcMOEFo62dnI6hwc8M6
	KEIegcWCFXj4oFpj4WW7RW7A2hmyIiz0pIEx0sJzTVKVDfSM2TZWBjCa8WqwzSzmYaGsLBICkVHEr
	RClmwLZr/1msvSbgFj+2fhtj6LXzNBAhcoIuH6P8IW1To04EhBbRJa/tFv7ERy5EP1AJRpbqTqI35
	8FJMQtiQKZ+VO6kHdehJgW+gCdOWvXTFr0tcQHRcVGWMpYiZYgJiqHq8wEXb1OUON+kH4t0c1l95x
	kkRVT+sw==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPM-0000000024Q-3zfV;
	Wed, 20 May 2026 11:00:28 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	Len Brown <lenb@kernel.org>,
	Pavel Machek <pavel@kernel.org>,
	linux-pm@vger.kernel.org
Subject: [RFC PATCH 33/40] PM: hibernate: walk per-superpageblock free lists
 in mark_free_pages
Date: Wed, 20 May 2026 10:59:39 -0400
Message-ID: <20260520150018.2491267-34-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

mark_free_pages() walks the buddy allocator's free lists and calls
swsusp_set_page_free() on each free page so it is omitted from the
hibernation image. After the SPB rework, free pages live on per-
superpageblock free lists rather than the zone-level free list, so
the existing list_for_each_entry() walk over zone->free_area[order]
.free_list[t] found nothing. The hibernation snapshot then treated
every free page as needing to be saved, wasting image space and risking
OOM during the snapshot.

Wrap the existing per-page walk in an SPB iteration loop. When the
zone has no SPBs (e.g. an unpopulated hotplug zone), fall back to the
zone-level free list. The whole function still runs under
spin_lock_irqsave(&zone->lock) without drops, so there are no lock-
order or hotplug concerns.

Build-tested as part of the full mm series; kernel/power/snapshot.o
compiles cleanly in that build.  An -Werror=3Dreturn-type warning in
enough_free_mem() exists on some configurations but is unrelated to
this change and predates the SPB series.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Pavel Machek <pavel@kernel.org>
Cc: linux-pm@vger.kernel.org
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 kernel/power/snapshot.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a564650734dc..f96885bc46f8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1270,17 +1270,32 @@ static void mark_free_pages(struct zone *zone)
 	}
=20
 	for_each_migratetype_order(order, t) {
-		list_for_each_entry(page,
-				&zone->free_area[order].free_list[t], buddy_list) {
-			unsigned long i;
-
-			pfn =3D page_to_pfn(page);
-			for (i =3D 0; i < (1UL << order); i++) {
-				if (!--page_count) {
-					touch_nmi_watchdog();
-					page_count =3D WD_PAGE_COUNT;
+		unsigned long sb_idx;
+		unsigned long nr_lists =3D zone->nr_superpageblocks ? : 1;
+
+		/*
+		 * After the SPB rework, free pages live on per-superpageblock
+		 * free lists. Walk every SPB's list for this (order, mt) cell.
+		 * If the zone has no SPBs (unpopulated zone), fall back to the
+		 * zone-level list head so that any pre-SPB pages are still
+		 * marked.
+		 */
+		for (sb_idx =3D 0; sb_idx < nr_lists; sb_idx++) {
+			struct list_head *list =3D zone->nr_superpageblocks ?
+				&zone->superpageblocks[sb_idx].free_area[order].free_list[t] :
+				&zone->free_area[order].free_list[t];
+
+			list_for_each_entry(page, list, buddy_list) {
+				unsigned long i;
+
+				pfn =3D page_to_pfn(page);
+				for (i =3D 0; i < (1UL << order); i++) {
+					if (!--page_count) {
+						touch_nmi_watchdog();
+						page_count =3D WD_PAGE_COUNT;
+					}
+					swsusp_set_page_free(pfn_to_page(pfn + i));
 				}
-				swsusp_set_page_free(pfn_to_page(pfn + i));
 			}
 		}
 	}
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F08EB3F44E2;
	Wed, 20 May 2026 15:00:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289255; cv=none;
 b=EqktdrrR8LJ0FmFK2TmVKdx38dyQKeWTpytAOLBlRsz0bDSqWcA565FR4hTVoEzCyGXMY7iinWTONH7vf+mTuZRuGI/fTmoaTwop02mMF4jPtPSpcU1Fl08oDFOI/Q//Nl+lZ4dDsymjPDRwDTA04NohMoipI2H5VMzcLFlIdzY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289255; c=relaxed/simple;
	bh=VbsWC5gPtY7GuOMHXLDcQOEdjwkLf3J3QvARVmlpMN4=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=FeYgXPl/SyxxbdUmwpn5CfIPk8xXCKSucE4DSywcgQVOsicdz+AaHVhHGRnCLctJknLkkTpzvip6U+wR62BoljzK3lPeU6CPyjIFH89z5lHWP6z0IhfU9IdvgHRKAlT037fsV0cwfXvtCISjadk+9+Nha7cxtmVero9u3w7BHcs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=LjnIdbjC; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="LjnIdbjC"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:Content-Type:MIME-Version:References:
	In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=dOkWLdcLGvAao+PCfxgE3TfEgj2k6nE6LANeJbPhpH4=; b=LjnIdbjCuo6IWVmPiKouCG4b4t
	E8kNUfsX6Qdd+h3/AftBcmpx7qQSahkKWjilxZu9hYMslD6/6Ip3XaxMwGOdx6QHpl0zgdYPbBP3H
	Hb7+HO9cNuZpkXmDFXM6rluxShfJVUrRzNANHgg7GETKqTbf3nWNMcm9J9fSUf1+gA7P7V9Sod52X
	GvPvhsK1KxhQxf5yqhdmAEcwmz1i6MKrrPirRjR+eKJ2n8m2NwgVSFFqfXLHLL9kZCa5IVF+klUc4
	M/a1W2wW+0NyuODGVZy1Rix7ZW91ldKKwrrF7mRExCUATFfJj/P+G16oPGUicheCB7el3oZEfIaCq
	iBeUDFxA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPN-0000000024Q-0414;
	Wed, 20 May 2026 11:00:29 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>,
	Chris Mason <clm@meta.com>,
	David Sterba <dsterba@suse.com>,
	Boris Burkov <boris@bur.io>,
	linux-btrfs@vger.kernel.org
Subject: [RFC PATCH 34/40] btrfs: allocate eb-attached btree pages as movable
Date: Wed, 20 May 2026 10:59:40 -0400
Message-ID: <20260520150018.2491267-35-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

Extent buffer pages allocated by alloc_extent_buffer() are attached to
btree_inode->i_mapping (the buffer_tree path), reach the LRU, and are
served by the btree_migrate_folio aops in fs/btrfs/disk-io.c. They are
migratable in practice once their owning extent buffer hits refs =3D=3D 1,
which happens naturally as tree roots rotate. The buddy allocator
classifies them by GFP, however, and bare GFP_NOFS lands them in
MIGRATE_UNMOVABLE pageblocks. The result: every btree_inode page we
read in pins an unmovable pageblock from the page-superblock allocator's
perspective, even though the page itself can be moved.

Add __GFP_MOVABLE to that one allocation site (alloc_extent_buffer's
call to alloc_eb_folio_array). Plumb the flag through
alloc_eb_folio_array =E2=86=92 btrfs_alloc_page_array as a `gfp_t extra_gfp`
parameter. All other call sites pass 0.

Three categories of caller stay on bare GFP_NOFS, deliberately:

  - alloc_dummy_extent_buffer / btrfs_clone_extent_buffer: the
    resulting eb is EXTENT_BUFFER_UNMAPPED, folio->mapping stays NULL,
    the folios never enter LRU, never get migrate_folio aops. Tagging
    them __GFP_MOVABLE would violate the page allocator's migrability
    contract and they would defeat compaction in MOVABLE pageblocks
    where isolate_migratepages_block skips non-LRU non-movable_ops
    pages outright.

  - btrfs_alloc_page_array callers in fs/btrfs/raid56.c (stripe
    pages), fs/btrfs/inode.c (encoded reads), fs/btrfs/ioctl.c (uring
    encoded reads), fs/btrfs/relocation.c (relocation buffers): same
    contract violation. raid56 stripe_pages additionally persist in
    the stripe cache (RBIO_CACHE_SIZE=3D1024) well beyond a single I/O,
    so they are not transient enough to hand-wave the contract.

  - btrfs_alloc_folio_array caller in fs/btrfs/scrub.c (stripe
    folios): same -- stripe->folios[] are private buffers freed via
    folio_put in release_scrub_stripe.

This change targets the dominant fragmentation source observed on the
page-superblock series: ~28 GB of btree_inode pages parked across
many tainted superpageblocks on a 250 GB test system with btrfs root,
preventing 1 GiB hugepage allocation from those regions. With the
movable hint, those pages now land in MOVABLE pageblocks where the
existing background defragger drains them through the standard
PB_has_movable gate, no LRU-sample fallback needed.

Cc: Chris Mason <clm@meta.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Boris Burkov <boris@bur.io>
Cc: linux-btrfs@vger.kernel.org
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
Reviewed-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/extent_io.c  | 69 ++++++++++++++++++++++++++++++-------------
 fs/btrfs/extent_io.h  |  4 +--
 fs/btrfs/inode.c      |  2 +-
 fs/btrfs/ioctl.c      |  2 +-
 fs/btrfs/raid56.c     |  6 ++--
 fs/btrfs/relocation.c |  2 +-
 fs/btrfs/scrub.c      |  3 +-
 7 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2275189b7860..563c4a7eaa36 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -620,24 +620,33 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
 }
=20
 /*
- * Populate every free slot in a provided array with folios using GFP_NOFS.
+ * Populate every free slot in a provided array with folios using
+ * GFP_NOFS plus optional caller-supplied flags.
  *
- * @nr_folios:   number of folios to allocate
- * @order:	 the order of the folios to be allocated
- * @folio_array: the array to fill with folios; any existing non-NULL entr=
ies in
- *		 the array will be skipped
+ * @nr_folios:    number of folios to allocate
+ * @order:	  folio order
+ * @folio_array:  array to fill with folios; non-NULL entries are skipped
+ * @extra_gfp:    extra GFP flags OR'd into GFP_NOFS. The only value used
+ *                today is __GFP_MOVABLE, which the extent-buffer real-map=
ping
+ *                path (alloc_extent_buffer) passes when the resulting fol=
ios
+ *                will be attached to btree_inode->i_mapping (added to LRU,
+ *                served by the btree_migrate_folio aops). Pass 0 for
+ *                everything else; folios allocated by other callers stay =
in
+ *                driver-owned arrays, never reach LRU and never register
+ *                movable_ops, so they cannot satisfy the __GFP_MOVABLE
+ *                migrability contract.
  *
  * Return: 0        if all folios were able to be allocated;
  *         -ENOMEM  otherwise, the partially allocated folios would be fre=
ed and
  *                  the array slots zeroed
  */
 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
-			    struct folio **folio_array)
+			    struct folio **folio_array, gfp_t extra_gfp)
 {
 	for (int i =3D 0; i < nr_folios; i++) {
 		if (folio_array[i])
 			continue;
-		folio_array[i] =3D folio_alloc(GFP_NOFS, order);
+		folio_array[i] =3D folio_alloc(GFP_NOFS | extra_gfp, order);
 		if (!folio_array[i])
 			goto error;
 	}
@@ -652,21 +661,27 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, u=
nsigned int order,
 }
=20
 /*
- * Populate every free slot in a provided array with pages, using GFP_NOFS.
+ * Populate every free slot in a provided array with pages, using GFP_NOFS
+ * plus optional caller-supplied flags.
  *
- * @nr_pages:   number of pages to allocate
- * @page_array: the array to fill with pages; any existing non-null entrie=
s in
- *		the array will be skipped
- * @nofail:	whether using __GFP_NOFAIL flag
+ * @nr_pages:    number of pages to allocate
+ * @page_array:  array to fill; non-NULL entries are skipped
+ * @nofail:      whether to use __GFP_NOFAIL
+ * @extra_gfp:   extra GFP flags OR'd into the base mask. The only value u=
sed
+ *               today is __GFP_MOVABLE, which the extent-buffer real-mapp=
ing
+ *               path passes when the resulting pages will be attached to
+ *               btree_inode->i_mapping. See btrfs_alloc_folio_array() for
+ *               the full migrability rationale.
  *
  * Return: 0        if all pages were able to be allocated;
  *         -ENOMEM  otherwise, the partially allocated pages would be free=
d and
  *                  the array slots zeroed
  */
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
-			   bool nofail)
+			   bool nofail, gfp_t extra_gfp)
 {
-	const gfp_t gfp =3D nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS;
+	const gfp_t gfp =3D (nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS) |
+			  extra_gfp;
 	unsigned int allocated;
=20
 	for (allocated =3D 0; allocated < nr_pages;) {
@@ -689,14 +704,23 @@ int btrfs_alloc_page_array(unsigned int nr_pages, str=
uct page **page_array,
  * Populate needed folios for the extent buffer.
  *
  * For now, the folios populated are always in order 0 (aka, single page).
+ *
+ * @movable: pass true only when the resulting pages will be attached to
+ *           btree_inode->i_mapping (the alloc_extent_buffer real path).
+ *           Cloned/dummy extent buffers (EXTENT_BUFFER_UNMAPPED) leave
+ *           folio->mapping NULL, never enter the LRU, and never get the
+ *           btree_migrate_folio aops, so __GFP_MOVABLE would violate the
+ *           page-allocator's migrability contract for them.
  */
-static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
+static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail,
+				bool movable)
 {
 	struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] =3D { 0 };
 	int num_pages =3D num_extent_pages(eb);
 	int ret;
=20
-	ret =3D btrfs_alloc_page_array(num_pages, page_array, nofail);
+	ret =3D btrfs_alloc_page_array(num_pages, page_array, nofail,
+				     movable ? __GFP_MOVABLE : 0);
 	if (ret < 0)
 		return ret;
=20
@@ -3097,7 +3121,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const=
 struct extent_buffer *src)
 	 */
 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
=20
-	ret =3D alloc_eb_folio_array(new, false);
+	ret =3D alloc_eb_folio_array(new, false, false);
 	if (ret)
 		goto release_eb;
=20
@@ -3138,7 +3162,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(struc=
t btrfs_fs_info *fs_info,
 	if (!eb)
 		return NULL;
=20
-	ret =3D alloc_eb_folio_array(eb, false);
+	ret =3D alloc_eb_folio_array(eb, false, false);
 	if (ret)
 		goto release_eb;
=20
@@ -3491,8 +3515,13 @@ struct extent_buffer *alloc_extent_buffer(struct btr=
fs_fs_info *fs_info,
 	}
=20
 reallocate:
-	/* Allocate all pages first. */
-	ret =3D alloc_eb_folio_array(eb, true);
+	/*
+	 * Allocate all pages first. These will be attached to
+	 * btree_inode->i_mapping below (added to LRU, served by
+	 * btree_migrate_folio), so request __GFP_MOVABLE so the
+	 * page allocator places them in MOVABLE pageblocks.
+	 */
+	ret =3D alloc_eb_folio_array(eb, true, true);
 	if (ret < 0) {
 		btrfs_free_folio_state(prealloc);
 		goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b310a5145cf6..5e263f07b59d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -387,9 +387,9 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle=
 *trans,
 			      struct extent_buffer *buf);
=20
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
-			   bool nofail);
+			   bool nofail, gfp_t extra_gfp);
 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
-			    struct folio **folio_array);
+			    struct folio **folio_array, gfp_t extra_gfp);
=20
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 906d5c21ebc4..85f56ab815f9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9659,7 +9659,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb=
, struct iov_iter *iter,
 	pages =3D kzalloc_objs(struct page *, nr_pages, GFP_NOFS);
 	if (!pages)
 		return -ENOMEM;
-	ret =3D btrfs_alloc_page_array(nr_pages, pages, false);
+	ret =3D btrfs_alloc_page_array(nr_pages, pages, false, 0);
 	if (ret) {
 		ret =3D -ENOMEM;
 		goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a39460bf68a7..77091915cacc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4621,7 +4621,7 @@ static int btrfs_uring_read_extent(struct kiocb *iocb=
, struct iov_iter *iter,
 	pages =3D kzalloc_objs(struct page *, nr_pages, GFP_NOFS);
 	if (!pages)
 		return -ENOMEM;
-	ret =3D btrfs_alloc_page_array(nr_pages, pages, 0);
+	ret =3D btrfs_alloc_page_array(nr_pages, pages, 0, 0);
 	if (ret) {
 		ret =3D -ENOMEM;
 		goto out_fail;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 08ee8f316d96..4135bac62be1 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1123,7 +1123,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rb=
io)
 {
 	int ret;
=20
-	ret =3D btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
+	ret =3D btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false,=
 0);
 	if (ret < 0)
 		return ret;
 	/* Mapping all sectors */
@@ -1138,7 +1138,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_=
bio *rbio)
 	int ret;
=20
 	ret =3D btrfs_alloc_page_array(rbio->nr_pages - data_pages,
-				     rbio->stripe_pages + data_pages, false);
+				     rbio->stripe_pages + data_pages, false, 0);
 	if (ret < 0)
 		return ret;
=20
@@ -1732,7 +1732,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bi=
o *rbio)
 	const int data_pages =3D rbio->nr_data * rbio->stripe_npages;
 	int ret;
=20
-	ret =3D btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
+	ret =3D btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false, 0);
 	if (ret < 0)
 		return ret;
=20
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3ebaf5880125..6f6d25724fb8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4038,7 +4038,7 @@ static int copy_remapped_data(struct btrfs_fs_info *f=
s_info, u64 old_addr,
 	if (!pages)
 		return -ENOMEM;
=20
-	ret =3D btrfs_alloc_page_array(nr_pages, pages, 0);
+	ret =3D btrfs_alloc_page_array(nr_pages, pages, 0, 0);
 	if (ret) {
 		ret =3D -ENOMEM;
 		goto end;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1ac609239cbe..4089e80077cc 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -369,7 +369,8 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_i=
nfo,
=20
 	ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <=3D SCRUB_STRIPE_MAX_FOLIOS);
 	ret =3D btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift,
-				      fs_info->block_min_order, stripe->folios);
+				      fs_info->block_min_order, stripe->folios,
+				      0);
 	if (ret < 0)
 		goto error;
=20
--=20
2.54.0

From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E35B63F1656
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289252; cv=none;
 b=erg8ELhduZ1IDwvT9Jw1rkxRXXGg9GqlSTtYOY/1uUqXLfG+IhfbYqq3qGFpo1Kplp7quxiHUurJdxYkGxQntNSYAv0/GdJDpdGegKn62l7ApXlf73V7NOLvbjefb1FPPCJM+/cxZg1A+D5BADelZ5I21s634vwUbAskKfIh+wA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289252; c=relaxed/simple;
	bh=Zxw21QhkEmmTsancD+oD4auUWB1s3qfsrELSXjsx020=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=Qx00NWtB88zJz7IRSppV4Nz5PGbzBB5kcKuSyp6GWBIsg62F3q5QrWmWbDWA0WVH5L1CcTjWhD1dPCvm/BjKcKscd1baSqrCSnY/97dyvknJmMTn/16kH92jXPm1N/S30j8r0nlWnko3HoUXvZZPiAUnN0l9hj8pbuWWLqFh8Tk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=Sh7iI7xi; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="Sh7iI7xi"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=ulNo65hNaNqGoC5B35kMqlTnxEKU9cQsEPkam/uQDmc=; b=Sh7iI7xiJxSGjU3a8N4C+J2UcI
	+xW9/Z9it2c/nlh9yunllK2rdjPju5EwaRPLc1hSC6AZVIDZqc7g+B3fMcZrYQSiiYVvmKpcVfaFm
	W0wylpEMNCp9kXqgSUkWDQPLHqikPjArdQmZFu7NQIIUufOiP+JG02RkGxnXCkr9naMgKWlRJxXz1
	fNS1uW0CupLCqKYrQ7m+txfUjm6+08/ji/SgfC7dISGoI8BnnxSDqeclcjw55mNukNcVb1vG3VZV+
	7JDrzfVrOpHeF6cdPoJqDcV0bGNkd+Vx64RLEGS6N4ltPPayrpgrzsbpMcazfe3M8jFvm4COoKlkn
	6Bl9+bqQ==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPN-0000000024Q-0DQU;
	Wed, 20 May 2026 11:00:29 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 35/40] mm: page_alloc: refuse best-effort high-order
 allocs servable at lower orders
Date: Wed, 20 May 2026 10:59:41 -0400
Message-ID: <20260520150018.2491267-36-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Several common high-order allocation patterns are best-effort: the caller
prefers a single large page for performance but has an order-decrement
fallback (or equivalent retry path) and is happy to accept failure of the
high-order attempt. Examples:

  kvmalloc()              kmalloc attempt has a vmalloc fallback
  vmalloc()               vm_area_alloc_pages decrements order on NULL
  alloc_skb_with_frags()  decrements order on NULL per fragment

The convention these callers share is to strip __GFP_DIRECT_RECLAIM and
set __GFP_NOWARN on the high-order attempt, signaling 'I don't want this
to block on direct reclaim and I'm fine with failure being silent'.

Without further hints, get_page_from_freelist's relax sequence treats
these as atomic allocs that must succeed and escalates: it adds
ALLOC_NOFRAG_TAINTED_OK (allowing PASS_2/2B claim_whole_block to relabel
a MOV pageblock inside a tainted SPB) and then drops ALLOC_NOFRAGMENT
entirely (allowing __rmqueue_claim/_steal to taint a clean SPB). The
caller's order-decrement fallback never runs because the high-order
attempt 'succeeds' by tainting.

The fix at the call sites is to add __GFP_NORETRY (kmalloc_gfp_adjust
already does this for kvmalloc). Generalize: in the relax sequence,
before dropping NOFRAGMENT, detect the 'best-effort high-order with
fallback' pattern by:

   order > 0
   __GFP_NOWARN set
   __GFP_NOFAIL not set
   __GFP_DIRECT_RECLAIM already cleared (the relax-sequence gate above)

If the tainted pool can plausibly serve a smaller (or same) order alloc
on the caller's retry, refuse the current attempt instead of escalating.
'Plausibly serve' means any tainted SPB has either:

  - nr_movable > 0  (MOV content exists; reclaim/migration can free
                     pageblocks at the order the caller's retry needs,
                     including orders >=3D the requested order -- e.g.
                     four THPs in the SPB can yield an order-7 buddy
                     for an order-7 unmovable alloc once the THPs are
                     migrated), OR
  - a free buddy on the requesting migratetype's own list at an order
    < requested (a smaller PASS_1 retry would succeed directly), OR
  - a free buddy on the opposite non-MOV list at an order < requested
    (PASS_2C borrow at the smaller order would succeed) -- only relevant
    for UNMOV/RECL allocs.

The MOV-content check alone covers the common case cheaply (one counter
read per tainted SPB) and works even when the movable memory exists at
orders larger than the alloc -- which is exactly when the per-order
free_list walk would miss it.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 116d9cc0a493..2791a52b61da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2708,6 +2708,101 @@ static inline u16 spb_tainted_reserve(const struct =
superpageblock *sb)
 	return max_t(u16, SPB_TAINTED_RESERVE_MIN, sb->total_pageblocks / 32);
 }
=20
+/*
+ * spb_tainted_can_serve_smaller - could a smaller-order @migratetype alloc
+ * be satisfied from any tainted SPB of @zone (now or after evac/reclaim)?
+ *
+ * "Yes" if either:
+ *   - some tainted SPB has nr_movable > 0 (MOV content exists; reclaim or
+ *     compaction/evac can free pageblocks at any order the caller's
+ *     order-decrement fallback might want, including orders >=3D the orig=
inal
+ *     requested order -- e.g. four THPs in the SPB can yield an order-7
+ *     buddy for an order-7 unmovable alloc once the THPs are migrated), OR
+ *   - some tainted SPB has a free buddy on the requesting migratetype's
+ *     own list at an order < @order (a smaller PASS_1 retry would
+ *     succeed directly), OR
+ *   - some tainted SPB has a free buddy on the opposite non-MOV list at
+ *     an order < @order (PASS_2C borrow at the smaller order would
+ *     succeed) -- only meaningful for UNMOV/RECL allocs.
+ *
+ * Used by the get_page_from_freelist relax sequence to discriminate
+ * "the caller has an order-decrement fallback that the tainted pool can
+ * eventually serve" from "the alloc must escalate to dropping
+ * ALLOC_NOFRAGMENT and tainting a clean SPB".
+ *
+ * Walks zone->spb_lists[SB_TAINTED][*] under zone->lock: spb_update_list()
+ * mutates these same lists under zone->lock, so a lockless walk would race
+ * with list-cursor reassignment (list_move from a concurrent allocator
+ * caller could splice the cursor onto a different list and turn the walk
+ * into an infinite loop or crash on a corrupted list_head). Sister functi=
on
+ * tainted_pool_has_free() takes zone->lock for the same reason; match its
+ * lock discipline. Bounded by the tainted SPB count plus a constant amount
+ * of work per SPB.
+ */
+static bool spb_tainted_can_serve_smaller(struct zone *zone,
+					  unsigned int order,
+					  int migratetype)
+{
+	struct superpageblock *sb;
+	unsigned long flags;
+	bool found =3D false;
+	int full;
+	unsigned int o;
+	int opposite_mt =3D -1;
+
+	if (order =3D=3D 0)
+		return false;
+
+	if (migratetype =3D=3D MIGRATE_UNMOVABLE)
+		opposite_mt =3D MIGRATE_RECLAIMABLE;
+	else if (migratetype =3D=3D MIGRATE_RECLAIMABLE)
+		opposite_mt =3D MIGRATE_UNMOVABLE;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	for (full =3D 0; full < __NR_SB_FULLNESS && !found; full++) {
+		list_for_each_entry(sb, &zone->spb_lists[SB_TAINTED][full],
+				    list) {
+			/*
+			 * MOV content can be reclaimed (LRU folios) or
+			 * migrated (compaction / spb_evacuate_for_order),
+			 * making the SPB able to host a smaller (or even
+			 * same-order) non-MOV alloc on the retry. Cheap
+			 * counter check, covers most real cases.
+			 */
+			if (sb->nr_movable > 0) {
+				found =3D true;
+				break;
+			}
+
+			if (!sb->nr_free_pages)
+				continue;
+
+			/*
+			 * No MOV content but there might be a same-mt or
+			 * opposite-non-MOV buddy at a smaller order that a
+			 * PASS_1 retry / PASS_2C borrow could serve.
+			 */
+			for (o =3D 0; o < order; o++) {
+				struct free_area *area =3D &sb->free_area[o];
+
+				if (!list_empty(&area->free_list[migratetype])) {
+					found =3D true;
+					break;
+				}
+				if (opposite_mt >=3D 0 &&
+				    !list_empty(&area->free_list[opposite_mt])) {
+					found =3D true;
+					break;
+				}
+			}
+			if (found)
+				break;
+		}
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return found;
+}
+
 /*
  * High-water threshold for proactively kicking the slab shrinker. When a
  * non-movable allocation consumes from a tainted SPB whose total free
@@ -6303,8 +6398,35 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int =
order, int alloc_flags,
 	 */
 	if (no_fallback && !defrag_mode &&
 	    !(gfp_mask & __GFP_DIRECT_RECLAIM)) {
+		struct zone *pref =3D zonelist_zone(ac->preferred_zoneref);
+
 		if (gfp_mask & __GFP_NORETRY)
 			return NULL;
+
+		/*
+		 * Best-effort high-order callers convention: stripping
+		 * __GFP_DIRECT_RECLAIM, setting __GFP_NOWARN, omitting
+		 * __GFP_NOFAIL, and asking for a high order indicates the
+		 * caller has an order-decrement fallback (kvmalloc's
+		 * vmalloc fallback, vmalloc's order-decrement loop,
+		 * alloc_skb_with_frags's order-decrement loop, ...).
+		 *
+		 * If the tainted-SPB pool already has a free buddy at any
+		 * lower order on a free list a smaller retry could use,
+		 * refuse this attempt so the caller's order-decrement
+		 * uses that sub-pageblock space instead of forcing us to
+		 * drop ALLOC_NOFRAGMENT and taint a clean SPB.
+		 *
+		 * Same intent as adding __GFP_NORETRY at every such
+		 * caller, but applied centrally so we cover both existing
+		 * and future callers without per-call-site fixes.
+		 */
+		if (order > 0 && (gfp_mask & __GFP_NOWARN) &&
+		    !(gfp_mask & __GFP_NOFAIL) &&
+		    spb_tainted_can_serve_smaller(pref, order,
+						  ac->migratetype))
+			return NULL;
+
 		if (!(alloc_flags & ALLOC_NOFRAG_TAINTED_OK)) {
 			alloc_flags |=3D ALLOC_NOFRAG_TAINTED_OK;
 			goto retry;
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3CEEA3F1653
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289258; cv=none;
 b=JPW7o6yNmiBmJ/g8+UXJhIU0+Ej5AwH2zLHgj4OUaHBBiKkaHha1kWIWW/ixodG6W0GVrzB8rMCe+tupqp5HWlJ4QHOgYMobwigiaapmbS8m1R/oDZKDmDvBi62+XP2bWjlDAWwiTxVVWHtQsu+3rLNihhEyULW8UgZD4PyVS+0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289258; c=relaxed/simple;
	bh=PYSlL1i606gycxUg6rXIwlBsnRTwz3am9HkNgCkSRCc=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=T97bqAWQxC7rrpIGTq7ckNp5hf0jTyr762YRp99H13WBME7kJKGTJ1n43O5y8VmuuiOfWWXEC8MFJUfRNWGzXeaXwq4I0IoGp45QTEOF5vjc0H2IWbFU24zW7AOaG1wLlD6HencoVzfNrT8pVbSTKl0fefrozx0kV4GYYk3f0Bc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=aCNhk919; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="aCNhk919"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=SZrOQk0GM156TvBoVM+zLzXRO2Iy08Gh16ie8a5/0l4=; b=aCNhk9191WRg6r+R2XCSmJGb9N
	XWpYCBqFGqjSuPZAdPk6o4CKReJBXt0pyJ0YNYvbfuErrkbzrlrNhADsUtIYQS9bvj2tS5Uihh81K
	QVvIciHWPeHBse8536TVD6l1Sf/9I3fxdZrKloib+zUmpbYGprwmbA8gDCqriOra8R8qX7Yb8kRzl
	Xzxi42+yhqMJHEx6pJO6/EK/dPBEWr/r63Nget93fY9H/dQmaEgKZJw8Oa4LPzOH2oRXQrPqom484
	Iyi2AOHaVQgtFtbhbHJU1NrCrAgyHdJI1BUu/gFKM38WS1t4FIIi0X7QLB2TFOtlN0jM1aJXqbVp8
	XeYfTlDA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPN-0000000024Q-0NIZ;
	Wed, 20 May 2026 11:00:29 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 36/40] mm: page_alloc: set ALLOC_NOFRAGMENT on
 alloc_frozen_pages_nolock_noprof
Date: Wed, 20 May 2026 10:59:42 -0400
Message-ID: <20260520150018.2491267-37-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The no-lock atomic page allocator (alloc_frozen_pages_nolock) bypasses
__alloc_pages_slowpath() entirely and calls get_page_from_freelist()
directly. The slowpath is where ALLOC_NOFRAGMENT is normally set for
non-movable allocations, so the nolock path runs without the "skip
clean SPBs" guard.

Add ALLOC_NOFRAGMENT to the alloc_flags. The function is best-effort
and callers already handle NULL. The flag steers the alloc toward
tainted-SPB sub-pageblock space first; for order > 0 the existing
auto-refuse in the get_page_from_freelist relax sequence returns NULL
when the tainted pool can serve a smaller order, and for order =3D 0
the alloc falls back to claiming a clean SPB only if the tainted
pool is exhausted of suitable fragments.

ALLOC_NOFRAGMENT is purely a pageblock-category steering hint; it
does not depend on __GFP_DIRECT_RECLAIM and does not introduce any
sleeping locks, so it composes cleanly with ALLOC_TRYLOCK in NMI,
hardirq, and PREEMPT_RT contexts.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/page_alloc.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2791a52b61da..4e45fac14622 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -11525,7 +11525,23 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_=
t gfp_flags, int nid, unsigned
 	 */
 	gfp_t alloc_gfp =3D __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_=
COMP
 			| gfp_flags;
-	unsigned int alloc_flags =3D ALLOC_TRYLOCK;
+	/*
+	 * ALLOC_NOFRAGMENT steers this best-effort no-lock allocation
+	 * toward tainted-SPB sub-pageblock space before fragmenting any
+	 * clean superpageblock. Without it, atomic kmalloc_nolock callers
+	 * (e.g. BPF storage from sched_process_exec tracepoints) hit the
+	 * fallback path on every PCP miss and convert a movable pageblock
+	 * in a clean SPB to UNMOVABLE -- tainting a 1 GiB hugepage
+	 * candidate for an order-0 atomic alloc. With NOFRAGMENT set:
+	 *   - order > 0: the get_page_from_freelist relax sequence
+	 *     auto-refuses to drop NOFRAGMENT when the tainted pool can
+	 *     serve a smaller order, returning NULL to the caller.
+	 *   - order =3D 0: prefers tainted-SPB fragments first; only falls
+	 *     back to claiming a clean-SPB pageblock if the tainted pool
+	 *     is exhausted of suitable fragments.
+	 * Callers of alloc_pages_nolock() already handle NULL.
+	 */
+	unsigned int alloc_flags =3D ALLOC_TRYLOCK | ALLOC_NOFRAGMENT;
 	struct alloc_context ac =3D { };
 	struct page *page;
=20
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 136033F44F0
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289256; cv=none;
 b=PcB0tDZpZ5y4ffriHODtDjQ2BapdAwQmP0zMSdsQ3Ylmhacp2Snqgx6C5V5uReuqY8SzpraL5s2n8qdEAe1MvmGa74J1q+GQtOOIkimX9dhYf/WrJy2D/zkoxMh12xr01DdkguViE0f+Iwda7vbCfeVqP8iIjDjT2JqSkl+O24g=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289256; c=relaxed/simple;
	bh=1MfIrcRrUVJPvpWutLd4EUqFR1xFf2rn4QP9FHwunrk=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=UTqnnZa1Y/fasqdQ5CZT3A/lzUyPAwSv2omQTCaPhnklySp1Iqm8fN91TpSVJe7jBO5QRv9OiDA1y+wjoroByBBoWJTafMebzCCvxDOOJHTzaMj010LJ5WWVQaejORavLled5XF5zz9WLJZtd4/Ru3ujto9p+PjHndNw42oCRyo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=iFzmLLq7; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="iFzmLLq7"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=QW4fFEcF7RsWKk8iCSpAqzPzouHeAUq+juaYAvjgTXo=; b=iFzmLLq7qnngkcCaltwJB4Tthr
	T8mODk7BlOY+POMVK0yzvJA+nnYXDdCvUyNuuhzrg6sAp8wP8JwwotdRbYcF6XSiuB9/hkfE2QWGP
	jBAw0chcYeP2GehQJSA18owqSf2zRdecb8xpTnEb/P1bzPjvL+12C2Z+BU5WALcZKz0/ttjEgy5b1
	ZLmc/luYOkJaOyBqMs0bf3mizTPsw2InvuC4FEguf28LlxRq0hRdaUEZhgdra2grumCAvidfKhbpY
	IS0pYR432j1AjwDeNT2P5AQkpsA1fCjc1OO4sJHDw0tuAcjaGXh0RQHptDtL0uG6FNhMSjl6Q0NKl
	Ac820/ZA==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPN-0000000024Q-0ShO;
	Wed, 20 May 2026 11:00:29 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 37/40] mm: page_alloc: move spb_get_category and
 spb_tainted_reserve to mmzone.h
Date: Wed, 20 May 2026 10:59:43 -0400
Message-ID: <20260520150018.2491267-38-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Pure move with no semantic change.  Both helpers will be needed in
mm/compaction.c by upcoming SPB-skip predicates; keep a single source
of truth for them in <linux/mmzone.h> alongside pfn_to_superpageblock().

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h | 30 ++++++++++++++++++++++++++++++
 mm/page_alloc.c        | 27 +++++----------------------
 2 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c9c248d5b14e..d51dbca59656 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1296,6 +1296,36 @@ static inline struct superpageblock *pfn_to_superpag=
eblock(struct zone *zone,
 	return &zone->superpageblocks[idx];
 }
=20
+/**
+ * spb_get_category - Determine if a superpageblock is clean or tainted
+ * @sb: superpageblock to classify
+ *
+ * A superpageblock is clean if it contains only free and movable pagebloc=
ks.
+ * Any unmovable, reclaimable, or reserved pageblocks make it tainted.
+ */
+static inline enum sb_category spb_get_category(struct superpageblock *sb)
+{
+	if (sb->nr_unmovable || sb->nr_reclaimable || sb->nr_reserved)
+		return SB_TAINTED;
+	return SB_CLEAN;
+}
+
+/*
+ * Minimum free-pageblock reserve a tainted SPB tries to maintain so the
+ * allocator can satisfy non-movable allocations without tainting fresh
+ * clean SPBs.  Used by spb_tainted_reserve() as a floor for the per-SPB
+ * reserve; the page allocator and compaction both consult
+ * spb_tainted_reserve() to decide when a tainted SPB has so few free
+ * pageblocks left that movable allocations should look elsewhere.
+ * Scale with SPB size: reserve ~3% of pageblocks (minimum 4).
+ */
+#define SPB_TAINTED_RESERVE_MIN	4
+
+static inline u16 spb_tainted_reserve(const struct superpageblock *sb)
+{
+	return max_t(u16, SPB_TAINTED_RESERVE_MIN, sb->total_pageblocks / 32);
+}
+
 enum pgdat_flags {
 	PGDAT_WRITEBACK,		/* reclaim scanning has recently found
 					 * many pages under writeback
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e45fac14622..62edbdf0c3f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -673,22 +673,8 @@ void superpageblock_set_has_movable(struct zone *zone,=
 struct page *page)
 }
 #endif /* CONFIG_COMPACTION */
=20
-/**
- * spb_get_category - Determine if a superpageblock is clean or tainted
- * @sb: superpageblock to classify
- *
- * A superpageblock is clean if it contains only free and movable pagebloc=
ks.
- * Any unmovable, reclaimable, or reserved pageblocks make it tainted.
- * Reserved pageblocks (memory holes) taint the superpageblock because it
- * can never be used for 1GB hugepages, making it a better home for
- * unmovable/reclaimable allocations.
- */
-static inline enum sb_category spb_get_category(struct superpageblock *sb)
-{
-	if (sb->nr_unmovable || sb->nr_reclaimable || sb->nr_reserved)
-		return SB_TAINTED;
-	return SB_CLEAN;
-}
+/* spb_get_category() lives in <linux/mmzone.h> for shared use with
+ * mm/compaction.c. */
=20
 /**
  * sb_get_fullness - Determine the fullness bucket for a superpageblock
@@ -2700,13 +2686,10 @@ static void prep_new_page(struct page *page, unsign=
ed int order, gfp_t gfp_flags
  *
  * Scale with SPB size: reserve ~3% of pageblocks (minimum 4).
  * For a 512-pageblock SPB this gives 16 reserved pageblocks.
+ *
+ * SPB_TAINTED_RESERVE_MIN and spb_tainted_reserve() live in
+ * <linux/mmzone.h> for shared use with mm/compaction.c.
  */
-#define SPB_TAINTED_RESERVE_MIN	4
-
-static inline u16 spb_tainted_reserve(const struct superpageblock *sb)
-{
-	return max_t(u16, SPB_TAINTED_RESERVE_MIN, sb->total_pageblocks / 32);
-}
=20
 /*
  * spb_tainted_can_serve_smaller - could a smaller-order @migratetype alloc
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CE6833F1662
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289255; cv=none;
 b=mVOoiLggdcYEoNZtK2up2rGFT7oz6DXM4IA4PEVzjfaIow5B2qijLmifYPUda7Nmu6aTLV5xRQkplRU77FMKiseMX5rQn/MNGKcrGd/R6vV2fG4nOEowPq+l5k3XLNExSQFWfDsZMMmrL3cP73OlHrqEzHyJNs6ee8kx+qeKvbY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289255; c=relaxed/simple;
	bh=SPT4Eq7v36unK79Lqd1lxyUobeTeyxLOnxyWUwPZKLQ=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=sUjrS/58wATaSwMyhTUtQ0OUAmzqeEk0pfipbPauRBwPQK6ivCUfSAmndeM0rQSFFQaOSJsuOEfo/LhAYlUrJbMAwzozDhoPZ19Ov+LBWk+YAZ00PD4MXxBA+WyTLYW/YSHxcQUAHJhGNIemi3dWhVJUdniTJ0pQ2WaSU3BnX+4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=ng/wS2QP; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="ng/wS2QP"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=uXkdeLHggCi+vFtKz5fxgfs3r4RSBpj9N/haZRnpGWk=; b=ng/wS2QPNAoNUjMcCITdZpjGyD
	5FSsZe/eW2L4IzCJ4dBed+n/ZuJgwwDcK5k27IpNliI4efJ0rTMwZsxebn5ApXkaRFEKF2j2fzxXN
	FZGQQQ6hhFLqg6buA34BNF3c6KpNJp6rKfCpHD2+KGcG4KNWZ2kn/2hbXmaFcSEUqwdcqs6AFKe4B
	rML1EbV5LyxbDuBBJ+75iEnBexLzik8nkGdV+r24uNX1sut0FK3H88VCCfEwouZacXecmm48xQiZ5
	1yvZak5ph2d8ppyL7MfvQZ9Ee44A+Aejdh76bUOuKfiRNK2TmJL1tl6ef5N4NlQhYUz06d3BBZBpR
	sXawct+A==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPN-0000000024Q-0ZBm;
	Wed, 20 May 2026 11:00:29 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 38/40] mm: compaction: skip empty tainted superpageblocks
 as migration source
Date: Wed, 20 May 2026 10:59:44 -0400
Message-ID: <20260520150018.2491267-39-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Tainted superpageblocks with nr_movable =3D=3D 0 hold only unmovable and/or
reclaimable pages.  None are migration candidates (compaction only moves
migratable pages), so scanning them is pure overhead.  Bail out at the
top of isolate_migratepages_block() when the current pageblock falls
inside such an SPB, returning 0 without performing any per-page scan.

The caller (isolate_migratepages) then advances to the next pageblock
and re-enters; each pageblock in the empty tainted SPB takes the same
fast bail, so the whole SPB is effectively skipped without per-page
scan work.  A tainted SPB with nr_movable > 0 still gets scanned
normally, preserving correctness.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/compaction.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/mm/compaction.c b/mm/compaction.c
index e4ba21072435..f9de52875c88 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -873,6 +873,27 @@ isolate_migratepages_block(struct compact_control *cc,=
 unsigned long low_pfn,
=20
 	cc->migrate_pfn =3D low_pfn;
=20
+	/*
+	 * Fast-path: a tainted SPB with no movable content has nothing to
+	 * migrate from this pageblock.  Bail out immediately and let the
+	 * caller move on to the next pageblock; the caller will re-enter
+	 * this function for each subsequent pageblock in the SPB and bail
+	 * the same way, so the whole SPB is effectively skipped without
+	 * any per-page scan work.
+	 */
+	if (cc->zone->nr_superpageblocks) {
+		struct superpageblock *sb =3D
+			pfn_to_superpageblock(cc->zone, low_pfn);
+
+		if (sb && spb_get_category(sb) =3D=3D SB_TAINTED &&
+		    sb->nr_movable =3D=3D 0) {
+			unsigned long sb_end =3D
+				ALIGN(low_pfn + 1, SUPERPAGEBLOCK_NR_PAGES);
+			cc->migrate_pfn =3D min(sb_end, end_pfn);
+			return 0;
+		}
+	}
+
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
 	 * list by either parallel reclaimers or compaction. If there are,
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1A0213F44F5
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289259; cv=none;
 b=C2/iPSTMI9ovt9ANZEYWLe0M6qmPEmvoMoXGILP5LUvNYum5MAef5X5WUuKoF7nvR5o3F1bW1dLDi/6Vo6Q4UJ167ASUJOCDR+EykpR9oxTN+g4lqguXf7ZKbAKfmg2yJgS69zA9zw5cz0QVq/H1hRRLAArYIT/NXKsvZvh3008=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289259; c=relaxed/simple;
	bh=x9eQDprk9ZUCDTUcWiYjtCD1ExQMSHG5JPhC9QQ07dg=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=FGSGLYg3Ons3bMpE7PyPVELSDuCVSOW/vUNfh0gDp0XxkWj3RUVOqJlgTMWQ8zf2wGQXb3DcpPygyMQRv684Rb9q+NshEzcinpzta6ECD03YXM1z0pp8c3jrr7JCtmrblJ8ko4yAQv0xgerCjWaQYMUthqGaQswsvLUU0airAss=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=lQTEdoYg; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="lQTEdoYg"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=jtZKbuOjheumhRbPauF+MORFobu5zhRF12dkuYJiXIw=; b=lQTEdoYgv1COf2hRyTnyxUch/k
	KGfNdIHjMbmbk2ikfos0xrzFi/X+hIGzQcgqiZJ6zh0XJWWfTrYnybAz+S3K9OJjVAcwhf9pJN+IJ
	dIfaTEmk8ghXLoYH0SZEtJtZD2CfEmdWTb31M2rxxQxM6tirwnjvEwy9Y/TbxhcoXiF8WCpb1/SDQ
	QgugPCGH7bKSsL/q7ax8ujNPcGrw8jZpqg8vCnEKSlzhi9A1fUOky9MHc88+YnHVZiDATVGaxX2Rt
	dSDFARfZtDYatuEE4vC0CTQSr0w2j1j7loXjTRv4Y/TdhHTQR8ztvTNFzSHjdUCyykp5jGrb/9+pe
	WhcP0Hfg==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPN-0000000024Q-0eoY;
	Wed, 20 May 2026 11:00:29 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 39/40] mm: compaction: respect tainted SPB reserve in
 destination selection
Date: Wed, 20 May 2026 10:59:45 -0400
Message-ID: <20260520150018.2491267-40-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Tainted SPBs reserve headroom (per
spb_tainted_reserve()) so the allocator can satisfy non-movable
allocations without tainting fresh clean SPBs.  Compaction's free-page
scanner can erode that reserve by picking destination pages from
tainted-SPB free lists -- exactly the headroom we need for unmovable
demand.

Refuse tainted-SPB pageblocks as compaction destinations once nr_free
is at or below spb_tainted_reserve(sb).  Clean SPBs (no reserve) stay
fully eligible.  This naturally biases compaction toward producing
free pageblocks inside clean SPBs, where they're useful for hugepage
allocation.

Combined with the source-side skip and the kcompactd wake fix, this
is sufficient for kcompactd to handle clean-SPB consolidation alone.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 mm/compaction.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/mm/compaction.c b/mm/compaction.c
index f9de52875c88..3d1015dffa82 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1503,6 +1503,21 @@ static bool suitable_migration_target(struct compact=
_control *cc,
 	if (cc->ignore_block_suitable)
 		return true;
=20
+	/*
+	 * Tainted superpageblocks reserve some headroom for non-movable
+	 * allocations.  Don't spill compaction migration into that reserve --
+	 * doing so erodes the headroom the allocator was holding to avoid
+	 * tainting fresh clean SPBs.  Clean SPBs (no reserve) stay eligible.
+	 */
+	if (cc->zone->nr_superpageblocks) {
+		struct superpageblock *sb =3D
+			pfn_to_superpageblock(cc->zone, page_to_pfn(page));
+
+		if (sb && spb_get_category(sb) =3D=3D SB_TAINTED &&
+		    sb->nr_free <=3D spb_tainted_reserve(sb))
+			return false;
+	}
+
 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
 	if (is_migrate_movable(get_pageblock_migratetype(page)))
 		return true;
--=20
2.54.0
From nobody Sun May 24 23:29:01 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A8D483E9C1F
	for <linux-kernel@vger.kernel.org>; Wed, 20 May 2026 15:00:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1779289258; cv=none;
 b=jEEwlzfyCQIHKzJf07Bj+trjrV6sf5UthOBgTw7+D6hN+T1DAF1SS4p6RoP7Ri2V/Ib1LGlDRw49HczawkzhWGy+/XE0HlaNhnrOGa71bJEzNt1M8/2CQBbBChwZkWVwn6JgzBQhA5nXN+xgiqUIlCOOvwZIhqPTD6mYqaMNiUY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1779289258; c=relaxed/simple;
	bh=h0CXryZn+tAFkPebXbpscjtPcwgexfOgK+dzOEkel8o=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=jtMzxcM/m0zAAYb3mrubVJpcDN0iQDdWwAZWJPLZGC39AvmX5McO5B3ITs/otHoQv9RztSsE/XksPS65ftFodWQdKCeHhkZ8nMpHu+Yvqwp6tRVYA9ov8Gym7MzPF/Go6TjjzD4WINc2dziOkb4jvN53h8m5vFxb05WhaoWmLNk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=fxwNmRbC; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="fxwNmRbC"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:Content-Type:MIME-Version:References:
	In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=ajxze11Zz5IYegQgvseCccUEAj4dKmOcGzpk63x4DmE=; b=fxwNmRbCWQjMvKt01NI6kuNNEr
	tfMKIS8Nw6jGac5gVnCT7SCNoFYDJtXHduKZ98tBxMMhfIxho5XwMbXzm/pbtg+xsuN1+48Xb7iEN
	K3ju23m4SZwCzSQCCDrNAGwT8GxUVgsrDdKeBsFHCgpykGPrqP+jouQhs8qklDBnzZX5w+a4FubmT
	cZve0qDrMd41FQKRGlwPVkVu1b5M13Ljx+8Uq/3F+EuSQKjaWHi8X0R4FTpQOp10fMjwUJid3ZkCs
	TZ/QmFpa1oNX9P2zFuHikZXPBeGgY0Di8jqUYRPDOHTSTB2MAyErPw0Dlgk4u2vDLFEWsf7sfrWzr
	J+SuuOpQ==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1wPiPN-0000000024Q-0sb6;
	Wed, 20 May 2026 11:00:29 -0400
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	david@kernel.org,
	willy@infradead.org,
	surenb@google.com,
	hannes@cmpxchg.org,
	ljs@kernel.org,
	ziy@nvidia.com,
	usama.arif@linux.dev,
	fvdl@google.com,
	Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 40/40] mm: page_alloc: SPB tracepoint instrumentation
 [DO-NOT-MERGE]
Date: Wed, 20 May 2026 10:59:46 -0400
Message-ID: <20260520150018.2491267-41-riel@surriel.com>
X-Mailer: git-send-email 2.54.0
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
References: <20260520150018.2491267-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

Bundle all SPB anti-fragmentation diagnostic tracepoints into a single
commit so the entire instrumentation can be dropped before upstream
submission.

Tracepoint definitions (include/trace/events/kmem.h):
  - spb_alloc_walk            -- exit point of every __rmqueue_smallest
                                 call with outcome and SPB visit count
  - spb_alloc_fall_through    -- fires when PASS 1/2/2b/2c all failed
                                 and the allocator is about to taint
                                 a fresh clean SPB (PASS 3 / steal)
  - spb_pb_taint              -- every PB_has_<mt> bit transition
  - spb_claim_block_refused   -- try_to_claim_block exits with reason
  - spb_evacuate_for_order_done -- evac phase completion summary
  - spb_alloc_atomic_relax    -- atomic NORETRY relaxation events

Plus enum value extensions:
  - SPB_ALLOC_OUTCOME_PASS_2D =3D 8 extends the spb_alloc_walk outcome
    set for the cross-MOV borrow path.
  - SPB_ATOMIC_RELAX_NOWARN_LOWER_ORDER =3D 3 extends the
    spb_alloc_atomic_relax step set for the best-effort high-order
    refusal path.

Tracepoint emission scaffolding and call sites (mm/page_alloc.c):
  - alloc_flags parameter on __rmqueue_smallest (plumbed through all
    callers; passed as 0 by callers without an alloc_flags context),
    consumed by the trace_spb_alloc_walk emit
  - n_spbs_visited counter + SPB_WALK_DONE macro in __rmqueue_smallest
  - bool first/last in __spb_set_has_type / __spb_clear_has_type
  - if-stmt brace + trace_spb_claim_block_refused in try_to_claim_block
    early-return paths (isolate, CMA, zone-boundary, noncompat-cross)
  - struct zone *pref + trace_spb_alloc_atomic_relax in slowpath
    NORETRY/NOFRAG-tainted relaxation
  - phase1_attempts/phase2_attempts counters +
    trace_spb_evacuate_for_order_done
  - trace_printk("SB first unmovable/reclaimable") on first-of-type
    transitions per SPB

Designed for diagnostics only; the behavioral commits in this series
provide the SPB anti-fragmentation machinery, this commit is purely
instrumentation.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/trace/events/kmem.h | 373 ++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c             | 154 +++++++++++++--
 2 files changed, 514 insertions(+), 13 deletions(-)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index cd7920c81f85..6ca63908a620 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -266,6 +266,379 @@ TRACE_EVENT(mm_page_pcpu_drain,
 		__entry->order, __entry->migratetype)
 );
=20
+/*
+ * spb_pb_taint action encoding.
+ */
+#define SPB_PB_TAINT_ACTION_SET		0   /* set PB_has_<mt> */
+#define SPB_PB_TAINT_ACTION_CLEAR	1   /* clear PB_has_<mt> */
+
+#define show_spb_pb_taint_action(a)				\
+	__print_symbolic(a,					\
+		{ SPB_PB_TAINT_ACTION_SET,	"SET"   },	\
+		{ SPB_PB_TAINT_ACTION_CLEAR,	"CLEAR" })
+
+/*
+ * Per-call tracepoint at every PB_has_<migratetype> bit transition.
+ * Distinct from the existing trace_printk lines (which only fire on
+ * the FIRST 0->1 transition per (SPB, migratetype)) =E2=80=94 this fires =
on
+ * EVERY successful set/clear, and includes a flag for whether this
+ * call also caused a 0<->1 transition at the SPB-level counter
+ * (i.e., is_first_or_last for this (SPB, mt) combination).
+ *
+ * Use to answer "who is painting/clearing PB_has bits and at what
+ * rate?" =E2=80=94 most useful when investigating runaway tainting or when
+ * Stage 1 / sync evac should be clearing bits but isn't.
+ *
+ * High volume: bounded by the rate of PB_has_* bit changes, which
+ * is typically per-allocation. Static-key gated to zero overhead
+ * when detached.
+ */
+TRACE_EVENT(spb_pb_taint,
+
+	TP_PROTO(struct page *page, int migratetype, int action,
+		 bool is_first_or_last),
+
+	TP_ARGS(page, migratetype, action, is_first_or_last),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	pfn			)
+		__field(	int,		migratetype		)
+		__field(	int,		action			)
+		__field(	bool,		is_first_or_last	)
+	),
+
+	TP_fast_assign(
+		__entry->pfn			=3D page_to_pfn(page);
+		__entry->migratetype		=3D migratetype;
+		__entry->action			=3D action;
+		__entry->is_first_or_last	=3D is_first_or_last;
+	),
+
+	TP_printk("pfn=3D0x%lx mt=3D%d action=3D%s first_or_last=3D%d",
+		__entry->pfn,
+		__entry->migratetype,
+		show_spb_pb_taint_action(__entry->action),
+		__entry->is_first_or_last)
+);
+
+/*
+ * spb_claim_block_refused reason encoding.
+ */
+#define SPB_CLAIM_REFUSED_ISOLATE		0
+#define SPB_CLAIM_REFUSED_CMA			1
+#define SPB_CLAIM_REFUSED_ZONE_BOUNDARY		2
+#define SPB_CLAIM_REFUSED_CROSS_TYPE_NOT_FREE	3
+#define SPB_CLAIM_REFUSED_INSUFFICIENT_COMPAT	4
+
+#define show_spb_claim_refused_reason(r)				\
+	__print_symbolic(r,						\
+		{ SPB_CLAIM_REFUSED_ISOLATE,         "ISOLATE"        },	\
+		{ SPB_CLAIM_REFUSED_CMA,             "CMA"            },	\
+		{ SPB_CLAIM_REFUSED_ZONE_BOUNDARY,   "ZONE_BOUNDARY"  },	\
+		{ SPB_CLAIM_REFUSED_CROSS_TYPE_NOT_FREE, "CROSS_TYPE_NOT_FREE" }, \
+		{ SPB_CLAIM_REFUSED_INSUFFICIENT_COMPAT, "INSUFFICIENT_COMPAT" })
+
+/*
+ * Per-refusal tracepoint inside try_to_claim_block. The function can
+ * fail for several reasons: pageblock isolated for evacuation, CMA
+ * pageblock, zone boundary straddle, cross-type relabel that requires
+ * a fully-free PB, or the heuristic threshold that says too few pages
+ * in the block are compatible. Visibility into WHICH reason fires how
+ * often informs Stage 4 design (e.g., is the heuristic gate the
+ * dominant cause of allocations spilling to clean SPBs?).
+ *
+ * Volume: bounded by the rate of fallback attempts, which is rare
+ * compared to total allocations.
+ */
+TRACE_EVENT(spb_claim_block_refused,
+
+	TP_PROTO(struct page *page, int start_type, int block_type,
+		 int reason),
+
+	TP_ARGS(page, start_type, block_type, reason),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	pfn		)
+		__field(	int,		start_type	)
+		__field(	int,		block_type	)
+		__field(	int,		reason		)
+	),
+
+	TP_fast_assign(
+		__entry->pfn		=3D page_to_pfn(page);
+		__entry->start_type	=3D start_type;
+		__entry->block_type	=3D block_type;
+		__entry->reason		=3D reason;
+	),
+
+	TP_printk("pfn=3D0x%lx start_mt=3D%d block_mt=3D%d reason=3D%s",
+		__entry->pfn,
+		__entry->start_type,
+		__entry->block_type,
+		show_spb_claim_refused_reason(__entry->reason))
+);
+
+/*
+ * Per-call tracepoint at the exit of spb_evacuate_for_order, the
+ * synchronous slowpath evacuator called from
+ * __alloc_pages_direct_compact. Captures how many evacuate_pageblock
+ * calls were attempted in each phase:
+ *   - Phase 1: coalesce within existing same-mt pageblocks
+ *   - Phase 2: evacuate whole movable pageblocks to create free PBs
+ *
+ * Together with pgmigrate_success/pgmigrate_fail counter deltas, this
+ * lets us answer "is slowpath sync evacuation actually creating
+ * useful free pageblocks, or are the migrations EAGAINing on busy
+ * ebs?" =E2=80=94 directly informs whether the per-call budget caps need
+ * tuning.
+ *
+ * Low volume: ~one event per direct-compact slowpath visit.
+ */
+TRACE_EVENT(spb_evacuate_for_order_done,
+
+	TP_PROTO(struct zone *zone, unsigned int order, int migratetype,
+		 unsigned int phase1_attempts, unsigned int phase2_attempts,
+		 bool did_evacuate),
+
+	TP_ARGS(zone, order, migratetype, phase1_attempts,
+		phase2_attempts, did_evacuate),
+
+	TP_STRUCT__entry(
+		__string(	name,			zone->name	)
+		__field(	unsigned int,		order		)
+		__field(	int,			migratetype	)
+		__field(	unsigned int,		phase1_attempts	)
+		__field(	unsigned int,		phase2_attempts	)
+		__field(	bool,			did_evacuate	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__entry->order			=3D order;
+		__entry->migratetype		=3D migratetype;
+		__entry->phase1_attempts	=3D phase1_attempts;
+		__entry->phase2_attempts	=3D phase2_attempts;
+		__entry->did_evacuate		=3D did_evacuate;
+	),
+
+	TP_printk("zone=3D%s order=3D%u mt=3D%d p1=3D%u p2=3D%u did_evac=3D%d",
+		__get_str(name),
+		__entry->order,
+		__entry->migratetype,
+		__entry->phase1_attempts,
+		__entry->phase2_attempts,
+		__entry->did_evacuate)
+);
+
+/*
+ * spb_alloc_atomic_relax step encoding.
+ */
+#define SPB_ATOMIC_RELAX_NORETRY_SKIP	0   /* NORETRY caller =E2=80=94 retu=
rn NULL */
+#define SPB_ATOMIC_RELAX_ADD_TAINTED_OK	1   /* add ALLOC_NOFRAG_TAINTED_OK=
 retry */
+#define SPB_ATOMIC_RELAX_DROP_NOFRAGMENT 2  /* drop ALLOC_NOFRAGMENT retry=
 */
+#define SPB_ATOMIC_RELAX_NOWARN_LOWER_ORDER 3  /* NOWARN best-effort + tai=
nted has lower order */
+
+#define show_spb_atomic_relax_step(s)					\
+	__print_symbolic(s,						\
+		{ SPB_ATOMIC_RELAX_NORETRY_SKIP,        "NORETRY_SKIP"    }, \
+		{ SPB_ATOMIC_RELAX_ADD_TAINTED_OK,      "ADD_TAINTED_OK"  }, \
+		{ SPB_ATOMIC_RELAX_DROP_NOFRAGMENT,     "DROP_NOFRAGMENT" }, \
+		{ SPB_ATOMIC_RELAX_NOWARN_LOWER_ORDER,  "NOWARN_LOWER_ORDER" })
+
+/*
+ * Per-event tracepoint at each atomic-allocation NOFRAGMENT-relaxation
+ * step in get_page_from_freelist. Captures NORETRY-skip exits (caller
+ * had a fallback so we returned NULL), and the two relaxation retries
+ * (add NOFRAG_TAINTED_OK; drop NOFRAGMENT entirely).
+ *
+ * Use to quantify how often each step fires under the workload.
+ * Validates the NORETRY-skip change is paying off.
+ *
+ * Volume: only on atomic allocs that exhaust the tainted pool =E2=80=94
+ * typically rare on a healthy system.
+ */
+TRACE_EVENT(spb_alloc_atomic_relax,
+
+	TP_PROTO(struct zone *zone, unsigned int order, int migratetype,
+		 gfp_t gfp_mask, int step),
+
+	TP_ARGS(zone, order, migratetype, gfp_mask, step),
+
+	TP_STRUCT__entry(
+		__string(	name,			zone->name	)
+		__field(	unsigned int,		order		)
+		__field(	int,			migratetype	)
+		__field(	unsigned long,		gfp_mask	)
+		__field(	int,			step		)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__entry->order		=3D order;
+		__entry->migratetype	=3D migratetype;
+		__entry->gfp_mask	=3D (__force unsigned long)gfp_mask;
+		__entry->step		=3D step;
+	),
+
+	TP_printk("zone=3D%s order=3D%u mt=3D%d gfp=3D%s step=3D%s",
+		__get_str(name),
+		__entry->order,
+		__entry->migratetype,
+		show_gfp_flags(__entry->gfp_mask),
+		show_spb_atomic_relax_step(__entry->step))
+);
+
+/*
+ * spb_alloc_walk outcome encoding. SUCCESS_* values name which Pass
+ * inside __rmqueue_smallest produced the page. NO_PAGE means the
+ * function returned NULL (all passes failed).
+ */
+#define SPB_ALLOC_OUTCOME_NO_PAGE	0
+#define SPB_ALLOC_OUTCOME_PASS_1	1   /* preferred SPBs */
+#define SPB_ALLOC_OUTCOME_PASS_2	2   /* claim_whole_block from tainted */
+#define SPB_ALLOC_OUTCOME_PASS_2B	3   /* sub-PB claim from tainted */
+#define SPB_ALLOC_OUTCOME_PASS_2C	4   /* cross-non-movable borrow */
+#define SPB_ALLOC_OUTCOME_PASS_3	5   /* empty SPB (taints fresh SPB) */
+#define SPB_ALLOC_OUTCOME_PASS_4	6   /* movable falls back to tainted */
+#define SPB_ALLOC_OUTCOME_ZONE_FALLBACK	7  /* zone-level free_area (hotplu=
g edge) */
+#define SPB_ALLOC_OUTCOME_PASS_2D	8   /* cross-MOV borrow within tainted */
+
+#define show_spb_alloc_outcome(o)				\
+	__print_symbolic(o,					\
+		{ SPB_ALLOC_OUTCOME_NO_PAGE,	"NO_PAGE"  },	\
+		{ SPB_ALLOC_OUTCOME_PASS_1,	"PASS_1"   },	\
+		{ SPB_ALLOC_OUTCOME_PASS_2,	"PASS_2"   },	\
+		{ SPB_ALLOC_OUTCOME_PASS_2B,	"PASS_2B"  },	\
+		{ SPB_ALLOC_OUTCOME_PASS_2C,	"PASS_2C"  },	\
+		{ SPB_ALLOC_OUTCOME_PASS_2D,	"PASS_2D"  },	\
+		{ SPB_ALLOC_OUTCOME_PASS_3,	"PASS_3"   },	\
+		{ SPB_ALLOC_OUTCOME_PASS_4,	"PASS_4"   },	\
+		{ SPB_ALLOC_OUTCOME_ZONE_FALLBACK, "ZONE_FB" })
+
+/*
+ * Per-allocation tracepoint at every exit of __rmqueue_smallest.
+ * Captures how many SPBs were walked before the allocation was
+ * satisfied (or determined unsatisfiable).
+ *
+ * Use this to characterize the cost of the linear spb_lists walk:
+ *   - typical walk depth per allocation
+ *   - per-(order, migratetype) walk-depth distribution
+ *   - whether some workloads see pathologically long walks
+ *
+ * High-volume tracepoint (~1 emission per allocation, ~hundreds of
+ * thousands per second on busy systems). The static-key gating in
+ * the caller keeps cost at ~1 ns when the tracepoint is detached.
+ * When attached, expect ~100 ns/event (~10% CPU on a saturated
+ * allocator). Filter by outcome to reduce volume:
+ *   tracepoint:kmem:spb_alloc_walk /args->n_spbs_visited > 5/ { ... }
+ */
+TRACE_EVENT(spb_alloc_walk,
+
+	TP_PROTO(struct zone *zone, unsigned int order, int migratetype,
+		 unsigned int alloc_flags, int outcome,
+		 unsigned int n_spbs_visited),
+
+	TP_ARGS(zone, order, migratetype, alloc_flags, outcome,
+		n_spbs_visited),
+
+	TP_STRUCT__entry(
+		__string(	name,			zone->name	)
+		__field(	unsigned int,		order		)
+		__field(	int,			migratetype	)
+		__field(	unsigned int,		alloc_flags	)
+		__field(	int,			outcome		)
+		__field(	unsigned int,		n_spbs_visited	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__entry->order			=3D order;
+		__entry->migratetype		=3D migratetype;
+		__entry->alloc_flags		=3D alloc_flags;
+		__entry->outcome		=3D outcome;
+		__entry->n_spbs_visited		=3D n_spbs_visited;
+	),
+
+	TP_printk("zone=3D%s order=3D%u mt=3D%d alloc_flags=3D0x%x outcome=3D%s n=
_spbs_visited=3D%u",
+		__get_str(name),
+		__entry->order,
+		__entry->migratetype,
+		__entry->alloc_flags,
+		show_spb_alloc_outcome(__entry->outcome),
+		__entry->n_spbs_visited)
+);
+
+/*
+ * Diagnostic tracepoint fired when __rmqueue_smallest's tainted-SPB
+ * passes (Pass 1/2/2b/2c) all failed and the allocator is about to
+ * fall through to Pass 3 (which may taint a clean SPB) or to the
+ * fallback paths in __rmqueue_claim/__rmqueue_steal.
+ *
+ * Captures enough state to answer "why didn't an existing tainted SPB
+ * absorb this allocation?":
+ *   - n_tainted_with_buddy: count of tainted SPBs whose free_area at
+ *     the requested order has a non-empty free_list of the requested
+ *     migratetype. >0 means buddies WERE available =E2=80=94 Pass 1 missed
+ *     them somehow. 0 means the tainted pool genuinely had nothing at
+ *     the right (order, mt).
+ *   - walk flags: snapshot of struct spb_tainted_walk gathered during
+ *     Pass 1's walk. saw_free_pages =3D any tainted SPB had any free
+ *     pages anywhere; saw_free_pb =3D any tainted SPB had a wholly-free
+ *     pageblock; saw_below_reserve =3D any tainted SPB was at or below
+ *     its reserve threshold.
+ *
+ * Fires once per fall-through event, so volume scales with the rate
+ * at which clean-SPB tainting becomes a possibility =E2=80=94 typically r=
are
+ * once the workload reaches steady state.
+ */
+TRACE_EVENT(spb_alloc_fall_through,
+
+	TP_PROTO(struct zone *zone, unsigned int order, int migratetype,
+		 unsigned int alloc_flags,
+		 unsigned int n_tainted, unsigned int n_tainted_with_buddy,
+		 bool saw_free_pages, bool saw_free_pb,
+		 bool saw_below_reserve),
+
+	TP_ARGS(zone, order, migratetype, alloc_flags,
+		n_tainted, n_tainted_with_buddy,
+		saw_free_pages, saw_free_pb, saw_below_reserve),
+
+	TP_STRUCT__entry(
+		__string(	name,			zone->name		)
+		__field(	unsigned int,		order			)
+		__field(	int,			migratetype		)
+		__field(	unsigned int,		alloc_flags		)
+		__field(	unsigned int,		n_tainted		)
+		__field(	unsigned int,		n_tainted_with_buddy	)
+		__field(	bool,			saw_free_pages		)
+		__field(	bool,			saw_free_pb		)
+		__field(	bool,			saw_below_reserve	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__entry->order			=3D order;
+		__entry->migratetype		=3D migratetype;
+		__entry->alloc_flags		=3D alloc_flags;
+		__entry->n_tainted		=3D n_tainted;
+		__entry->n_tainted_with_buddy	=3D n_tainted_with_buddy;
+		__entry->saw_free_pages		=3D saw_free_pages;
+		__entry->saw_free_pb		=3D saw_free_pb;
+		__entry->saw_below_reserve	=3D saw_below_reserve;
+	),
+
+	TP_printk("zone=3D%s order=3D%u mt=3D%d alloc_flags=3D0x%x n_tainted=3D%u=
 n_tainted_with_buddy=3D%u walk=3D[fp=3D%d fpb=3D%d below=3D%d]",
+		__get_str(name),
+		__entry->order,
+		__entry->migratetype,
+		__entry->alloc_flags,
+		__entry->n_tainted,
+		__entry->n_tainted_with_buddy,
+		__entry->saw_free_pages,
+		__entry->saw_free_pb,
+		__entry->saw_below_reserve)
+);
+
 TRACE_EVENT(mm_page_alloc_extfrag,
=20
 	TP_PROTO(struct page *page,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62edbdf0c3f3..a6cb09273347 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -522,18 +522,39 @@ static void __spb_set_has_type(struct page *page, int=
 migratetype)
 		return;
=20
 	if (!get_pfnblock_bit(page, pfn, bit)) {
+		bool first =3D false;
+
 		set_pfnblock_bit(page, pfn, bit);
 		switch (bit) {
 		case PB_has_unmovable:
 			sb->nr_unmovable++;
+			first =3D (sb->nr_unmovable =3D=3D 1);
+			if (first)
+				trace_printk("SB first unmovable: zone=3D%s sb=3D%lu pfn=3D%lu mt=3D%d=
 rsv=3D%u mov=3D%u recl=3D%u free=3D%u\n",
+					     sb->zone->name,
+					     (unsigned long)(sb - sb->zone->superpageblocks),
+					     pfn, migratetype,
+					     sb->nr_reserved, sb->nr_movable,
+					     sb->nr_reclaimable, sb->nr_free);
 			break;
 		case PB_has_reclaimable:
 			sb->nr_reclaimable++;
+			first =3D (sb->nr_reclaimable =3D=3D 1);
+			if (first)
+				trace_printk("SB first reclaimable: zone=3D%s sb=3D%lu pfn=3D%lu mt=3D=
%d rsv=3D%u mov=3D%u unmov=3D%u free=3D%u\n",
+					     sb->zone->name,
+					     (unsigned long)(sb - sb->zone->superpageblocks),
+					     pfn, migratetype,
+					     sb->nr_reserved, sb->nr_movable,
+					     sb->nr_unmovable, sb->nr_free);
 			break;
 		case PB_has_movable:
 			sb->nr_movable++;
+			first =3D (sb->nr_movable =3D=3D 1);
 			break;
 		}
+		trace_spb_pb_taint(page, migratetype,
+				   SPB_PB_TAINT_ACTION_SET, first);
 		spb_debug_check(sb, "__spb_set_has_type");
 	}
 }
@@ -557,21 +578,28 @@ static void __spb_clear_has_type(struct page *page, i=
nt migratetype)
 		return;
=20
 	if (get_pfnblock_bit(page, pfn, bit)) {
+		bool last =3D false;
+
 		clear_pfnblock_bit(page, pfn, bit);
 		switch (bit) {
 		case PB_has_unmovable:
 			if (sb->nr_unmovable)
 				sb->nr_unmovable--;
+			last =3D (sb->nr_unmovable =3D=3D 0);
 			break;
 		case PB_has_reclaimable:
 			if (sb->nr_reclaimable)
 				sb->nr_reclaimable--;
+			last =3D (sb->nr_reclaimable =3D=3D 0);
 			break;
 		case PB_has_movable:
 			if (sb->nr_movable)
 				sb->nr_movable--;
+			last =3D (sb->nr_movable =3D=3D 0);
 			break;
 		}
+		trace_spb_pb_taint(page, migratetype,
+				   SPB_PB_TAINT_ACTION_CLEAR, last);
 		spb_debug_check(sb, "__spb_clear_has_type");
 	}
 }
@@ -3037,7 +3065,8 @@ static struct page *try_alloc_from_sb_pass1(struct zo=
ne *zone,
=20
 static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
-				int migratetype, struct spb_tainted_walk *walk)
+				int migratetype, unsigned int alloc_flags,
+				struct spb_tainted_walk *walk)
 {
 	unsigned int current_order;
 	struct free_area *area;
@@ -3045,6 +3074,17 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 	int full;
 	struct superpageblock *sb;
 	int opposite_mt;
+	/*
+	 * Diagnostic counter for the spb_alloc_walk tracepoint. Counts how
+	 * many SPBs were visited (across all Passes) before this allocation
+	 * succeeded or fell through. Used to characterize the cost of the
+	 * linear spb_lists walk and identify pathological cases.
+	 */
+	unsigned int n_spbs_visited =3D 0;
+
+#define SPB_WALK_DONE(_outcome) \
+	trace_spb_alloc_walk(zone, order, migratetype, alloc_flags, \
+			     (_outcome), n_spbs_visited)
 	/*
 	 * Category search order: 2 passes.
 	 * Movable: clean first, then tainted (pack into clean SBs).
@@ -3088,6 +3128,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 				    migratetype,
 				    pcp_allowed_order(order) &&
 				    migratetype < MIGRATE_PCPTYPES);
+				SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_1);
 				return page;
 			}
 		}
@@ -3103,6 +3144,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 				    migratetype,
 				    pcp_allowed_order(order) &&
 				    migratetype < MIGRATE_PCPTYPES);
+				SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_1);
 				return page;
 			}
 		}
@@ -3139,6 +3181,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
=20
 		list_for_each_entry(sb,
 			&zone->spb_lists[cat][full], list) {
+			n_spbs_visited++;
 			/*
 			 * Snapshot tainted-SPB capacity before the
 			 * nr_free_pages skip: an SPB with a free pageblock
@@ -3173,6 +3216,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 					page, order, migratetype,
 					pcp_allowed_order(order) &&
 					migratetype < MIGRATE_PCPTYPES);
+				SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_1);
 				if (migratetype < MIGRATE_PCPTYPES) {
 					struct spb_warm_hint_slot *slot;
=20
@@ -3203,6 +3247,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
 						migratetype < MIGRATE_PCPTYPES);
+					SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_1);
 					if (migratetype < MIGRATE_PCPTYPES) {
 						struct spb_warm_hint_slot *slot;
=20
@@ -3234,6 +3279,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
 			list_for_each_entry(sb,
 				&zone->spb_lists[SB_TAINTED][full], list) {
+				n_spbs_visited++;
 				if (!sb->nr_free)
 					continue;
 				for (current_order =3D max_t(unsigned int,
@@ -3258,6 +3304,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
 						migratetype < MIGRATE_PCPTYPES);
+					SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_2);
 					return page;
 				}
 			}
@@ -3268,6 +3315,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 				&zone->spb_lists[SB_TAINTED][full], list) {
 				int co;
=20
+				n_spbs_visited++;
 				if (!sb->nr_free_pages)
 					continue;
 				for (co =3D min_t(int, pageblock_order - 1,
@@ -3296,6 +3344,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
 						migratetype < MIGRATE_PCPTYPES);
+					SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_2B);
 					return page;
 				}
 			}
@@ -3353,6 +3402,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 					&zone->spb_lists[SB_TAINTED][full], list) {
 					int co;
=20
+					n_spbs_visited++;
 					if (!sb->nr_free_pages)
 						continue;
 					for (co =3D min_t(int, pageblock_order - 1,
@@ -3380,6 +3430,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 							page, order, migratetype,
 							pcp_allowed_order(order) &&
 							migratetype < MIGRATE_PCPTYPES);
+						SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_2C);
 						return page;
 					}
 				}
@@ -3425,6 +3476,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 					&zone->spb_lists[SB_TAINTED][full], list) {
 					int co;
=20
+					n_spbs_visited++;
 					if (!sb->nr_free_pages)
 						continue;
 					for (co =3D min_t(int, pageblock_order - 1,
@@ -3452,6 +3504,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 							page, order, migratetype,
 							pcp_allowed_order(order) &&
 							migratetype < MIGRATE_PCPTYPES);
+						SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_2D);
 						return page;
 					}
 				}
@@ -3494,8 +3547,40 @@ struct page *__rmqueue_smallest(struct zone *zone, u=
nsigned int order,
 		}
 	}
=20
+	/*
+	 * Diagnostic: capture per-fall-through state so we can answer
+	 * "why didn't an existing tainted SPB absorb this allocation?".
+	 * The count loop walks the tainted-SPB lists looking for any SPB
+	 * with a free buddy at the requested (order, migratetype). >0
+	 * means buddies were available -- Pass 1 missed them. 0 means
+	 * the tainted pool genuinely had nothing usable. Loop is bounded
+	 * by the number of tainted SPBs and runs only on the slow path
+	 * (this is the fall-through to Pass 3/Pass 4). Skipped if the
+	 * tracepoint is not active so there is zero cost in production.
+	 */
+	if (walk && trace_spb_alloc_fall_through_enabled()) {
+		unsigned int n_tainted =3D 0, n_with_buddy =3D 0;
+
+		for (full =3D SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			list_for_each_entry(sb,
+				&zone->spb_lists[SB_TAINTED][full], list) {
+				n_tainted++;
+				if (!list_empty(
+				    &sb->free_area[order].free_list[migratetype]))
+					n_with_buddy++;
+			}
+		}
+		trace_spb_alloc_fall_through(zone, order, migratetype,
+					     alloc_flags,
+					     n_tainted, n_with_buddy,
+					     walk->saw_free_pages,
+					     walk->saw_free_pb,
+					     walk->saw_below_reserve);
+	}
+
 	/* Pass 3: whole pageblock from empty superpageblocks */
 	list_for_each_entry(sb, &zone->spb_empty, list) {
+		n_spbs_visited++;
 		if (!sb->nr_free_pages)
 			continue;
 		for (current_order =3D max(order, pageblock_order);
@@ -3511,6 +3596,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 				migratetype,
 				pcp_allowed_order(order) &&
 				migratetype < MIGRATE_PCPTYPES);
+			SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_3);
 			return page;
 		}
 	}
@@ -3529,6 +3615,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
=20
 			list_for_each_entry(sb,
 				&zone->spb_lists[cat][full], list) {
+				n_spbs_visited++;
 				if (!sb->nr_free_pages)
 					continue;
 				/*
@@ -3553,6 +3640,7 @@ struct page *__rmqueue_smallest(struct zone *zone, un=
signed int order,
 						page, order, migratetype,
 						pcp_allowed_order(order) &&
 						migratetype < MIGRATE_PCPTYPES);
+					SPB_WALK_DONE(SPB_ALLOC_OUTCOME_PASS_4);
 					return page;
 				}
 			}
@@ -3577,10 +3665,13 @@ struct page *__rmqueue_smallest(struct zone *zone, =
unsigned int order,
 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
 				pcp_allowed_order(order) &&
 				migratetype < MIGRATE_PCPTYPES);
+		SPB_WALK_DONE(SPB_ALLOC_OUTCOME_ZONE_FALLBACK);
 		return page;
 	}
=20
+	SPB_WALK_DONE(SPB_ALLOC_OUTCOME_NO_PAGE);
 	return NULL;
+#undef SPB_WALK_DONE
 }
=20
=20
@@ -3617,7 +3708,7 @@ static inline bool noncompatible_cross_type(int start=
_type, int fallback_type)
 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zo=
ne,
 					unsigned int order)
 {
-	return __rmqueue_smallest(zone, order, MIGRATE_CMA, NULL);
+	return __rmqueue_smallest(zone, order, MIGRATE_CMA, 0, NULL);
 }
 #else
 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
@@ -3999,8 +4090,11 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 	 * Don't steal from pageblocks that are isolated for
 	 * evacuation -- that would undo the work in progress.
 	 */
-	if (get_pageblock_isolate(page))
+	if (get_pageblock_isolate(page)) {
+		trace_spb_claim_block_refused(page, start_type, block_type,
+					      SPB_CLAIM_REFUSED_ISOLATE);
 		return NULL;
+	}
=20
 	/*
 	 * Never steal from CMA pageblocks.  CMA pages freed through
@@ -4009,8 +4103,11 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
 	 * fallback search.  Stealing would corrupt CMA by changing
 	 * the pageblock type away from MIGRATE_CMA.
 	 */
-	if (is_migrate_cma(get_pageblock_migratetype(page)))
+	if (is_migrate_cma(get_pageblock_migratetype(page))) {
+		trace_spb_claim_block_refused(page, start_type, block_type,
+					      SPB_CLAIM_REFUSED_CMA);
 		return NULL;
+	}
=20
 	/* Take ownership for orders >=3D pageblock_order */
 	if (current_order >=3D pageblock_order)
@@ -4019,8 +4116,11 @@ try_to_claim_block(struct zone *zone, struct page *p=
age,
=20
 	/* moving whole block can fail due to zone boundary conditions */
 	if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
-				       &movable_pages))
+				       &movable_pages)) {
+		trace_spb_claim_block_refused(page, start_type, block_type,
+					      SPB_CLAIM_REFUSED_ZONE_BOUNDARY);
 		return NULL;
+	}
=20
 	/*
 	 * Determine how many pages are compatible with our allocation.
@@ -4059,11 +4159,17 @@ try_to_claim_block(struct zone *zone, struct page *=
page,
 	 * the SPB is tainted.
 	 */
 	if (noncompatible_cross_type(start_type, block_type)) {
-		if (free_pages !=3D pageblock_nr_pages)
+		if (free_pages !=3D pageblock_nr_pages) {
+			trace_spb_claim_block_refused(page, start_type,
+				block_type,
+				SPB_CLAIM_REFUSED_CROSS_TYPE_NOT_FREE);
 			return NULL;
+		}
 	} else if (!from_tainted_spb &&
 		   free_pages + alike_pages < (1 << (pageblock_order-1)) &&
 		   !page_group_by_mobility_disabled) {
+		trace_spb_claim_block_refused(page, start_type, block_type,
+			SPB_CLAIM_REFUSED_INSUFFICIENT_COMPAT);
 		return NULL;
 	}
=20
@@ -4092,7 +4198,7 @@ try_to_claim_block(struct zone *zone, struct page *pa=
ge,
 	if (sb)
 		spb_update_list(sb);
 #endif
-	return __rmqueue_smallest(zone, order, start_type, NULL);
+	return __rmqueue_smallest(zone, order, start_type, 0, NULL);
 }
=20
 /*
@@ -4493,7 +4599,8 @@ __rmqueue(struct zone *zone, unsigned int order, int =
migratetype,
 	 */
 	switch (*mode) {
 	case RMQUEUE_NORMAL:
-		page =3D __rmqueue_smallest(zone, order, migratetype, walkp);
+		page =3D __rmqueue_smallest(zone, order, migratetype,
+					  alloc_flags, walkp);
 		if (page)
 			return page;
 		/*
@@ -5632,7 +5739,8 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
 		}
 		if (alloc_flags & ALLOC_HIGHATOMIC)
 			page =3D __rmqueue_smallest(zone, order,
-						  MIGRATE_HIGHATOMIC, NULL);
+						  MIGRATE_HIGHATOMIC,
+						  alloc_flags, NULL);
 		if (!page) {
 			enum rmqueue_mode rmqm =3D RMQUEUE_NORMAL;
=20
@@ -5647,7 +5755,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zon=
e, struct zone *zone,
 			if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
 				page =3D __rmqueue_smallest(zone, order,
 							  MIGRATE_HIGHATOMIC,
-							  NULL);
+							  alloc_flags, NULL);
=20
 			if (!page) {
 				spin_unlock_irqrestore(&zone->lock, flags);
@@ -6383,8 +6491,12 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int =
order, int alloc_flags,
 	    !(gfp_mask & __GFP_DIRECT_RECLAIM)) {
 		struct zone *pref =3D zonelist_zone(ac->preferred_zoneref);
=20
-		if (gfp_mask & __GFP_NORETRY)
+		if (gfp_mask & __GFP_NORETRY) {
+			trace_spb_alloc_atomic_relax(pref, order,
+				ac->migratetype, gfp_mask,
+				SPB_ATOMIC_RELAX_NORETRY_SKIP);
 			return NULL;
+		}
=20
 		/*
 		 * Best-effort high-order callers convention: stripping
@@ -6407,13 +6519,22 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int=
 order, int alloc_flags,
 		if (order > 0 && (gfp_mask & __GFP_NOWARN) &&
 		    !(gfp_mask & __GFP_NOFAIL) &&
 		    spb_tainted_can_serve_smaller(pref, order,
-						  ac->migratetype))
+						  ac->migratetype)) {
+			trace_spb_alloc_atomic_relax(pref, order,
+				ac->migratetype, gfp_mask,
+				SPB_ATOMIC_RELAX_NOWARN_LOWER_ORDER);
 			return NULL;
-
+		}
 		if (!(alloc_flags & ALLOC_NOFRAG_TAINTED_OK)) {
+			trace_spb_alloc_atomic_relax(pref, order,
+				ac->migratetype, gfp_mask,
+				SPB_ATOMIC_RELAX_ADD_TAINTED_OK);
 			alloc_flags |=3D ALLOC_NOFRAG_TAINTED_OK;
 			goto retry;
 		}
+		trace_spb_alloc_atomic_relax(pref, order,
+			ac->migratetype, gfp_mask,
+			SPB_ATOMIC_RELAX_DROP_NOFRAGMENT);
 		alloc_flags &=3D ~(ALLOC_NOFRAGMENT | ALLOC_NOFRAG_TAINTED_OK);
 		goto retry;
 	}
@@ -10317,6 +10438,13 @@ static bool spb_evacuate_for_order(struct zone *zo=
ne, unsigned int order,
 	 */
 	queue_spb_slab_shrink(zone);
=20
+	/*
+	 * The tracepoint signature retains phase1_attempts / phase2_attempts
+	 * for ABI continuity with existing observers; report the merged total
+	 * in phase1_attempts and 0 in phase2_attempts.
+	 */
+	trace_spb_evacuate_for_order_done(zone, order, migratetype,
+			attempts, 0, did_evacuate);
 	return did_evacuate;
 }
 #endif /* CONFIG_COMPACTION */
--=20
2.54.0