There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.
This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.
This concern was raised during the review of adding LBS support to
XFS[1][2].
Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left.
Add a config option STATIC_PMD_ZERO_PAGE that will always allocate
the huge_zero_folio in .bss, and it will never be freed. This makes using the
huge_zero_folio without having to pass any mm struct and call put_folio
in the destructor.
As STATIC_PMD_ZERO_PAGE does not depend on THP, declare huge_zero_folio
and huge_zero_pfn outside of the THP ifdef.
It can only be enabled from x86_64, but it is an optional config. We
could expand it more architectures in the future.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
Questions:
- Can we call __split_huge_zero_page_pmd() on static PMD page?
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 8 ++++++++
arch/x86/kernel/head_64.S | 8 ++++++++
include/linux/mm.h | 16 +++++++++++++++-
mm/Kconfig | 13 +++++++++++++
mm/huge_memory.c | 24 ++++++++++++++++++++----
mm/memory.c | 19 +++++++++++++++++++
7 files changed, 84 insertions(+), 5 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..c3a9d136ec0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_HAS_STATIC_PMD_ZERO_PAGE if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select BUILDTIME_TABLE_SORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 774430c3abff..7013a7d26da5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -47,6 +47,14 @@ void ptdump_walk_user_pgd_level_checkwx(void);
#define debug_checkwx_user() do { } while (0)
#endif
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+/*
+ * PMD_ZERO_PAGE is a global shared PMD page that is always zero.
+ */
+extern unsigned long empty_pmd_zero_page[(PMD_SIZE) / sizeof(unsigned long)]
+ __visible;
+#endif
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3e9b3a3bd039..86aaa53fd619 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -714,6 +714,14 @@ EXPORT_SYMBOL(phys_base)
#include "../xen/xen-head.S"
__PAGE_ALIGNED_BSS
+
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+SYM_DATA_START_PAGE_ALIGNED(empty_pmd_zero_page)
+ .skip PMD_SIZE
+SYM_DATA_END(empty_pmd_zero_page)
+EXPORT_SYMBOL(empty_pmd_zero_page)
+#endif
+
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
SYM_DATA_END(empty_zero_page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c8fbeaacf896..b20d60d68b3c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4018,10 +4018,10 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline bool is_huge_zero_folio(const struct folio *folio)
{
return READ_ONCE(huge_zero_folio) == folio;
@@ -4032,9 +4032,23 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
}
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static inline struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
+{
+ return READ_ONCE(huge_zero_folio);
+}
+
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
+{
+ return;
+}
+
+#else
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
+#endif /* CONFIG_STATIC_PMD_ZERO_PAGE */
+
#else
static inline bool is_huge_zero_folio(const struct folio *folio)
{
diff --git a/mm/Kconfig b/mm/Kconfig
index 781be3240e21..fd1c51995029 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,19 @@ config ARCH_WANTS_THP_SWAP
config MM_ID
def_bool n
+config ARCH_HAS_STATIC_PMD_ZERO_PAGE
+ def_bool n
+
+config STATIC_PMD_ZERO_PAGE
+ bool "Allocate a PMD page for zeroing"
+ depends on ARCH_HAS_STATIC_PMD_ZERO_PAGE
+ help
+ Typically huge_zero_folio, which is a PMD page of zeroes, is allocated
+ on demand and deallocated when not in use. This option will
+ allocate a PMD sized zero page in .bss and huge_zero_folio will
+ use it instead allocating dynamically.
+ Not suitable for memory constrained systems.
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 101b67ab2eb6..c12ca7134e88 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -75,9 +75,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc);
static bool split_underused_thp = true;
-static atomic_t huge_zero_refcount;
-struct folio *huge_zero_folio __read_mostly;
-unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
@@ -208,6 +205,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static int huge_zero_page_shrinker_init(void)
+{
+ return 0;
+}
+
+static void huge_zero_page_shrinker_exit(void)
+{
+ return;
+}
+#else
+
+static struct shrinker *huge_zero_page_shrinker;
+static atomic_t huge_zero_refcount;
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
static bool get_huge_zero_page(void)
{
struct folio *zero_folio;
@@ -288,7 +302,6 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
static int huge_zero_page_shrinker_init(void)
{
huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
@@ -307,6 +320,7 @@ static void huge_zero_page_shrinker_exit(void)
return;
}
+#endif
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -2843,6 +2857,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pte_t *pte;
int i;
+ // FIXME: can this be called with static zero page?
+ VM_BUG_ON(IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE));
/*
* Leave pmd empty until pte is filled note that it is fine to delay
* notification until mmu_notifier_invalidate_range_end() as we are
diff --git a/mm/memory.c b/mm/memory.c
index 8eba595056fe..77721f5ae043 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -159,6 +159,25 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
+static int __init init_pmd_zero_pfn(void)
+{
+ huge_zero_folio = virt_to_folio(empty_pmd_zero_page);
+ huge_zero_pfn = page_to_pfn(virt_to_page(empty_pmd_zero_page));
+
+ __folio_set_head(huge_zero_folio);
+ prep_compound_head((struct page *)huge_zero_folio, PMD_ORDER);
+ /* Ensure zero folio won't have large_rmappable flag set. */
+ folio_clear_large_rmappable(huge_zero_folio);
+
+ return 0;
+}
+early_initcall(init_pmd_zero_pfn);
+#endif
+
void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
trace_rss_stat(mm, member);
--
2.49.0
Hello,
kernel test robot noticed "WARNING:at_mm/gup.c:#try_grab_folio" on:
commit: 8e628a9d6cc5c377ae06b7821f8280cd6ff2a20f ("[PATCH 3/5] mm: add static PMD zero page")
url: https://github.com/intel-lab-lkp/linux/commits/Pankaj-Raghav/mm-move-huge_zero_page-declaration-from-huge_mm-h-to-mm-h/20250612-185248
patch link: https://lore.kernel.org/all/20250612105100.59144-4-p.raghav@samsung.com/
patch subject: [PATCH 3/5] mm: add static PMD zero page
in testcase: trinity
version: trinity-x86_64-ba2360ed-1_20241228
with following parameters:
runtime: 300s
group: group-03
nr_groups: 5
config: x86_64-randconfig-077-20250618
compiler: clang-20
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202506201441.2f96266-lkp@intel.com
[ 379.105772][ T4274] ------------[ cut here ]------------
[ 379.107617][ T4274] WARNING: CPU: 0 PID: 4274 at mm/gup.c:148 try_grab_folio (mm/gup.c:148 (discriminator 12))
[ 379.109660][ T4274] Modules linked in:
[ 379.111018][ T4274] CPU: 0 UID: 65534 PID: 4274 Comm: trinity-c3 Not tainted 6.16.0-rc1-00003-g8e628a9d6cc5 #1 PREEMPT(voluntary)
[ 379.113741][ T4274] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 379.116285][ T4274] RIP: 0010:try_grab_folio (mm/gup.c:148 (discriminator 12))
[ 379.117678][ T4274] Code: 00 48 01 1d 6f 95 3f 0b 48 c7 c7 38 95 55 8f be 08 00 00 00 e8 76 98 0f 00 48 01 1d 47 08 ac 0d e9 e4 fe ff ff e8 c5 2c cd ff <0f> 0b b8 f4 ff ff ff e9 d5 fe ff ff 44 89 f1 80 e1 07 80 c1 03 38
All code
========
0: 00 48 01 add %cl,0x1(%rax)
3: 1d 6f 95 3f 0b sbb $0xb3f956f,%eax
8: 48 c7 c7 38 95 55 8f mov $0xffffffff8f559538,%rdi
f: be 08 00 00 00 mov $0x8,%esi
14: e8 76 98 0f 00 call 0xf988f
19: 48 01 1d 47 08 ac 0d add %rbx,0xdac0847(%rip) # 0xdac0867
20: e9 e4 fe ff ff jmp 0xffffffffffffff09
25: e8 c5 2c cd ff call 0xffffffffffcd2cef
2a:* 0f 0b ud2 <-- trapping instruction
2c: b8 f4 ff ff ff mov $0xfffffff4,%eax
31: e9 d5 fe ff ff jmp 0xffffffffffffff0b
36: 44 89 f1 mov %r14d,%ecx
39: 80 e1 07 and $0x7,%cl
3c: 80 c1 03 add $0x3,%cl
3f: 38 .byte 0x38
Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: b8 f4 ff ff ff mov $0xfffffff4,%eax
7: e9 d5 fe ff ff jmp 0xfffffffffffffee1
c: 44 89 f1 mov %r14d,%ecx
f: 80 e1 07 and $0x7,%cl
12: 80 c1 03 add $0x3,%cl
15: 38 .byte 0x38
[ 379.122288][ T4274] RSP: 0018:ffffc90003eafc00 EFLAGS: 00010246
[ 379.123803][ T4274] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
[ 379.125678][ T4274] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[ 379.127640][ T4274] RBP: 0000000000210008 R08: 0000000000000000 R09: 0000000000000000
[ 379.129505][ T4274] R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000
[ 379.131448][ T4274] R13: ffffea0000398000 R14: ffffea0000398034 R15: ffffea0000398000
[ 379.133373][ T4274] FS: 00007f8feed44740(0000) GS:0000000000000000(0000) knlGS:0000000000000000
[ 379.135522][ T4274] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 379.137151][ T4274] CR2: 000000000000006e CR3: 0000000157d77000 CR4: 00000000000406f0
[ 379.139063][ T4274] Call Trace:
[ 379.139969][ T4274] <TASK>
[ 379.140739][ T4274] follow_huge_pmd (mm/gup.c:767)
[ 379.141902][ T4274] __get_user_pages (mm/gup.c:993)
[ 379.143221][ T4274] populate_vma_page_range (mm/gup.c:1926 (discriminator 1))
[ 379.144519][ T4274] __mm_populate (mm/gup.c:2029)
[ 379.145559][ T4274] vm_mmap_pgoff (include/linux/mm.h:? mm/util.c:584)
[ 379.146769][ T4274] ? entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
[ 379.148226][ T4274] do_syscall_64 (arch/x86/entry/syscall_64.c:?)
[ 379.149357][ T4274] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:473 (discriminator 3))
[ 379.150866][ T4274] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
[ 379.152205][ T4274] RIP: 0033:0x7f8feee48719
[ 379.153354][ T4274] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 06 0d 00 f7 d8 64 89 01 48
All code
========
0: 08 89 e8 5b 5d c3 or %cl,-0x3ca2a418(%rcx)
6: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1)
d: 00 00 00
10: 90 nop
11: 48 89 f8 mov %rdi,%rax
14: 48 89 f7 mov %rsi,%rdi
17: 48 89 d6 mov %rdx,%rsi
1a: 48 89 ca mov %rcx,%rdx
1d: 4d 89 c2 mov %r8,%r10
20: 4d 89 c8 mov %r9,%r8
23: 4c 8b 4c 24 08 mov 0x8(%rsp),%r9
28: 0f 05 syscall
2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction
30: 73 01 jae 0x33
32: c3 ret
33: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06f1
3a: f7 d8 neg %eax
3c: 64 89 01 mov %eax,%fs:(%rcx)
3f: 48 rex.W
Code starting with the faulting instruction
===========================================
0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax
6: 73 01 jae 0x9
8: c3 ret
9: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06c7
10: f7 d8 neg %eax
12: 64 89 01 mov %eax,%fs:(%rcx)
15: 48 rex.W
[ 379.157899][ T4274] RSP: 002b:00007ffc477ec658 EFLAGS: 00000246 ORIG_RAX: 0000000000000009
[ 379.159864][ T4274] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f8feee48719
[ 379.161627][ T4274] RDX: 0000000000000004 RSI: 0000000000200000 RDI: 0000000000000000
[ 379.163409][ T4274] RBP: 00007f8fed769058 R08: ffffffffffffffff R09: 0000000000000000
[ 379.165255][ T4274] R10: 0000000004008862 R11: 0000000000000246 R12: 0000000000000009
[ 379.167147][ T4274] R13: 00007f8feed446c0 R14: 00007f8fed769058 R15: 00007f8fed769000
[ 379.169043][ T4274] </TASK>
[ 379.169891][ T4274] irq event stamp: 771243
[ 379.170971][ T4274] hardirqs last enabled at (771255): __console_unlock (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:119 arch/x86/include/asm/irqflags.h:159 kernel/printk/printk.c:344 kernel/printk/printk.c:2885)
[ 379.173320][ T4274] hardirqs last disabled at (771278): __console_unlock (kernel/printk/printk.c:342 (discriminator 9) kernel/printk/printk.c:2885 (discriminator 9))
[ 379.175642][ T4274] softirqs last enabled at (771272): handle_softirqs (arch/x86/include/asm/preempt.h:27 kernel/softirq.c:426 kernel/softirq.c:607)
[ 379.177979][ T4274] softirqs last disabled at (771263): __irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:682)
[ 379.180133][ T4274] ---[ end trace 0000000000000000 ]---
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250620/202506201441.2f96266-lkp@intel.com
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.