There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.
This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.
This concern was raised during the review of adding LBS support to
XFS[1][2].
Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left.
Add a config option STATIC_PMD_ZERO_PAGE that will always allocate
the huge_zero_folio in .bss, and it will never be freed. This makes using the
huge_zero_folio without having to pass any mm struct and call put_folio
in the destructor.
As STATIC_PMD_ZERO_PAGE does not depend on THP, declare huge_zero_folio
and huge_zero_pfn outside of the THP ifdef.
It can only be enabled from x86_64, but it is an optional config. We
could expand it more architectures in the future.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
Questions:
- Can we call __split_huge_zero_page_pmd() on static PMD page?
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 8 ++++++++
arch/x86/kernel/head_64.S | 8 ++++++++
include/linux/mm.h | 16 +++++++++++++++-
mm/Kconfig | 13 +++++++++++++
mm/huge_memory.c | 24 ++++++++++++++++++++----
mm/memory.c | 19 +++++++++++++++++++
7 files changed, 84 insertions(+), 5 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..c3a9d136ec0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_HAS_STATIC_PMD_ZERO_PAGE if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select BUILDTIME_TABLE_SORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 774430c3abff..7013a7d26da5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -47,6 +47,14 @@ void ptdump_walk_user_pgd_level_checkwx(void);
#define debug_checkwx_user() do { } while (0)
#endif
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+/*
+ * PMD_ZERO_PAGE is a global shared PMD page that is always zero.
+ */
+extern unsigned long empty_pmd_zero_page[(PMD_SIZE) / sizeof(unsigned long)]
+ __visible;
+#endif
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3e9b3a3bd039..86aaa53fd619 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -714,6 +714,14 @@ EXPORT_SYMBOL(phys_base)
#include "../xen/xen-head.S"
__PAGE_ALIGNED_BSS
+
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+SYM_DATA_START_PAGE_ALIGNED(empty_pmd_zero_page)
+ .skip PMD_SIZE
+SYM_DATA_END(empty_pmd_zero_page)
+EXPORT_SYMBOL(empty_pmd_zero_page)
+#endif
+
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
SYM_DATA_END(empty_zero_page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c8fbeaacf896..b20d60d68b3c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4018,10 +4018,10 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline bool is_huge_zero_folio(const struct folio *folio)
{
return READ_ONCE(huge_zero_folio) == folio;
@@ -4032,9 +4032,23 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
}
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static inline struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
+{
+ return READ_ONCE(huge_zero_folio);
+}
+
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
+{
+ return;
+}
+
+#else
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
+#endif /* CONFIG_STATIC_PMD_ZERO_PAGE */
+
#else
static inline bool is_huge_zero_folio(const struct folio *folio)
{
diff --git a/mm/Kconfig b/mm/Kconfig
index 781be3240e21..fd1c51995029 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,19 @@ config ARCH_WANTS_THP_SWAP
config MM_ID
def_bool n
+config ARCH_HAS_STATIC_PMD_ZERO_PAGE
+ def_bool n
+
+config STATIC_PMD_ZERO_PAGE
+ bool "Allocate a PMD page for zeroing"
+ depends on ARCH_HAS_STATIC_PMD_ZERO_PAGE
+ help
+ Typically huge_zero_folio, which is a PMD page of zeroes, is allocated
+ on demand and deallocated when not in use. This option will
+ allocate a PMD sized zero page in .bss and huge_zero_folio will
+ use it instead allocating dynamically.
+ Not suitable for memory constrained systems.
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 101b67ab2eb6..c12ca7134e88 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -75,9 +75,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc);
static bool split_underused_thp = true;
-static atomic_t huge_zero_refcount;
-struct folio *huge_zero_folio __read_mostly;
-unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
@@ -208,6 +205,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static int huge_zero_page_shrinker_init(void)
+{
+ return 0;
+}
+
+static void huge_zero_page_shrinker_exit(void)
+{
+ return;
+}
+#else
+
+static struct shrinker *huge_zero_page_shrinker;
+static atomic_t huge_zero_refcount;
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
static bool get_huge_zero_page(void)
{
struct folio *zero_folio;
@@ -288,7 +302,6 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
static int huge_zero_page_shrinker_init(void)
{
huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
@@ -307,6 +320,7 @@ static void huge_zero_page_shrinker_exit(void)
return;
}
+#endif
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -2843,6 +2857,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pte_t *pte;
int i;
+ // FIXME: can this be called with static zero page?
+ VM_BUG_ON(IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE));
/*
* Leave pmd empty until pte is filled note that it is fine to delay
* notification until mmu_notifier_invalidate_range_end() as we are
diff --git a/mm/memory.c b/mm/memory.c
index 8eba595056fe..77721f5ae043 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -159,6 +159,25 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
+static int __init init_pmd_zero_pfn(void)
+{
+ huge_zero_folio = virt_to_folio(empty_pmd_zero_page);
+ huge_zero_pfn = page_to_pfn(virt_to_page(empty_pmd_zero_page));
+
+ __folio_set_head(huge_zero_folio);
+ prep_compound_head((struct page *)huge_zero_folio, PMD_ORDER);
+ /* Ensure zero folio won't have large_rmappable flag set. */
+ folio_clear_large_rmappable(huge_zero_folio);
+
+ return 0;
+}
+early_initcall(init_pmd_zero_pfn);
+#endif
+
void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
trace_rss_stat(mm, member);
--
2.49.0
Hello, kernel test robot noticed "WARNING:at_mm/gup.c:#try_grab_folio" on: commit: 8e628a9d6cc5c377ae06b7821f8280cd6ff2a20f ("[PATCH 3/5] mm: add static PMD zero page") url: https://github.com/intel-lab-lkp/linux/commits/Pankaj-Raghav/mm-move-huge_zero_page-declaration-from-huge_mm-h-to-mm-h/20250612-185248 patch link: https://lore.kernel.org/all/20250612105100.59144-4-p.raghav@samsung.com/ patch subject: [PATCH 3/5] mm: add static PMD zero page in testcase: trinity version: trinity-x86_64-ba2360ed-1_20241228 with following parameters: runtime: 300s group: group-03 nr_groups: 5 config: x86_64-randconfig-077-20250618 compiler: clang-20 test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G (please refer to attached dmesg/kmsg for entire log/backtrace) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202506201441.2f96266-lkp@intel.com [ 379.105772][ T4274] ------------[ cut here ]------------ [ 379.107617][ T4274] WARNING: CPU: 0 PID: 4274 at mm/gup.c:148 try_grab_folio (mm/gup.c:148 (discriminator 12)) [ 379.109660][ T4274] Modules linked in: [ 379.111018][ T4274] CPU: 0 UID: 65534 PID: 4274 Comm: trinity-c3 Not tainted 6.16.0-rc1-00003-g8e628a9d6cc5 #1 PREEMPT(voluntary) [ 379.113741][ T4274] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 379.116285][ T4274] RIP: 0010:try_grab_folio (mm/gup.c:148 (discriminator 12)) [ 379.117678][ T4274] Code: 00 48 01 1d 6f 95 3f 0b 48 c7 c7 38 95 55 8f be 08 00 00 00 e8 76 98 0f 00 48 01 1d 47 08 ac 0d e9 e4 fe ff ff e8 c5 2c cd ff <0f> 0b b8 f4 ff ff ff e9 d5 fe ff ff 44 89 f1 80 e1 07 80 c1 03 38 All code ======== 0: 00 48 01 add %cl,0x1(%rax) 3: 1d 6f 95 3f 0b sbb $0xb3f956f,%eax 8: 48 c7 c7 38 95 55 8f mov $0xffffffff8f559538,%rdi f: be 08 00 00 00 mov $0x8,%esi 14: e8 76 98 0f 00 call 0xf988f 19: 48 01 1d 47 08 ac 0d add %rbx,0xdac0847(%rip) # 0xdac0867 20: e9 e4 fe ff ff jmp 0xffffffffffffff09 25: e8 c5 2c cd ff call 0xffffffffffcd2cef 2a:* 0f 0b ud2 <-- trapping instruction 2c: b8 f4 ff ff ff mov $0xfffffff4,%eax 31: e9 d5 fe ff ff jmp 0xffffffffffffff0b 36: 44 89 f1 mov %r14d,%ecx 39: 80 e1 07 and $0x7,%cl 3c: 80 c1 03 add $0x3,%cl 3f: 38 .byte 0x38 Code starting with the faulting instruction =========================================== 0: 0f 0b ud2 2: b8 f4 ff ff ff mov $0xfffffff4,%eax 7: e9 d5 fe ff ff jmp 0xfffffffffffffee1 c: 44 89 f1 mov %r14d,%ecx f: 80 e1 07 and $0x7,%cl 12: 80 c1 03 add $0x3,%cl 15: 38 .byte 0x38 [ 379.122288][ T4274] RSP: 0018:ffffc90003eafc00 EFLAGS: 00010246 [ 379.123803][ T4274] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 [ 379.125678][ T4274] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 379.127640][ T4274] RBP: 0000000000210008 R08: 0000000000000000 R09: 0000000000000000 [ 379.129505][ T4274] R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 [ 379.131448][ T4274] R13: ffffea0000398000 R14: ffffea0000398034 R15: ffffea0000398000 [ 379.133373][ T4274] FS: 00007f8feed44740(0000) GS:0000000000000000(0000) knlGS:0000000000000000 [ 379.135522][ T4274] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 379.137151][ T4274] CR2: 000000000000006e CR3: 0000000157d77000 CR4: 00000000000406f0 [ 379.139063][ T4274] Call Trace: [ 379.139969][ T4274] <TASK> [ 379.140739][ T4274] follow_huge_pmd (mm/gup.c:767) [ 379.141902][ T4274] __get_user_pages (mm/gup.c:993) [ 379.143221][ T4274] populate_vma_page_range (mm/gup.c:1926 (discriminator 1)) [ 379.144519][ T4274] __mm_populate (mm/gup.c:2029) [ 379.145559][ T4274] vm_mmap_pgoff (include/linux/mm.h:? mm/util.c:584) [ 379.146769][ T4274] ? entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) [ 379.148226][ T4274] do_syscall_64 (arch/x86/entry/syscall_64.c:?) [ 379.149357][ T4274] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:473 (discriminator 3)) [ 379.150866][ T4274] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) [ 379.152205][ T4274] RIP: 0033:0x7f8feee48719 [ 379.153354][ T4274] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 06 0d 00 f7 d8 64 89 01 48 All code ======== 0: 08 89 e8 5b 5d c3 or %cl,-0x3ca2a418(%rcx) 6: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) d: 00 00 00 10: 90 nop 11: 48 89 f8 mov %rdi,%rax 14: 48 89 f7 mov %rsi,%rdi 17: 48 89 d6 mov %rdx,%rsi 1a: 48 89 ca mov %rcx,%rdx 1d: 4d 89 c2 mov %r8,%r10 20: 4d 89 c8 mov %r9,%r8 23: 4c 8b 4c 24 08 mov 0x8(%rsp),%r9 28: 0f 05 syscall 2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction 30: 73 01 jae 0x33 32: c3 ret 33: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06f1 3a: f7 d8 neg %eax 3c: 64 89 01 mov %eax,%fs:(%rcx) 3f: 48 rex.W Code starting with the faulting instruction =========================================== 0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax 6: 73 01 jae 0x9 8: c3 ret 9: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06c7 10: f7 d8 neg %eax 12: 64 89 01 mov %eax,%fs:(%rcx) 15: 48 rex.W [ 379.157899][ T4274] RSP: 002b:00007ffc477ec658 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 [ 379.159864][ T4274] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f8feee48719 [ 379.161627][ T4274] RDX: 0000000000000004 RSI: 0000000000200000 RDI: 0000000000000000 [ 379.163409][ T4274] RBP: 00007f8fed769058 R08: ffffffffffffffff R09: 0000000000000000 [ 379.165255][ T4274] R10: 0000000004008862 R11: 0000000000000246 R12: 0000000000000009 [ 379.167147][ T4274] R13: 00007f8feed446c0 R14: 00007f8fed769058 R15: 00007f8fed769000 [ 379.169043][ T4274] </TASK> [ 379.169891][ T4274] irq event stamp: 771243 [ 379.170971][ T4274] hardirqs last enabled at (771255): __console_unlock (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:119 arch/x86/include/asm/irqflags.h:159 kernel/printk/printk.c:344 kernel/printk/printk.c:2885) [ 379.173320][ T4274] hardirqs last disabled at (771278): __console_unlock (kernel/printk/printk.c:342 (discriminator 9) kernel/printk/printk.c:2885 (discriminator 9)) [ 379.175642][ T4274] softirqs last enabled at (771272): handle_softirqs (arch/x86/include/asm/preempt.h:27 kernel/softirq.c:426 kernel/softirq.c:607) [ 379.177979][ T4274] softirqs last disabled at (771263): __irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:682) [ 379.180133][ T4274] ---[ end trace 0000000000000000 ]--- The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20250620/202506201441.2f96266-lkp@intel.com -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.