[PATCH 3/5] mm: add static PMD zero page

Pankaj Raghav posted 5 patches 4 months ago
[PATCH 3/5] mm: add static PMD zero page
Posted by Pankaj Raghav 4 months ago
There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.

This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.

This concern was raised during the review of adding LBS support to
XFS[1][2].

Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left.

Add a config option STATIC_PMD_ZERO_PAGE that will always allocate
the huge_zero_folio in .bss, and it will never be freed. This makes using the
huge_zero_folio without having to pass any mm struct and call put_folio
in the destructor.

As STATIC_PMD_ZERO_PAGE does not depend on THP, declare huge_zero_folio
and huge_zero_pfn outside of the THP ifdef.

It can only be enabled from x86_64, but it is an optional config. We
could expand it more architectures in the future.

[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/

Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
Questions:
- Can we call __split_huge_zero_page_pmd() on static PMD page?

 arch/x86/Kconfig               |  1 +
 arch/x86/include/asm/pgtable.h |  8 ++++++++
 arch/x86/kernel/head_64.S      |  8 ++++++++
 include/linux/mm.h             | 16 +++++++++++++++-
 mm/Kconfig                     | 13 +++++++++++++
 mm/huge_memory.c               | 24 ++++++++++++++++++++----
 mm/memory.c                    | 19 +++++++++++++++++++
 7 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..c3a9d136ec0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
 	select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP	if X86_64
 	select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
 	select ARCH_WANTS_THP_SWAP		if X86_64
+	select ARCH_HAS_STATIC_PMD_ZERO_PAGE	if X86_64
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 774430c3abff..7013a7d26da5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -47,6 +47,14 @@ void ptdump_walk_user_pgd_level_checkwx(void);
 #define debug_checkwx_user()	do { } while (0)
 #endif
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+/*
+ * PMD_ZERO_PAGE is a global shared PMD page that is always zero.
+ */
+extern unsigned long empty_pmd_zero_page[(PMD_SIZE) / sizeof(unsigned long)]
+	__visible;
+#endif
+
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3e9b3a3bd039..86aaa53fd619 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -714,6 +714,14 @@ EXPORT_SYMBOL(phys_base)
 #include "../xen/xen-head.S"
 
 	__PAGE_ALIGNED_BSS
+
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+SYM_DATA_START_PAGE_ALIGNED(empty_pmd_zero_page)
+	.skip PMD_SIZE
+SYM_DATA_END(empty_pmd_zero_page)
+EXPORT_SYMBOL(empty_pmd_zero_page)
+#endif
+
 SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
 	.skip PAGE_SIZE
 SYM_DATA_END(empty_zero_page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c8fbeaacf896..b20d60d68b3c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4018,10 +4018,10 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct folio *huge_zero_folio;
 extern unsigned long huge_zero_pfn;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline bool is_huge_zero_folio(const struct folio *folio)
 {
 	return READ_ONCE(huge_zero_folio) == folio;
@@ -4032,9 +4032,23 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
 }
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static inline struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
+{
+	return READ_ONCE(huge_zero_folio);
+}
+
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
+{
+	return;
+}
+
+#else
 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
 void mm_put_huge_zero_folio(struct mm_struct *mm);
 
+#endif /* CONFIG_STATIC_PMD_ZERO_PAGE */
+
 #else
 static inline bool is_huge_zero_folio(const struct folio *folio)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 781be3240e21..fd1c51995029 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,19 @@ config ARCH_WANTS_THP_SWAP
 config MM_ID
 	def_bool n
 
+config ARCH_HAS_STATIC_PMD_ZERO_PAGE
+	def_bool n
+
+config STATIC_PMD_ZERO_PAGE
+	bool "Allocate a PMD page for zeroing"
+	depends on ARCH_HAS_STATIC_PMD_ZERO_PAGE
+	help
+	  Typically huge_zero_folio, which is a PMD page of zeroes, is allocated
+	  on demand and deallocated when not in use. This option will
+	  allocate a PMD sized zero page in .bss and huge_zero_folio will
+	  use it instead allocating dynamically.
+	  Not suitable for memory constrained systems.
+
 menuconfig TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
 	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 101b67ab2eb6..c12ca7134e88 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -75,9 +75,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 					 struct shrink_control *sc);
 static bool split_underused_thp = true;
 
-static atomic_t huge_zero_refcount;
-struct folio *huge_zero_folio __read_mostly;
-unsigned long huge_zero_pfn __read_mostly = ~0UL;
 unsigned long huge_anon_orders_always __read_mostly;
 unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
@@ -208,6 +205,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 	return orders;
 }
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static int huge_zero_page_shrinker_init(void)
+{
+	return 0;
+}
+
+static void huge_zero_page_shrinker_exit(void)
+{
+	return;
+}
+#else
+
+static struct shrinker *huge_zero_page_shrinker;
+static atomic_t huge_zero_refcount;
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
 static bool get_huge_zero_page(void)
 {
 	struct folio *zero_folio;
@@ -288,7 +302,6 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
 	return 0;
 }
 
-static struct shrinker *huge_zero_page_shrinker;
 static int huge_zero_page_shrinker_init(void)
 {
 	huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
@@ -307,6 +320,7 @@ static void huge_zero_page_shrinker_exit(void)
 	return;
 }
 
+#endif
 
 #ifdef CONFIG_SYSFS
 static ssize_t enabled_show(struct kobject *kobj,
@@ -2843,6 +2857,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pte_t *pte;
 	int i;
 
+	// FIXME: can this be called with static zero page?
+	VM_BUG_ON(IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE));
 	/*
 	 * Leave pmd empty until pte is filled note that it is fine to delay
 	 * notification until mmu_notifier_invalidate_range_end() as we are
diff --git a/mm/memory.c b/mm/memory.c
index 8eba595056fe..77721f5ae043 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -159,6 +159,25 @@ static int __init init_zero_pfn(void)
 }
 early_initcall(init_zero_pfn);
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
+static int __init init_pmd_zero_pfn(void)
+{
+	huge_zero_folio = virt_to_folio(empty_pmd_zero_page);
+	huge_zero_pfn = page_to_pfn(virt_to_page(empty_pmd_zero_page));
+
+	__folio_set_head(huge_zero_folio);
+	prep_compound_head((struct page *)huge_zero_folio, PMD_ORDER);
+	/* Ensure zero folio won't have large_rmappable flag set. */
+	folio_clear_large_rmappable(huge_zero_folio);
+
+	return 0;
+}
+early_initcall(init_pmd_zero_pfn);
+#endif
+
 void mm_trace_rss_stat(struct mm_struct *mm, int member)
 {
 	trace_rss_stat(mm, member);
-- 
2.49.0
Re: [PATCH 3/5] mm: add static PMD zero page
Posted by kernel test robot 3 months, 2 weeks ago

Hello,

kernel test robot noticed "WARNING:at_mm/gup.c:#try_grab_folio" on:

commit: 8e628a9d6cc5c377ae06b7821f8280cd6ff2a20f ("[PATCH 3/5] mm: add static PMD zero page")
url: https://github.com/intel-lab-lkp/linux/commits/Pankaj-Raghav/mm-move-huge_zero_page-declaration-from-huge_mm-h-to-mm-h/20250612-185248
patch link: https://lore.kernel.org/all/20250612105100.59144-4-p.raghav@samsung.com/
patch subject: [PATCH 3/5] mm: add static PMD zero page

in testcase: trinity
version: trinity-x86_64-ba2360ed-1_20241228
with following parameters:

	runtime: 300s
	group: group-03
	nr_groups: 5



config: x86_64-randconfig-077-20250618
compiler: clang-20
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202506201441.2f96266-lkp@intel.com


[  379.105772][ T4274] ------------[ cut here ]------------
[ 379.107617][ T4274] WARNING: CPU: 0 PID: 4274 at mm/gup.c:148 try_grab_folio (mm/gup.c:148 (discriminator 12)) 
[  379.109660][ T4274] Modules linked in:
[  379.111018][ T4274] CPU: 0 UID: 65534 PID: 4274 Comm: trinity-c3 Not tainted 6.16.0-rc1-00003-g8e628a9d6cc5 #1 PREEMPT(voluntary)
[  379.113741][ T4274] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 379.116285][ T4274] RIP: 0010:try_grab_folio (mm/gup.c:148 (discriminator 12)) 
[ 379.117678][ T4274] Code: 00 48 01 1d 6f 95 3f 0b 48 c7 c7 38 95 55 8f be 08 00 00 00 e8 76 98 0f 00 48 01 1d 47 08 ac 0d e9 e4 fe ff ff e8 c5 2c cd ff <0f> 0b b8 f4 ff ff ff e9 d5 fe ff ff 44 89 f1 80 e1 07 80 c1 03 38
All code
========
   0:	00 48 01             	add    %cl,0x1(%rax)
   3:	1d 6f 95 3f 0b       	sbb    $0xb3f956f,%eax
   8:	48 c7 c7 38 95 55 8f 	mov    $0xffffffff8f559538,%rdi
   f:	be 08 00 00 00       	mov    $0x8,%esi
  14:	e8 76 98 0f 00       	call   0xf988f
  19:	48 01 1d 47 08 ac 0d 	add    %rbx,0xdac0847(%rip)        # 0xdac0867
  20:	e9 e4 fe ff ff       	jmp    0xffffffffffffff09
  25:	e8 c5 2c cd ff       	call   0xffffffffffcd2cef
  2a:*	0f 0b                	ud2		<-- trapping instruction
  2c:	b8 f4 ff ff ff       	mov    $0xfffffff4,%eax
  31:	e9 d5 fe ff ff       	jmp    0xffffffffffffff0b
  36:	44 89 f1             	mov    %r14d,%ecx
  39:	80 e1 07             	and    $0x7,%cl
  3c:	80 c1 03             	add    $0x3,%cl
  3f:	38                   	.byte 0x38

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2
   2:	b8 f4 ff ff ff       	mov    $0xfffffff4,%eax
   7:	e9 d5 fe ff ff       	jmp    0xfffffffffffffee1
   c:	44 89 f1             	mov    %r14d,%ecx
   f:	80 e1 07             	and    $0x7,%cl
  12:	80 c1 03             	add    $0x3,%cl
  15:	38                   	.byte 0x38
[  379.122288][ T4274] RSP: 0018:ffffc90003eafc00 EFLAGS: 00010246
[  379.123803][ T4274] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
[  379.125678][ T4274] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[  379.127640][ T4274] RBP: 0000000000210008 R08: 0000000000000000 R09: 0000000000000000
[  379.129505][ T4274] R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000
[  379.131448][ T4274] R13: ffffea0000398000 R14: ffffea0000398034 R15: ffffea0000398000
[  379.133373][ T4274] FS:  00007f8feed44740(0000) GS:0000000000000000(0000) knlGS:0000000000000000
[  379.135522][ T4274] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  379.137151][ T4274] CR2: 000000000000006e CR3: 0000000157d77000 CR4: 00000000000406f0
[  379.139063][ T4274] Call Trace:
[  379.139969][ T4274]  <TASK>
[ 379.140739][ T4274] follow_huge_pmd (mm/gup.c:767) 
[ 379.141902][ T4274] __get_user_pages (mm/gup.c:993) 
[ 379.143221][ T4274] populate_vma_page_range (mm/gup.c:1926 (discriminator 1)) 
[ 379.144519][ T4274] __mm_populate (mm/gup.c:2029) 
[ 379.145559][ T4274] vm_mmap_pgoff (include/linux/mm.h:? mm/util.c:584) 
[ 379.146769][ T4274] ? entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 
[ 379.148226][ T4274] do_syscall_64 (arch/x86/entry/syscall_64.c:?) 
[ 379.149357][ T4274] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:473 (discriminator 3)) 
[ 379.150866][ T4274] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 
[  379.152205][ T4274] RIP: 0033:0x7f8feee48719
[ 379.153354][ T4274] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 06 0d 00 f7 d8 64 89 01 48
All code
========
   0:	08 89 e8 5b 5d c3    	or     %cl,-0x3ca2a418(%rcx)
   6:	66 2e 0f 1f 84 00 00 	cs nopw 0x0(%rax,%rax,1)
   d:	00 00 00 
  10:	90                   	nop
  11:	48 89 f8             	mov    %rdi,%rax
  14:	48 89 f7             	mov    %rsi,%rdi
  17:	48 89 d6             	mov    %rdx,%rsi
  1a:	48 89 ca             	mov    %rcx,%rdx
  1d:	4d 89 c2             	mov    %r8,%r10
  20:	4d 89 c8             	mov    %r9,%r8
  23:	4c 8b 4c 24 08       	mov    0x8(%rsp),%r9
  28:	0f 05                	syscall
  2a:*	48 3d 01 f0 ff ff    	cmp    $0xfffffffffffff001,%rax		<-- trapping instruction
  30:	73 01                	jae    0x33
  32:	c3                   	ret
  33:	48 8b 0d b7 06 0d 00 	mov    0xd06b7(%rip),%rcx        # 0xd06f1
  3a:	f7 d8                	neg    %eax
  3c:	64 89 01             	mov    %eax,%fs:(%rcx)
  3f:	48                   	rex.W

Code starting with the faulting instruction
===========================================
   0:	48 3d 01 f0 ff ff    	cmp    $0xfffffffffffff001,%rax
   6:	73 01                	jae    0x9
   8:	c3                   	ret
   9:	48 8b 0d b7 06 0d 00 	mov    0xd06b7(%rip),%rcx        # 0xd06c7
  10:	f7 d8                	neg    %eax
  12:	64 89 01             	mov    %eax,%fs:(%rcx)
  15:	48                   	rex.W
[  379.157899][ T4274] RSP: 002b:00007ffc477ec658 EFLAGS: 00000246 ORIG_RAX: 0000000000000009
[  379.159864][ T4274] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f8feee48719
[  379.161627][ T4274] RDX: 0000000000000004 RSI: 0000000000200000 RDI: 0000000000000000
[  379.163409][ T4274] RBP: 00007f8fed769058 R08: ffffffffffffffff R09: 0000000000000000
[  379.165255][ T4274] R10: 0000000004008862 R11: 0000000000000246 R12: 0000000000000009
[  379.167147][ T4274] R13: 00007f8feed446c0 R14: 00007f8fed769058 R15: 00007f8fed769000
[  379.169043][ T4274]  </TASK>
[  379.169891][ T4274] irq event stamp: 771243
[ 379.170971][ T4274] hardirqs last enabled at (771255): __console_unlock (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:119 arch/x86/include/asm/irqflags.h:159 kernel/printk/printk.c:344 kernel/printk/printk.c:2885) 
[ 379.173320][ T4274] hardirqs last disabled at (771278): __console_unlock (kernel/printk/printk.c:342 (discriminator 9) kernel/printk/printk.c:2885 (discriminator 9)) 
[ 379.175642][ T4274] softirqs last enabled at (771272): handle_softirqs (arch/x86/include/asm/preempt.h:27 kernel/softirq.c:426 kernel/softirq.c:607) 
[ 379.177979][ T4274] softirqs last disabled at (771263): __irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:682) 
[  379.180133][ T4274] ---[ end trace 0000000000000000 ]---


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250620/202506201441.2f96266-lkp@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki