mm, arch: A more robust approach to sync top level kernel page tables

[RFC V1 PATCH mm-hotfixes 1/3] mm: introduce and use {pgd,p4d}_populate_kernel()

Posted by Harry Yoo 7 months ago

Intrdocue and use {pgd,p4d}_pouplate_kernel() in core MM code when
populating PGD and P4D entries corresponding to the kernel address
space. The main purpose of these helpers is to ensure synchronization of
the kernel portion of the top-level page tables whenever such an entry
is populated.

Until now, the kernel has relied on each architecture to handle
synchronization of top-level page tables in an ad-hoc manner.
For example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for
direct mapping and vmemmap mapping changes").

However, this approach has proven fragile, as it's easy to forget to
perform the necessary synchronization when introducing new changes.

To address this, introduce _kernel() varients of the page table
population helpers that invoke architecture-specific hooks to properly
synchronize the page tables.

For now, it only targets x86_64, so only PGD and P4D level helpers are
introduced. In theory, PUD and PMD level helpers can be added later if
needed by other architectures.

Currently it is no-op as no arch defines __HAVE_ARCH_SYNC_KERNEL_PGTABLES.

Cc: <stable@vger.kernel.org>
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
---
 include/asm-generic/pgalloc.h |  4 ++++
 include/linux/pgalloc.h       |  0
 mm/kasan/init.c               | 10 +++++-----
 mm/percpu.c                   |  4 ++--
 mm/sparse-vmemmap.c           |  4 ++--
 5 files changed, 13 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/pgalloc.h

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 3c8ec3bfea44..6cac1ce64e01 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -295,6 +295,10 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	__pgd_free(mm, pgd);
 }
 #endif
+#ifndef __HAVE_ARCH_SYNC_KERNEL_PGTABLE
+#define pgd_populate_kernel(addr, pgd, p4d) pgd_populate(&init_mm, pgd, p4d)
+#define p4d_populate_kernel(addr, p4d, pud) p4d_populate(&init_mm, p4d, pud)
+#endif
 
 #endif /* CONFIG_MMU */
 
diff --git a/include/linux/pgalloc.h b/include/linux/pgalloc.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ced6b29fcf76..43de820ee282 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 			pud_t *pud;
 			pmd_t *pmd;
 
-			p4d_populate(&init_mm, p4d,
+			p4d_populate_kernel(addr, p4d,
 					lm_alias(kasan_early_shadow_pud));
 			pud = pud_offset(p4d, addr);
 			pud_populate(&init_mm, pud,
@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 			} else {
 				p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
 				pud_init(p);
-				p4d_populate(&init_mm, p4d, p);
+				p4d_populate_kernel(addr, p4d, p);
 			}
 		}
 		zero_pud_populate(p4d, addr, next);
@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
 			 * puds,pmds, so pgd_populate(), pud_populate()
 			 * is noops.
 			 */
-			pgd_populate(&init_mm, pgd,
+			pgd_populate_kernel(addr, pgd,
 					lm_alias(kasan_early_shadow_p4d));
 			p4d = p4d_offset(pgd, addr);
-			p4d_populate(&init_mm, p4d,
+			p4d_populate_kernel(addr, p4d,
 					lm_alias(kasan_early_shadow_pud));
 			pud = pud_offset(p4d, addr);
 			pud_populate(&init_mm, pud,
@@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
 				if (!p)
 					return -ENOMEM;
 			} else {
-				pgd_populate(&init_mm, pgd,
+				pgd_populate_kernel(addr, pgd,
 					early_alloc(PAGE_SIZE, NUMA_NO_NODE));
 			}
 		}
diff --git a/mm/percpu.c b/mm/percpu.c
index 782cc148b39c..57450a03c432 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3134,13 +3134,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
 
 	if (pgd_none(*pgd)) {
 		p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
-		pgd_populate(&init_mm, pgd, p4d);
+		pgd_populate_kernel(addr, pgd, p4d);
 	}
 
 	p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d)) {
 		pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
-		p4d_populate(&init_mm, p4d, pud);
+		p4d_populate_kernel(addr, p4d, pud);
 	}
 
 	pud = pud_offset(p4d, addr);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index fd2ab5118e13..e275310ac708 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
 		if (!p)
 			return NULL;
 		pud_init(p);
-		p4d_populate(&init_mm, p4d, p);
+		p4d_populate_kernel(addr, p4d, p);
 	}
 	return p4d;
 }
@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
 		if (!p)
 			return NULL;
-		pgd_populate(&init_mm, pgd, p);
+		pgd_populate_kernel(addr, pgd, p);
 	}
 	return pgd;
 }
-- 
2.43.0

Re: [RFC V1 PATCH mm-hotfixes 1/3] mm: introduce and use {pgd,p4d}_populate_kernel()

Posted by David Hildenbrand 7 months ago

On 09.07.25 15:16, Harry Yoo wrote:
> Intrdocue and use {pgd,p4d}_pouplate_kernel() in core MM code when
> populating PGD and P4D entries corresponding to the kernel address
> space. The main purpose of these helpers is to ensure synchronization of
> the kernel portion of the top-level page tables whenever such an entry
> is populated.
> 
> Until now, the kernel has relied on each architecture to handle
> synchronization of top-level page tables in an ad-hoc manner.
> For example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for
> direct mapping and vmemmap mapping changes").
> 
> However, this approach has proven fragile, as it's easy to forget to
> perform the necessary synchronization when introducing new changes.
> 
> To address this, introduce _kernel() varients of the page table

s/varients/variants/

> population helpers that invoke architecture-specific hooks to properly
> synchronize the page tables.

I was expecting to see the sync be done in common code -- such that it 
cannot be missed :)

But it's really just rerouting to the arch code where the sync can be 
done, correct?

-- 
Cheers,

David / dhildenb

Re: [RFC V1 PATCH mm-hotfixes 1/3] mm: introduce and use {pgd,p4d}_populate_kernel()

Posted by Harry Yoo 7 months ago

On Fri, Jul 11, 2025 at 06:18:44PM +0200, David Hildenbrand wrote:
> On 09.07.25 15:16, Harry Yoo wrote:
> > Intrdocue and use {pgd,p4d}_pouplate_kernel() in core MM code when
> > populating PGD and P4D entries corresponding to the kernel address
> > space. The main purpose of these helpers is to ensure synchronization of
> > the kernel portion of the top-level page tables whenever such an entry
> > is populated.
> > 
> > Until now, the kernel has relied on each architecture to handle
> > synchronization of top-level page tables in an ad-hoc manner.
> > For example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for
> > direct mapping and vmemmap mapping changes").
> > 
> > However, this approach has proven fragile, as it's easy to forget to
> > perform the necessary synchronization when introducing new changes.
> > 
> > To address this, introduce _kernel() varients of the page table
> 
> s/varients/variants/

Will fix. Thanks.

> > population helpers that invoke architecture-specific hooks to properly
> > synchronize the page tables.
> 
> I was expecting to see the sync be done in common code -- such that it
> cannot be missed :)

You mean something like an arch-independent implementation of
sync_global_pgds()?

That would be a "much more robust" approach ;)

To do that, the kernel would need to maintain a list of page tables that
have kernel portion mapped and perform the sync in the common code.

But determining which page tables to add to the list would be highly
architecture-specific. For example, I think some architectures use separate
page tables for kernel space, unlike x86 (e.g., arm64 TTBR1, SPARC) and
user page tables should not be affected.

While doing the sync in common code might be a more robust option
in the long term, I'm afraid that making it work correctly across
all architectures would be challenging, due to differences in how each
architecture manages the kernel address space.

> But it's really just rerouting to the arch code where the sync can be done,
> correct?

Yes, that's correct.

Thanks for taking a look!

-- 
Cheers,
Harry / Hyeonggon

Re: [RFC V1 PATCH mm-hotfixes 1/3] mm: introduce and use {pgd,p4d}_populate_kernel()

Posted by Mike Rapoport 7 months ago

On Sun, Jul 13, 2025 at 08:39:53PM +0900, Harry Yoo wrote:
> On Fri, Jul 11, 2025 at 06:18:44PM +0200, David Hildenbrand wrote:
> > On 09.07.25 15:16, Harry Yoo wrote:
> > > Intrdocue and use {pgd,p4d}_pouplate_kernel() in core MM code when
> > > populating PGD and P4D entries corresponding to the kernel address
> > > space. The main purpose of these helpers is to ensure synchronization of
> > > the kernel portion of the top-level page tables whenever such an entry
> > > is populated.
> > > 
> > > Until now, the kernel has relied on each architecture to handle
> > > synchronization of top-level page tables in an ad-hoc manner.
> > > For example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for
> > > direct mapping and vmemmap mapping changes").
> > > 
> > > However, this approach has proven fragile, as it's easy to forget to
> > > perform the necessary synchronization when introducing new changes.
> > > 
> > > To address this, introduce _kernel() varients of the page table
> > 
> > s/varients/variants/
> 
> Will fix. Thanks.
> 
> > > population helpers that invoke architecture-specific hooks to properly
> > > synchronize the page tables.
> > 
> > I was expecting to see the sync be done in common code -- such that it
> > cannot be missed :)
> 
> You mean something like an arch-independent implementation of
> sync_global_pgds()?
>
> That would be a "much more robust" approach ;)
> 
> To do that, the kernel would need to maintain a list of page tables that
> have kernel portion mapped and perform the sync in the common code.
> 
> But determining which page tables to add to the list would be highly
> architecture-specific. For example, I think some architectures use separate
> page tables for kernel space, unlike x86 (e.g., arm64 TTBR1, SPARC) and
> user page tables should not be affected.

sync_global_pgds() can be still implemented per architecture, but it can be
called from the common code.
We already have something like that for vmalloc that calls
arch_sync_kernel_mappings(). It's implemented only by x86-32 and arm, other
architectures do not define it.

> While doing the sync in common code might be a more robust option
> in the long term, I'm afraid that making it work correctly across
> all architectures would be challenging, due to differences in how each
> architecture manages the kernel address space.
> 
> > But it's really just rerouting to the arch code where the sync can be done,
> > correct?
> 
> Yes, that's correct.
> 
> Thanks for taking a look!
> 
> -- 
> Cheers,
> Harry / Hyeonggon

-- 
Sincerely yours,
Mike.

Re: [RFC V1 PATCH mm-hotfixes 1/3] mm: introduce and use {pgd,p4d}_populate_kernel()

Posted by Harry Yoo 7 months ago

On Sun, Jul 13, 2025 at 08:56:10PM +0300, Mike Rapoport wrote:
> On Sun, Jul 13, 2025 at 08:39:53PM +0900, Harry Yoo wrote:
> > On Fri, Jul 11, 2025 at 06:18:44PM +0200, David Hildenbrand wrote:
> > > > population helpers that invoke architecture-specific hooks to properly
> > > > synchronize the page tables.
> > > 
> > > I was expecting to see the sync be done in common code -- such that it
> > > cannot be missed :)
> > 
> > You mean something like an arch-independent implementation of
> > sync_global_pgds()?
> >
> > That would be a "much more robust" approach ;)
> > 
> > To do that, the kernel would need to maintain a list of page tables that
> > have kernel portion mapped and perform the sync in the common code.
> > 
> > But determining which page tables to add to the list would be highly
> > architecture-specific. For example, I think some architectures use separate
> > page tables for kernel space, unlike x86 (e.g., arm64 TTBR1, SPARC) and
> > user page tables should not be affected.
> 
> sync_global_pgds() can be still implemented per architecture, but it can be
> called from the common code.

A good point, and that can be done!

Actually, that was the initial plan and I somehow thought that
you can't determine if the architecture is using 5-level or 4-level paging
and decide whether to call arch_sync_kernel_pagetables(). But looking at
how it's done in vmalloc, I think it can be done in a similar way.

> We already have something like that for vmalloc that calls
> arch_sync_kernel_mappings(). It's implemented only by x86-32 and arm, other
> architectures do not define it.

It is indeed a good example and was helpful.
Thank you for the comment, Mike!

> -- 
> Sincerely yours,
> Mike.

-- 
Cheers,
Harry / Hyeonggon

Re: [RFC V1 PATCH mm-hotfixes 1/3] mm: introduce and use {pgd,p4d}_populate_kernel()

Posted by Harry Yoo 6 months, 4 weeks ago

On Mon, Jul 14, 2025 at 05:10:44PM +0900, Harry Yoo wrote:
> On Sun, Jul 13, 2025 at 08:56:10PM +0300, Mike Rapoport wrote:
> > On Sun, Jul 13, 2025 at 08:39:53PM +0900, Harry Yoo wrote:
> > > On Fri, Jul 11, 2025 at 06:18:44PM +0200, David Hildenbrand wrote:
> > > > > population helpers that invoke architecture-specific hooks to properly
> > > > > synchronize the page tables.
> > > > 
> > > > I was expecting to see the sync be done in common code -- such that it
> > > > cannot be missed :)
> > > 
> > > You mean something like an arch-independent implementation of
> > > sync_global_pgds()?
> > >
> > > That would be a "much more robust" approach ;)
> > > 
> > > To do that, the kernel would need to maintain a list of page tables that
> > > have kernel portion mapped and perform the sync in the common code.
> > > 
> > > But determining which page tables to add to the list would be highly
> > > architecture-specific. For example, I think some architectures use separate
> > > page tables for kernel space, unlike x86 (e.g., arm64 TTBR1, SPARC) and
> > > user page tables should not be affected.
> > 
> > sync_global_pgds() can be still implemented per architecture, but it can be
> > called from the common code.
> 
> A good point, and that can be done!
> 
> Actually, that was the initial plan and I somehow thought that
> you can't determine if the architecture is using 5-level or 4-level paging
> and decide whether to call arch_sync_kernel_pagetables(). But looking at
> how it's done in vmalloc, I think it can be done in a similar way.
> 
> > We already have something like that for vmalloc that calls
> > arch_sync_kernel_mappings(). It's implemented only by x86-32 and arm, other
> > architectures do not define it.
> 
> It is indeed a good example and was helpful.
> Thank you for the comment, Mike!

[Adding to Joerg Cc]

Wait, after reading more of the history on synchronization of page
tables for vmalloc area, I realized that at least on x86-64, all PGD
entries for vmalloc are preallocated [1].

But in this case I'm not sure adding/removing memory to/from the
system is performance critical enough to warrant a similar optimization.

I'll stick with current approach unless someone argues otherwise.

[1] https://lore.kernel.org/all/20200721095953.6218-2-joro@8bytes.org

Also, vmalloc and other features use apply_to_page_range() are not affected
by this change as they have their own ways to synchronize kernel mappings.

Perhaps that can be unified later but given that this series needs to be
backported later, I'd prefer to fix the bug first and defer cleanups
to a later time.

Thanks!

-- 
Cheers,
Harry / Hyeonggon