[PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift

Ard Biesheuvel posted 7 patches 7 months ago
[PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Ard Biesheuvel 7 months ago
From: Ard Biesheuvel <ardb@kernel.org>

Shrink the global variable 'pgdir_shift' to a single byte, and move it
to the cache hot per-CPU variable section so that it can be accessed
virtually for free.

Set the variable from asm code before entering C so that C code is
guaranteed to always consistently observe the correct value.

For the decompressor, provide an alternative that tests the CR4.LA57
control bit directly - this is more costly in principle but this should
not matter for the early boot code, and the LA57 control bit is
guaranteed to be in sync with the number of paging levels in use.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/boot/compressed/misc.h         |  6 +++++-
 arch/x86/boot/compressed/pgtable_64.c   |  2 --
 arch/x86/boot/startup/map_kernel.c      |  1 -
 arch/x86/include/asm/page_64_types.h    |  2 +-
 arch/x86/include/asm/pgtable_64_types.h | 13 +++++++++++--
 arch/x86/kernel/head64.c                |  2 --
 arch/x86/kernel/head_64.S               |  5 +++++
 arch/x86/mm/pgtable.c                   |  4 ++++
 8 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index db1048621ea2..97b80d08f03c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -28,6 +28,10 @@
 #define __pa(x)  ((unsigned long)(x))
 #define __va(x)  ((void *)((unsigned long)(x)))
 
+#ifdef CONFIG_X86_64
+#define pgdir_shift()	(native_read_cr4() & X86_CR4_LA57 ? 48 : 39)
+#endif
+
 #include <linux/linkage.h>
 #include <linux/screen_info.h>
 #include <linux/elf.h>
@@ -189,7 +193,7 @@ static inline int count_immovable_mem_regions(void) { return 0; }
 #endif
 
 /* ident_map_64.c */
-extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
+extern unsigned int __pgtable_l5_enabled, ptrs_per_p4d;
 extern void kernel_add_identity_map(unsigned long start, unsigned long end);
 
 /* Used by PAGE_KERN* macros: */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index bdd26050dff7..898a4e66e401 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -12,7 +12,6 @@
 
 /* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */
 unsigned int __section(".data") __pgtable_l5_enabled;
-unsigned int __section(".data") pgdir_shift = 39;
 unsigned int __section(".data") ptrs_per_p4d = 1;
 
 /* Buffer to preserve trampoline memory */
@@ -123,7 +122,6 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
 
 		/* Initialize variables for 5-level paging */
 		__pgtable_l5_enabled = 1;
-		pgdir_shift = 48;
 		ptrs_per_p4d = 512;
 	}
 
diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c
index 332dbe6688c4..06306c5d016f 100644
--- a/arch/x86/boot/startup/map_kernel.c
+++ b/arch/x86/boot/startup/map_kernel.c
@@ -24,7 +24,6 @@ static inline bool check_la57_support(void)
 		return false;
 
 	__pgtable_l5_enabled	= 1;
-	pgdir_shift		= 48;
 	ptrs_per_p4d		= 512;
 
 	return true;
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 7400dab373fe..e1fccd9116c3 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -48,7 +48,7 @@
 /* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */
 
 #define __PHYSICAL_MASK_SHIFT	52
-#define __VIRTUAL_MASK_SHIFT	(pgtable_l5_enabled() ? 56 : 47)
+#define __VIRTUAL_MASK_SHIFT	(pgdir_shift() + 8)
 
 #define TASK_SIZE_MAX		task_size_max()
 #define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 1481b234465a..3ee747f596e3 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -7,6 +7,7 @@
 #ifndef __ASSEMBLER__
 #include <linux/types.h>
 #include <asm/kaslr.h>
+#include <asm/percpu.h>
 
 /*
  * These are used to make use of C type-checking..
@@ -23,6 +24,15 @@ typedef struct { pmdval_t pmd; } pmd_t;
 
 extern unsigned int __pgtable_l5_enabled;
 
+#ifndef pgdir_shift
+DECLARE_PER_CPU_CACHE_HOT(u8, __pgdir_shift);
+
+static __always_inline __attribute_const__ u8 pgdir_shift(void)
+{
+	return this_cpu_read_stable(__pgdir_shift);
+}
+#endif /* pgdir_shift */
+
 #ifdef USE_EARLY_PGTABLE_L5
 /*
  * cpu_feature_enabled() is not available in early boot code.
@@ -36,7 +46,6 @@ static inline bool pgtable_l5_enabled(void)
 #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
 #endif /* USE_EARLY_PGTABLE_L5 */
 
-extern unsigned int pgdir_shift;
 extern unsigned int ptrs_per_p4d;
 
 #endif	/* !__ASSEMBLER__ */
@@ -44,7 +53,7 @@ extern unsigned int ptrs_per_p4d;
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
-#define PGDIR_SHIFT	pgdir_shift
+#define PGDIR_SHIFT	pgdir_shift()
 #define PTRS_PER_PGD	512
 
 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 533fcf5636fc..e2d9e709ec01 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -52,8 +52,6 @@ SYM_PIC_ALIAS(next_early_pgt);
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
 unsigned int __pgtable_l5_enabled __ro_after_init;
-unsigned int pgdir_shift __ro_after_init = 39;
-EXPORT_SYMBOL(pgdir_shift);
 unsigned int ptrs_per_p4d __ro_after_init = 1;
 EXPORT_SYMBOL(ptrs_per_p4d);
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3e9b3a3bd039..d35b75a5f892 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -71,6 +71,11 @@ SYM_CODE_START_NOALIGN(startup_64)
 	xorl	%edx, %edx
 	wrmsr
 
+	movq	%cr4, %rax
+	btl	$X86_CR4_LA57_BIT, %eax
+	jnc	0f
+	movb	$48, PER_CPU_VAR(__pgdir_shift)
+0:
 	call	startup_64_setup_gdt_idt
 
 	/* Now switch to __KERNEL_CS so IRET works reliably */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 62777ba4de1a..a7eaeaa595f8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -46,6 +46,10 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 }
 
 #if CONFIG_PGTABLE_LEVELS > 4
+DEFINE_PER_CPU_CACHE_HOT(u8, __pgdir_shift) = 39;
+EXPORT_SYMBOL_GPL(__pgdir_shift);
+SYM_PIC_ALIAS(__pgdir_shift);
+
 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
 {
 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
-- 
2.49.0.1101.gccaa498523-goog
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Kirill A. Shutemov 7 months ago
On Tue, May 20, 2025 at 12:41:41PM +0200, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Shrink the global variable 'pgdir_shift' to a single byte, and move it
> to the cache hot per-CPU variable section so that it can be accessed
> virtually for free.

Hm. I am not convinced about per-CPU being useful here. Read-only cache
lines can be shared between cores anyway.

Putting it to __ro_after_init should do the trick, no?

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Ard Biesheuvel 7 months ago
On Tue, 20 May 2025 at 13:03, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Tue, May 20, 2025 at 12:41:41PM +0200, Ard Biesheuvel wrote:
> > From: Ard Biesheuvel <ardb@kernel.org>
> >
> > Shrink the global variable 'pgdir_shift' to a single byte, and move it
> > to the cache hot per-CPU variable section so that it can be accessed
> > virtually for free.
>
> Hm. I am not convinced about per-CPU being useful here. Read-only cache
> lines can be shared between cores anyway.
>
> Putting it to __ro_after_init should do the trick, no?
>

Are you saying we shouldn't optimize prematurely? You must be new here :-)

But seriously, whatever works is fine with me, as long as the
observable return value of pgtable_l5_enabled() never changes.
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Borislav Petkov 7 months ago
On Tue, May 20, 2025 at 01:28:52PM +0200, Ard Biesheuvel wrote:
> Are you saying we shouldn't optimize prematurely? You must be new here :-)

Right, but do you see __pgdir_shift on a seriously hot path to warrant this?

I'd expect this to happen the other way around, really: "Hey, pgdir_shift is
getting accessed a lot, let's stick it somewhere where we can access it
quickly..."

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Ard Biesheuvel 6 months, 4 weeks ago
On Tue, 20 May 2025 at 16:35, Borislav Petkov <bp@alien8.de> wrote:
>
> On Tue, May 20, 2025 at 01:28:52PM +0200, Ard Biesheuvel wrote:
> > Are you saying we shouldn't optimize prematurely? You must be new here :-)
>
> Right, but do you see __pgdir_shift on a seriously hot path to warrant this?
>

No. But if you had read the next couple of patches, you would have
noticed that PGDIR_SHIFT, PTRS_PER_P4D and pgtable_l5_enabled() will
all be derived from this variable, and the latter currently uses code
patching (in cpu_feature_enabled())

This is also explained in the cover letter btw
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Borislav Petkov 6 months, 4 weeks ago
On Tue, May 20, 2025 at 07:03:37PM +0200, Ard Biesheuvel wrote:
> No. But if you had read the next couple of patches, you would have
> noticed that PGDIR_SHIFT, PTRS_PER_P4D and pgtable_l5_enabled() will
> all be derived from this variable, and the latter currently uses code
> patching (in cpu_feature_enabled())
> 
> This is also explained in the cover letter btw

Yes, I saw that.

The question remains: are the *users* - PGDIR_SHIFT, etc - on some hot path
which I'm not seeing?

For example pgd_index() is called in a bunch of places and I guess that adds
up. But without measuring that, we won't know for sure.

Looking at an example:

# ./arch/x86/include/asm/pgtable_64_types.h:32:         return this_cpu_read_stable(__pgdir_shift);
#APP
# 32 "./arch/x86/include/asm/pgtable_64_types.h" 1
        movb %gs:__pgdir_shift(%rip), %cl       #, pfo_val__
# 0 "" 2

...

        movq    40(%rdi), %rax  # ppd_20(D)->vaddr, tmp128
        shrq    %cl, %rax       # pfo_val__, tmp128
# arch/x86/boot/startup/sme.c:105:      pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
        movq    8(%rdi), %rcx   # ppd_20(D)->pgd, ppd_20(D)->pgd
# arch/x86/boot/startup/sme.c:105:      pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
        andl    $511, %eax      #, tmp130
# arch/x86/boot/startup/sme.c:105:      pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
        leaq    (%rcx,%rax,8), %rsi     #, pgd_p

that looks like two insns to me: the RIP-relative mov to %cl and then the
shift.

If you use a "normal" variable, that would be also two insns, no?

Or am I way off?

Because if not, the percpu thing doesn't buy you anything...

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Ard Biesheuvel 6 months, 4 weeks ago
On Tue, 20 May 2025 at 19:38, Borislav Petkov <bp@alien8.de> wrote:
>
> On Tue, May 20, 2025 at 07:03:37PM +0200, Ard Biesheuvel wrote:
> > No. But if you had read the next couple of patches, you would have
> > noticed that PGDIR_SHIFT, PTRS_PER_P4D and pgtable_l5_enabled() will
> > all be derived from this variable, and the latter currently uses code
> > patching (in cpu_feature_enabled())
> >
> > This is also explained in the cover letter btw
>
> Yes, I saw that.
>
> The question remains: are the *users* - PGDIR_SHIFT, etc - on some hot path
> which I'm not seeing?
>
> For example pgd_index() is called in a bunch of places and I guess that adds
> up. But without measuring that, we won't know for sure.
>

Look at pgtable_l5_enabled() please, that is the important one.

> Looking at an example:
>
> # ./arch/x86/include/asm/pgtable_64_types.h:32:         return this_cpu_read_stable(__pgdir_shift);
> #APP
> # 32 "./arch/x86/include/asm/pgtable_64_types.h" 1
>         movb %gs:__pgdir_shift(%rip), %cl       #, pfo_val__
> # 0 "" 2
>
> ...
>
>         movq    40(%rdi), %rax  # ppd_20(D)->vaddr, tmp128
>         shrq    %cl, %rax       # pfo_val__, tmp128
> # arch/x86/boot/startup/sme.c:105:      pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
>         movq    8(%rdi), %rcx   # ppd_20(D)->pgd, ppd_20(D)->pgd
> # arch/x86/boot/startup/sme.c:105:      pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
>         andl    $511, %eax      #, tmp130
> # arch/x86/boot/startup/sme.c:105:      pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
>         leaq    (%rcx,%rax,8), %rsi     #, pgd_p
>
> that looks like two insns to me: the RIP-relative mov to %cl and then the
> shift.
>
> If you use a "normal" variable, that would be also two insns, no?
>
> Or am I way off?
>
> Because if not, the percpu thing doesn't buy you anything...
>

The variable access is identical in terms of instructions, the only
difference is the %gs offset being applied, and the fact that using
cache hot data is guaranteed not to increase the number of cachelines
covering the working set of any existing workload (the region is
bounded to a fixed number of cachelines)

Happy to keep this as a simple __ro_after_init variable if there is
consensus between the tip maintainers that we don't need this perf
advantage, or we can use some kind of code patching that is not CPU
feature based or ternary alternative based to short circuit these
things (and pgtable_l5_enabled() in particular) after boot.
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Borislav Petkov 6 months, 4 weeks ago
On Tue, May 20, 2025 at 07:46:33PM +0200, Ard Biesheuvel wrote:
> Look at pgtable_l5_enabled() please, that is the important one.

OMG. :-)

# 32 "./arch/x86/include/asm/pgtable_64_types.h" 1
        movb %gs:__pgdir_shift(%rip), %al       #, pfo_val__
# 0 "" 2
# ./arch/x86/include/asm/pgtable.h:1178:        if (!pgtable_l5_enabled())
#NO_APP
        testb   $1, %al #, pfo_val__

I hadn't seen his fun yet:

!(pgdir_shift() & 1)

> The variable access is identical in terms of instructions, the only
> difference is the %gs offset being applied, and the fact that using
> cache hot data is guaranteed not to increase the number of cachelines
> covering the working set of any existing workload (the region is
> bounded to a fixed number of cachelines)

Yes, but look at all the callers. I hardly see any serious hot paths to care
about the %gs offset being applied.

And I'll be really surprised if that *shows* in any sensible benchmark...

Also, it doesn't make any sense for a *global* system property - what paging
level the machine uses - to be in a *per-CPU* variable. That's a global value
which is the same everywhere. So why waste space?

> Happy to keep this as a simple __ro_after_init variable if there is
> consensus between the tip maintainers that we don't need this perf

Yeah, IMO, no need. But let's see what the others say.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Linus Torvalds 6 months, 4 weeks ago
On Tue, 20 May 2025 at 11:01, Borislav Petkov <bp@alien8.de> wrote:
>
> OMG. :-)
>
> # 32 "./arch/x86/include/asm/pgtable_64_types.h" 1
>         movb %gs:__pgdir_shift(%rip), %al       #, pfo_val__
> # 0 "" 2
> # ./arch/x86/include/asm/pgtable.h:1178:        if (!pgtable_l5_enabled())
> #NO_APP
>         testb   $1, %al #, pfo_val__

That's garbage.

Gcc should be able to turn it into just a

        testb $1,%gs:__pgdir_shift(%rip)

What happens if pgtable_l5_enabled() is made to use  __raw_cpu_read()?
With a compiler that is new enough to support USE_X86_SEG_SUPPORT?

Oh, and it looks like we messed up __raw_cpu_read_stable(), because
that *always* uses the inline asm, so it doesn't allow the compiler to
just DTRT and do things like the above.

                Linus
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Ard Biesheuvel 6 months, 4 weeks ago
On Tue, 20 May 2025 at 20:28, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Tue, 20 May 2025 at 11:01, Borislav Petkov <bp@alien8.de> wrote:
> >
> > OMG. :-)
> >
> > # 32 "./arch/x86/include/asm/pgtable_64_types.h" 1
> >         movb %gs:__pgdir_shift(%rip), %al       #, pfo_val__
> > # 0 "" 2
> > # ./arch/x86/include/asm/pgtable.h:1178:        if (!pgtable_l5_enabled())
> > #NO_APP
> >         testb   $1, %al #, pfo_val__
>
> That's garbage.
>
> Gcc should be able to turn it into just a
>
>         testb $1,%gs:__pgdir_shift(%rip)
>
> What happens if pgtable_l5_enabled() is made to use  __raw_cpu_read()?
> With a compiler that is new enough to support USE_X86_SEG_SUPPORT?
>
> Oh, and it looks like we messed up __raw_cpu_read_stable(), because
> that *always* uses the inline asm, so it doesn't allow the compiler to
> just DTRT and do things like the above.
>

I suppose that is what the this_cpu_read_const() is for, but that
requires a const alias declaration and an entry in the linker script.

If we decide to go with a per-cpu var, I can add this
USE_X86_SEG_SUPPORT codepath like we have in other places - that
should give the compiler sufficient insight into the fact that this
value never changes, and generate optimal code as a result.

For an ordinary variable, I suppose we can still declare it as const,
__attribute__((const)) etc if it is defined in and set from the asm
startup code, resulting in the same potential for optimization
(without alternatives or runtime const hacks).
Re: [PATCH v5 2/7] x86/mm: Use a single cache hot per-CPU variable to record pgdir_shift
Posted by Borislav Petkov 6 months, 4 weeks ago
On Tue, May 20, 2025 at 11:28:28AM -0700, Linus Torvalds wrote:
> What happens if pgtable_l5_enabled() is made to use  __raw_cpu_read()?
> With a compiler that is new enough to support USE_X86_SEG_SUPPORT?

Before we go play with this more, I see the same thing with gcc-13 and gcc-14
on debian.

Those should be fairly new, I'd think...

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette