[PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging

Ard Biesheuvel posted 7 patches 8 months, 4 weeks ago
There is a newer version of this series
[PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ard Biesheuvel 8 months, 4 weeks ago
From: Ard Biesheuvel <ardb@kernel.org>

Currently, the LA57 CPU feature flag is taken to mean two different
things at once:
- whether the CPU implements the LA57 extension, and is therefore
  capable of supporting 5 level paging;
- whether 5 level paging is currently in use.

This means the LA57 capability of the hardware is hidden when a LA57
capable CPU is forced to run with 4 levels of paging. It also means the
the ordinary CPU capability detection code will happily set the LA57
capability and it needs to be cleared explicitly afterwards to avoid
inconsistencies.

Separate the two so that the CPU hardware capability can be identified
unambigously in all cases.

To avoid breaking existing users that might assume that 5 level paging
is being used when the "la57" string is visible in /proc/cpuinfo,
repurpose that string to mean that 5-level paging is in use, and add a
new string la57_capable to indicate that the CPU feature is implemented
by the hardware.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h               |  3 ++-
 arch/x86/include/asm/page_64.h                   |  2 +-
 arch/x86/include/asm/pgtable_64_types.h          |  2 +-
 arch/x86/kernel/cpu/common.c                     | 16 ++--------------
 arch/x86/kvm/x86.h                               |  4 ++--
 drivers/iommu/amd/init.c                         |  4 ++--
 drivers/iommu/intel/svm.c                        |  4 ++--
 tools/testing/selftests/kvm/x86/set_sregs_test.c |  2 +-
 8 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f67a93fc9391..d59bee5907e7 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -395,7 +395,7 @@
 #define X86_FEATURE_AVX512_BITALG	(16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
 #define X86_FEATURE_TME			(16*32+13) /* "tme" Intel Total Memory Encryption */
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57		(16*32+16) /* "la57" 5-level page tables */
+#define X86_FEATURE_LA57		(16*32+16) /* "la57_hw" 5-level page tables */
 #define X86_FEATURE_RDPID		(16*32+22) /* "rdpid" RDPID instruction */
 #define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* "bus_lock_detect" Bus Lock detect */
 #define X86_FEATURE_CLDEMOTE		(16*32+25) /* "cldemote" CLDEMOTE instruction */
@@ -483,6 +483,7 @@
 #define X86_FEATURE_PREFER_YMM		(21*32+ 8) /* Avoid ZMM registers due to downclocking */
 #define X86_FEATURE_APX			(21*32+ 9) /* Advanced Performance Extensions */
 #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
+#define X86_FEATURE_5LEVEL_PAGING	(21*32+11) /* "la57" Whether 5 levels of page tables are in use */
 
 /*
  * BUG word(s)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index d3aab6f4e59a..acfa61ad0725 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -86,7 +86,7 @@ static __always_inline unsigned long task_size_max(void)
 	unsigned long ret;
 
 	alternative_io("movq %[small],%0","movq %[large],%0",
-			X86_FEATURE_LA57,
+			X86_FEATURE_5LEVEL_PAGING,
 			"=r" (ret),
 			[small] "i" ((1ul << 47)-PAGE_SIZE),
 			[large] "i" ((1ul << 56)-PAGE_SIZE));
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index e83721db18c9..d31ae12663bb 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -34,7 +34,7 @@ static inline bool pgtable_l5_enabled(void)
 	return __pgtable_l5_enabled;
 }
 #else
-#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING)
 #endif /* USE_EARLY_PGTABLE_L5 */
 
 #else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 114aaaf6ae8a..6f7827015834 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1755,20 +1755,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_clear_cpu_cap(X86_FEATURE_PCID);
 #endif
 
-	/*
-	 * Later in the boot process pgtable_l5_enabled() relies on
-	 * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
-	 * enabled by this point we need to clear the feature bit to avoid
-	 * false-positives at the later stage.
-	 *
-	 * pgtable_l5_enabled() can be false here for several reasons:
-	 *  - 5-level paging is disabled compile-time;
-	 *  - it's 32-bit kernel;
-	 *  - machine doesn't support 5-level paging;
-	 *  - user specified 'no5lvl' in kernel command line.
-	 */
-	if (!pgtable_l5_enabled())
-		setup_clear_cpu_cap(X86_FEATURE_LA57);
+	if (IS_ENABLED(CONFIG_X86_5LEVEL) && (native_read_cr4() & X86_CR4_LA57))
+		setup_force_cpu_cap(X86_FEATURE_5LEVEL_PAGING);
 
 	detect_nopl();
 }
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9dc32a409076..d2c093f17ae5 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -243,7 +243,7 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
 
 static inline u8 max_host_virt_addr_bits(void)
 {
-	return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48;
+	return kvm_cpu_cap_has(X86_FEATURE_5LEVEL_PAGING) ? 57 : 48;
 }
 
 /*
@@ -603,7 +603,7 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		__reserved_bits |= X86_CR4_FSGSBASE;    \
 	if (!__cpu_has(__c, X86_FEATURE_PKU))           \
 		__reserved_bits |= X86_CR4_PKE;         \
-	if (!__cpu_has(__c, X86_FEATURE_LA57))          \
+	if (!__cpu_has(__c, X86_FEATURE_5LEVEL_PAGING))          \
 		__reserved_bits |= X86_CR4_LA57;        \
 	if (!__cpu_has(__c, X86_FEATURE_UMIP))          \
 		__reserved_bits |= X86_CR4_UMIP;        \
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 14aa0d77df26..083fca8f8b97 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3084,7 +3084,7 @@ static int __init early_amd_iommu_init(void)
 		goto out;
 
 	/* 5 level guest page table */
-	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+	if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
 	    FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL)
 		amd_iommu_gpt_level = PAGE_MODE_5_LEVEL;
 
@@ -3691,7 +3691,7 @@ __setup("ivrs_acpihid",		parse_ivrs_acpihid);
 bool amd_iommu_pasid_supported(void)
 {
 	/* CPU page table size should match IOMMU guest page table size */
-	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+	if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
 	    amd_iommu_gpt_level != PAGE_MODE_5_LEVEL)
 		return false;
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ba93123cb4eb..1f615e6d06ec 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -37,7 +37,7 @@ void intel_svm_check(struct intel_iommu *iommu)
 		return;
 	}
 
-	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+	if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
 	    !cap_fl5lp_support(iommu->cap)) {
 		pr_err("%s SVM disabled, incompatible paging mode\n",
 		       iommu->name);
@@ -165,7 +165,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
 		return PTR_ERR(dev_pasid);
 
 	/* Setup the pasid table: */
-	sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+	sflags = cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) ? PASID_FLAG_FL5LP : 0;
 	ret = __domain_setup_first_level(iommu, dev, pasid,
 					 FLPT_DEFAULT_DID, mm->pgd,
 					 sflags, old);
diff --git a/tools/testing/selftests/kvm/x86/set_sregs_test.c b/tools/testing/selftests/kvm/x86/set_sregs_test.c
index f4095a3d1278..de78665fa675 100644
--- a/tools/testing/selftests/kvm/x86/set_sregs_test.c
+++ b/tools/testing/selftests/kvm/x86/set_sregs_test.c
@@ -52,7 +52,7 @@ static uint64_t calc_supported_cr4_feature_bits(void)
 
 	if (kvm_cpu_has(X86_FEATURE_UMIP))
 		cr4 |= X86_CR4_UMIP;
-	if (kvm_cpu_has(X86_FEATURE_LA57))
+	if (kvm_cpu_has(X86_FEATURE_5LEVEL_PAGING))
 		cr4 |= X86_CR4_LA57;
 	if (kvm_cpu_has(X86_FEATURE_VMX))
 		cr4 |= X86_CR4_VMXE;
-- 
2.49.0.1101.gccaa498523-goog
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Borislav Petkov 8 months, 4 weeks ago
On Wed, May 14, 2025 at 12:42:44PM +0200, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Currently, the LA57 CPU feature flag is taken to mean two different
> things at once:
> - whether the CPU implements the LA57 extension, and is therefore
>   capable of supporting 5 level paging;
> - whether 5 level paging is currently in use.

Btw, that gunk:

We had started simplifying the whole 5-level crap:

https://lore.kernel.org/all/20240621164406.256314-1-kirill.shutemov@linux.intel.com/

Shivank, I hear the performance issues got resolved in the meantime?

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Shivank Garg 8 months, 4 weeks ago

On 5/15/2025 6:41 PM, Borislav Petkov wrote:
> On Wed, May 14, 2025 at 12:42:44PM +0200, Ard Biesheuvel wrote:
>> From: Ard Biesheuvel <ardb@kernel.org>
>>
>> Currently, the LA57 CPU feature flag is taken to mean two different
>> things at once:
>> - whether the CPU implements the LA57 extension, and is therefore
>>   capable of supporting 5 level paging;
>> - whether 5 level paging is currently in use.
> 
> Btw, that gunk:
> 
> We had started simplifying the whole 5-level crap:
> 
> https://lore.kernel.org/all/20240621164406.256314-1-kirill.shutemov@linux.intel.com/
> 
> Shivank, I hear the performance issues got resolved in the meantime?
> 
> Thx.
> 

Hi Boris,

I've re-tested the performance concerns we discussed earlier regarding 5-level paging.
Recent tests on a current kernel don't show any performance issues:

AMD EPYC Zen 5 (SMT enabled).
Linux HEAD 6.15.0-rc6+ 088d13246a46

lmbench/lat_pagefault:
numactl --membind=1 --cpunodebind=1 bin/x86_64-linux-gnu/lat_pagefault -N 100 1GB_randomfile

Output values (50 runs, Mean, 2.5 percentile and 97.5 percentile, in microseconds):

4-level (no5lvl option)
Mean: 0.138876
     2.5%     97.5%
0.1384988 0.1392532

4-level (CONFIG_X86_5LEVEL=n)
Mean: 0.137958
     2.5%     97.5%
0.1376473 0.1382687

5-level
Mean: 0.138904
     2.5%     97.5%
0.1384789 0.1393291

After repeating the experiments a few times, the observed difference(~1%) in mean values
is under noise levels.
I think these results address the performance concerns previously raised[1]. I don't
foresee any issues in proceeding with the 5-level paging implementation
simplification efforts[2].

[1] https://lore.kernel.org/all/80734605-1926-4ac7-9c63-006fe3ea6b6a@amd.com
[2] https://lore.kernel.org/all/20240621164406.256314-1-kirill.shutemov@linux.intel.com

Thanks,
Shivank
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Borislav Petkov 8 months, 4 weeks ago
On Thu, May 15, 2025 at 11:50:17PM +0530, Shivank Garg wrote:
> I've re-tested the performance concerns we discussed earlier regarding 5-level paging.
> Recent tests on a current kernel don't show any performance issues:
> 
> AMD EPYC Zen 5 (SMT enabled).
> Linux HEAD 6.15.0-rc6+ 088d13246a46
> 
> lmbench/lat_pagefault:
> numactl --membind=1 --cpunodebind=1 bin/x86_64-linux-gnu/lat_pagefault -N 100 1GB_randomfile
> 
> Output values (50 runs, Mean, 2.5 percentile and 97.5 percentile, in microseconds):
> 
> 4-level (no5lvl option)
> Mean: 0.138876
>      2.5%     97.5%
> 0.1384988 0.1392532
> 
> 4-level (CONFIG_X86_5LEVEL=n)
> Mean: 0.137958
>      2.5%     97.5%
> 0.1376473 0.1382687
> 
> 5-level
> Mean: 0.138904
>      2.5%     97.5%
> 0.1384789 0.1393291
> 
> After repeating the experiments a few times, the observed difference(~1%) in mean values
> is under noise levels.
> I think these results address the performance concerns previously raised[1]. I don't
> foresee any issues in proceeding with the 5-level paging implementation
> simplification efforts[2].
> 
> [1] https://lore.kernel.org/all/80734605-1926-4ac7-9c63-006fe3ea6b6a@amd.com
> [2] https://lore.kernel.org/all/20240621164406.256314-1-kirill.shutemov@linux.intel.com

I guess Kirill could dust off his patchset from [2] and that would get rid of
CONFIG_X86_5LEVEL and likely simplify that aspect considerably...

I'd say.

And then Ard's patches would get even simpler...

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Kirill A. Shutemov 8 months, 3 weeks ago
On Thu, May 15, 2025 at 09:11:31PM +0200, Borislav Petkov wrote:
> On Thu, May 15, 2025 at 11:50:17PM +0530, Shivank Garg wrote:
> > I've re-tested the performance concerns we discussed earlier regarding 5-level paging.
> > Recent tests on a current kernel don't show any performance issues:
> > 
> > AMD EPYC Zen 5 (SMT enabled).
> > Linux HEAD 6.15.0-rc6+ 088d13246a46
> > 
> > lmbench/lat_pagefault:
> > numactl --membind=1 --cpunodebind=1 bin/x86_64-linux-gnu/lat_pagefault -N 100 1GB_randomfile
> > 
> > Output values (50 runs, Mean, 2.5 percentile and 97.5 percentile, in microseconds):
> > 
> > 4-level (no5lvl option)
> > Mean: 0.138876
> >      2.5%     97.5%
> > 0.1384988 0.1392532
> > 
> > 4-level (CONFIG_X86_5LEVEL=n)
> > Mean: 0.137958
> >      2.5%     97.5%
> > 0.1376473 0.1382687
> > 
> > 5-level
> > Mean: 0.138904
> >      2.5%     97.5%
> > 0.1384789 0.1393291
> > 
> > After repeating the experiments a few times, the observed difference(~1%) in mean values
> > is under noise levels.
> > I think these results address the performance concerns previously raised[1]. I don't
> > foresee any issues in proceeding with the 5-level paging implementation
> > simplification efforts[2].
> > 
> > [1] https://lore.kernel.org/all/80734605-1926-4ac7-9c63-006fe3ea6b6a@amd.com
> > [2] https://lore.kernel.org/all/20240621164406.256314-1-kirill.shutemov@linux.intel.com
> 
> I guess Kirill could dust off his patchset from [2] and that would get rid of
> CONFIG_X86_5LEVEL and likely simplify that aspect considerably...

https://lore.kernel.org/all/20250516091534.3414310-1-kirill.shutemov@linux.intel.com/

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ard Biesheuvel 8 months, 4 weeks ago
On Thu, 15 May 2025 at 14:11, Borislav Petkov <bp@alien8.de> wrote:
>
> On Wed, May 14, 2025 at 12:42:44PM +0200, Ard Biesheuvel wrote:
> > From: Ard Biesheuvel <ardb@kernel.org>
> >
> > Currently, the LA57 CPU feature flag is taken to mean two different
> > things at once:
> > - whether the CPU implements the LA57 extension, and is therefore
> >   capable of supporting 5 level paging;
> > - whether 5 level paging is currently in use.
>
> Btw, that gunk:
>
> We had started simplifying the whole 5-level crap:
>
> https://lore.kernel.org/all/20240621164406.256314-1-kirill.shutemov@linux.intel.com/
>
> Shivank, I hear the performance issues got resolved in the meantime?
>

It would be interesting to know whether CONFIG_X86_5LEVEL=n deviates
from CONFIG_X86_5LEVEL=y with 'no5lvl' on the command line. If passing
'no5lvl' makes up for the performance hit, then I don't think the
performance issues should stop us from removing this Kconfig symbol.
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by David Laight 8 months, 3 weeks ago
On Thu, 15 May 2025 14:33:50 +0100
Ard Biesheuvel <ardb@kernel.org> wrote:

> On Thu, 15 May 2025 at 14:11, Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Wed, May 14, 2025 at 12:42:44PM +0200, Ard Biesheuvel wrote:  
> > > From: Ard Biesheuvel <ardb@kernel.org>
> > >
> > > Currently, the LA57 CPU feature flag is taken to mean two different
> > > things at once:
> > > - whether the CPU implements the LA57 extension, and is therefore
> > >   capable of supporting 5 level paging;
> > > - whether 5 level paging is currently in use.  
> >
> > Btw, that gunk:
> >
> > We had started simplifying the whole 5-level crap:
> >
> > https://lore.kernel.org/all/20240621164406.256314-1-kirill.shutemov@linux.intel.com/
> >
> > Shivank, I hear the performance issues got resolved in the meantime?
> >  
> 
> It would be interesting to know whether CONFIG_X86_5LEVEL=n deviates
> from CONFIG_X86_5LEVEL=y with 'no5lvl' on the command line. If passing
> 'no5lvl' makes up for the performance hit, then I don't think the
> performance issues should stop us from removing this Kconfig symbol.

You might then want a Kconfig option to invert the default for the
command line option (and an inverted command line option).

	David
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Borislav Petkov 8 months, 4 weeks ago
On Wed, May 14, 2025 at 12:42:44PM +0200, Ard Biesheuvel wrote:
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index f67a93fc9391..d59bee5907e7 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -395,7 +395,7 @@
>  #define X86_FEATURE_AVX512_BITALG	(16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
>  #define X86_FEATURE_TME			(16*32+13) /* "tme" Intel Total Memory Encryption */
>  #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> -#define X86_FEATURE_LA57		(16*32+16) /* "la57" 5-level page tables */
> +#define X86_FEATURE_LA57		(16*32+16) /* "la57_hw" 5-level page tables */

Is there any real reason to expose that flag in /proc/cpuinfo?

>  #define X86_FEATURE_RDPID		(16*32+22) /* "rdpid" RDPID instruction */
>  #define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* "bus_lock_detect" Bus Lock detect */
>  #define X86_FEATURE_CLDEMOTE		(16*32+25) /* "cldemote" CLDEMOTE instruction */
> @@ -483,6 +483,7 @@
>  #define X86_FEATURE_PREFER_YMM		(21*32+ 8) /* Avoid ZMM registers due to downclocking */
>  #define X86_FEATURE_APX			(21*32+ 9) /* Advanced Performance Extensions */
>  #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> +#define X86_FEATURE_5LEVEL_PAGING	(21*32+11) /* "la57" Whether 5 levels of page tables are in use */

Or can we stick to that one?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ard Biesheuvel 8 months, 4 weeks ago
On Thu, 15 May 2025 at 10:52, Borislav Petkov <bp@alien8.de> wrote:
>
> On Wed, May 14, 2025 at 12:42:44PM +0200, Ard Biesheuvel wrote:
> > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> > index f67a93fc9391..d59bee5907e7 100644
> > --- a/arch/x86/include/asm/cpufeatures.h
> > +++ b/arch/x86/include/asm/cpufeatures.h
> > @@ -395,7 +395,7 @@
> >  #define X86_FEATURE_AVX512_BITALG    (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
> >  #define X86_FEATURE_TME                      (16*32+13) /* "tme" Intel Total Memory Encryption */
> >  #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> > -#define X86_FEATURE_LA57             (16*32+16) /* "la57" 5-level page tables */
> > +#define X86_FEATURE_LA57             (16*32+16) /* "la57_hw" 5-level page tables */
>
> Is there any real reason to expose that flag in /proc/cpuinfo?
>

I'd lean towards not lying about whether the CPU is la57 capable in
/proc/cpuinfo if we don't have to - this flag directly reflects the
CPUID leaf.

> >  #define X86_FEATURE_RDPID            (16*32+22) /* "rdpid" RDPID instruction */
> >  #define X86_FEATURE_BUS_LOCK_DETECT  (16*32+24) /* "bus_lock_detect" Bus Lock detect */
> >  #define X86_FEATURE_CLDEMOTE         (16*32+25) /* "cldemote" CLDEMOTE instruction */
> > @@ -483,6 +483,7 @@
> >  #define X86_FEATURE_PREFER_YMM               (21*32+ 8) /* Avoid ZMM registers due to downclocking */
> >  #define X86_FEATURE_APX                      (21*32+ 9) /* Advanced Performance Extensions */
> >  #define X86_FEATURE_INDIRECT_THUNK_ITS       (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> > +#define X86_FEATURE_5LEVEL_PAGING    (21*32+11) /* "la57" Whether 5 levels of page tables are in use */
>
> Or can we stick to that one?
>

This is arguably the important one: as long as "la57" does not change
meaning, we should be fine from compatibility pov.
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Borislav Petkov 8 months, 4 weeks ago
On Thu, May 15, 2025 at 11:17:47AM +0100, Ard Biesheuvel wrote:
> I'd lean towards not lying about whether the CPU is la57 capable in
> /proc/cpuinfo if we don't have to - this flag directly reflects the
> CPUID leaf.

We have a whole documentation about that whole "not lying" thing: short
version - use kcpuid or some other cpuid tool:

Documentation/arch/x86/cpuinfo.rst

> This is arguably the important one: as long as "la57" does not change
> meaning, we should be fine from compatibility pov.

Yes, those flags in /proc/cpuinfo should show that something is enabled and in
use. And it makes sense here. And you can't change that string anyway anymore.

Whether the hw is capable is not important for userspace and there are a lot
better means for it.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ard Biesheuvel 8 months, 4 weeks ago
On Thu, 15 May 2025 at 11:39, Borislav Petkov <bp@alien8.de> wrote:
>
> On Thu, May 15, 2025 at 11:17:47AM +0100, Ard Biesheuvel wrote:
> > I'd lean towards not lying about whether the CPU is la57 capable in
> > /proc/cpuinfo if we don't have to - this flag directly reflects the
> > CPUID leaf.
>
> We have a whole documentation about that whole "not lying" thing: short
> version - use kcpuid or some other cpuid tool:
>
> Documentation/arch/x86/cpuinfo.rst
>
> > This is arguably the important one: as long as "la57" does not change
> > meaning, we should be fine from compatibility pov.
>
> Yes, those flags in /proc/cpuinfo should show that something is enabled and in
> use. And it makes sense here. And you can't change that string anyway anymore.
>
> Whether the hw is capable is not important for userspace and there are a lot
> better means for it.
>

Fair enough.
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ingo Molnar 8 months, 4 weeks ago
* Ard Biesheuvel <ardb+git@google.com> wrote:

> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Currently, the LA57 CPU feature flag is taken to mean two different
> things at once:
> - whether the CPU implements the LA57 extension, and is therefore
>   capable of supporting 5 level paging;
> - whether 5 level paging is currently in use.
> 
> This means the LA57 capability of the hardware is hidden when a LA57
> capable CPU is forced to run with 4 levels of paging. It also means the
> the ordinary CPU capability detection code will happily set the LA57
> capability and it needs to be cleared explicitly afterwards to avoid
> inconsistencies.
> 
> Separate the two so that the CPU hardware capability can be identified
> unambigously in all cases.
> 
> To avoid breaking existing users that might assume that 5 level paging
> is being used when the "la57" string is visible in /proc/cpuinfo,
> repurpose that string to mean that 5-level paging is in use, and add a
> new string la57_capable to indicate that the CPU feature is implemented
> by the hardware.
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/x86/include/asm/cpufeatures.h               |  3 ++-
>  arch/x86/include/asm/page_64.h                   |  2 +-
>  arch/x86/include/asm/pgtable_64_types.h          |  2 +-
>  arch/x86/kernel/cpu/common.c                     | 16 ++--------------
>  arch/x86/kvm/x86.h                               |  4 ++--
>  drivers/iommu/amd/init.c                         |  4 ++--
>  drivers/iommu/intel/svm.c                        |  4 ++--
>  tools/testing/selftests/kvm/x86/set_sregs_test.c |  2 +-
>  8 files changed, 13 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index f67a93fc9391..d59bee5907e7 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -395,7 +395,7 @@
>  #define X86_FEATURE_AVX512_BITALG	(16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
>  #define X86_FEATURE_TME			(16*32+13) /* "tme" Intel Total Memory Encryption */
>  #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> -#define X86_FEATURE_LA57		(16*32+16) /* "la57" 5-level page tables */
> +#define X86_FEATURE_LA57		(16*32+16) /* "la57_hw" 5-level page tables */
>  #define X86_FEATURE_RDPID		(16*32+22) /* "rdpid" RDPID instruction */
>  #define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* "bus_lock_detect" Bus Lock detect */
>  #define X86_FEATURE_CLDEMOTE		(16*32+25) /* "cldemote" CLDEMOTE instruction */
> @@ -483,6 +483,7 @@
>  #define X86_FEATURE_PREFER_YMM		(21*32+ 8) /* Avoid ZMM registers due to downclocking */
>  #define X86_FEATURE_APX			(21*32+ 9) /* Advanced Performance Extensions */
>  #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> +#define X86_FEATURE_5LEVEL_PAGING	(21*32+11) /* "la57" Whether 5 levels of page tables are in use */

So there's a new complication here, KVM doesn't like the use of 
synthethic CPU flags, for understandable reasons:

  inlined from ‘intel_pmu_set_msr’ at arch/x86/kvm/vmx/pmu_intel.c:369:7:
  ...
  ./arch/x86/kvm/reverse_cpuid.h:102:9: note: in expansion of macro ‘BUILD_BUG_ON’
    102 |         BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
        |         ^~~~~~~~~~~~

(See x86-64 allmodconfig)

Even though previously X86_FEATURE_LA57 was effectively a synthethic 
CPU flag (it got artificially turned off by the Linux kernel if 5-level 
paging was disabled) ...

So I think the most straightforward solution would be to do the change 
below, and pass through LA57 flag if 5-level paging is enabled in the 
host kernel. This is similar to as if the firmware turned off LA57, and 
it doesn't bring in all the early-boot complications bare metal has. It 
should also match the previous behavior I think.

Thoughts?

Thanks,

	Ingo

=================>

 arch/x86/kvm/cpuid.c | 6 ++++++
 arch/x86/kvm/x86.h   | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 571c906ffcbf..d951d71aea3b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1221,6 +1221,12 @@ void kvm_set_cpu_caps(void)
 		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
 		kvm_cpu_cap_clear(X86_FEATURE_RDPID);
 	}
+	/*
+	 * Clear the LA57 flag in the guest if the host kernel
+	 * does not have 5-level paging support:
+	 */
+	if (kvm_cpu_cap_has(X86_FEATURE_LA57) && !pgtable_l5_enabled())
+		kvm_cpu_cap_clear(X86_FEATURE_LA57);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d2c093f17ae5..9dc32a409076 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -243,7 +243,7 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
 
 static inline u8 max_host_virt_addr_bits(void)
 {
-	return kvm_cpu_cap_has(X86_FEATURE_5LEVEL_PAGING) ? 57 : 48;
+	return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48;
 }
 
 /*
@@ -603,7 +603,7 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		__reserved_bits |= X86_CR4_FSGSBASE;    \
 	if (!__cpu_has(__c, X86_FEATURE_PKU))           \
 		__reserved_bits |= X86_CR4_PKE;         \
-	if (!__cpu_has(__c, X86_FEATURE_5LEVEL_PAGING))          \
+	if (!__cpu_has(__c, X86_FEATURE_LA57))          \
 		__reserved_bits |= X86_CR4_LA57;        \
 	if (!__cpu_has(__c, X86_FEATURE_UMIP))          \
 		__reserved_bits |= X86_CR4_UMIP;        \
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ard Biesheuvel 8 months, 4 weeks ago
On Thu, 15 May 2025 at 08:45, Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Ard Biesheuvel <ardb+git@google.com> wrote:
>
> > From: Ard Biesheuvel <ardb@kernel.org>
> >
> > Currently, the LA57 CPU feature flag is taken to mean two different
> > things at once:
> > - whether the CPU implements the LA57 extension, and is therefore
> >   capable of supporting 5 level paging;
> > - whether 5 level paging is currently in use.
> >
> > This means the LA57 capability of the hardware is hidden when a LA57
> > capable CPU is forced to run with 4 levels of paging. It also means the
> > the ordinary CPU capability detection code will happily set the LA57
> > capability and it needs to be cleared explicitly afterwards to avoid
> > inconsistencies.
> >
> > Separate the two so that the CPU hardware capability can be identified
> > unambigously in all cases.
> >
> > To avoid breaking existing users that might assume that 5 level paging
> > is being used when the "la57" string is visible in /proc/cpuinfo,
> > repurpose that string to mean that 5-level paging is in use, and add a
> > new string la57_capable to indicate that the CPU feature is implemented
> > by the hardware.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  arch/x86/include/asm/cpufeatures.h               |  3 ++-
> >  arch/x86/include/asm/page_64.h                   |  2 +-
> >  arch/x86/include/asm/pgtable_64_types.h          |  2 +-
> >  arch/x86/kernel/cpu/common.c                     | 16 ++--------------
> >  arch/x86/kvm/x86.h                               |  4 ++--
> >  drivers/iommu/amd/init.c                         |  4 ++--
> >  drivers/iommu/intel/svm.c                        |  4 ++--
> >  tools/testing/selftests/kvm/x86/set_sregs_test.c |  2 +-
> >  8 files changed, 13 insertions(+), 24 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> > index f67a93fc9391..d59bee5907e7 100644
> > --- a/arch/x86/include/asm/cpufeatures.h
> > +++ b/arch/x86/include/asm/cpufeatures.h
> > @@ -395,7 +395,7 @@
> >  #define X86_FEATURE_AVX512_BITALG    (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
> >  #define X86_FEATURE_TME                      (16*32+13) /* "tme" Intel Total Memory Encryption */
> >  #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> > -#define X86_FEATURE_LA57             (16*32+16) /* "la57" 5-level page tables */
> > +#define X86_FEATURE_LA57             (16*32+16) /* "la57_hw" 5-level page tables */
> >  #define X86_FEATURE_RDPID            (16*32+22) /* "rdpid" RDPID instruction */
> >  #define X86_FEATURE_BUS_LOCK_DETECT  (16*32+24) /* "bus_lock_detect" Bus Lock detect */
> >  #define X86_FEATURE_CLDEMOTE         (16*32+25) /* "cldemote" CLDEMOTE instruction */
> > @@ -483,6 +483,7 @@
> >  #define X86_FEATURE_PREFER_YMM               (21*32+ 8) /* Avoid ZMM registers due to downclocking */
> >  #define X86_FEATURE_APX                      (21*32+ 9) /* Advanced Performance Extensions */
> >  #define X86_FEATURE_INDIRECT_THUNK_ITS       (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> > +#define X86_FEATURE_5LEVEL_PAGING    (21*32+11) /* "la57" Whether 5 levels of page tables are in use */
>
> So there's a new complication here, KVM doesn't like the use of
> synthethic CPU flags, for understandable reasons:
>
>   inlined from ‘intel_pmu_set_msr’ at arch/x86/kvm/vmx/pmu_intel.c:369:7:
>   ...
>   ./arch/x86/kvm/reverse_cpuid.h:102:9: note: in expansion of macro ‘BUILD_BUG_ON’
>     102 |         BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
>         |         ^~~~~~~~~~~~
>
> (See x86-64 allmodconfig)
>
> Even though previously X86_FEATURE_LA57 was effectively a synthethic
> CPU flag (it got artificially turned off by the Linux kernel if 5-level
> paging was disabled) ...
>
> So I think the most straightforward solution would be to do the change
> below, and pass through LA57 flag if 5-level paging is enabled in the
> host kernel. This is similar to as if the firmware turned off LA57, and
> it doesn't bring in all the early-boot complications bare metal has. It
> should also match the previous behavior I think.
>
> Thoughts?
>
> Thanks,
>
>         Ingo
>
> =================>
>
>  arch/x86/kvm/cpuid.c | 6 ++++++
>  arch/x86/kvm/x86.h   | 4 ++--
>  2 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 571c906ffcbf..d951d71aea3b 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -1221,6 +1221,12 @@ void kvm_set_cpu_caps(void)
>                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
>                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
>         }
> +       /*
> +        * Clear the LA57 flag in the guest if the host kernel
> +        * does not have 5-level paging support:
> +        */
> +       if (kvm_cpu_cap_has(X86_FEATURE_LA57) && !pgtable_l5_enabled())
> +               kvm_cpu_cap_clear(X86_FEATURE_LA57);
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
>
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index d2c093f17ae5..9dc32a409076 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -243,7 +243,7 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
>
>  static inline u8 max_host_virt_addr_bits(void)
>  {
> -       return kvm_cpu_cap_has(X86_FEATURE_5LEVEL_PAGING) ? 57 : 48;
> +       return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48;

Based on the comment that documents is_noncanonical_address(), it
seems to me that this should be using cpu_has(X86_FEATURE_LA57) rather
than kvm_cpu_cap_has(X86_FEATURE_LA57). Whether the host actually runs
with 5-level paging should be irrelevant, it only matters whether it
is capable of doing so.

>  }
>
>  /*
> @@ -603,7 +603,7 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>                 __reserved_bits |= X86_CR4_FSGSBASE;    \
>         if (!__cpu_has(__c, X86_FEATURE_PKU))           \
>                 __reserved_bits |= X86_CR4_PKE;         \
> -       if (!__cpu_has(__c, X86_FEATURE_5LEVEL_PAGING))          \
> +       if (!__cpu_has(__c, X86_FEATURE_LA57))          \
>                 __reserved_bits |= X86_CR4_LA57;        \

This could be

if (!__cpu_has(__c, X86_FEATURE_LA57) || !pgtable_l5_enabled())
    __reserved_bits |= X86_CR4_LA57;

>         if (!__cpu_has(__c, X86_FEATURE_UMIP))          \
>                 __reserved_bits |= X86_CR4_UMIP;        \
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Sean Christopherson 8 months, 4 weeks ago
On Thu, May 15, 2025, Ard Biesheuvel wrote:
> On Thu, 15 May 2025 at 08:45, Ingo Molnar <mingo@kernel.org> wrote:
> > > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> > > index f67a93fc9391..d59bee5907e7 100644
> > > --- a/arch/x86/include/asm/cpufeatures.h
> > > +++ b/arch/x86/include/asm/cpufeatures.h
> > > @@ -395,7 +395,7 @@
> > >  #define X86_FEATURE_AVX512_BITALG    (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
> > >  #define X86_FEATURE_TME                      (16*32+13) /* "tme" Intel Total Memory Encryption */
> > >  #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> > > -#define X86_FEATURE_LA57             (16*32+16) /* "la57" 5-level page tables */
> > > +#define X86_FEATURE_LA57             (16*32+16) /* "la57_hw" 5-level page tables */
> > >  #define X86_FEATURE_RDPID            (16*32+22) /* "rdpid" RDPID instruction */
> > >  #define X86_FEATURE_BUS_LOCK_DETECT  (16*32+24) /* "bus_lock_detect" Bus Lock detect */
> > >  #define X86_FEATURE_CLDEMOTE         (16*32+25) /* "cldemote" CLDEMOTE instruction */
> > > @@ -483,6 +483,7 @@
> > >  #define X86_FEATURE_PREFER_YMM               (21*32+ 8) /* Avoid ZMM registers due to downclocking */
> > >  #define X86_FEATURE_APX                      (21*32+ 9) /* Advanced Performance Extensions */
> > >  #define X86_FEATURE_INDIRECT_THUNK_ITS       (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> > > +#define X86_FEATURE_5LEVEL_PAGING    (21*32+11) /* "la57" Whether 5 levels of page tables are in use */
> >
> > So there's a new complication here, KVM doesn't like the use of
> > synthethic CPU flags, for understandable reasons:
> >
> >   inlined from ‘intel_pmu_set_msr’ at arch/x86/kvm/vmx/pmu_intel.c:369:7:
> >   ...
> >   ./arch/x86/kvm/reverse_cpuid.h:102:9: note: in expansion of macro ‘BUILD_BUG_ON’
> >     102 |         BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
> >         |         ^~~~~~~~~~~~
> >
> > (See x86-64 allmodconfig)
> >
> > Even though previously X86_FEATURE_LA57 was effectively a synthethic
> > CPU flag (it got artificially turned off by the Linux kernel if 5-level
> > paging was disabled) ...

KVM doesn't care if the kernel clears CPUID feature flags.  In fact, KVM depends
on it in many cases.  What KVM cares about is that the bit number matches CPUID
(hardware defined bit) for features that are exposed to guests.

> > So I think the most straightforward solution would be to do the change
> > below, and pass through LA57 flag if 5-level paging is enabled in the
> > host kernel. This is similar to as if the firmware turned off LA57, and
> > it doesn't bring in all the early-boot complications bare metal has. It
> > should also match the previous behavior I think.
> >
> > Thoughts?
> >
> > Thanks,
> >
> >         Ingo
> >
> > =================>
> >
> >  arch/x86/kvm/cpuid.c | 6 ++++++
> >  arch/x86/kvm/x86.h   | 4 ++--
> >  2 files changed, 8 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index 571c906ffcbf..d951d71aea3b 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -1221,6 +1221,12 @@ void kvm_set_cpu_caps(void)
> >                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
> >                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
> >         }
> > +       /*
> > +        * Clear the LA57 flag in the guest if the host kernel
> > +        * does not have 5-level paging support:

No, KVM very intentionally goes out of it's way to support LA57 in the guest
irrespective of host kernel support.  I.e. KVM already looks at CPUID directly
and supports virtualizing LA57 if it's supported in hardware, the kernel's
feature flags are ignored (for LA57).

> > +        */
> > +       if (kvm_cpu_cap_has(X86_FEATURE_LA57) && !pgtable_l5_enabled())
> > +               kvm_cpu_cap_clear(X86_FEATURE_LA57);
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
> >
> > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> > index d2c093f17ae5..9dc32a409076 100644
> > --- a/arch/x86/kvm/x86.h
> > +++ b/arch/x86/kvm/x86.h
> > @@ -243,7 +243,7 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
> >
> >  static inline u8 max_host_virt_addr_bits(void)
> >  {
> > -       return kvm_cpu_cap_has(X86_FEATURE_5LEVEL_PAGING) ? 57 : 48;
> > +       return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48;
> 
> Based on the comment that documents is_noncanonical_address(), it
> seems to me that this should be using cpu_has(X86_FEATURE_LA57) rather
> than kvm_cpu_cap_has(X86_FEATURE_LA57). Whether the host actually runs
> with 5-level paging should be irrelevant, it only matters whether it
> is capable of doing so.

Ard's patches are completely wrong for KVM, and amusingly, completely unecessary.

Simply drop all of the KVM changes, including the selftest change.  And if you
want to save yourselves some time in the future, use scripts/get_maintainer.pl.

> >  }
> >
> >  /*
> > @@ -603,7 +603,7 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> >                 __reserved_bits |= X86_CR4_FSGSBASE;    \
> >         if (!__cpu_has(__c, X86_FEATURE_PKU))           \
> >                 __reserved_bits |= X86_CR4_PKE;         \
> > -       if (!__cpu_has(__c, X86_FEATURE_5LEVEL_PAGING))          \
> > +       if (!__cpu_has(__c, X86_FEATURE_LA57))          \
> >                 __reserved_bits |= X86_CR4_LA57;        \
> 
> This could be
> 
> if (!__cpu_has(__c, X86_FEATURE_LA57) || !pgtable_l5_enabled())
>     __reserved_bits |= X86_CR4_LA57;

No, all of this is wrong.  __kvm_is_valid_cr4() is used check guest CR4 against
KVM's supported set of features CPUID, and also to check guest CR4 against the
vCPU's supported set of features.

> 
> >         if (!__cpu_has(__c, X86_FEATURE_UMIP))          \
> >                 __reserved_bits |= X86_CR4_UMIP;        \
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ard Biesheuvel 8 months, 3 weeks ago
On Fri, 16 May 2025 at 00:24, Sean Christopherson <seanjc@google.com> wrote:
>
...
> Ard's patches are completely wrong for KVM, and amusingly, completely unecessary.
>
> Simply drop all of the KVM changes, including the selftest change.  And if you
> want to save yourselves some time in the future, use scripts/get_maintainer.pl.
>

Understood. Thanks for taking a look.
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Kirill A. Shutemov 8 months, 4 weeks ago
On Thu, May 15, 2025 at 09:45:52AM +0200, Ingo Molnar wrote:
> 
> * Ard Biesheuvel <ardb+git@google.com> wrote:
> 
> > From: Ard Biesheuvel <ardb@kernel.org>
> > 
> > Currently, the LA57 CPU feature flag is taken to mean two different
> > things at once:
> > - whether the CPU implements the LA57 extension, and is therefore
> >   capable of supporting 5 level paging;
> > - whether 5 level paging is currently in use.
> > 
> > This means the LA57 capability of the hardware is hidden when a LA57
> > capable CPU is forced to run with 4 levels of paging. It also means the
> > the ordinary CPU capability detection code will happily set the LA57
> > capability and it needs to be cleared explicitly afterwards to avoid
> > inconsistencies.
> > 
> > Separate the two so that the CPU hardware capability can be identified
> > unambigously in all cases.
> > 
> > To avoid breaking existing users that might assume that 5 level paging
> > is being used when the "la57" string is visible in /proc/cpuinfo,
> > repurpose that string to mean that 5-level paging is in use, and add a
> > new string la57_capable to indicate that the CPU feature is implemented
> > by the hardware.
> > 
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  arch/x86/include/asm/cpufeatures.h               |  3 ++-
> >  arch/x86/include/asm/page_64.h                   |  2 +-
> >  arch/x86/include/asm/pgtable_64_types.h          |  2 +-
> >  arch/x86/kernel/cpu/common.c                     | 16 ++--------------
> >  arch/x86/kvm/x86.h                               |  4 ++--
> >  drivers/iommu/amd/init.c                         |  4 ++--
> >  drivers/iommu/intel/svm.c                        |  4 ++--
> >  tools/testing/selftests/kvm/x86/set_sregs_test.c |  2 +-
> >  8 files changed, 13 insertions(+), 24 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> > index f67a93fc9391..d59bee5907e7 100644
> > --- a/arch/x86/include/asm/cpufeatures.h
> > +++ b/arch/x86/include/asm/cpufeatures.h
> > @@ -395,7 +395,7 @@
> >  #define X86_FEATURE_AVX512_BITALG	(16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
> >  #define X86_FEATURE_TME			(16*32+13) /* "tme" Intel Total Memory Encryption */
> >  #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> > -#define X86_FEATURE_LA57		(16*32+16) /* "la57" 5-level page tables */
> > +#define X86_FEATURE_LA57		(16*32+16) /* "la57_hw" 5-level page tables */
> >  #define X86_FEATURE_RDPID		(16*32+22) /* "rdpid" RDPID instruction */
> >  #define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* "bus_lock_detect" Bus Lock detect */
> >  #define X86_FEATURE_CLDEMOTE		(16*32+25) /* "cldemote" CLDEMOTE instruction */
> > @@ -483,6 +483,7 @@
> >  #define X86_FEATURE_PREFER_YMM		(21*32+ 8) /* Avoid ZMM registers due to downclocking */
> >  #define X86_FEATURE_APX			(21*32+ 9) /* Advanced Performance Extensions */
> >  #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> > +#define X86_FEATURE_5LEVEL_PAGING	(21*32+11) /* "la57" Whether 5 levels of page tables are in use */
> 
> So there's a new complication here, KVM doesn't like the use of 
> synthethic CPU flags, for understandable reasons:
> 
>   inlined from ‘intel_pmu_set_msr’ at arch/x86/kvm/vmx/pmu_intel.c:369:7:
>   ...
>   ./arch/x86/kvm/reverse_cpuid.h:102:9: note: in expansion of macro ‘BUILD_BUG_ON’
>     102 |         BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
>         |         ^~~~~~~~~~~~
> 
> (See x86-64 allmodconfig)
> 
> Even though previously X86_FEATURE_LA57 was effectively a synthethic 
> CPU flag (it got artificially turned off by the Linux kernel if 5-level 
> paging was disabled) ...
> 
> So I think the most straightforward solution would be to do the change 
> below, and pass through LA57 flag if 5-level paging is enabled in the 
> host kernel. This is similar to as if the firmware turned off LA57, and 
> it doesn't bring in all the early-boot complications bare metal has. It 
> should also match the previous behavior I think.
> 
> Thoughts?
> 
> Thanks,
> 
> 	Ingo
> 
> =================>
> 
>  arch/x86/kvm/cpuid.c | 6 ++++++
>  arch/x86/kvm/x86.h   | 4 ++--
>  2 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 571c906ffcbf..d951d71aea3b 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -1221,6 +1221,12 @@ void kvm_set_cpu_caps(void)
>  		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
>  		kvm_cpu_cap_clear(X86_FEATURE_RDPID);
>  	}
> +	/*
> +	 * Clear the LA57 flag in the guest if the host kernel
> +	 * does not have 5-level paging support:
> +	 */
> +	if (kvm_cpu_cap_has(X86_FEATURE_LA57) && !pgtable_l5_enabled())

X86_FEATURE_LA57 check seems redundant.

> +		kvm_cpu_cap_clear(X86_FEATURE_LA57);
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
>  
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index d2c093f17ae5..9dc32a409076 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -243,7 +243,7 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
>  
>  static inline u8 max_host_virt_addr_bits(void)
>  {
> -	return kvm_cpu_cap_has(X86_FEATURE_5LEVEL_PAGING) ? 57 : 48;
> +	return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48;
>  }
>  
>  /*
> @@ -603,7 +603,7 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>  		__reserved_bits |= X86_CR4_FSGSBASE;    \
>  	if (!__cpu_has(__c, X86_FEATURE_PKU))           \
>  		__reserved_bits |= X86_CR4_PKE;         \
> -	if (!__cpu_has(__c, X86_FEATURE_5LEVEL_PAGING))          \
> +	if (!__cpu_has(__c, X86_FEATURE_LA57))          \
>  		__reserved_bits |= X86_CR4_LA57;        \
>  	if (!__cpu_has(__c, X86_FEATURE_UMIP))          \
>  		__reserved_bits |= X86_CR4_UMIP;        \

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ingo Molnar 8 months, 4 weeks ago
* Kirill A. Shutemov <kirill@shutemov.name> wrote:

> On Thu, May 15, 2025 at 09:45:52AM +0200, Ingo Molnar wrote:
> > 
> > * Ard Biesheuvel <ardb+git@google.com> wrote:
> > 
> > > From: Ard Biesheuvel <ardb@kernel.org>
> > > 
> > > Currently, the LA57 CPU feature flag is taken to mean two different
> > > things at once:
> > > - whether the CPU implements the LA57 extension, and is therefore
> > >   capable of supporting 5 level paging;
> > > - whether 5 level paging is currently in use.
> > > 
> > > This means the LA57 capability of the hardware is hidden when a LA57
> > > capable CPU is forced to run with 4 levels of paging. It also means the
> > > the ordinary CPU capability detection code will happily set the LA57
> > > capability and it needs to be cleared explicitly afterwards to avoid
> > > inconsistencies.
> > > 
> > > Separate the two so that the CPU hardware capability can be identified
> > > unambigously in all cases.
> > > 
> > > To avoid breaking existing users that might assume that 5 level paging
> > > is being used when the "la57" string is visible in /proc/cpuinfo,
> > > repurpose that string to mean that 5-level paging is in use, and add a
> > > new string la57_capable to indicate that the CPU feature is implemented
> > > by the hardware.
> > > 
> > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > > ---
> > >  arch/x86/include/asm/cpufeatures.h               |  3 ++-
> > >  arch/x86/include/asm/page_64.h                   |  2 +-
> > >  arch/x86/include/asm/pgtable_64_types.h          |  2 +-
> > >  arch/x86/kernel/cpu/common.c                     | 16 ++--------------
> > >  arch/x86/kvm/x86.h                               |  4 ++--
> > >  drivers/iommu/amd/init.c                         |  4 ++--
> > >  drivers/iommu/intel/svm.c                        |  4 ++--
> > >  tools/testing/selftests/kvm/x86/set_sregs_test.c |  2 +-
> > >  8 files changed, 13 insertions(+), 24 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> > > index f67a93fc9391..d59bee5907e7 100644
> > > --- a/arch/x86/include/asm/cpufeatures.h
> > > +++ b/arch/x86/include/asm/cpufeatures.h
> > > @@ -395,7 +395,7 @@
> > >  #define X86_FEATURE_AVX512_BITALG	(16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
> > >  #define X86_FEATURE_TME			(16*32+13) /* "tme" Intel Total Memory Encryption */
> > >  #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> > > -#define X86_FEATURE_LA57		(16*32+16) /* "la57" 5-level page tables */
> > > +#define X86_FEATURE_LA57		(16*32+16) /* "la57_hw" 5-level page tables */
> > >  #define X86_FEATURE_RDPID		(16*32+22) /* "rdpid" RDPID instruction */
> > >  #define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* "bus_lock_detect" Bus Lock detect */
> > >  #define X86_FEATURE_CLDEMOTE		(16*32+25) /* "cldemote" CLDEMOTE instruction */
> > > @@ -483,6 +483,7 @@
> > >  #define X86_FEATURE_PREFER_YMM		(21*32+ 8) /* Avoid ZMM registers due to downclocking */
> > >  #define X86_FEATURE_APX			(21*32+ 9) /* Advanced Performance Extensions */
> > >  #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> > > +#define X86_FEATURE_5LEVEL_PAGING	(21*32+11) /* "la57" Whether 5 levels of page tables are in use */
> > 
> > So there's a new complication here, KVM doesn't like the use of 
> > synthethic CPU flags, for understandable reasons:
> > 
> >   inlined from ‘intel_pmu_set_msr’ at arch/x86/kvm/vmx/pmu_intel.c:369:7:
> >   ...
> >   ./arch/x86/kvm/reverse_cpuid.h:102:9: note: in expansion of macro ‘BUILD_BUG_ON’
> >     102 |         BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
> >         |         ^~~~~~~~~~~~
> > 
> > (See x86-64 allmodconfig)
> > 
> > Even though previously X86_FEATURE_LA57 was effectively a synthethic 
> > CPU flag (it got artificially turned off by the Linux kernel if 5-level 
> > paging was disabled) ...
> > 
> > So I think the most straightforward solution would be to do the change 
> > below, and pass through LA57 flag if 5-level paging is enabled in the 
> > host kernel. This is similar to as if the firmware turned off LA57, and 
> > it doesn't bring in all the early-boot complications bare metal has. It 
> > should also match the previous behavior I think.
> > 
> > Thoughts?
> > 
> > Thanks,
> > 
> > 	Ingo
> > 
> > =================>
> > 
> >  arch/x86/kvm/cpuid.c | 6 ++++++
> >  arch/x86/kvm/x86.h   | 4 ++--
> >  2 files changed, 8 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index 571c906ffcbf..d951d71aea3b 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -1221,6 +1221,12 @@ void kvm_set_cpu_caps(void)
> >  		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
> >  		kvm_cpu_cap_clear(X86_FEATURE_RDPID);
> >  	}
> > +	/*
> > +	 * Clear the LA57 flag in the guest if the host kernel
> > +	 * does not have 5-level paging support:
> > +	 */
> > +	if (kvm_cpu_cap_has(X86_FEATURE_LA57) && !pgtable_l5_enabled())
> 
> X86_FEATURE_LA57 check seems redundant.

Possibly. I wanted to limit the clearing of X86_FEATURE_LA57 only to 
precisely the case where it was set to begin with. Ie. don't clear it 
when it's not present to begin with.

It shouldn't matter, as this is KVM-module-init-only code.

Thanks,

	Ingo
Re: [PATCH v3 1/7] x86/cpu: Use a new feature flag for 5 level paging
Posted by Ingo Molnar 8 months, 4 weeks ago
* Ard Biesheuvel <ardb+git@google.com> wrote:

> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Currently, the LA57 CPU feature flag is taken to mean two different
> things at once:
> - whether the CPU implements the LA57 extension, and is therefore
>   capable of supporting 5 level paging;
> - whether 5 level paging is currently in use.
> 
> This means the LA57 capability of the hardware is hidden when a LA57
> capable CPU is forced to run with 4 levels of paging. It also means the
> the ordinary CPU capability detection code will happily set the LA57
> capability and it needs to be cleared explicitly afterwards to avoid
> inconsistencies.
> 
> Separate the two so that the CPU hardware capability can be identified
> unambigously in all cases.
> 
> To avoid breaking existing users that might assume that 5 level paging
> is being used when the "la57" string is visible in /proc/cpuinfo,
> repurpose that string to mean that 5-level paging is in use, and add a
> new string la57_capable to indicate that the CPU feature is implemented
> by the hardware.

So the new string ended up being "la57_hw", not "la57_capable". :-)

> -#define X86_FEATURE_LA57		(16*32+16) /* "la57" 5-level page tables */
> +#define X86_FEATURE_LA57		(16*32+16) /* "la57_hw" 5-level page tables */

I fixed the changelog and kept la57_hw.

BTW., I too was considering these variants for the new flag:

	la57_support
	la57_cap
	la57_capable

	  - These are easy to confuse with 5-level paging software 
	    support in the kernel, ie. the name doesn't sufficiently 
	    disambiguate that this flag is about hardware support.

	la57_cpu

	  - While this makes it clear that it's about the CPU, the _cpu 
	    postfix often denotes something related to a specific CPU, 
	    so it's a tiny bit confusing in this context.

... and each had disadvantages, as listed, and "la57_hw" seemed the 
least ambiguous in this context.

Thanks,

	Ingo