From: Ard Biesheuvel <ardb@kernel.org>
Currently, the LA57 CPU feature flag is taken to mean two different
things at once:
- whether the CPU implements the LA57 extension, and is therefore
capable of supporting 5 level paging;
- whether 5 level paging is currently in use.
This means the LA57 capability of the hardware is hidden when a LA57
capable CPU is forced to run with 4 levels of paging. It also means the
the ordinary CPU capability detection code will happily set the LA57
capability and it needs to be cleared explicitly afterwards to avoid
inconsistencies.
Separate the two so that the CPU hardware capability can be identified
unambigously in all cases.
To avoid breaking existing users that might assume that 5 level paging
is being used when the "la57" string is visible in /proc/cpuinfo,
repurpose that string to mean that 5-level paging is in use, and add a
new string la57_capable to indicate that the CPU feature is implemented
by the hardware.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/x86/include/asm/cpufeatures.h | 3 ++-
arch/x86/include/asm/page_64.h | 2 +-
arch/x86/include/asm/pgtable_64_types.h | 2 +-
arch/x86/kernel/cpu/common.c | 16 ++--------------
drivers/iommu/amd/init.c | 4 ++--
drivers/iommu/intel/svm.c | 4 ++--
6 files changed, 10 insertions(+), 21 deletions(-)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f67a93fc9391..5c19bee0af11 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -395,7 +395,7 @@
#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */
+#define X86_FEATURE_LA57 (16*32+16) /* 57-bit linear addressing */
#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */
#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */
#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */
@@ -483,6 +483,7 @@
#define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */
#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */
#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
+#define X86_FEATURE_5LEVEL_PAGING (21*32+11) /* "la57" Whether 5 levels of page tables are in use */
/*
* BUG word(s)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index d3aab6f4e59a..acfa61ad0725 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -86,7 +86,7 @@ static __always_inline unsigned long task_size_max(void)
unsigned long ret;
alternative_io("movq %[small],%0","movq %[large],%0",
- X86_FEATURE_LA57,
+ X86_FEATURE_5LEVEL_PAGING,
"=r" (ret),
[small] "i" ((1ul << 47)-PAGE_SIZE),
[large] "i" ((1ul << 56)-PAGE_SIZE));
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index eee06f77b245..bf4c33ae24d7 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -34,7 +34,7 @@ static inline bool pgtable_l5_enabled(void)
return __pgtable_l5_enabled;
}
#else
-#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING)
#endif /* USE_EARLY_PGTABLE_L5 */
#else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8feb8fd2957a..67cdbd916830 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1755,20 +1755,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_PCID);
#endif
- /*
- * Later in the boot process pgtable_l5_enabled() relies on
- * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
- * enabled by this point we need to clear the feature bit to avoid
- * false-positives at the later stage.
- *
- * pgtable_l5_enabled() can be false here for several reasons:
- * - 5-level paging is disabled compile-time;
- * - it's 32-bit kernel;
- * - machine doesn't support 5-level paging;
- * - user specified 'no5lvl' in kernel command line.
- */
- if (!pgtable_l5_enabled())
- setup_clear_cpu_cap(X86_FEATURE_LA57);
+ if (native_read_cr4() & X86_CR4_LA57)
+ setup_force_cpu_cap(X86_FEATURE_5LEVEL_PAGING);
detect_nopl();
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 14aa0d77df26..083fca8f8b97 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3084,7 +3084,7 @@ static int __init early_amd_iommu_init(void)
goto out;
/* 5 level guest page table */
- if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL)
amd_iommu_gpt_level = PAGE_MODE_5_LEVEL;
@@ -3691,7 +3691,7 @@ __setup("ivrs_acpihid", parse_ivrs_acpihid);
bool amd_iommu_pasid_supported(void)
{
/* CPU page table size should match IOMMU guest page table size */
- if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
amd_iommu_gpt_level != PAGE_MODE_5_LEVEL)
return false;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ba93123cb4eb..1f615e6d06ec 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -37,7 +37,7 @@ void intel_svm_check(struct intel_iommu *iommu)
return;
}
- if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
!cap_fl5lp_support(iommu->cap)) {
pr_err("%s SVM disabled, incompatible paging mode\n",
iommu->name);
@@ -165,7 +165,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
return PTR_ERR(dev_pasid);
/* Setup the pasid table: */
- sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ sflags = cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) ? PASID_FLAG_FL5LP : 0;
ret = __domain_setup_first_level(iommu, dev, pasid,
FLPT_DEFAULT_DID, mm->pgd,
sflags, old);
--
2.49.0.1101.gccaa498523-goog
On Sat, May 17, 2025 at 11:16:41AM +0200, Ard Biesheuvel wrote:
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index f67a93fc9391..5c19bee0af11 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -395,7 +395,7 @@
> #define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
> #define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */
> #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
> -#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */
> +#define X86_FEATURE_LA57 (16*32+16) /* 57-bit linear addressing */
> #define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */
> #define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */
> #define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */
> @@ -483,6 +483,7 @@
> #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */
> #define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */
> #define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
> +#define X86_FEATURE_5LEVEL_PAGING (21*32+11) /* "la57" Whether 5 levels of page tables are in use */
I don't think we need this second flag - you can simply clear the existing
one. Diff ontop below:
---
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 371eaf3f300e..3b34e7c6d1b9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -395,7 +395,7 @@
#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57 (16*32+16) /* 57-bit linear addressing */
+#define X86_FEATURE_LA57 (16*32+16) /* "la57" 57-bit linear addressing */
#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */
#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */
#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */
@@ -483,7 +483,6 @@
#define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */
#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */
#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
-#define X86_FEATURE_5LEVEL_PAGING (21*32+11) /* "la57" Whether 5 levels of page tables are in use */
/*
* BUG word(s)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 754be17cc8c2..015d23f3e01f 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -85,7 +85,7 @@ static __always_inline unsigned long task_size_max(void)
unsigned long ret;
alternative_io("movq %[small],%0","movq %[large],%0",
- X86_FEATURE_5LEVEL_PAGING,
+ X86_FEATURE_LA57,
"=r" (ret),
[small] "i" ((1ul << 47)-PAGE_SIZE),
[large] "i" ((1ul << 56)-PAGE_SIZE));
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 92176887f8eb..4604f924d8b8 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -33,7 +33,7 @@ static inline bool pgtable_l5_enabled(void)
return __pgtable_l5_enabled;
}
#else
-#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING)
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
#endif /* USE_EARLY_PGTABLE_L5 */
extern unsigned int pgdir_shift;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 67cdbd916830..104944e93902 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1755,8 +1755,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_PCID);
#endif
- if (native_read_cr4() & X86_CR4_LA57)
- setup_force_cpu_cap(X86_FEATURE_5LEVEL_PAGING);
+ if (!(native_read_cr4() & X86_CR4_LA57))
+ setup_clear_cpu_cap(X86_FEATURE_LA57);
detect_nopl();
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 083fca8f8b97..14aa0d77df26 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3084,7 +3084,7 @@ static int __init early_amd_iommu_init(void)
goto out;
/* 5 level guest page table */
- if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
+ if (cpu_feature_enabled(X86_FEATURE_LA57) &&
FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL)
amd_iommu_gpt_level = PAGE_MODE_5_LEVEL;
@@ -3691,7 +3691,7 @@ __setup("ivrs_acpihid", parse_ivrs_acpihid);
bool amd_iommu_pasid_supported(void)
{
/* CPU page table size should match IOMMU guest page table size */
- if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
+ if (cpu_feature_enabled(X86_FEATURE_LA57) &&
amd_iommu_gpt_level != PAGE_MODE_5_LEVEL)
return false;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 1f615e6d06ec..ba93123cb4eb 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -37,7 +37,7 @@ void intel_svm_check(struct intel_iommu *iommu)
return;
}
- if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
+ if (cpu_feature_enabled(X86_FEATURE_LA57) &&
!cap_fl5lp_support(iommu->cap)) {
pr_err("%s SVM disabled, incompatible paging mode\n",
iommu->name);
@@ -165,7 +165,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
return PTR_ERR(dev_pasid);
/* Setup the pasid table: */
- sflags = cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) ? PASID_FLAG_FL5LP : 0;
+ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
ret = __domain_setup_first_level(iommu, dev, pasid,
FLPT_DEFAULT_DID, mm->pgd,
sflags, old);
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
* Borislav Petkov <bp@alien8.de> wrote: > On Sat, May 17, 2025 at 11:16:41AM +0200, Ard Biesheuvel wrote: > > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h > > index f67a93fc9391..5c19bee0af11 100644 > > --- a/arch/x86/include/asm/cpufeatures.h > > +++ b/arch/x86/include/asm/cpufeatures.h > > @@ -395,7 +395,7 @@ > > #define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ > > #define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */ > > #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */ > > -#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */ > > +#define X86_FEATURE_LA57 (16*32+16) /* 57-bit linear addressing */ > > #define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */ > > #define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */ > > #define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */ > > @@ -483,6 +483,7 @@ > > #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ > > #define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ > > #define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ > > +#define X86_FEATURE_5LEVEL_PAGING (21*32+11) /* "la57" Whether 5 levels of page tables are in use */ > > I don't think we need this second flag - you can simply clear the existing > one. That's what the old code did, and it was an error to do that, we almost never do that for CPU hardware capability flags: - Do we clear the PAE flag just because the kernel isn't PAE? We don't. - Do we clear the CX8 flag just because it's a UP kernel? We don't. - Do we clear the VMX/SVM flag just because KVM isn't running? We don't. - etc. etc. The handling of the LA57 flag is the odd one out, and it was a mistake for the 5-level paging kernel to clear the LA57 flag. The second best thing we can do is to have a sane, constant LA57 flag for the hardware capability, and introduce a synthethic flag that is set conditionally (X86_FEATURE_5LEVEL_PAGING) - which is how it should have been done originally, and to maintain compatibility, expose the synthethic flag in /proc/cpuinfo as 'la57' to maintain the ABI. And let's remember this the next time someone submits a kernel series with CPU flag clearing... ;-) Thanks, Ingo
On Mon, May 19, 2025 at 03:08:56PM +0200, Ingo Molnar wrote:
> The second best thing we can do is to have a sane, constant LA57 flag
> for the hardware capability, and introduce a synthethic flag that is
> set conditionally (X86_FEATURE_5LEVEL_PAGING) - which is how it should
> have been done originally, and to maintain compatibility, expose the
> synthethic flag in /proc/cpuinfo as 'la57' to maintain the ABI.
- we don't expose every CPUID flag in /proc/cpuinfo for obvious reasons:
Documentation/arch/x86/cpuinfo.rst
- if you want to mirror CPUID *capability* flags with X86_FEATURE flags *and*
use the same alternatives infrastructure to test *enabled* feature flags,
then you almost always must define *two* flags - a capability one or an
enabled one. I don't think we want that.
And since we're dealing with ancient infrastructure which has grown warts over
the years, we all - x86 maintainers - need to decide here how we should go
forward. I have raised these questions multiple times but we have never
discussed it properly.
Also, Ahmed and tglx are working on a unified CPUID view where you can test
capability. Which means, what is enabled can be used solely by the
X86_FEATURE flags but I haven't looked at his set yet.
So it is high time we sit down and hammer out the rules for the feature flags
as apparently what we have now is a total mess.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Mon, May 19 2025 at 15:19, Borislav Petkov wrote:
> On Mon, May 19, 2025 at 03:08:56PM +0200, Ingo Molnar wrote:
>> The second best thing we can do is to have a sane, constant LA57 flag
>> for the hardware capability, and introduce a synthethic flag that is
>> set conditionally (X86_FEATURE_5LEVEL_PAGING) - which is how it should
>> have been done originally, and to maintain compatibility, expose the
>> synthethic flag in /proc/cpuinfo as 'la57' to maintain the ABI.
>
> - we don't expose every CPUID flag in /proc/cpuinfo for obvious reasons:
>
> Documentation/arch/x86/cpuinfo.rst
>
> - if you want to mirror CPUID *capability* flags with X86_FEATURE flags *and*
> use the same alternatives infrastructure to test *enabled* feature flags,
> then you almost always must define *two* flags - a capability one or an
> enabled one. I don't think we want that.
>
> And since we're dealing with ancient infrastructure which has grown warts over
> the years, we all - x86 maintainers - need to decide here how we should go
> forward. I have raised these questions multiple times but we have never
> discussed it properly.
>
> Also, Ahmed and tglx are working on a unified CPUID view where you can test
> capability. Which means, what is enabled can be used solely by the
> X86_FEATURE flags but I haven't looked at his set yet.
>
> So it is high time we sit down and hammer out the rules for the feature flags
> as apparently what we have now is a total mess.
There are several issues with the handling of CPUID and feature flags:
1) The evaluation of the CPUID information is a complete maze and
has been expanded as people see fit without ever thinking about
a proper design.
As a result many CPUID instances are read out a gazillion times
to evaluate only parts of them and most of it is done with
magically hardcoded constants and the most convoluted macros.
2) The in memory representation is as stupid as it gets. Some CPUID
features are stored as is, others are analyzed and individual bits
are stored in some artifical bit field.
3) There is no clear seperation between global and per CPU
information. Real CPUID features are supposed to be global and
identical accross all CPUs. Per CPU data in CPUID is information
which is related to topologies of all sorts (APIC, caches, TLBs,
power etc.)
4) Drivers having access to CPUID is just wrong. We've had issues
with that in the past because drivers evaluated CPUID themself and
missed that the core code had stuff disabled.
So what we (mostly Ahmed) are trying to do here is to create a 1:1 view
of the CPUID information in memory with proper data structures, which
are generated out of the CPUID data base.
I know people will whine that this is waste of memory, but we are
talking about a few kilobytes of data and not about insane large structs.
That means we end up with something like this:
union cpuid_info {
unsigned long bits[SIZE];
struct cpuid_data data;
};
struct cpuid_data {
struct leaf_0 leaf_0;
struct leaf_1 leaf_1;
....
struct leaf_N leaf_N;
};
Some of the leafs will be arrays, but that's a implementation
detail. Versus CPU feature bits this means that we are not longer
looking at a condensed linear bitmap of feature bits. but at a sparse
populated bitmap. The bitmap offsets of the individual feature bits are
computed at build time and not longer hardcoded in some horribly
formatted header file.
This allows to:
1) Read _all_ CPUID information once into memory
2) Make _all_ subsequent CPUID related operations memory based
3) Remove CPUID() from drivers and other places and let them retrieve
the information out of the relevant leaf structures by doing
if (!info->data.leaf_N.foo_enabled)
return;
foo = info->data.leaf_N.foo;
instead of
cpuid($N, eax, ebx, dummy1, dummy2);
if (!(eax & MAGIC_NUMBER))
return;
foo = MAGIC_MACRO_MESS(ebx);
That also makes sure that everything in the kernel looks at a
consistent piece of information, especially when the boot code,
mitigations, command line options etc. disabled certain features.
Now what about software defined (artificial) feature bits including BUG
bits?
We still need them and there is no reason why we would replace them with
something else. But, what we want to do here, is basically the same as
we do for the real CPUID information:
Create and document real artifical leafs (there is enough reserved
number space in the CPUID numbering scheme) and put those into the
CPUID database as well.
That allows to handle this stuff in the same way as any other CPUID
data and the autogeneration of bit offsets and text information for
cpuinfo just works the same way.
Coming back to the original question with the example of LA57 and the
actual enablement. There is no reason why we can't have the actual CPUID
bit and a software defined bit.
The way how this should work is:
1) The CPUID info is in data.leaf_07.la57
2) The enablement bit is in data.leaf_linux_N.la57 or such
The CPUID database contains the entries for those in leaf_07:
<bit16 len="1" id="la57" desc="57-bit linear addresses (five-level paging)">
<vendors>
<vendor>Intel</vendor>
<vendor>AMD</vendor>
</vendors>
<linux feature="true" proc="false" />
</bit16>
and leaf_linux_N:
<bit3 len="1" id="la57" desc="57-bit linear addresses (five-level paging)">
<vendors>
<vendor>Linux</vendor>
</vendors>
<linux feature="true" proc="true" />
</bit3>
As the "proc" property of leaf_07.la57 is false, the bit won't be
exposed in cpuinfo, but the software defined bit will.
This also means, that we switch to a model where the software defined
bits are not longer subject to random introduction and removal. We
simply keep them around, mark them as not longer used and introduce new
ones with proper documentation. That requires due process, which
prevents the adhoc messing around with feature bits, which has us bitten
more than once in the past.
Aside of code clarity and simplification there is another benefit of
having such a proper defined and documented view:
Analysis in crash dumps becomes easy for tooling because both the
kernel and the tools can rely on machine generated data which comes
from a single source, i.e. the x86 cpuid data base.
Versus global and per CPU information.
Global information contains:
- all feature bits, which can be mapped into the X86_FEATURE machinery
- all globally valid information which is only accessed via software,
e.g. the name string or some numerical data, which is the same on
all CPUs
Per CPU information contains:
- all leafs which truly contain per CPU information (topology, caches,
power ...)
This will obviously take some time to hash out and implement, but in the
long run this is the right thing to do.
Thanks,
tglx
On Wed, May 21, 2025 at 05:23:37PM +0200, Thomas Gleixner wrote: > 4) Drivers having access to CPUID is just wrong. We've had issues > with that in the past because drivers evaluated CPUID themself and > missed that the core code had stuff disabled. I had this patch that read the module instructions and failed loading if they used 'fancy' instructions. Do you want me to revive that?
On Thu, May 22, 2025, Peter Zijlstra wrote: > On Wed, May 21, 2025 at 05:23:37PM +0200, Thomas Gleixner wrote: > > > 4) Drivers having access to CPUID is just wrong. We've had issues > > with that in the past because drivers evaluated CPUID themself and > > missed that the core code had stuff disabled. > > I had this patch that read the module instructions and failed loading if > they used 'fancy' instructions. Do you want me to revive that? Unless you want to grant exceptions, that's not going to fly for KVM. KVM makes heavy use of CPUID, the consumption/output of which is firmly entrenched in KVM's ABI.
On Thu, May 22 2025 at 08:08, Sean Christopherson wrote:
> On Thu, May 22, 2025, Peter Zijlstra wrote:
>> On Wed, May 21, 2025 at 05:23:37PM +0200, Thomas Gleixner wrote:
>>
>> > 4) Drivers having access to CPUID is just wrong. We've had issues
>> > with that in the past because drivers evaluated CPUID themself and
>> > missed that the core code had stuff disabled.
>>
>> I had this patch that read the module instructions and failed loading if
>> they used 'fancy' instructions. Do you want me to revive that?
Once we have the new infrastructure in place....
> Unless you want to grant exceptions, that's not going to fly for KVM. KVM makes
> heavy use of CPUID, the consumption/output of which is firmly entrenched in KVM's
> ABI.
If there is a full in memory copy of all CPUID leafs, then what needs KVM beyond
reading it from there?
Thanks,
tglx
On Thu, May 22, 2025, Thomas Gleixner wrote: > On Thu, May 22 2025 at 08:08, Sean Christopherson wrote: > > On Thu, May 22, 2025, Peter Zijlstra wrote: > >> On Wed, May 21, 2025 at 05:23:37PM +0200, Thomas Gleixner wrote: > >> > >> > 4) Drivers having access to CPUID is just wrong. We've had issues > >> > with that in the past because drivers evaluated CPUID themself and > >> > missed that the core code had stuff disabled. > >> > >> I had this patch that read the module instructions and failed loading if > >> they used 'fancy' instructions. Do you want me to revive that? > > Once we have the new infrastructure in place.... > > > Unless you want to grant exceptions, that's not going to fly for KVM. KVM makes > > heavy use of CPUID, the consumption/output of which is firmly entrenched in KVM's > > ABI. > > If there is a full in memory copy of all CPUID leafs, then what needs KVM beyond > reading it from there? Ah, I missed that context. If it's a truly full (e.g. includes the XSTATE sizes sub-leafs and all that jazz) and unmodified copy, then it'll work. If it's a modified/filtered copy, it might work? I'd have to think/dig more. I'm pretty sure the only CPUID-based feature that KVM supports based solely on hardware capabilities is LA57, and it sounds like that will have special handling anyways. My bigger concern is cases where the kernel _adds_ features. KVM's default handling of features is to advertise support if and only both the host kernel and hardware support a feature. I.e. KVM does a bitwise-AND of CPUID and boot_cpu_data.x86_capability. KVM does opt-in for a handful of features, but they are the exception, not the rule. I don't see an obvious way to maintain that behavior without KVM doing CPUID itself.
On Wed, May 21, 2025 at 05:23:37PM +0200, Thomas Gleixner wrote:
> Now what about software defined (artificial) feature bits including BUG
> bits?
>
> We still need them and there is no reason why we would replace them with
> something else. But, what we want to do here, is basically the same as
> we do for the real CPUID information:
>
> Create and document real artifical leafs (there is enough reserved
> number space in the CPUID numbering scheme) and put those into the
> CPUID database as well.
I presume here, when the kernel patch is being sent, the accompanying CPUID db
patch needs to go out too?
> That allows to handle this stuff in the same way as any other CPUID
> data and the autogeneration of bit offsets and text information for
> cpuinfo just works the same way.
>
> Coming back to the original question with the example of LA57 and the
> actual enablement. There is no reason why we can't have the actual CPUID
> bit and a software defined bit.
>
> The way how this should work is:
>
> 1) The CPUID info is in data.leaf_07.la57
>
> 2) The enablement bit is in data.leaf_linux_N.la57 or such
>
> The CPUID database contains the entries for those in leaf_07:
>
> <bit16 len="1" id="la57" desc="57-bit linear addresses (five-level paging)">
> <vendors>
> <vendor>Intel</vendor>
> <vendor>AMD</vendor>
> </vendors>
> <linux feature="true" proc="false" />
> </bit16>
>
> and leaf_linux_N:
>
> <bit3 len="1" id="la57" desc="57-bit linear addresses (five-level paging)">
> <vendors>
> <vendor>Linux</vendor>
> </vendors>
> <linux feature="true" proc="true" />
> </bit3>
>
> As the "proc" property of leaf_07.la57 is false, the bit won't be
> exposed in cpuinfo, but the software defined bit will.
>
> This also means, that we switch to a model where the software defined
> bits are not longer subject to random introduction and removal. We
> simply keep them around, mark them as not longer used and introduce new
> ones with proper documentation. That requires due process, which
> prevents the adhoc messing around with feature bits, which has us bitten
> more than once in the past.
Right, so in this particular example with la57, the CPUID bit which denotes
that the hw is capable of doing 5-level paging is needed only during kernel
init so that we can know whether we should even try to setup 5-level paging.
After that, the rest of the kernel will need to look only at "our" bit which
means, 5-level is *enabled*.
Because that's what the code cares for - whether it is running on 5-level or
not.
And 5-level *enabled* implies 5-level possible. So that first bit is kinda
redundant and perhaps even confusing. That's why I think merging the two bits
simplifies things.
You're already basically doing that with proc="false" but it should be even
less visible. No one besides us cares if the hw is capable - users care if the
feature is enabled or not.
I'd say...
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, May 21 2025 at 20:11, Borislav Petkov wrote:
> On Wed, May 21, 2025 at 05:23:37PM +0200, Thomas Gleixner wrote:
>> Now what about software defined (artificial) feature bits including BUG
>> bits?
>>
>> We still need them and there is no reason why we would replace them with
>> something else. But, what we want to do here, is basically the same as
>> we do for the real CPUID information:
>>
>> Create and document real artifical leafs (there is enough reserved
>> number space in the CPUID numbering scheme) and put those into the
>> CPUID database as well.
>
> I presume here, when the kernel patch is being sent, the accompanying CPUID db
> patch needs to go out too?
Yes, and the process is that the leaf/bit needs to be in the data base so
that the headers containing the new leaf/bit can be auto generated.
>> This also means, that we switch to a model where the software defined
>> bits are not longer subject to random introduction and removal. We
>> simply keep them around, mark them as not longer used and introduce new
>> ones with proper documentation. That requires due process, which
>> prevents the adhoc messing around with feature bits, which has us bitten
>> more than once in the past.
>
> Right, so in this particular example with la57, the CPUID bit which denotes
> that the hw is capable of doing 5-level paging is needed only during kernel
> init so that we can know whether we should even try to setup 5-level paging.
>
> After that, the rest of the kernel will need to look only at "our" bit which
> means, 5-level is *enabled*.
>
> Because that's what the code cares for - whether it is running on 5-level or
> not.
>
> And 5-level *enabled* implies 5-level possible. So that first bit is kinda
> redundant and perhaps even confusing. That's why I think merging the two bits
> simplifies things.
>
> You're already basically doing that with proc="false" but it should be even
> less visible. No one besides us cares if the hw is capable - users care if the
> feature is enabled or not.
Kinda, but you're conflating things here. leaf_7.la57 is a hardware
property and leaf_linux_$N.la57 is a software property.
Of course you might say, that clearing leaf_7.la57 is sufficient to achieve
this.
But in fact, "clearing" the hardware view is the wrong thing to do from
a conceptual point of view. The hardware view is "cast in stone" no
matter what and having a clear distinction of a separate software view
will make things way more clear and understandable.
I've stared at code for hours just to figure out that there was some
obscure way to end up with a disabled feature bit.
Having a software view in the first place makes it clear that this is
subject to a logical operation of 'hardware capable' _and_ 'software
willing' instead of some hidden and obscure 'pretend that the hardware
is not capable' magic.
Clear conceptual seperation is the only way to keep sanity in this ever
increasing complexity nightmare.
Claiming that saving 5 bits of extra memory is a brilliant idea was
even wrong 30 years ago when all of this madness started.
I freely admit that I was thinking the same way back then, because I was
coming from really memory constraint microcontroller systems. But in the
context of Linux and contemporary hardware complexity we really need to
shift the focus to comprehensible concepts and strict abstractions
between the hardware and software point of view.
Thanks,
tglx
On Wed, May 21, 2025 at 08:56:19PM +0200, Thomas Gleixner wrote:
> Yes, and the process is that the leaf/bit needs to be in the data base so
> that the headers containing the new leaf/bit can be auto generated.
Ack.
> Kinda, but you're conflating things here. leaf_7.la57 is a hardware
> property and leaf_linux_$N.la57 is a software property.
>
> Of course you might say, that clearing leaf_7.la57 is sufficient to achieve
> this.
>
> But in fact, "clearing" the hardware view is the wrong thing to do from
> a conceptual point of view. The hardware view is "cast in stone" no
> matter what and having a clear distinction of a separate software view
> will make things way more clear and understandable.
Right, if I had a time machine, I would fly back and define this thing in
a whole different way: X86_FEATURE would be what the kernel has enabled and if
the bits completely or partially overlap with the definition of a respective
CPUID bit, so be it. But they would not parrot CPUID.
IOW, I would completely decouple X86_FEATURE_ bits from CPUID and have all
in-kernel users check X86_FEATURE flags and not touch CPUID at all.
But ok, in another life...
> I've stared at code for hours just to figure out that there was some
> obscure way to end up with a disabled feature bit.
>
> Having a software view in the first place makes it clear that this is
> subject to a logical operation of 'hardware capable' _and_ 'software
> willing' instead of some hidden and obscure 'pretend that the hardware
> is not capable' magic.
>
> Clear conceptual seperation is the only way to keep sanity in this ever
> increasing complexity nightmare.
As long as we all agree and follow this, I'm a happy camper. Ofc, there will
be confusion where: "hey, I'm checking the CPUID bit but the kernel has
disabled it and marked this in the synthetic bit, blabla..." but once we know
and agree to what goes where, it should work.
> Claiming that saving 5 bits of extra memory is a brilliant idea was
> even wrong 30 years ago when all of this madness started.
>
> I freely admit that I was thinking the same way back then, because I was
> coming from really memory constraint microcontroller systems. But in the
> context of Linux and contemporary hardware complexity we really need to
> shift the focus to comprehensible concepts and strict abstractions
> between the hardware and software point of view.
Right.
And as said, arch/x86/ should be solely looking at the hw view and
representing to the rest what is enabled or not. But I'm preaching to the
choir here.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, May 21 2025 at 21:29, Borislav Petkov wrote:
> On Wed, May 21, 2025 at 08:56:19PM +0200, Thomas Gleixner wrote:
>> But in fact, "clearing" the hardware view is the wrong thing to do from
>> a conceptual point of view. The hardware view is "cast in stone" no
>> matter what and having a clear distinction of a separate software view
>> will make things way more clear and understandable.
>
> Right, if I had a time machine, I would fly back and define this thing in
> a whole different way: X86_FEATURE would be what the kernel has enabled and if
> the bits completely or partially overlap with the definition of a respective
> CPUID bit, so be it. But they would not parrot CPUID.
>
> IOW, I would completely decouple X86_FEATURE_ bits from CPUID and have all
> in-kernel users check X86_FEATURE flags and not touch CPUID at all.
>
> But ok, in another life...
We can do that now.
>> I've stared at code for hours just to figure out that there was some
>> obscure way to end up with a disabled feature bit.
>>
>> Having a software view in the first place makes it clear that this is
>> subject to a logical operation of 'hardware capable' _and_ 'software
>> willing' instead of some hidden and obscure 'pretend that the hardware
>> is not capable' magic.
>>
>> Clear conceptual seperation is the only way to keep sanity in this ever
>> increasing complexity nightmare.
>
> As long as we all agree and follow this, I'm a happy camper. Ofc, there will
> be confusion where: "hey, I'm checking the CPUID bit but the kernel has
> disabled it and marked this in the synthetic bit, blabla..." but once we know
> and agree to what goes where, it should work.
The fact that user space can check CPUID and make uninformed assumptions
about the kernel's view was a design fail in the first place. Of course
everyone assumed that nothing needs the kernel to be involved and all of
this will magically support itself, but that was delusional as we all
know.
In the long run we really want to disable user space CPUID and emulate
it when the hardware supports CPUID faulting, which should be made an
architectural feature IMO.
>> Claiming that saving 5 bits of extra memory is a brilliant idea was
>> even wrong 30 years ago when all of this madness started.
>>
>> I freely admit that I was thinking the same way back then, because I was
>> coming from really memory constraint microcontroller systems. But in the
>> context of Linux and contemporary hardware complexity we really need to
>> shift the focus to comprehensible concepts and strict abstractions
>> between the hardware and software point of view.
>
> Right.
>
> And as said, arch/x86/ should be solely looking at the hw view and
> representing to the rest what is enabled or not. But I'm preaching to the
> choir here.
Let me restrict this further:
arch/x86/kernel/cpu/cpuid_eval.c::cpuid_eval() (or whatever name the
bikeshed painting debate agrees on) should be the only place which
can use the actual CPUID instruction.
Thanks,
tglx
On Wed, May 21, 2025 at 09:41:00PM +0200, Thomas Gleixner wrote:
> > Right, if I had a time machine, I would fly back and define this thing in
> > a whole different way: X86_FEATURE would be what the kernel has enabled and if
> > the bits completely or partially overlap with the definition of a respective
> > CPUID bit, so be it. But they would not parrot CPUID.
> >
> > IOW, I would completely decouple X86_FEATURE_ bits from CPUID and have all
> > in-kernel users check X86_FEATURE flags and not touch CPUID at all.
> >
> > But ok, in another life...
>
> We can do that now.
Yeah, we will have to because of alternatives...
> The fact that user space can check CPUID and make uninformed assumptions
> about the kernel's view was a design fail in the first place.
Yeah.
> Of course everyone assumed that nothing needs the kernel to be involved and
> all of this will magically support itself, but that was delusional as we all
> know.
Yeah, look at glibc doing CPUID and making decisions about what to use. Now
imagine kernel disables something under its feet. Boom.
> In the long run we really want to disable user space CPUID and emulate
> it when the hardware supports CPUID faulting, which should be made an
> architectural feature IMO.
Both vendors do support it. I probably should do the AMD side. It is about
time...
> Let me restrict this further:
>
> arch/x86/kernel/cpu/cpuid_eval.c::cpuid_eval() (or whatever name the
> bikeshed painting debate agrees on) should be the only place which
> can use the actual CPUID instruction.
Right.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, May 21 2025 at 21:48, Borislav Petkov wrote:
> On Wed, May 21, 2025 at 09:41:00PM +0200, Thomas Gleixner wrote:
>> In the long run we really want to disable user space CPUID and emulate
>> it when the hardware supports CPUID faulting, which should be made an
>> architectural feature IMO.
>
> Both vendors do support it. I probably should do the AMD side. It is about
> time...
Both vendors support it, but it's not an architectural feature and it
really should be one.
That ensures it can't go away just because some hardware dude thinks
again that the three extra gates are overkill as all of this can be
handled in software ....
Thanks,
tglx
On Mon, 19 May 2025 at 11:41, Borislav Petkov <bp@alien8.de> wrote: > > On Sat, May 17, 2025 at 11:16:41AM +0200, Ard Biesheuvel wrote: > > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h > > index f67a93fc9391..5c19bee0af11 100644 > > --- a/arch/x86/include/asm/cpufeatures.h > > +++ b/arch/x86/include/asm/cpufeatures.h > > @@ -395,7 +395,7 @@ > > #define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ > > #define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */ > > #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */ > > -#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */ > > +#define X86_FEATURE_LA57 (16*32+16) /* 57-bit linear addressing */ > > #define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */ > > #define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */ > > #define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */ > > @@ -483,6 +483,7 @@ > > #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ > > #define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ > > #define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ > > +#define X86_FEATURE_5LEVEL_PAGING (21*32+11) /* "la57" Whether 5 levels of page tables are in use */ > > I don't think we need this second flag - you can simply clear the existing > one. That is what the old code does. It results in the flag transiently being set and cleared again, which is what I am trying to avoid. > Diff ontop below: > How is that not just a revert?
On Mon, May 19, 2025 at 11:46:34AM +0200, Ard Biesheuvel wrote:
> That is what the old code does. It results in the flag transiently
> being set and cleared again, which is what I am trying to avoid.
Right, something like this clumsy thing ontop. It'll have to be macro-ized
properly and we had macros for those somewhere - need to grep...
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 104944e93902..a6a1892a9215 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -704,7 +704,10 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
}
/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)) = {
+ [X86_FEATURE_LA57 / 32] = BIT(X86_FEATURE_LA57 & 0x1f)
+};
+
__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
#ifdef CONFIG_X86_32
asm looks correct to me:
.type cpu_caps_cleared, @object
.size cpu_caps_cleared, 96
cpu_caps_cleared:
.zero 64
.long 65536
.zero 28
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Mon, 19 May 2025 at 14:16, Borislav Petkov <bp@alien8.de> wrote:
>
> On Mon, May 19, 2025 at 11:46:34AM +0200, Ard Biesheuvel wrote:
> > That is what the old code does. It results in the flag transiently
> > being set and cleared again, which is what I am trying to avoid.
>
> Right, something like this clumsy thing ontop. It'll have to be macro-ized
> properly and we had macros for those somewhere - need to grep...
>
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 104944e93902..a6a1892a9215 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -704,7 +704,10 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
> }
>
> /* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
> -__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
> +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)) = {
> + [X86_FEATURE_LA57 / 32] = BIT(X86_FEATURE_LA57 & 0x1f)
> +};
> +
> __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
>
Setting the bit in cpu_caps_cleared[] is the easy part. I'd lean
towards something like the below instead, though.
*However*, this will still result in the associated bit in
boot_cpu_data.x86_capability[] to get set and cleared again if the CPU
is LA57 capable but not running with 5 levels of paging. Any code that
evaluates pgtable_l5_enabled() in the meantime will get the wrong
value, which is why we have these KAsan false positives etc. The whole
point of these changes is that pgtable_l5_enabled() is guaranteed to
always produce the correct value.
We could modify the CPU capability detection code to take
cpu_caps_cleared[] into account whenever it sets any bits in
cpu_capability[], but that is going be a lot more intrusive in the
end. Using a fake capability that will only get set or cleared
explicitly much cleaner.
I still think avoiding a CPU capability altogether (and testing
CR4.LA57 directly combined with a ternary alternative) is the better
solution, but that was shot down by Linus on aesthetic grounds.
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -67,9 +67,10 @@ SYM_CODE_START_NOALIGN(startup_64)
mov %cr4, %rax
btl $X86_CR4_LA57_BIT, %eax
jnc 0f
setup_force_cpu_cap X86_FEATURE_LA57
+ jmp 1f
+0: setup_clear_cpu_cap X86_FEATURE_LA57
+1:
/* Set up the stack for verify_cpu() */
leaq __top_init_kernel_stack(%rip), %rsp
On Mon, May 19, 2025 at 02:15:41PM +0200, Borislav Petkov wrote:
> On Mon, May 19, 2025 at 11:46:34AM +0200, Ard Biesheuvel wrote:
> > That is what the old code does. It results in the flag transiently
> > being set and cleared again, which is what I am trying to avoid.
>
> Right, something like this clumsy thing ontop. It'll have to be macro-ized
> properly and we had macros for those somewhere - need to grep...
With trivial macros:
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 893cbca37fe9..fa6c4e395ce3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -38,6 +38,9 @@ enum cpuid_leafs
NR_CPUID_WORDS,
};
+#define CAP_WORD(f) ((f) / 32)
+#define CAP_BIT(f) BIT((f) & 0x1f)
+
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 104944e93902..6d2c7e5694e9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -704,7 +704,10 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
}
/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)) = {
+ [CAP_WORD(X86_FEATURE_LA57)] = CAP_BIT(X86_FEATURE_LA57)
+};
+
__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
#ifdef CONFIG_X86_32
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Sat, 17 May 2025 at 11:16, Ard Biesheuvel <ardb+git@google.com> wrote: > > From: Ard Biesheuvel <ardb@kernel.org> > > Currently, the LA57 CPU feature flag is taken to mean two different > things at once: > - whether the CPU implements the LA57 extension, and is therefore > capable of supporting 5 level paging; > - whether 5 level paging is currently in use. > > This means the LA57 capability of the hardware is hidden when a LA57 > capable CPU is forced to run with 4 levels of paging. It also means the > the ordinary CPU capability detection code will happily set the LA57 > capability and it needs to be cleared explicitly afterwards to avoid > inconsistencies. > > Separate the two so that the CPU hardware capability can be identified > unambigously in all cases. > > To avoid breaking existing users that might assume that 5 level paging > is being used when the "la57" string is visible in /proc/cpuinfo, > repurpose that string to mean that 5-level paging is in use. >, and add a > new string la57_capable to indicate that the CPU feature is implemented > by the hardware. > ^^^ forgot to drop this
* Ard Biesheuvel <ardb@kernel.org> wrote:
> >, and add a
> > new string la57_capable to indicate that the CPU feature is implemented
> > by the hardware.
> >
>
> ^^^ forgot to drop this
I've rewritten this paragraph to:
The 'la57' flag's behavior in /proc/cpuinfo remains unchanged.
Thanks,
Ingo
The following commit has been merged into the x86/core branch of tip:
Commit-ID: ae41ee699c6c89850d11ba64a282490f287d9be7
Gitweb: https://git.kernel.org/tip/ae41ee699c6c89850d11ba64a282490f287d9be7
Author: Ard Biesheuvel <ardb@kernel.org>
AuthorDate: Sat, 17 May 2025 11:16:41 +02:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Mon, 19 May 2025 10:37:21 +02:00
x86/cpu: Use a new feature flag for 5-level paging
Currently, the LA57 CPU feature flag is taken to mean two different
things at once:
- Whether the CPU implements the LA57 extension, and is therefore
capable of supporting 5-level paging;
- whether 5-level paging is currently in use.
This means the LA57 capability of the hardware is hidden when a LA57
capable CPU is forced to run with 4-level paging. It also means the
the ordinary CPU capability detection code will happily set the LA57
capability and it needs to be cleared explicitly afterwards to avoid
inconsistencies.
Separate the two so that the CPU hardware capability can be identified
unambigously in all cases.
The 'la57' flag's behavior in /proc/cpuinfo remains unchanged.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Ahmed S. Darwish <darwi@linutronix.de>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250517091639.3807875-9-ardb+git@google.com
---
arch/x86/include/asm/cpufeatures.h | 3 ++-
arch/x86/include/asm/page_64.h | 2 +-
arch/x86/include/asm/pgtable_64_types.h | 2 +-
arch/x86/kernel/cpu/common.c | 16 ++--------------
drivers/iommu/amd/init.c | 4 ++--
drivers/iommu/intel/svm.c | 4 ++--
6 files changed, 10 insertions(+), 21 deletions(-)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f67a93f..5c19bee 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -395,7 +395,7 @@
#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */
+#define X86_FEATURE_LA57 (16*32+16) /* 57-bit linear addressing */
#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */
#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */
#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */
@@ -483,6 +483,7 @@
#define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */
#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */
#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
+#define X86_FEATURE_5LEVEL_PAGING (21*32+11) /* "la57" Whether 5 levels of page tables are in use */
/*
* BUG word(s)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 015d23f..754be17 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -85,7 +85,7 @@ static __always_inline unsigned long task_size_max(void)
unsigned long ret;
alternative_io("movq %[small],%0","movq %[large],%0",
- X86_FEATURE_LA57,
+ X86_FEATURE_5LEVEL_PAGING,
"=r" (ret),
[small] "i" ((1ul << 47)-PAGE_SIZE),
[large] "i" ((1ul << 56)-PAGE_SIZE));
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 4604f92..9217688 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -33,7 +33,7 @@ static inline bool pgtable_l5_enabled(void)
return __pgtable_l5_enabled;
}
#else
-#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING)
#endif /* USE_EARLY_PGTABLE_L5 */
extern unsigned int pgdir_shift;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8feb8fd..67cdbd9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1755,20 +1755,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_PCID);
#endif
- /*
- * Later in the boot process pgtable_l5_enabled() relies on
- * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
- * enabled by this point we need to clear the feature bit to avoid
- * false-positives at the later stage.
- *
- * pgtable_l5_enabled() can be false here for several reasons:
- * - 5-level paging is disabled compile-time;
- * - it's 32-bit kernel;
- * - machine doesn't support 5-level paging;
- * - user specified 'no5lvl' in kernel command line.
- */
- if (!pgtable_l5_enabled())
- setup_clear_cpu_cap(X86_FEATURE_LA57);
+ if (native_read_cr4() & X86_CR4_LA57)
+ setup_force_cpu_cap(X86_FEATURE_5LEVEL_PAGING);
detect_nopl();
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 14aa0d7..083fca8 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3084,7 +3084,7 @@ static int __init early_amd_iommu_init(void)
goto out;
/* 5 level guest page table */
- if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL)
amd_iommu_gpt_level = PAGE_MODE_5_LEVEL;
@@ -3691,7 +3691,7 @@ __setup("ivrs_acpihid", parse_ivrs_acpihid);
bool amd_iommu_pasid_supported(void)
{
/* CPU page table size should match IOMMU guest page table size */
- if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
amd_iommu_gpt_level != PAGE_MODE_5_LEVEL)
return false;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ba93123..1f615e6 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -37,7 +37,7 @@ void intel_svm_check(struct intel_iommu *iommu)
return;
}
- if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ if (cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) &&
!cap_fl5lp_support(iommu->cap)) {
pr_err("%s SVM disabled, incompatible paging mode\n",
iommu->name);
@@ -165,7 +165,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
return PTR_ERR(dev_pasid);
/* Setup the pasid table: */
- sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ sflags = cpu_feature_enabled(X86_FEATURE_5LEVEL_PAGING) ? PASID_FLAG_FL5LP : 0;
ret = __domain_setup_first_level(iommu, dev, pasid,
FLPT_DEFAULT_DID, mm->pgd,
sflags, old);
* tip-bot2 for Ard Biesheuvel <tip-bot2@linutronix.de> wrote: > The following commit has been merged into the x86/core branch of tip: > > Commit-ID: ae41ee699c6c89850d11ba64a282490f287d9be7 > Gitweb: https://git.kernel.org/tip/ae41ee699c6c89850d11ba64a282490f287d9be7 > Author: Ard Biesheuvel <ardb@kernel.org> > AuthorDate: Sat, 17 May 2025 11:16:41 +02:00 > Committer: Ingo Molnar <mingo@kernel.org> > CommitterDate: Mon, 19 May 2025 10:37:21 +02:00 > > x86/cpu: Use a new feature flag for 5-level paging Update: although Linus tentatively acked this series, I've removed this commit because it's still under discussion - it appears the synthethic CPU flag approach is not universally accepted yet. Thanks, Ingo
© 2016 - 2025 Red Hat, Inc.