arch/x86/include/asm/tlbflush.h | 23 +++++++++++++++++++++++ arch/x86/mm/tlb.c | 19 ------------------- 2 files changed, 23 insertions(+), 19 deletions(-)
Currently no system with AMD INVLPGB support requires the page table
isolation mitigation. However, people could still enable PTI manually,
or a vulnerability could be found in the future that makes PTI useful
on certain AMD CPUs.
Additionally, there are systems that support Intel RAR TLB invalidation,
where PTI is a useful mitigation.
The combination of PTI and broadcast TLB flush has a problem:
- invalidate_user_asid() sets a bit corresponding to the process PCID
- SWITCH_TO_USER_CR3 tests and clears a bit corresponding to the process PCID
However, the PCIDs no longer fit in the range 0-6 when using global ASIDs.
Enlarge user_pcid_flush_mask to fit the PCID numbers that can be present when
using broadcast TLB flushing. This takes up 256 or 512 bytes per CPU, depending
on whether or not page table isolation is built into the kernel.
Signed-off-by: Rik van Riel <riel@surriel.com>
Fixes: c3ed3f5b2550 x86/mm: userspace & pageout flushing using Intel RAR
Cc: stable@kernel.org
---
arch/x86/include/asm/tlbflush.h | 23 +++++++++++++++++++++++
arch/x86/mm/tlb.c | 19 -------------------
2 files changed, 23 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e9b81876ebe4..9a1317e0f140 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -23,6 +23,25 @@ void __flush_tlb_all(void);
#define TLB_FLUSH_ALL -1UL
#define TLB_GENERATION_INVALID 0
+/*
+ * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
void cr4_update_irqsoff(unsigned long set, unsigned long clear);
unsigned long cr4_read_shadow(void);
@@ -121,7 +140,11 @@ struct tlb_state {
* the corresponding user PCID needs a flush next time we
* switch to it; see SWITCH_TO_USER_CR3.
*/
+#if defined(CONFIG_X86_TLB_BROADCAST_TLB_FLUSH) && defined(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)
+ unsigned long user_pcid_flush_mask[(1 << CR3_AVAIL_PCID_BITS) / BITS_PER_LONG];
+#else
unsigned short user_pcid_flush_mask;
+#endif
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 39f80111e6f1..40b40b36d257 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -90,25 +90,6 @@
*
*/
-/*
- * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
- * user/kernel switches
- */
-#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
-# define PTI_CONSUMED_PCID_BITS 1
-#else
-# define PTI_CONSUMED_PCID_BITS 0
-#endif
-
-#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
-
-/*
- * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
- * for them being zero-based. Another -1 is because PCID 0 is reserved for
- * use by non-PCID-aware users.
- */
-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
-
/*
* Given @asid, compute kPCID
*/
--
2.49.0
* Rik van Riel <riel@surriel.com> wrote:
> @@ -121,7 +140,11 @@ struct tlb_state {
> * the corresponding user PCID needs a flush next time we
> * switch to it; see SWITCH_TO_USER_CR3.
> */
> +#if defined(CONFIG_X86_TLB_BROADCAST_TLB_FLUSH) && defined(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)
> + unsigned long user_pcid_flush_mask[(1 << CR3_AVAIL_PCID_BITS) / BITS_PER_LONG];
> +#else
> unsigned short user_pcid_flush_mask;
> +#endif
1)
CONFIG_X86_TLB_BROADCAST_TLB_FLUSH doesn't actually exist, the name is
CONFIG_BROADCAST_TLB_FLUSH.
This patch could not possibly have been tested on a vanilla kernel on
actual hardware in the PTI usecase it claims to fix...
2)
Testing aside, this definition and the usage of user_pcid_flush_mask is
unnecessarily confusing, as it also defines user_pcid_flush_mask when
it's not actually used by runtime code. user_pcid_flush_mask is only
used if CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y, so as an interim step
we could make this a more obvious:
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
# ifdef CONFIG_BROADCAST_TLB_FLUSH
unsigned long user_pcid_flush_mask[(1 << CR3_AVAIL_PCID_BITS) / BITS_PER_LONG];
# else
unsigned short user_pcid_flush_mask;
# endif
#endif
And wrap the body of invalidate_user_asid() in an #ifdef
CONFIG_MITIGATION_PAGE_TABLE_ISOLATION. The entry assembly code is
already under such an #ifdef.
And once we do that it becomes obvious that the two definitions of
user_pcid_flush_mask can be merged by doing something like:
#ifdef CONFIG_BROADCAST_TLB_FLUSH
# define CR3_AVAIL_PCID_LONGS ((1 << CR3_AVAIL_PCID_BITS) / BITS_PER_LONG)
#else
# define CR3_AVAIL_PCID_LONGS 1
#endif
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
unsigned long user_pcid_flush_mask[CR3_AVAIL_PCID_LONGS];
#endif
(Or so, if I counted my bits and longs right.)
And we can drop the ugly & fragile type cast in invalidate_user_asid():
- __set_bit(kern_pcid(asid),
- (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+ __set_bit(kern_pcid(asid), this_cpu_ptr(cpu_tlbstate.user_pcid_flush_mask));
And yeah, this means user_pcid_flush_mask is a long on
!CONFIG_BROADCAST_TLB_FLUSH kernels, while it was a short before.
Literally nobody cares, because it's enabled on all distro kernels that
have CPU_SUP_AMD:
config BROADCAST_TLB_FLUSH
def_bool y
depends on CPU_SUP_AMD && 64BIT
Literally no Linux distribution is going to have this option disabled.
Ie. we'd be uglifying and obfuscating the code for a config that people
aren't actually using...
3)
If we are going to grow user_pcid_flush_mask from 2 bytes to 256 bytes
then please reorder 'struct tlb_state' for cache efficiency: at minimum
the ::cr4 shadow should move to before ::user_pcid_flush_mask. But I
think we should probably move user_pcid_flush_mask to the end of the
structure, where it does the least damage to cache layout.
Thanks,
Ingo
On Sat, 2025-05-17 at 09:59 +0200, Ingo Molnar wrote:
>
> CONFIG_X86_TLB_BROADCAST_TLB_FLUSH doesn't actually exist, the name
> is
> CONFIG_BROADCAST_TLB_FLUSH.
>
Argh, cut'n'pasted from the wrong tree :(
>
> we could make this a more obvious:
>
> And we can drop the ugly & fragile type cast in
> invalidate_user_asid():
>
> - __set_bit(kern_pcid(asid),
> - (unsigned long
> *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
>
> + __set_bit(kern_pcid(asid),
> this_cpu_ptr(cpu_tlbstate.user_pcid_flush_mask));
>
That is a really nice improvement, and it almost
works, too ;)
In file included from ./arch/x86/include/asm/bitops.h:430,
from ./include/linux/bitops.h:68:
./include/asm-generic/bitops/instrumented-non-atomic.h:26:54: note:
expected ‘volatile long unsigned int *’ but argument is of type ‘long
unsigned int (*)[32]’
26 | ___set_bit(unsigned long nr, volatile unsigned long *addr)
| ~~~~~~~~~~~~~~~~~~~~~~~~^~~~
I ended up settling for this:
__set_bit(kern_pcid(asid),
this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask[0]));
> 3)
>
> If we are going to grow user_pcid_flush_mask from 2 bytes to 256
> bytes
> then please reorder 'struct tlb_state' for cache efficiency: at
> minimum
> the ::cr4 shadow should move to before ::user_pcid_flush_mask. But I
> think we should probably move user_pcid_flush_mask to the end of the
> structure, where it does the least damage to cache layout.
Done. V2 to follow in another email.
Thank you!
--
All Rights Reversed.
On Sat, 17 May 2025 09:59:48 +0200
Ingo Molnar <mingo@kernel.org> wrote:
> [many good suggestions]
Thank you for your suggestions. The code looks better now, and the
compiler almost allowed all of your ideas, with the one difference
being that gcc 14.2.1 is unhappy when passing ___set_bit() a
pointer to an array of unsigned longs:
In file included from ./arch/x86/include/asm/bitops.h:430,
from ./include/linux/bitops.h:68:
./include/asm-generic/bitops/instrumented-non-atomic.h:26:54: note:
expected ‘volatile long unsigned int *’ but argument is of type ‘long
unsigned int (*)[32]’
26 | ___set_bit(unsigned long nr, volatile unsigned long *addr)
|
But it is perfectly happy with getting the address of the first long
in that array, which works for both array size 1 and array size 32.
Thank you!
---8<---
From 96958fa1ed02e2434305fc7b0e37374eee899daf Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@meta.com>
Date: Fri, 16 May 2025 08:37:04 -0700
Subject: [PATCH] x86/mm: resize user_pcid_flush_mask for PTI / broadcast TLB flush combination
Currently no system with AMD INVLPGB support requires the page table
isolation mitigation. However, people could still enable PTI manually,
or a vulnerability could be found in the future that makes PTI useful
on certain AMD CPUs.
Additionally, there are systems that support Intel RAR TLB invalidation,
where PTI is a useful mitigation.
The combination of PTI and broadcast TLB flush has a problem:
- invalidate_user_asid() sets a bit corresponding to the process pcid in user_pcid_flush_mask
- SWITCH_TO_USER_CR3 tests and clears a bit corresponding to the process PCID in user_pcid_flush_mask
Enlarge user_pcid_flush_mask to fit the PCID numbers that can be present when
using broadcast TLB flushing. This takes up 256 or 512 bytes per CPU, depending
on whether or not page table isolation is built into the kernel.
Signed-off-by: Rik van Riel <riel@surriel.com>
Fixes: c3ed3f5b2550 x86/mm: userspace & pageout flushing using Intel RAR
Cc: stable@kernel.org
---
arch/x86/include/asm/tlbflush.h | 42 ++++++++++++++++++++++++++-------
arch/x86/kernel/asm-offsets.c | 2 ++
arch/x86/mm/tlb.c | 28 +++-------------------
3 files changed, 39 insertions(+), 33 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e9b81876ebe4..cc9935bbbd45 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -23,6 +23,31 @@ void __flush_tlb_all(void);
#define TLB_FLUSH_ALL -1UL
#define TLB_GENERATION_INVALID 0
+/*
+ * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+# define CR3_AVAIL_PCID_LONGS ((1 << CR3_AVAIL_PCID_BITS) / BITS_PER_LONG)
+#else
+# define CR3_AVAIL_PCID_LONGS 1
+#endif
+
void cr4_update_irqsoff(unsigned long set, unsigned long clear);
unsigned long cr4_read_shadow(void);
@@ -115,14 +140,6 @@ struct tlb_state {
*/
u8 lam;
#endif
-
- /*
- * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
- * the corresponding user PCID needs a flush next time we
- * switch to it; see SWITCH_TO_USER_CR3.
- */
- unsigned short user_pcid_flush_mask;
-
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
@@ -149,6 +166,15 @@ struct tlb_state {
* context 0.
*/
struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ /*
+ * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+ * the corresponding user PCID needs a flush next time we
+ * switch to it; see SWITCH_TO_USER_CR3.
+ */
+ unsigned long user_pcid_flush_mask[CR3_AVAIL_PCID_LONGS];
+#endif
};
DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 6259b474073b..8c41a2e5a53e 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -103,8 +103,10 @@ static void __used common(void)
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/* TLB state for the entry code */
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+#endif
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 39f80111e6f1..f5761e8be77f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -90,25 +90,6 @@
*
*/
-/*
- * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
- * user/kernel switches
- */
-#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
-# define PTI_CONSUMED_PCID_BITS 1
-#else
-# define PTI_CONSUMED_PCID_BITS 0
-#endif
-
-#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
-
-/*
- * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
- * for them being zero-based. Another -1 is because PCID 0 is reserved for
- * use by non-PCID-aware users.
- */
-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
-
/*
* Given @asid, compute kPCID
*/
@@ -543,10 +524,7 @@ static void broadcast_tlb_flush(struct flush_tlb_info *info)
*/
static inline void invalidate_user_asid(u16 asid)
{
- /* There is no user ASID if address space separation is off */
- if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
- return;
-
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* We only have a single ASID if PCID is off and the CR3
* write will have flushed it.
@@ -557,8 +535,8 @@ static inline void invalidate_user_asid(u16 asid)
if (!static_cpu_has(X86_FEATURE_PTI))
return;
- __set_bit(kern_pcid(asid),
- (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+ __set_bit(kern_pcid(asid), this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask[0]));
+#endif
}
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
--
2.49.0
* Rik van Riel <riel@surriel.com> wrote:
> But it is perfectly happy with getting the address of the first long
> in that array, which works for both array size 1 and array size 32.
Yeah. Testing is still a question: does this boot on a system with PCID
&& INVLPGB features, with PTI force-enabled ("pti=on" boot option)?
Thanks,
Ingo
On Sun, 2025-05-18 at 08:25 +0200, Ingo Molnar wrote:
>
> * Rik van Riel <riel@surriel.com> wrote:
>
> > But it is perfectly happy with getting the address of the first
> > long
> > in that array, which works for both array size 1 and array size 32.
>
> Yeah. Testing is still a question: does this boot on a system with
> PCID
> && INVLPGB features, with PTI force-enabled ("pti=on" boot option)?
This version runs, and no signs of corruption with
last_global_asid up over a few hundred due to various
programs running and exiting.
--
All Rights Reversed.
© 2016 - 2025 Red Hat, Inc.