Previous commits have added a software signal that tracks which CR3 (kernel
or user) is in use for any given CPU.
Combined with:
o the CR3 switch itself being a flush for non-global mappings
o global mappings under kPTI being limited to the CEA and entry text
we now have a way to safely defer (kernel) TLB flush IPIs targeting
NOHZ_FULL CPUs executing in userspace (i.e. with the user CR3 loaded).
When sending a kernel TLB flush IPI to a NOHZ_FULL CPU, check whether it is
using the user CR3, and if it is, do not interrupt it and instead rely on
the CR3 write that happens when switching to the kernel CR3.
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 34 ++++++++++++++++++++++++++-------
mm/vmalloc.c | 30 ++++++++++++++++++++++++-----
3 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 3b3aceee701e6..8bae150206665 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -22,6 +22,7 @@ DECLARE_PER_CPU_PAGE_ALIGNED(bool, kernel_cr3_loaded);
#endif
void __flush_tlb_all(void);
+void flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end);
#define TLB_FLUSH_ALL -1UL
#define TLB_GENERATION_INVALID 0
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f5b93e01e3472..e08f16474f074 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -13,6 +13,7 @@
#include <linux/mmu_notifier.h>
#include <linux/mmu_context.h>
#include <linux/kvm_types.h>
+#include <linux/sched/isolation.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -1530,23 +1531,24 @@ static void do_kernel_range_flush(void *info)
flush_tlb_one_kernel(addr);
}
-static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+static void kernel_tlb_flush_all(smp_cond_func_t cond, struct flush_tlb_info *info)
{
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
invlpgb_flush_all();
else
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+ on_each_cpu_cond(cond, do_flush_tlb_all, NULL, 1);
}
-static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+static void kernel_tlb_flush_range(smp_cond_func_t cond, struct flush_tlb_info *info)
{
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
invlpgb_kernel_range_flush(info);
else
- on_each_cpu(do_kernel_range_flush, info, 1);
+ on_each_cpu_cond(cond, do_kernel_range_flush, info, 1);
}
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+static inline void
+__flush_tlb_kernel_range(smp_cond_func_t cond, unsigned long start, unsigned long end)
{
struct flush_tlb_info *info;
@@ -1556,13 +1558,31 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
TLB_GENERATION_INVALID);
if (info->end == TLB_FLUSH_ALL)
- kernel_tlb_flush_all(info);
+ kernel_tlb_flush_all(cond, info);
else
- kernel_tlb_flush_range(info);
+ kernel_tlb_flush_range(cond, info);
put_flush_tlb_info();
}
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ __flush_tlb_kernel_range(NULL, start, end);
+}
+
+#ifdef CONFIG_TRACK_CR3
+static bool flush_tlb_kernel_cond(int cpu, void *info)
+{
+ return housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE) ||
+ per_cpu(kernel_cr3_loaded, cpu);
+}
+
+void flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end)
+{
+ __flush_tlb_kernel_range(flush_tlb_kernel_cond, start, end);
+}
+#endif
+
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e286c2d2068cb..55b7bafe26016 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -501,6 +501,26 @@ void vunmap_range_noflush(unsigned long start, unsigned long end)
__vunmap_range_noflush(start, end);
}
+/*
+ * !!! BIG FAT WARNING !!!
+ *
+ * The CPU is free to cache any part of the paging hierarchy it wants at any
+ * time. It's also free to set accessed and dirty bits at any time, even for
+ * instructions that may never execute architecturally.
+ *
+ * This means that deferring a TLB flush affecting freed page-table-pages (IOW,
+ * keeping them in a CPU's paging hierarchy cache) is a recipe for disaster.
+ *
+ * This isn't a problem for deferral of TLB flushes in vmalloc, because
+ * page-table-pages used for vmap() mappings are never freed - see how
+ * __vunmap_range_noflush() walks the whole mapping but only clears the leaf PTEs.
+ * If this ever changes, TLB flush deferral will cause misery.
+ */
+void __weak flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end)
+{
+ flush_tlb_kernel_range(start, end);
+}
+
/**
* vunmap_range - unmap kernel virtual addresses
* @addr: start of the VM area to unmap
@@ -514,7 +534,7 @@ void vunmap_range(unsigned long addr, unsigned long end)
{
flush_cache_vunmap(addr, end);
vunmap_range_noflush(addr, end);
- flush_tlb_kernel_range(addr, end);
+ flush_tlb_kernel_range_deferrable(addr, end);
}
static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
@@ -2366,7 +2386,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
nr_purge_nodes = cpumask_weight(&purge_nodes);
if (nr_purge_nodes > 0) {
- flush_tlb_kernel_range(start, end);
+ flush_tlb_kernel_range_deferrable(start, end);
/* One extra worker is per a lazy_max_pages() full set minus one. */
nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
@@ -2469,7 +2489,7 @@ static void free_unmap_vmap_area(struct vmap_area *va)
flush_cache_vunmap(va->va_start, va->va_end);
vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range(va->va_start, va->va_end);
+ flush_tlb_kernel_range_deferrable(va->va_start, va->va_end);
free_vmap_area_noflush(va);
}
@@ -2916,7 +2936,7 @@ static void vb_free(unsigned long addr, unsigned long size)
vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range(addr, addr + size);
+ flush_tlb_kernel_range_deferrable(addr, addr + size);
spin_lock(&vb->lock);
@@ -2981,7 +3001,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
free_purged_blocks(&purge_list);
if (!__purge_vmap_area_lazy(start, end, false) && flush)
- flush_tlb_kernel_range(start, end);
+ flush_tlb_kernel_range_deferrable(start, end);
mutex_unlock(&vmap_purge_lock);
}
--
2.52.0