[v4] Allow preemption during IPI completion waiting to improve real-time performance

[PATCH v4 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Chuyi Zhou 22 hours ago
Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
stack") converted flush_tlb_info from stack variable to per-CPU variable.
This brought about a performance improvement of around 3% in extreme test.
However, it also required that all flush_tlb* operations keep preemption
disabled entirely to prevent concurrent modifications of flush_tlb_info.
flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for
all remote CPUs to complete their local TLB flushes. The process could
take tens of milliseconds when interrupts are disabled or with a large
number of remote CPUs.

From the perspective of improving kernel real-time performance, this patch
reverts flush_tlb_info back to stack variables and align it with
SMP_CACHE_BYTES. In certain configurations, SMP_CACHE_BYTES may be large,
so the alignment size is limited to 64.  This is a preparation for enabling
preemption during TLB flush in next patch.

To evaluate the performance impact of this patch, use the following script
to reproduce the microbenchmark mentioned in commit 3db6d5a5ecaf
("x86/mm/tlb: Remove 'struct flush_tlb_info' from the stack"). The test
environment is an Ice Lake system (Intel(R) Xeon(R) Platinum 8336C) with
128 CPUs and 2 NUMA nodes. During the test, the threads were bound to
specific CPUs, and both pti and mitigations were disabled:

    #include <stdio.h>
    #include <stdlib.h>
    #include <pthread.h>
    #include <sys/mman.h>
    #include <sys/time.h>
    #include <unistd.h>

    #define NUM_OPS 1000000
    #define NUM_THREADS 3
    #define NUM_RUNS 5
    #define PAGE_SIZE 4096

    volatile int stop_threads = 0;

    void *busy_wait_thread(void *arg) {
        while (!stop_threads) {
            __asm__ volatile ("nop");
        }
        return NULL;
    }

    long long get_usec() {
        struct timeval tv;
        gettimeofday(&tv, NULL);
        return tv.tv_sec * 1000000LL + tv.tv_usec;
    }

    int main() {
        pthread_t threads[NUM_THREADS];
        char *addr;
        int i, r;
        addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE
		| MAP_ANONYMOUS, -1, 0);

        if (addr == MAP_FAILED) {
            perror("mmap");
            exit(1);
        }

        for (i = 0; i < NUM_THREADS; i++) {
            if (pthread_create(&threads[i], NULL, busy_wait_thread, NULL))
                exit(1);
        }

        printf("Running benchmark: %d runs, %d ops each, %d background\n"
               "threads\n", NUM_RUNS, NUM_OPS, NUM_THREADS);

        for (r = 0; r < NUM_RUNS; r++) {
            long long start, end;
            start = get_usec();
            for (i = 0; i < NUM_OPS; i++) {
                addr[0] = 1;
                if (madvise(addr, PAGE_SIZE, MADV_DONTNEED)) {
                    perror("madvise");
                    exit(1);
                }
            }
            end = get_usec();
            double duration = (double)(end - start);
            double avg_lat = duration / NUM_OPS;
            printf("Run %d: Total time %.2f us, Avg latency %.4f us/op\n",
            r + 1, duration, avg_lat);
        }
        stop_threads = 1;
        for (i = 0; i < NUM_THREADS; i++)
            pthread_join(threads[i], NULL);
        munmap(addr, PAGE_SIZE);
        return 0;
    }

                   base   on-stack-aligned  on-stack-not-aligned
                   ----       ---------      -----------
avg (usec/op)     2.5278       2.5261         2.5508
stddev            0.0007       0.0027         0.0023

The benchmark results show that the average latency difference between the
baseline (base) and the properly aligned stack variable (on-stack-aligned)
is within the standard deviation (stddev). This indicates that the
variations are caused by testing noise, and reverting to a stack variable
with proper alignment causes no performance regression compared to the
per-CPU implementation. The unaligned version (on-stack-not-aligned) shows
a minor performance drop. This demonstrates that we can improve the
real-time performance without sacrificing performance.

Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Suggested-by: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 arch/x86/include/asm/tlbflush.h |  8 +++-
 arch/x86/mm/tlb.c               | 72 +++++++++------------------------
 2 files changed, 27 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0545fe75c3fa..f4e4505d4ece 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -211,6 +211,12 @@ extern u16 invlpgb_count_max;
 
 extern void initialize_tlbstate_and_flush(void);
 
+#if SMP_CACHE_BYTES > 64
+#define FLUSH_TLB_INFO_ALIGN 64
+#else
+#define FLUSH_TLB_INFO_ALIGN SMP_CACHE_BYTES
+#endif
+
 /*
  * TLB flushing:
  *
@@ -249,7 +255,7 @@ struct flush_tlb_info {
 	u8			stride_shift;
 	u8			freed_tables;
 	u8			trim_cpumask;
-};
+} __aligned(FLUSH_TLB_INFO_ALIGN);
 
 void flush_tlb_local(void);
 void flush_tlb_one_user(unsigned long addr);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index af43d177087e..cfc3a72477f5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1373,28 +1373,12 @@ void flush_tlb_multi(const struct cpumask *cpumask,
  */
 unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
-#endif
-
-static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
-			unsigned long start, unsigned long end,
-			unsigned int stride_shift, bool freed_tables,
-			u64 new_tlb_gen)
+static void get_flush_tlb_info(struct flush_tlb_info *info,
+			       struct mm_struct *mm,
+			       unsigned long start, unsigned long end,
+			       unsigned int stride_shift, bool freed_tables,
+			       u64 new_tlb_gen)
 {
-	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Ensure that the following code is non-reentrant and flush_tlb_info
-	 * is not overwritten. This means no TLB flushing is initiated by
-	 * interrupt handlers and machine-check exception handlers.
-	 */
-	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
-#endif
-
 	/*
 	 * If the number of flushes is so large that a full flush
 	 * would be faster, do a full flush.
@@ -1412,32 +1396,22 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	info->new_tlb_gen	= new_tlb_gen;
 	info->initiating_cpu	= smp_processor_id();
 	info->trim_cpumask	= 0;
-
-	return info;
-}
-
-static void put_flush_tlb_info(void)
-{
-#ifdef CONFIG_DEBUG_VM
-	/* Complete reentrancy prevention checks */
-	barrier();
-	this_cpu_dec(flush_tlb_info_idx);
-#endif
 }
 
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, unsigned int stride_shift,
 				bool freed_tables)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info _info;
+	struct flush_tlb_info *info = &_info;
 	int cpu = get_cpu();
 	u64 new_tlb_gen;
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	new_tlb_gen = inc_mm_tlb_gen(mm);
 
-	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
-				  new_tlb_gen);
+	get_flush_tlb_info(&_info, mm, start, end, stride_shift, freed_tables,
+			   new_tlb_gen);
 
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1457,7 +1431,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 		local_irq_enable();
 	}
 
-	put_flush_tlb_info();
 	put_cpu();
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
@@ -1527,19 +1500,16 @@ static void kernel_tlb_flush_range(struct flush_tlb_info *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info;
 
 	guard(preempt)();
+	get_flush_tlb_info(&info, NULL, start, end, PAGE_SHIFT, false,
+			   TLB_GENERATION_INVALID);
 
-	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
-				  TLB_GENERATION_INVALID);
-
-	if (info->end == TLB_FLUSH_ALL)
-		kernel_tlb_flush_all(info);
+	if (info.end == TLB_FLUSH_ALL)
+		kernel_tlb_flush_all(&info);
 	else
-		kernel_tlb_flush_range(info);
-
-	put_flush_tlb_info();
+		kernel_tlb_flush_range(&info);
 }
 
 /*
@@ -1707,12 +1677,11 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info;
 
 	int cpu = get_cpu();
-
-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
-				  TLB_GENERATION_INVALID);
+	get_flush_tlb_info(&info, NULL, 0, TLB_FLUSH_ALL, 0, false,
+			   TLB_GENERATION_INVALID);
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling
@@ -1722,17 +1691,16 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		invlpgb_flush_all_nonglobals();
 		batch->unmapped_pages = false;
 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
-		flush_tlb_multi(&batch->cpumask, info);
+		flush_tlb_multi(&batch->cpumask, &info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
 
 	cpumask_clear(&batch->cpumask);
 
-	put_flush_tlb_info();
 	put_cpu();
 }
 
-- 
2.20.1