[PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack

Chuyi Zhou posted 12 patches 2 weeks, 5 days ago
There is a newer version of this series
[PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Chuyi Zhou 2 weeks, 5 days ago
Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
stack") converted flush_tlb_info from stack variable to per-CPU variable.
This brought about a performance improvement of around 3% in extreme test.
However, it also required that all flush_tlb* operations keep preemption
disabled entirely to prevent concurrent modifications of flush_tlb_info.
flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for
all remote CPUs to complete their local TLB flushes. The process could
take tens of milliseconds when interrupts are disabled or with a large
number of remote CPUs.

From the perspective of improving kernel real-time performance, this patch
reverts flush_tlb_info back to stack variables. This is a preparation for
enabling preemption during TLB flush in next patch.

To evaluate the performance impact of this patch, use the following script
to reproduce the microbenchmark mentioned in commit 3db6d5a5ecaf
("x86/mm/tlb: Remove 'struct flush_tlb_info' from the stack"). The test
environment is an Ice Lake system (Intel(R) Xeon(R) Platinum 8336C) with
128 CPUs and 2 NUMA nodes:

    #include <stdio.h>
    #include <stdlib.h>
    #include <pthread.h>
    #include <sys/mman.h>
    #include <sys/time.h>
    #include <unistd.h>

    #define NUM_OPS 1000000
    #define NUM_THREADS 3
    #define NUM_RUNS 5
    #define PAGE_SIZE 4096

    volatile int stop_threads = 0;

    void *busy_wait_thread(void *arg) {
        while (!stop_threads) {
            __asm__ volatile ("nop");
        }
        return NULL;
    }

    long long get_usec() {
        struct timeval tv;
        gettimeofday(&tv, NULL);
        return tv.tv_sec * 1000000LL + tv.tv_usec;
    }

    int main() {
        pthread_t threads[NUM_THREADS];
        char *addr;
        int i, r;
        addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE
		| MAP_ANONYMOUS, -1, 0);

        if (addr == MAP_FAILED) {
            perror("mmap");
            exit(1);
        }

        for (i = 0; i < NUM_THREADS; i++) {
            if (pthread_create(&threads[i], NULL, busy_wait_thread, NULL))
                exit(1);
        }

        printf("Running benchmark: %d runs, %d ops each, %d background\n"
               "threads\n", NUM_RUNS, NUM_OPS, NUM_THREADS);

        for (r = 0; r < NUM_RUNS; r++) {
            long long start, end;
            start = get_usec();
            for (i = 0; i < NUM_OPS; i++) {
                addr[0] = 1;
                if (madvise(addr, PAGE_SIZE, MADV_DONTNEED)) {
                    perror("madvise");
                    exit(1);
                }
            }
            end = get_usec();
            double duration = (double)(end - start);
            double avg_lat = duration / NUM_OPS;
            printf("Run %d: Total time %.2f us, Avg latency %.4f us/op\n",
            r + 1, duration, avg_lat);
        }
        stop_threads = 1;
        for (i = 0; i < NUM_THREADS; i++)
            pthread_join(threads[i], NULL);
        munmap(addr, PAGE_SIZE);
        return 0;
    }

Using the per-cpu flush_tlb_info showed only a very marginal performance
advantage, approximately 1%.

                             base            on-stack
                             ----            ---------
       avg (usec/op)         5.9362           5.9956   (+1%)
       stddev                0.0240           0.0096

And for the mmtest/stress-ng-madvise test, which randomly calls madvise on
pages within a mmap range and triggers a large number of high-frequency TLB
flushes, no significant performance regression was observed.

				 baseline              on-stack

Amean     bops-madvise-1        13.64 (   0.00%)      13.56 (   0.59%)
Amean     bops-madvise-2        27.32 (   0.00%)      27.26 (   0.24%)
Amean     bops-madvise-4        53.35 (   0.00%)      53.54 (  -0.35%)
Amean     bops-madvise-8        103.09 (   0.00%)     103.30 (  -0.20%)
Amean     bops-madvise-16       191.88 (   0.00%)     191.75 (   0.07%)
Amean     bops-madvise-32       287.98 (   0.00%)     291.01 *  -1.05%*
Amean     bops-madvise-64       365.84 (   0.00%)     368.09 *  -0.61%*
Amean     bops-madvise-128      422.72 (   0.00%)     423.47 (  -0.18%)
Amean     bops-madvise-256      435.61 (   0.00%)     435.63 (  -0.01%)

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 arch/x86/mm/tlb.c | 124 ++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 75 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index af43d177087e..4704200de3f0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1373,71 +1373,30 @@ void flush_tlb_multi(const struct cpumask *cpumask,
  */
 unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
-#endif
-
-static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
-			unsigned long start, unsigned long end,
-			unsigned int stride_shift, bool freed_tables,
-			u64 new_tlb_gen)
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, unsigned int stride_shift,
+				bool freed_tables)
 {
-	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
+	int cpu = get_cpu();
 
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Ensure that the following code is non-reentrant and flush_tlb_info
-	 * is not overwritten. This means no TLB flushing is initiated by
-	 * interrupt handlers and machine-check exception handlers.
-	 */
-	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
-#endif
+	struct flush_tlb_info info = {
+		.mm = mm,
+		.stride_shift = stride_shift,
+		.freed_tables = freed_tables,
+		.trim_cpumask = 0,
+		.initiating_cpu = cpu,
+	};
 
-	/*
-	 * If the number of flushes is so large that a full flush
-	 * would be faster, do a full flush.
-	 */
 	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
 		start = 0;
 		end = TLB_FLUSH_ALL;
 	}
 
-	info->start		= start;
-	info->end		= end;
-	info->mm		= mm;
-	info->stride_shift	= stride_shift;
-	info->freed_tables	= freed_tables;
-	info->new_tlb_gen	= new_tlb_gen;
-	info->initiating_cpu	= smp_processor_id();
-	info->trim_cpumask	= 0;
-
-	return info;
-}
-
-static void put_flush_tlb_info(void)
-{
-#ifdef CONFIG_DEBUG_VM
-	/* Complete reentrancy prevention checks */
-	barrier();
-	this_cpu_dec(flush_tlb_info_idx);
-#endif
-}
-
-void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-				unsigned long end, unsigned int stride_shift,
-				bool freed_tables)
-{
-	struct flush_tlb_info *info;
-	int cpu = get_cpu();
-	u64 new_tlb_gen;
-
 	/* This is also a barrier that synchronizes with switch_mm(). */
-	new_tlb_gen = inc_mm_tlb_gen(mm);
+	info.new_tlb_gen = inc_mm_tlb_gen(mm);
 
-	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
-				  new_tlb_gen);
+	info.start = start;
+	info.end = end;
 
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1445,19 +1404,18 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	 * flush_tlb_func_local() directly in this case.
 	 */
 	if (mm_global_asid(mm)) {
-		broadcast_tlb_flush(info);
+		broadcast_tlb_flush(&info);
 	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
-		info->trim_cpumask = should_trim_cpumask(mm);
-		flush_tlb_multi(mm_cpumask(mm), info);
+		info.trim_cpumask = should_trim_cpumask(mm);
+		flush_tlb_multi(mm_cpumask(mm), &info);
 		consider_global_asid(mm);
 	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
 
-	put_flush_tlb_info();
 	put_cpu();
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
@@ -1527,19 +1485,29 @@ static void kernel_tlb_flush_range(struct flush_tlb_info *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info = {
+		.mm = NULL,
+		.stride_shift = PAGE_SHIFT,
+		.freed_tables = false,
+		.trim_cpumask = 0,
+		.new_tlb_gen = TLB_GENERATION_INVALID
+	};
 
 	guard(preempt)();
 
-	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
-				  TLB_GENERATION_INVALID);
+	if ((end - start) >> PAGE_SHIFT > tlb_single_page_flush_ceiling) {
+		start = 0;
+		end = TLB_FLUSH_ALL;
+	}
 
-	if (info->end == TLB_FLUSH_ALL)
-		kernel_tlb_flush_all(info);
-	else
-		kernel_tlb_flush_range(info);
+	info.initiating_cpu = smp_processor_id(),
+	info.start = start;
+	info.end = end;
 
-	put_flush_tlb_info();
+	if (info.end == TLB_FLUSH_ALL)
+		kernel_tlb_flush_all(&info);
+	else
+		kernel_tlb_flush_range(&info);
 }
 
 /*
@@ -1707,12 +1675,19 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	struct flush_tlb_info *info;
-
 	int cpu = get_cpu();
 
-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
-				  TLB_GENERATION_INVALID);
+	struct flush_tlb_info info = {
+		.start = 0,
+		.end = TLB_FLUSH_ALL,
+		.mm = NULL,
+		.stride_shift = 0,
+		.freed_tables = false,
+		.new_tlb_gen = TLB_GENERATION_INVALID,
+		.initiating_cpu = cpu,
+		.trim_cpumask = 0,
+	};
+
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling
@@ -1722,17 +1697,16 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		invlpgb_flush_all_nonglobals();
 		batch->unmapped_pages = false;
 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
-		flush_tlb_multi(&batch->cpumask, info);
+		flush_tlb_multi(&batch->cpumask, &info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
 
 	cpumask_clear(&batch->cpumask);
 
-	put_flush_tlb_info();
 	put_cpu();
 }
 
-- 
2.20.1
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Sebastian Andrzej Siewior 2 weeks, 5 days ago
+Nadav, org post https://lore.kernel.org/all/20260318045638.1572777-11-zhouchuyi@bytedance.com/

On 2026-03-18 12:56:36 [+0800], Chuyi Zhou wrote:
> Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
> stack") converted flush_tlb_info from stack variable to per-CPU variable.
> This brought about a performance improvement of around 3% in extreme test.
> However, it also required that all flush_tlb* operations keep preemption
> disabled entirely to prevent concurrent modifications of flush_tlb_info.
> flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for
> all remote CPUs to complete their local TLB flushes. The process could
> take tens of milliseconds when interrupts are disabled or with a large
> number of remote CPUs.
…
PeterZ wasn't too happy to reverse this.
The snippet below results in the following assembly:

| 0000000000001ab0 <flush_tlb_kernel_range>:
…
|     1ac9:       48 89 e5                mov    %rsp,%rbp
|     1acc:       48 83 e4 c0             and    $0xffffffffffffffc0,%rsp
|     1ad0:       48 83 ec 40             sub    $0x40,%rsp

so it would align it properly which should result in the same cache-line
movement. I'm not sure about the virtual-to-physical translation of the
variables as in TLB misses since here we have a virtual mapped stack and
there we have virtual mapped per-CPU memory.

Here the below is my quick hack. Does this work, or still a now? I have
no numbers so…

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5a3cdc439e38d..4a7f40c7f939a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -227,7 +227,7 @@ struct flush_tlb_info {
 	u8			stride_shift;
 	u8			freed_tables;
 	u8			trim_cpumask;
-};
+} __aligned(SMP_CACHE_BYTES);
 
 void flush_tlb_local(void);
 void flush_tlb_one_user(unsigned long addr);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 621e09d049cb9..99b70e94ec281 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1394,28 +1394,12 @@ void flush_tlb_multi(const struct cpumask *cpumask,
  */
 unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
-#endif
-
-static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
-			unsigned long start, unsigned long end,
-			unsigned int stride_shift, bool freed_tables,
-			u64 new_tlb_gen)
+static void get_flush_tlb_info(struct flush_tlb_info *info,
+			       struct mm_struct *mm,
+			       unsigned long start, unsigned long end,
+			       unsigned int stride_shift, bool freed_tables,
+			       u64 new_tlb_gen)
 {
-	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Ensure that the following code is non-reentrant and flush_tlb_info
-	 * is not overwritten. This means no TLB flushing is initiated by
-	 * interrupt handlers and machine-check exception handlers.
-	 */
-	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
-#endif
-
 	/*
 	 * If the number of flushes is so large that a full flush
 	 * would be faster, do a full flush.
@@ -1433,8 +1417,6 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	info->new_tlb_gen	= new_tlb_gen;
 	info->initiating_cpu	= smp_processor_id();
 	info->trim_cpumask	= 0;
-
-	return info;
 }
 
 static void put_flush_tlb_info(void)
@@ -1450,15 +1432,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, unsigned int stride_shift,
 				bool freed_tables)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info _info;
+	struct flush_tlb_info *info = &_info;
 	int cpu = get_cpu();
 	u64 new_tlb_gen;
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	new_tlb_gen = inc_mm_tlb_gen(mm);
 
-	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
-				  new_tlb_gen);
+	get_flush_tlb_info(&_info, mm, start, end, stride_shift, freed_tables,
+			   new_tlb_gen);
 
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1548,17 +1531,15 @@ static void kernel_tlb_flush_range(struct flush_tlb_info *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info;
 
-	guard(preempt)();
+	get_flush_tlb_info(&info, NULL, start, end, PAGE_SHIFT, false,
+			   TLB_GENERATION_INVALID);
 
-	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
-				  TLB_GENERATION_INVALID);
-
-	if (info->end == TLB_FLUSH_ALL)
-		kernel_tlb_flush_all(info);
+	if (info.end == TLB_FLUSH_ALL)
+		kernel_tlb_flush_all(&info);
 	else
-		kernel_tlb_flush_range(info);
+		kernel_tlb_flush_range(&info);
 
 	put_flush_tlb_info();
 }
@@ -1728,12 +1709,11 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info;
 
 	int cpu = get_cpu();
-
-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
-				  TLB_GENERATION_INVALID);
+	get_flush_tlb_info(&info, NULL, 0, TLB_FLUSH_ALL, 0, false,
+			   TLB_GENERATION_INVALID);
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling
@@ -1743,11 +1723,11 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		invlpgb_flush_all_nonglobals();
 		batch->unmapped_pages = false;
 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
-		flush_tlb_multi(&batch->cpumask, info);
+		flush_tlb_multi(&batch->cpumask, &info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
 
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Chuyi Zhou 2 weeks, 3 days ago
Hello,

On 2026-03-19 1:21 a.m., Sebastian Andrzej Siewior wrote:
> +Nadav, org post https://lore.kernel.org/all/20260318045638.1572777-11-zhouchuyi@bytedance.com/
> 
> On 2026-03-18 12:56:36 [+0800], Chuyi Zhou wrote:
>> Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
>> stack") converted flush_tlb_info from stack variable to per-CPU variable.
>> This brought about a performance improvement of around 3% in extreme test.
>> However, it also required that all flush_tlb* operations keep preemption
>> disabled entirely to prevent concurrent modifications of flush_tlb_info.
>> flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for
>> all remote CPUs to complete their local TLB flushes. The process could
>> take tens of milliseconds when interrupts are disabled or with a large
>> number of remote CPUs.
> …
> PeterZ wasn't too happy to reverse this.
> The snippet below results in the following assembly:
> 
> | 0000000000001ab0 <flush_tlb_kernel_range>:
> …
> |     1ac9:       48 89 e5                mov    %rsp,%rbp
> |     1acc:       48 83 e4 c0             and    $0xffffffffffffffc0,%rsp
> |     1ad0:       48 83 ec 40             sub    $0x40,%rsp
> 
> so it would align it properly which should result in the same cache-line
> movement. I'm not sure about the virtual-to-physical translation of the
> variables as in TLB misses since here we have a virtual mapped stack and
> there we have virtual mapped per-CPU memory.
> 
> Here the below is my quick hack. Does this work, or still a now? I have
> no numbers so…
> 

I applied this patch on tip sched/core: fe7171d0d5df ("sched/fair: 
Simplify SIS_UTIL handling in select_idle_cpu()") and retest the 
microbenchmark in [PATCH v3 10/12], pinning the tasks to CPUs using 
cpuset, and disabling pti and mitigations.


                   base       on-stack-aligned  on-stack
                   ----       ---------      -----------
avg (usec/op)     2.5278       2.5261        2.5508
stddev            0.0007       0.0027        0.0023

Based on the data above, there is no performance regression.

Dose this make sense? Do you have other testing suggestions?

Thanks

> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 5a3cdc439e38d..4a7f40c7f939a 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -227,7 +227,7 @@ struct flush_tlb_info {
>   	u8			stride_shift;
>   	u8			freed_tables;
>   	u8			trim_cpumask;
> -};
> +} __aligned(SMP_CACHE_BYTES);
>   
>   void flush_tlb_local(void);
>   void flush_tlb_one_user(unsigned long addr);
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 621e09d049cb9..99b70e94ec281 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1394,28 +1394,12 @@ void flush_tlb_multi(const struct cpumask *cpumask,
>    */
>   unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>   
> -static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
> -
> -#ifdef CONFIG_DEBUG_VM
> -static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
> -#endif
> -
> -static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
> -			unsigned long start, unsigned long end,
> -			unsigned int stride_shift, bool freed_tables,
> -			u64 new_tlb_gen)
> +static void get_flush_tlb_info(struct flush_tlb_info *info,
> +			       struct mm_struct *mm,
> +			       unsigned long start, unsigned long end,
> +			       unsigned int stride_shift, bool freed_tables,
> +			       u64 new_tlb_gen)
>   {
> -	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
> -
> -#ifdef CONFIG_DEBUG_VM
> -	/*
> -	 * Ensure that the following code is non-reentrant and flush_tlb_info
> -	 * is not overwritten. This means no TLB flushing is initiated by
> -	 * interrupt handlers and machine-check exception handlers.
> -	 */
> -	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
> -#endif
> -
>   	/*
>   	 * If the number of flushes is so large that a full flush
>   	 * would be faster, do a full flush.
> @@ -1433,8 +1417,6 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
>   	info->new_tlb_gen	= new_tlb_gen;
>   	info->initiating_cpu	= smp_processor_id();
>   	info->trim_cpumask	= 0;
> -
> -	return info;
>   }
>   
>   static void put_flush_tlb_info(void)
> @@ -1450,15 +1432,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>   				unsigned long end, unsigned int stride_shift,
>   				bool freed_tables)
>   {
> -	struct flush_tlb_info *info;
> +	struct flush_tlb_info _info;
> +	struct flush_tlb_info *info = &_info;
>   	int cpu = get_cpu();
>   	u64 new_tlb_gen;
>   
>   	/* This is also a barrier that synchronizes with switch_mm(). */
>   	new_tlb_gen = inc_mm_tlb_gen(mm);
>   
> -	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
> -				  new_tlb_gen);
> +	get_flush_tlb_info(&_info, mm, start, end, stride_shift, freed_tables,
> +			   new_tlb_gen);
>   
>   	/*
>   	 * flush_tlb_multi() is not optimized for the common case in which only
> @@ -1548,17 +1531,15 @@ static void kernel_tlb_flush_range(struct flush_tlb_info *info)
>   
>   void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>   {
> -	struct flush_tlb_info *info;
> +	struct flush_tlb_info info;
>   
> -	guard(preempt)();
> +	get_flush_tlb_info(&info, NULL, start, end, PAGE_SHIFT, false,
> +			   TLB_GENERATION_INVALID);
>   
> -	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
> -				  TLB_GENERATION_INVALID);
> -
> -	if (info->end == TLB_FLUSH_ALL)
> -		kernel_tlb_flush_all(info);
> +	if (info.end == TLB_FLUSH_ALL)
> +		kernel_tlb_flush_all(&info);
>   	else
> -		kernel_tlb_flush_range(info);
> +		kernel_tlb_flush_range(&info);
>   
>   	put_flush_tlb_info();
>   }
> @@ -1728,12 +1709,11 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
>   
>   void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>   {
> -	struct flush_tlb_info *info;
> +	struct flush_tlb_info info;
>   
>   	int cpu = get_cpu();
> -
> -	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
> -				  TLB_GENERATION_INVALID);
> +	get_flush_tlb_info(&info, NULL, 0, TLB_FLUSH_ALL, 0, false,
> +			   TLB_GENERATION_INVALID);
>   	/*
>   	 * flush_tlb_multi() is not optimized for the common case in which only
>   	 * a local TLB flush is needed. Optimize this use-case by calling
> @@ -1743,11 +1723,11 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>   		invlpgb_flush_all_nonglobals();
>   		batch->unmapped_pages = false;
>   	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
> -		flush_tlb_multi(&batch->cpumask, info);
> +		flush_tlb_multi(&batch->cpumask, &info);
>   	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
>   		lockdep_assert_irqs_enabled();
>   		local_irq_disable();
> -		flush_tlb_func(info);
> +		flush_tlb_func(&info);
>   		local_irq_enable();
>   	}
>
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Nadav Amit 2 weeks, 4 days ago

> On 18 Mar 2026, at 19:21, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> +Nadav, org post https://lore.kernel.org/all/20260318045638.1572777-11-zhouchuyi@bytedance.com/
> 
> On 2026-03-18 12:56:36 [+0800], Chuyi Zhou wrote:
>> Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
>> stack") converted flush_tlb_info from stack variable to per-CPU variable.
>> This brought about a performance improvement of around 3% in extreme test.
>> However, it also required that all flush_tlb* operations keep preemption
>> disabled entirely to prevent concurrent modifications of flush_tlb_info.
>> flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for
>> all remote CPUs to complete their local TLB flushes. The process could
>> take tens of milliseconds when interrupts are disabled or with a large
>> number of remote CPUs.
> …
> PeterZ wasn't too happy to reverse this.
> The snippet below results in the following assembly:
> 
> | 0000000000001ab0 <flush_tlb_kernel_range>:
> …
> |     1ac9:       48 89 e5                mov    %rsp,%rbp
> |     1acc:       48 83 e4 c0             and    $0xffffffffffffffc0,%rsp
> |     1ad0:       48 83 ec 40             sub    $0x40,%rsp
> 
> so it would align it properly which should result in the same cache-line
> movement. I'm not sure about the virtual-to-physical translation of the
> variables as in TLB misses since here we have a virtual mapped stack and
> there we have virtual mapped per-CPU memory.
> 
> Here the below is my quick hack. Does this work, or still a now? I have
> no numbers so…
> 
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 5a3cdc439e38d..4a7f40c7f939a 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -227,7 +227,7 @@ struct flush_tlb_info {
> 	u8			stride_shift;
> 	u8			freed_tables;
> 	u8			trim_cpumask;
> -};
> +} __aligned(SMP_CACHE_BYTES);
> 

This would work, but you are likely to encounter the same problem PeterZ hit
when I did something similar: in some configurations SMP_CACHE_BYTES is very
large.

See https://lore.kernel.org/all/tip-780e0106d468a2962b16b52fdf42898f2639e0a0@git.kernel.org/

Maybe cap the alignment somehow? something like:

#if SMP_CACHE_BYTES > 64
#define FLUSH_TLB_INFO_ALIGN 64
#else
#define FLUSH_TLB_INFO_ALIGN SMP_CACHE_BYTES
#endif

And then use __aligned(FLUSH_TLB_INFO_ALIGN) ?
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Nadav Amit 2 weeks, 4 days ago
> On 18 Mar 2026, at 22:24, Nadav Amit <nadav.amit@gmail.com> wrote:
> 
> 
> 
>> On 18 Mar 2026, at 19:21, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
>> 
>> so it would align it properly which should result in the same cache-line
>> movement. I'm not sure about the virtual-to-physical translation of the
>> variables as in TLB misses since here we have a virtual mapped stack and
>> there we have virtual mapped per-CPU memory.
>> 
>> Here the below is my quick hack. Does this work, or still a now? I have
>> no numbers so…
>> 
>> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
>> index 5a3cdc439e38d..4a7f40c7f939a 100644
>> --- a/arch/x86/include/asm/tlbflush.h
>> +++ b/arch/x86/include/asm/tlbflush.h
>> @@ -227,7 +227,7 @@ struct flush_tlb_info {
>> 	u8 stride_shift;
>> 	u8 freed_tables;
>> 	u8 trim_cpumask;
>> -};
>> +} __aligned(SMP_CACHE_BYTES);
>> 
> 
> This would work, but you are likely to encounter the same problem PeterZ hit
> when I did something similar: in some configurations SMP_CACHE_BYTES is very
> large.

Further thinking about it and looking at the rest of the series: wouldn’t it be
simpler to put flush_tlb_info and smp_call_function_many_cond()’s
cpumask on thread_struct? It would allow to support CONFIG_CPUMASK_OFFSTACK=y
case by preallocating cpumask on thread creation.

I’m not sure whether the memory overhead is prohibitive.
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Sebastian Andrzej Siewior 2 weeks, 4 days ago
On 2026-03-19 00:28:19 [+0200], Nadav Amit wrote:
> >> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> >> index 5a3cdc439e38d..4a7f40c7f939a 100644
> >> --- a/arch/x86/include/asm/tlbflush.h
> >> +++ b/arch/x86/include/asm/tlbflush.h
> >> @@ -227,7 +227,7 @@ struct flush_tlb_info {
> >> 	u8 stride_shift;
> >> 	u8 freed_tables;
> >> 	u8 trim_cpumask;
> >> -};
> >> +} __aligned(SMP_CACHE_BYTES);
> >> 
> > 
> > This would work, but you are likely to encounter the same problem PeterZ hit
> > when I did something similar: in some configurations SMP_CACHE_BYTES is very
> > large.

So if capping to 64 is an option does not break performance where one
would complain, why not. But you did it initially so…

> Further thinking about it and looking at the rest of the series: wouldn’t it be
> simpler to put flush_tlb_info and smp_call_function_many_cond()’s
> cpumask on thread_struct? It would allow to support CONFIG_CPUMASK_OFFSTACK=y
> case by preallocating cpumask on thread creation.
> 
> I’m not sure whether the memory overhead is prohibitive.

My Debian config has CONFIG_NR_CPUS=8192 which would add 1KiB if we add
a plain cpumask_t. The allocation based on cpumask_size() would add just
8 bytes/ pointer to the struct which should be fine. We could even stash
the mask in the pointer for CPUs <= 64 on 64bit.
On RT it would be desired to have the memory and not to fallback to
waiting with disabled preemption if the allocation fails.

The flush_tlb_info are around 40 bytes + alignment. Maybe we could try
stack first if this gets us to acceptable performance.

Sebastian
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Nadav Amit 2 weeks, 4 days ago

> On 19 Mar 2026, at 10:49, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> On 2026-03-19 00:28:19 [+0200], Nadav Amit wrote:
>>>> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
>>>> index 5a3cdc439e38d..4a7f40c7f939a 100644
>>>> --- a/arch/x86/include/asm/tlbflush.h
>>>> +++ b/arch/x86/include/asm/tlbflush.h
>>>> @@ -227,7 +227,7 @@ struct flush_tlb_info {
>>>> 	u8 stride_shift;
>>>> 	u8 freed_tables;
>>>> 	u8 trim_cpumask;
>>>> -};
>>>> +} __aligned(SMP_CACHE_BYTES);
>>>> 
>>> 
>>> This would work, but you are likely to encounter the same problem PeterZ hit
>>> when I did something similar: in some configurations SMP_CACHE_BYTES is very
>>> large.
> 
> So if capping to 64 is an option does not break performance where one
> would complain, why not. But you did it initially so…

No, I only aligned to SMP_CACHE_BYTES. Then I encountered the described problem,
hence I propose to cap it.

> 
>> Further thinking about it and looking at the rest of the series: wouldn’t it be
>> simpler to put flush_tlb_info and smp_call_function_many_cond()’s
>> cpumask on thread_struct? It would allow to support CONFIG_CPUMASK_OFFSTACK=y
>> case by preallocating cpumask on thread creation.
>> 
>> I’m not sure whether the memory overhead is prohibitive.
> 
> My Debian config has CONFIG_NR_CPUS=8192 which would add 1KiB if we add
> a plain cpumask_t. The allocation based on cpumask_size() would add just
> 8 bytes/ pointer to the struct which should be fine. We could even stash
> the mask in the pointer for CPUs <= 64 on 64bit.
> On RT it would be desired to have the memory and not to fallback to
> waiting with disabled preemption if the allocation fails.
> 
> The flush_tlb_info are around 40 bytes + alignment. Maybe we could try
> stack first if this gets us to acceptable performance.

I know it 1KB sounds a lot, but considering fpu size and other per-task
structures, it’s already almost 6KB. Just raising the option.
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Sebastian Andrzej Siewior 2 weeks, 4 days ago
On 2026-03-19 12:37:27 [+0200], Nadav Amit wrote:
> >> Further thinking about it and looking at the rest of the series: wouldn’t it be
> >> simpler to put flush_tlb_info and smp_call_function_many_cond()’s
> >> cpumask on thread_struct? It would allow to support CONFIG_CPUMASK_OFFSTACK=y
> >> case by preallocating cpumask on thread creation.
> >> 
> >> I’m not sure whether the memory overhead is prohibitive.
> > 
> > My Debian config has CONFIG_NR_CPUS=8192 which would add 1KiB if we add
> > a plain cpumask_t. The allocation based on cpumask_size() would add just
> > 8 bytes/ pointer to the struct which should be fine. We could even stash
> > the mask in the pointer for CPUs <= 64 on 64bit.
> > On RT it would be desired to have the memory and not to fallback to
> > waiting with disabled preemption if the allocation fails.
> > 
> > The flush_tlb_info are around 40 bytes + alignment. Maybe we could try
> > stack first if this gets us to acceptable performance.
> 
> I know it 1KB sounds a lot, but considering fpu size and other per-task
> structures, it’s already almost 6KB. Just raising the option.

Sure. We might try allocating just the cpumask while allocating the
task_struct so we won't do the whole 1KiB but just what is required and
we might avoid allocation if it fits in the pointer size.

Sebastian
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Chuyi Zhou 2 weeks, 4 days ago
Hi Sebastian,

On 2026-03-19 6:58 p.m., Sebastian Andrzej Siewior wrote:
> On 2026-03-19 12:37:27 [+0200], Nadav Amit wrote:
>>>> Further thinking about it and looking at the rest of the series: wouldn’t it be
>>>> simpler to put flush_tlb_info and smp_call_function_many_cond()’s
>>>> cpumask on thread_struct? It would allow to support CONFIG_CPUMASK_OFFSTACK=y
>>>> case by preallocating cpumask on thread creation.
>>>>
>>>> I’m not sure whether the memory overhead is prohibitive.
>>>
>>> My Debian config has CONFIG_NR_CPUS=8192 which would add 1KiB if we add
>>> a plain cpumask_t. The allocation based on cpumask_size() would add just
>>> 8 bytes/ pointer to the struct which should be fine. We could even stash
>>> the mask in the pointer for CPUs <= 64 on 64bit.
>>> On RT it would be desired to have the memory and not to fallback to
>>> waiting with disabled preemption if the allocation fails.
>>>
>>> The flush_tlb_info are around 40 bytes + alignment. Maybe we could try
>>> stack first if this gets us to acceptable performance.
>>
>> I know it 1KB sounds a lot, but considering fpu size and other per-task
>> structures, it’s already almost 6KB. Just raising the option.
> 
> Sure. We might try allocating just the cpumask while allocating the
> task_struct so we won't do the whole 1KiB but just what is required and
> we might avoid allocation if it fits in the pointer size.
> 
> Sebastian

IIUC, you mean something like the following?


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a5d3dbc9cdf..4222114cd34c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -927,6 +927,8 @@ struct task_struct {
         unsigned short                  migration_disabled;
         unsigned short                  migration_flags;

+       cpumask_t                       *ipi_cpus;
+
  #ifdef CONFIG_PREEMPT_RCU
         int                             rcu_read_lock_nesting;
         union rcu_special               rcu_read_unlock_special;
diff --git a/kernel/smp.c b/kernel/smp.c
index 47c3b057f57f..f2bbd9b87f7f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -112,6 +112,29 @@ void __init call_function_init(void)
         smpcfd_prepare_cpu(smp_processor_id());
  }

+static inline bool can_inline_cpumask(void)
+{
+       return cpumask_size() <= sizeof(cpumask_t *);
+}
+
+void alloc_ipi_cpumask(struct task_struct *task)
+{
+       if (can_inline_cpumask())
+               return;
+       /*
+        * Fallback to the default smp_call_function_many_cond
+        * logic if the allocation fails.
+        */
+       task->ipi_cpus = kmalloc(cpumask_size(), GFP_KERNEL);
+}
+
+static inline cpumask_t *get_local_cpumask(struct task_struct *cur)
+{
+       if (can_inline_cpumask())
+               return (cpumask_t *)&cur->ipi_cpus;
+       return cur->ipi_cpus;
+}
+
  static __always_inline void
  send_call_function_single_ipi(int cpu)
  {
Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
Posted by Sebastian Andrzej Siewior 2 weeks, 4 days ago
On 2026-03-19 21:41:28 [+0800], Chuyi Zhou wrote:
> Hi Sebastian,
Hi,

> IIUC, you mean something like the following?

basically yes. Later you might want to add a static_branch to
can_inline_cpumask() instead the check you have now. The value to
cpumask_size() should be assigned once the number of possible CPUs is
known and never change.

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5a5d3dbc9cdf..4222114cd34c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -927,6 +927,8 @@ struct task_struct {
>          unsigned short                  migration_disabled;
>          unsigned short                  migration_flags;
> 
> +       cpumask_t                       *ipi_cpus;

This is probably not the best spot to stuff it. You will have a 4byte
gap there. After user_cpus_ptr would be okay from alignment but I am not
sure if something else shifts too much.

> +
>   #ifdef CONFIG_PREEMPT_RCU
>          int                             rcu_read_lock_nesting;
>          union rcu_special               rcu_read_unlock_special;

Sebastian