[v3] skip redundant TLB sync IPIs

[PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by Lance Yang 1 month ago

From: Lance Yang <lance.yang@linux.dev>

When unsharing hugetlb PMD page tables, we currently send two IPIs: one
for TLB invalidation, and another to synchronize with concurrent GUP-fast
walkers via tlb_remove_table_sync_one().

However, if the TLB flush already sent IPIs to all CPUs (when freed_tables
or unshared_tables is true), the second IPI is redundant. GUP-fast runs
with IRQs disabled, so when the TLB flush IPI completes, any concurrent
GUP-fast must have finished.

To avoid the redundant IPI, we add a flag to mmu_gather to track whether
the TLB flush sent IPIs. We pass the mmu_gather pointer through the TLB
flush path via flush_tlb_info, so native_flush_tlb_multi() can set the
flag when it sends IPIs for freed_tables. We also set the flag for
local-only flushes, since disabling IRQs provides the same guarantee.

Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Lance Yang <lance.yang@linux.dev>
---
 arch/x86/include/asm/tlb.h      |  3 ++-
 arch/x86/include/asm/tlbflush.h |  9 +++++----
 arch/x86/kernel/alternative.c   |  2 +-
 arch/x86/kernel/ldt.c           |  2 +-
 arch/x86/mm/tlb.c               | 22 ++++++++++++++++------
 include/asm-generic/tlb.h       | 14 +++++++++-----
 mm/mmu_gather.c                 | 26 +++++++++++++++++++-------
 7 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 866ea78ba156..c5950a92058c 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -20,7 +20,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 		end = tlb->end;
 	}
 
-	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+	flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
+			   tlb->freed_tables || tlb->unshared_tables, tlb);
 }
 
 static inline void invlpg(unsigned long addr)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 00daedfefc1b..83c260c88b80 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -220,6 +220,7 @@ struct flush_tlb_info {
 	 *   will be zero.
 	 */
 	struct mm_struct	*mm;
+	struct mmu_gather	*tlb;
 	unsigned long		start;
 	unsigned long		end;
 	u64			new_tlb_gen;
@@ -305,23 +306,23 @@ static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
 #endif
 
 #define flush_tlb_mm(mm)						\
-		flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
+		flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true, NULL)
 
 #define flush_tlb_range(vma, start, end)				\
 	flush_tlb_mm_range((vma)->vm_mm, start, end,			\
 			   ((vma)->vm_flags & VM_HUGETLB)		\
 				? huge_page_shift(hstate_vma(vma))	\
-				: PAGE_SHIFT, true)
+				: PAGE_SHIFT, true, NULL)
 
 extern void flush_tlb_all(void);
 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, unsigned int stride_shift,
-				bool freed_tables);
+				bool freed_tables, struct mmu_gather *tlb);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 {
-	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
+	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false, NULL);
 }
 
 static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 28518371d8bf..006f3705b616 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2572,7 +2572,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
 	 */
 	flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
 			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
-			   PAGE_SHIFT, false);
+			   PAGE_SHIFT, false, NULL);
 
 	if (func == text_poke_memcpy) {
 		/*
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0f19ef355f5f..d8494706fec5 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -374,7 +374,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
 	}
 
 	va = (unsigned long)ldt_slot_va(ldt->slot);
-	flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
+	flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false, NULL);
 }
 
 #else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f5b93e01e347..be45976c0d16 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1374,6 +1374,9 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
 	else
 		on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
 				(void *)info, 1, cpumask);
+
+	if (info->freed_tables && info->tlb)
+		info->tlb->tlb_flush_sent_ipi = true;
 }
 
 void flush_tlb_multi(const struct cpumask *cpumask,
@@ -1403,7 +1406,7 @@ static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
 static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 			unsigned long start, unsigned long end,
 			unsigned int stride_shift, bool freed_tables,
-			u64 new_tlb_gen)
+			u64 new_tlb_gen, struct mmu_gather *tlb)
 {
 	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
 
@@ -1433,6 +1436,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	info->new_tlb_gen	= new_tlb_gen;
 	info->initiating_cpu	= smp_processor_id();
 	info->trim_cpumask	= 0;
+	info->tlb		= tlb;
 
 	return info;
 }
@@ -1447,8 +1451,8 @@ static void put_flush_tlb_info(void)
 }
 
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-				unsigned long end, unsigned int stride_shift,
-				bool freed_tables)
+			unsigned long end, unsigned int stride_shift,
+			bool freed_tables, struct mmu_gather *tlb)
 {
 	struct flush_tlb_info *info;
 	int cpu = get_cpu();
@@ -1458,7 +1462,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	new_tlb_gen = inc_mm_tlb_gen(mm);
 
 	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
-				  new_tlb_gen);
+				  new_tlb_gen, tlb);
 
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1476,6 +1480,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 		local_irq_disable();
 		flush_tlb_func(info);
 		local_irq_enable();
+		/*
+		 * Only current CPU uses this mm, so we can treat this as
+		 * having synchronized with GUP-fast. No sync IPI needed.
+		 */
+		if (tlb && freed_tables)
+			tlb->tlb_flush_sent_ipi = true;
 	}
 
 	put_flush_tlb_info();
@@ -1553,7 +1563,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	guard(preempt)();
 
 	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
-				  TLB_GENERATION_INVALID);
+				  TLB_GENERATION_INVALID, NULL);
 
 	if (info->end == TLB_FLUSH_ALL)
 		kernel_tlb_flush_all(info);
@@ -1733,7 +1743,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	int cpu = get_cpu();
 
 	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
-				  TLB_GENERATION_INVALID);
+				  TLB_GENERATION_INVALID, NULL);
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3975f7d11553..cbbe008590ee 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -249,6 +249,7 @@ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
 #define tlb_needs_table_invalidate() (true)
 #endif
 
+void tlb_gather_remove_table_sync_one(struct mmu_gather *tlb);
 void tlb_remove_table_sync_one(void);
 
 #else
@@ -257,6 +258,7 @@ void tlb_remove_table_sync_one(void);
 #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
 #endif
 
+static inline void tlb_gather_remove_table_sync_one(struct mmu_gather *tlb) { }
 static inline void tlb_remove_table_sync_one(void) { }
 
 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
@@ -378,6 +380,12 @@ struct mmu_gather {
 	 */
 	unsigned int		fully_unshared_tables : 1;
 
+	/*
+	 * Did the TLB flush for freed/unshared tables send IPIs to all CPUs?
+	 * If true, we can skip the redundant IPI in tlb_remove_table_sync_one().
+	 */
+	unsigned int		tlb_flush_sent_ipi : 1;
+
 	unsigned int		batch_count;
 
 #ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -833,13 +841,9 @@ static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
 	 *
 	 * We only perform this when we are the last sharer of a page table,
 	 * as the IPI will reach all CPUs: any GUP-fast.
-	 *
-	 * Note that on configs where tlb_remove_table_sync_one() is a NOP,
-	 * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
-	 * required IPIs already for us.
 	 */
 	if (tlb->fully_unshared_tables) {
-		tlb_remove_table_sync_one();
+		tlb_gather_remove_table_sync_one(tlb);
 		tlb->fully_unshared_tables = false;
 	}
 }
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2faa23d7f8d4..da36de52b281 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -273,8 +273,14 @@ static void tlb_remove_table_smp_sync(void *arg)
 	/* Simply deliver the interrupt */
 }
 
-void tlb_remove_table_sync_one(void)
+void tlb_gather_remove_table_sync_one(struct mmu_gather *tlb)
 {
+	/* Skip the IPI if the TLB flush already synchronized with other CPUs */
+	if (tlb && tlb->tlb_flush_sent_ipi) {
+		tlb->tlb_flush_sent_ipi = false;
+		return;
+	}
+
 	/*
 	 * This isn't an RCU grace period and hence the page-tables cannot be
 	 * assumed to be actually RCU-freed.
@@ -285,6 +291,11 @@ void tlb_remove_table_sync_one(void)
 	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 }
 
+void tlb_remove_table_sync_one(void)
+{
+	tlb_gather_remove_table_sync_one(NULL);
+}
+
 static void tlb_remove_table_rcu(struct rcu_head *head)
 {
 	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
@@ -328,7 +339,7 @@ static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
 	__tlb_remove_table(ptdesc);
 }
 
-static inline void __tlb_remove_table_one(void *table)
+static inline void __tlb_remove_table_one(void *table, struct mmu_gather *tlb)
 {
 	struct ptdesc *ptdesc;
 
@@ -336,16 +347,16 @@ static inline void __tlb_remove_table_one(void *table)
 	call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
 }
 #else
-static inline void __tlb_remove_table_one(void *table)
+static inline void __tlb_remove_table_one(void *table, struct mmu_gather *tlb)
 {
-	tlb_remove_table_sync_one();
+	tlb_gather_remove_table_sync_one(tlb);
 	__tlb_remove_table(table);
 }
 #endif /* CONFIG_PT_RECLAIM */
 
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
 {
-	__tlb_remove_table_one(table);
+	__tlb_remove_table_one(table, tlb);
 }
 
 static void tlb_table_flush(struct mmu_gather *tlb)
@@ -367,7 +378,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
 		if (*batch == NULL) {
 			tlb_table_invalidate(tlb);
-			tlb_remove_table_one(table);
+			tlb_remove_table_one(table, tlb);
 			return;
 		}
 		(*batch)->nr = 0;
@@ -427,6 +438,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 	tlb->vma_pfn = 0;
 
 	tlb->fully_unshared_tables = 0;
+	tlb->tlb_flush_sent_ipi = 0;
 	__tlb_reset_range(tlb);
 	inc_tlb_flush_pending(tlb->mm);
 }
-- 
2.49.0

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by Dave Hansen 1 month ago

On 1/6/26 04:03, Lance Yang wrote:
> From: Lance Yang <lance.yang@linux.dev>
> 
> When unsharing hugetlb PMD page tables, we currently send two IPIs: one
> for TLB invalidation, and another to synchronize with concurrent GUP-fast
> walkers via tlb_remove_table_sync_one().
> 
> However, if the TLB flush already sent IPIs to all CPUs (when freed_tables
> or unshared_tables is true), the second IPI is redundant. GUP-fast runs
> with IRQs disabled, so when the TLB flush IPI completes, any concurrent
> GUP-fast must have finished.
> 
> To avoid the redundant IPI, we add a flag to mmu_gather to track whether
> the TLB flush sent IPIs. We pass the mmu_gather pointer through the TLB
> flush path via flush_tlb_info, so native_flush_tlb_multi() can set the
> flag when it sends IPIs for freed_tables. We also set the flag for
> local-only flushes, since disabling IRQs provides the same guarantee.

The lack of imperative voice is killing me. :)

> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
> index 866ea78ba156..c5950a92058c 100644
> --- a/arch/x86/include/asm/tlb.h
> +++ b/arch/x86/include/asm/tlb.h
> @@ -20,7 +20,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
>  		end = tlb->end;
>  	}
>  
> -	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
> +	flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
> +			   tlb->freed_tables || tlb->unshared_tables, tlb);
>  }

I think this hunk sums up v3 pretty well. Where there was a single boolean, now there are two. To add to that, the structure that contains the booleans is itself being passed in. The boolean is still named 'freed_tables', and is going from:

	tlb->freed_tables

which is pretty obviously correct to:

	tlb->freed_tables || tlb->unshared_tables

which is _far_ from obviously correct.

I'm at a loss for why the patch wouldn't just do this:

-	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb);

I suspect these were sent out in a bit of haste, which isn't the first time I've gotten that feeling with this series.

Could we slow down, please?

>  static inline void invlpg(unsigned long addr)
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 00daedfefc1b..83c260c88b80 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -220,6 +220,7 @@ struct flush_tlb_info {
>  	 *   will be zero.
>  	 */
>  	struct mm_struct	*mm;
> +	struct mmu_gather	*tlb;
>  	unsigned long		start;
>  	unsigned long		end;
>  	u64			new_tlb_gen;

This also gives me pause.

There is a *lot* of redundant information between 'struct mmu_gather' and 'struct tlb_flush_info'. There needs to at least be a description of what the relationship is and how these relate to each other. I would have naively thought that the right move here would be to pull the mmu_gather data out at one discrete time rather than store a pointer to it.

What I see here is, I suspect, the most expedient way to do it. I'd _certainly_ have done this myself if I was just hacking something together to play with as quickly as possible.

So, in the end, I don't hate the approach here (yet). But it is almost impossible to evaluate it because the series is taking some rather egregious shortcuts and is lacking any real semblance of a refactoring effort.

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by Lance Yang 1 month ago


On 2026/1/7 00:24, Dave Hansen wrote:
> On 1/6/26 04:03, Lance Yang wrote:
>> From: Lance Yang <lance.yang@linux.dev>
>>
>> When unsharing hugetlb PMD page tables, we currently send two IPIs: one
>> for TLB invalidation, and another to synchronize with concurrent GUP-fast
>> walkers via tlb_remove_table_sync_one().
>>
>> However, if the TLB flush already sent IPIs to all CPUs (when freed_tables
>> or unshared_tables is true), the second IPI is redundant. GUP-fast runs
>> with IRQs disabled, so when the TLB flush IPI completes, any concurrent
>> GUP-fast must have finished.
>>
>> To avoid the redundant IPI, we add a flag to mmu_gather to track whether
>> the TLB flush sent IPIs. We pass the mmu_gather pointer through the TLB
>> flush path via flush_tlb_info, so native_flush_tlb_multi() can set the
>> flag when it sends IPIs for freed_tables. We also set the flag for
>> local-only flushes, since disabling IRQs provides the same guarantee.
> 
> The lack of imperative voice is killing me. :)

Oops.

> 
>> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
>> index 866ea78ba156..c5950a92058c 100644
>> --- a/arch/x86/include/asm/tlb.h
>> +++ b/arch/x86/include/asm/tlb.h
>> @@ -20,7 +20,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
>>   		end = tlb->end;
>>   	}
>>   
>> -	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
>> +	flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
>> +			   tlb->freed_tables || tlb->unshared_tables, tlb);
>>   }
> 
> I think this hunk sums up v3 pretty well. Where there was a single boolean, now there are two. To add to that, the structure that contains the booleans is itself being passed in. The boolean is still named 'freed_tables', and is going from:
> 
> 	tlb->freed_tables
> 
> which is pretty obviously correct to:
> 
> 	tlb->freed_tables || tlb->unshared_tables
> 
> which is _far_ from obviously correct.
> 
> I'm at a loss for why the patch wouldn't just do this:
> 
> -	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
> +	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb);
> 
> I suspect these were sent out in a bit of haste, which isn't the first time I've gotten that feeling with this series.
> 
> Could we slow down, please?

Sorry, I went too fast ...

> 
>>   static inline void invlpg(unsigned long addr)
>> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
>> index 00daedfefc1b..83c260c88b80 100644
>> --- a/arch/x86/include/asm/tlbflush.h
>> +++ b/arch/x86/include/asm/tlbflush.h
>> @@ -220,6 +220,7 @@ struct flush_tlb_info {
>>   	 *   will be zero.
>>   	 */
>>   	struct mm_struct	*mm;
>> +	struct mmu_gather	*tlb;
>>   	unsigned long		start;
>>   	unsigned long		end;
>>   	u64			new_tlb_gen;
> 
> This also gives me pause.
> 
> There is a *lot* of redundant information between 'struct mmu_gather' and 'struct tlb_flush_info'. There needs to at least be a description of what the relationship is and how these relate to each other. I would have naively thought that the right move here would be to pull the mmu_gather data out at one discrete time rather than store a pointer to it.
> 
> What I see here is, I suspect, the most expedient way to do it. I'd _certainly_ have done this myself if I was just hacking something together to play with as quickly as possible.
> 
> So, in the end, I don't hate the approach here (yet). But it is almost impossible to evaluate it because the series is taking some rather egregious shortcuts and is lacking any real semblance of a refactoring effort.

The flag lifetime issue David pointed out is real, and you're right
about the messy parameters :)

And, yeah, I need to think more those. Maybe v3 can be fixed, or maybe
v2 is actually sufficient - it's conservative but safe (no false positives).

Will take more time, thanks!

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by David Hildenbrand (Red Hat) 1 month ago

>   
>   static void tlb_table_flush(struct mmu_gather *tlb)
> @@ -367,7 +378,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
>   		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
>   		if (*batch == NULL) {
>   			tlb_table_invalidate(tlb);
> -			tlb_remove_table_one(table);
> +			tlb_remove_table_one(table, tlb);
>   			return;
>   		}
>   		(*batch)->nr = 0;
> @@ -427,6 +438,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
>   	tlb->vma_pfn = 0;
>   
>   	tlb->fully_unshared_tables = 0;
> +	tlb->tlb_flush_sent_ipi = 0;
>   	__tlb_reset_range(tlb);
>   	inc_tlb_flush_pending(tlb->mm);
>   }

But when would we have to reset tlb->tlb_flush_sent_ipi = 0 later? 
That's where it gets tricky. Just imagine the MMU gather gets reused later.

Also,

	+	if (info->freed_tables && info->tlb)
	+		info->tlb->tlb_flush_sent_ipi = true;

in native_flush_tlb_multi() misses the fact that we have different 
flushing types for removed/unshared tables vs. other flush.

So this approach more here certainly gets more complicated and error prone.

tlb_table_flush_implies_ipi_broadcast() was clearer in that regard: if 
you flushed the TLB after removing /unsharing tables, the IPI for 
handling page tables can be skipped. It's on the code flow to assure that.

What could work is tracking "tlb_table_flush_sent_ipi" really when we 
are flushing the TLB for removed/unshared tables, and maybe resetting it 
... I don't know when from the top of my head.

v2 was simpler IMHO.

-- 
Cheers

David

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by Lance Yang 1 month ago


On 2026/1/6 23:19, David Hildenbrand (Red Hat) wrote:
>>   static void tlb_table_flush(struct mmu_gather *tlb)
>> @@ -367,7 +378,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void 
>> *table)
>>           *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
>>           if (*batch == NULL) {
>>               tlb_table_invalidate(tlb);
>> -            tlb_remove_table_one(table);
>> +            tlb_remove_table_one(table, tlb);
>>               return;
>>           }
>>           (*batch)->nr = 0;
>> @@ -427,6 +438,7 @@ static void __tlb_gather_mmu(struct mmu_gather 
>> *tlb, struct mm_struct *mm,
>>       tlb->vma_pfn = 0;
>>       tlb->fully_unshared_tables = 0;
>> +    tlb->tlb_flush_sent_ipi = 0;
>>       __tlb_reset_range(tlb);
>>       inc_tlb_flush_pending(tlb->mm);
>>   }
> 
> But when would we have to reset tlb->tlb_flush_sent_ipi = 0 later? 
> That's where it gets tricky. Just imagine the MMU gather gets reused later.
> 
> Also,
> 
>      +    if (info->freed_tables && info->tlb)
>      +        info->tlb->tlb_flush_sent_ipi = true;
> 
> in native_flush_tlb_multi() misses the fact that we have different 
> flushing types for removed/unshared tables vs. other flush.
> 
> So this approach more here certainly gets more complicated and error prone.

Agreed. Tracking the flag through mmu_gather lifecycle does get
more complicated and error-prone ...

> 
> tlb_table_flush_implies_ipi_broadcast() was clearer in that regard: if 
> you flushed the TLB after removing /unsharing tables, the IPI for 
> handling page tables can be skipped. It's on the code flow to assure that.

v2 was definitely simpler.

> 
> What could work is tracking "tlb_table_flush_sent_ipi" really when we 
> are flushing the TLB for removed/unshared tables, and maybe resetting 
> it ... I don't know when from the top of my head.

Not sure what's the best way forward here :(

> 
> v2 was simpler IMHO.

The main concern Dave raised was that with PV hypercalls or when
INVLPGB is available, we can't tell from a static check whether IPIs
were actually sent.

Maybe that's acceptable, or we could find a simpler way to track that ...

Open to suggestions!

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by David Hildenbrand (Red Hat) 4 weeks, 1 day ago

>> What could work is tracking "tlb_table_flush_sent_ipi" really when we
>> are flushing the TLB for removed/unshared tables, and maybe resetting
>> it ... I don't know when from the top of my head.
> 
> Not sure what's the best way forward here :(
> 
>>
>> v2 was simpler IMHO.
> 
> The main concern Dave raised was that with PV hypercalls or when
> INVLPGB is available, we can't tell from a static check whether IPIs
> were actually sent.

Why can't we set the boolean at runtime when initializing the pv_ops 
structure, when we are sure that it is allowed?

-- 
Cheers

David

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by Lance Yang 4 weeks, 1 day ago

On 2026/1/9 22:13, David Hildenbrand (Red Hat) wrote:
> 
>>> What could work is tracking "tlb_table_flush_sent_ipi" really when we
>>> are flushing the TLB for removed/unshared tables, and maybe resetting
>>> it ... I don't know when from the top of my head.
>>
>> Not sure what's the best way forward here :(
>>
>>>
>>> v2 was simpler IMHO.
>>
>> The main concern Dave raised was that with PV hypercalls or when
>> INVLPGB is available, we can't tell from a static check whether IPIs
>> were actually sent.
> 
> Why can't we set the boolean at runtime when initializing the pv_ops 
> structure, when we are sure that it is allowed?

Yes, thanks, that sounds like a reasonable trade-off :)

As you mentioned:

"this lifetime stuff in core-mm ends up getting more complicated than
v2 without a clear benefit".

I totally agree that v3 is too complicated :(

But Dave's concern about v2 was that we can't accurately tell whether
IPIs were actually sent in PV environments or with INVLPGB, which
misses optimization opportunities. The INVLPGB+no_global_asid case
also sends IPIs during TLB flush.

Anyway, yeah, I'd rather start with a simple approach, even if it's
not perfect. We can always improve it later ;)

Any ideas on how to move forward?

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by David Hildenbrand (Red Hat) 4 weeks, 1 day ago

On 1/9/26 16:30, Lance Yang wrote:
> 
> 
> On 2026/1/9 22:13, David Hildenbrand (Red Hat) wrote:
>>
>>>> What could work is tracking "tlb_table_flush_sent_ipi" really when we
>>>> are flushing the TLB for removed/unshared tables, and maybe resetting
>>>> it ... I don't know when from the top of my head.
>>>
>>> Not sure what's the best way forward here :(
>>>
>>>>
>>>> v2 was simpler IMHO.
>>>
>>> The main concern Dave raised was that with PV hypercalls or when
>>> INVLPGB is available, we can't tell from a static check whether IPIs
>>> were actually sent.
>>
>> Why can't we set the boolean at runtime when initializing the pv_ops
>> structure, when we are sure that it is allowed?
> 
> Yes, thanks, that sounds like a reasonable trade-off :)
> 
> As you mentioned:
> 
> "this lifetime stuff in core-mm ends up getting more complicated than
> v2 without a clear benefit".
> 
> I totally agree that v3 is too complicated :(
> 
> But Dave's concern about v2 was that we can't accurately tell whether
> IPIs were actually sent in PV environments or with INVLPGB, which
> misses optimization opportunities. The INVLPGB+no_global_asid case
> also sends IPIs during TLB flush.
> 
> Anyway, yeah, I'd rather start with a simple approach, even if it's
> not perfect. We can always improve it later ;)
> 
> Any ideas on how to move forward?

I'd hope Dave can comment :)

In general, I saw the whole thing as a two step process:

1) Avoid IPIs completely when the TLB flush sent them. We can achieve
    that through v2 or v3, one-way or the other, I don't particularly
    care as long as it is clean and simple.

2) For other configs/arch, send IPIs only to CPUs that are actually in
    GUP-fast etc. That would resolve some RT headake with broadcast IPIs.


Regarding 2), it obviously only applies to setups where 1) does not 
apply: like x86 with INVLPGB or arm64.

I once had the idea of letting CPUs that enter/exit GUP-fast (and 
similar) to indicate in a global cpumask (or per-CPU variables) that 
they are in that context. Then, we can just collect these CPUs and limit 
the IPIs to them (usually, not a lot ...).

The trick here is to not slowdown GUP-fast too much. And one person 
(Yair in RT context) who played with that was not able to reduce the 
overhead sufficiently enough.

I guess the options are

a) Per-MM CPU mask we have to update atomically when entering/leaving 
GUP-fast

b) Global mask we have to update atomically when entering/leaving GUP-fast

c) Per-CPU variable we have to update when entering-leaving GUP-fast. 
Interrupts are disabled, so we don't have to worry about reschedule etc.

Maybe someone reading along has other thoughts.

-- 
Cheers

David

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by Lance Yang 1 month ago

Hi David,

On 2026/1/7 00:10, Lance Yang wrote:
[..]
>> What could work is tracking "tlb_table_flush_sent_ipi" really when we 
>> are flushing the TLB for removed/unshared tables, and maybe resetting 
>> it ... I don't know when from the top of my head.
>

Seems like we could fix the issue that the flag lifetime was broken
if the MMU gather gets reused by splitting the flush and reset. This
ensures the flag stays valid between flush and sync.

Now tlb_flush_unshared_tables() does:
   1) __tlb_flush_mmu_tlbonly() - flush only, keeps flags alive
   2) tlb_gather_remove_table_sync_one() - can check the flag
   3) __tlb_reset_range() - reset everything after sync

Something like this:

---8<---
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3975f7d11553..a95b054dfcca 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -415,6 +415,7 @@ static inline void __tlb_reset_range(struct 
mmu_gather *tlb)
  	tlb->cleared_puds = 0;
  	tlb->cleared_p4ds = 0;
  	tlb->unshared_tables = 0;
+	tlb->tlb_flush_sent_ipi = 0;
  	/*
  	 * Do not reset mmu_gather::vma_* fields here, we do not
  	 * call into tlb_start_vma() again to set them if there is an
@@ -492,7 +493,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct 
vm_area_struct *vma)
  	tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
  }

-static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+static inline void __tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
  {
  	/*
  	 * Anything calling __tlb_adjust_range() also sets at least one of
@@ -503,6 +504,11 @@ static inline void tlb_flush_mmu_tlbonly(struct 
mmu_gather *tlb)
  		return;

  	tlb_flush(tlb);
+}
+
+static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+{
+	__tlb_flush_mmu_tlbonly(tlb);
  	__tlb_reset_range(tlb);
  }

@@ -824,7 +830,7 @@ static inline void tlb_flush_unshared_tables(struct 
mmu_gather *tlb)
  	 * flush the TLB for the unsharer now.
  	 */
  	if (tlb->unshared_tables)
-		tlb_flush_mmu_tlbonly(tlb);
+		__tlb_flush_mmu_tlbonly(tlb);

  	/*
  	 * Similarly, we must make sure that concurrent GUP-fast will not
@@ -834,14 +840,16 @@ static inline void 
tlb_flush_unshared_tables(struct mmu_gather *tlb)
  	 * We only perform this when we are the last sharer of a page table,
  	 * as the IPI will reach all CPUs: any GUP-fast.
  	 *
-	 * Note that on configs where tlb_remove_table_sync_one() is a NOP,
-	 * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
-	 * required IPIs already for us.
+	 * Use tlb_gather_remove_table_sync_one() instead of
+	 * tlb_remove_table_sync_one() to skip the redundant IPI if the
+	 * TLB flush above already sent one.
  	 */
  	if (tlb->fully_unshared_tables) {
-		tlb_remove_table_sync_one();
+		tlb_gather_remove_table_sync_one(tlb);
  		tlb->fully_unshared_tables = false;
  	}
+
+	__tlb_reset_range(tlb);
  }
  #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
---

For khugepaged, it should be fine - it uses a local mmu_gather that
doesn't get reused. The lifetime is simply:

   tlb_gather_mmu() → flush → sync → tlb_finish_mmu()

Let me know if this addresses your concern :)

[...]

Re: [PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized

Posted by David Hildenbrand (Red Hat) 4 weeks, 1 day ago

On 1/7/26 07:37, Lance Yang wrote:
> Hi David,
> 
> On 2026/1/7 00:10, Lance Yang wrote:
> [..]
>>> What could work is tracking "tlb_table_flush_sent_ipi" really when we
>>> are flushing the TLB for removed/unshared tables, and maybe resetting
>>> it ... I don't know when from the top of my head.
>>
> 
> Seems like we could fix the issue that the flag lifetime was broken
> if the MMU gather gets reused by splitting the flush and reset. This
> ensures the flag stays valid between flush and sync.
> 
> Now tlb_flush_unshared_tables() does:
>     1) __tlb_flush_mmu_tlbonly() - flush only, keeps flags alive
>     2) tlb_gather_remove_table_sync_one() - can check the flag
>     3) __tlb_reset_range() - reset everything after sync
> 
> Something like this:
> 
> ---8<---
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 3975f7d11553..a95b054dfcca 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -415,6 +415,7 @@ static inline void __tlb_reset_range(struct
> mmu_gather *tlb)
>    	tlb->cleared_puds = 0;
>    	tlb->cleared_p4ds = 0;
>    	tlb->unshared_tables = 0;
> +	tlb->tlb_flush_sent_ipi = 0;


As raised, the "tlb_flush_sent_ipi" is confusing when we sent to 
different CPUs based on whether we are removing page tables or not.

I think you would really want to track that explicitly 
"tlb_table_flush_sent_ipi" ?

>    	/*
>    	 * Do not reset mmu_gather::vma_* fields here, we do not
>    	 * call into tlb_start_vma() again to set them if there is an
> @@ -492,7 +493,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct
> vm_area_struct *vma)
>    	tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
>    }
> 
> -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
> +static inline void __tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
>    {
>    	/*
>    	 * Anything calling __tlb_adjust_range() also sets at least one of
> @@ -503,6 +504,11 @@ static inline void tlb_flush_mmu_tlbonly(struct
> mmu_gather *tlb)
>    		return;
> 
>    	tlb_flush(tlb);
> +}
> +
> +static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
> +{
> +	__tlb_flush_mmu_tlbonly(tlb);
>    	__tlb_reset_range(tlb);
>    }
> 
> @@ -824,7 +830,7 @@ static inline void tlb_flush_unshared_tables(struct
> mmu_gather *tlb)
>    	 * flush the TLB for the unsharer now.
>    	 */
>    	if (tlb->unshared_tables)
> -		tlb_flush_mmu_tlbonly(tlb);
> +		__tlb_flush_mmu_tlbonly(tlb);
> 
>    	/*
>    	 * Similarly, we must make sure that concurrent GUP-fast will not
> @@ -834,14 +840,16 @@ static inline void
> tlb_flush_unshared_tables(struct mmu_gather *tlb)
>    	 * We only perform this when we are the last sharer of a page table,
>    	 * as the IPI will reach all CPUs: any GUP-fast.
>    	 *
> -	 * Note that on configs where tlb_remove_table_sync_one() is a NOP,
> -	 * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
> -	 * required IPIs already for us.
> +	 * Use tlb_gather_remove_table_sync_one() instead of
> +	 * tlb_remove_table_sync_one() to skip the redundant IPI if the
> +	 * TLB flush above already sent one.
>    	 */
>    	if (tlb->fully_unshared_tables) {
> -		tlb_remove_table_sync_one();
> +		tlb_gather_remove_table_sync_one(tlb);
>    		tlb->fully_unshared_tables = false;
>    	}
> +
> +	__tlb_reset_range(tlb);
>    }
>    #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
> ---
> 
> For khugepaged, it should be fine - it uses a local mmu_gather that
> doesn't get reused. The lifetime is simply:
> 
>     tlb_gather_mmu() → flush → sync → tlb_finish_mmu()
> 
> Let me know if this addresses your concern :)

I'll probably have to see the full picture. But this lifetime stuff in 
core-mm ends up getting more complicated than v2 without a clear benefit 
to me (except maybe handling some x86 oddities better ;) )

-- 
Cheers

David

[PATCH RESEND v3 1/2] mm/tlb: skip redundant IPI when TLB flush already synchronized
[PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI