From: Lance Yang <lance.yang@linux.dev>
When unsharing hugetlb PMD page tables, we currently send two IPIs: one
for TLB invalidation, and another to synchronize with concurrent GUP-fast
walkers via tlb_remove_table_sync_one().
However, if the TLB flush already sent IPIs to all CPUs (when freed_tables
or unshared_tables is true), the second IPI is redundant. GUP-fast runs
with IRQs disabled, so when the TLB flush IPI completes, any concurrent
GUP-fast must have finished.
To avoid the redundant IPI, we add a flag to mmu_gather to track whether
the TLB flush sent IPIs. We pass the mmu_gather pointer through the TLB
flush path via flush_tlb_info, so native_flush_tlb_multi() can set the
flag when it sends IPIs for freed_tables. We also set the flag for
local-only flushes, since disabling IRQs provides the same guarantee.
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Lance Yang <lance.yang@linux.dev>
---
arch/x86/include/asm/tlb.h | 3 ++-
arch/x86/include/asm/tlbflush.h | 9 +++++----
arch/x86/kernel/alternative.c | 2 +-
arch/x86/kernel/ldt.c | 2 +-
arch/x86/mm/tlb.c | 22 ++++++++++++++++------
include/asm-generic/tlb.h | 14 +++++++++-----
mm/mmu_gather.c | 26 +++++++++++++++++++-------
7 files changed, 53 insertions(+), 25 deletions(-)
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 866ea78ba156..c5950a92058c 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -20,7 +20,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
end = tlb->end;
}
- flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+ flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
+ tlb->freed_tables || tlb->unshared_tables, tlb);
}
static inline void invlpg(unsigned long addr)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 00daedfefc1b..83c260c88b80 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -220,6 +220,7 @@ struct flush_tlb_info {
* will be zero.
*/
struct mm_struct *mm;
+ struct mmu_gather *tlb;
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
@@ -305,23 +306,23 @@ static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif
#define flush_tlb_mm(mm) \
- flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
+ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true, NULL)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
- : PAGE_SHIFT, true)
+ : PAGE_SHIFT, true, NULL)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
- bool freed_tables);
+ bool freed_tables, struct mmu_gather *tlb);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
- flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false, NULL);
}
static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 28518371d8bf..006f3705b616 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2572,7 +2572,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
*/
flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
- PAGE_SHIFT, false);
+ PAGE_SHIFT, false, NULL);
if (func == text_poke_memcpy) {
/*
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0f19ef355f5f..d8494706fec5 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -374,7 +374,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
}
va = (unsigned long)ldt_slot_va(ldt->slot);
- flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
+ flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false, NULL);
}
#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f5b93e01e347..be45976c0d16 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1374,6 +1374,9 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
else
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
+
+ if (info->freed_tables && info->tlb)
+ info->tlb->tlb_flush_sent_ipi = true;
}
void flush_tlb_multi(const struct cpumask *cpumask,
@@ -1403,7 +1406,7 @@ static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
- u64 new_tlb_gen)
+ u64 new_tlb_gen, struct mmu_gather *tlb)
{
struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
@@ -1433,6 +1436,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
info->trim_cpumask = 0;
+ info->tlb = tlb;
return info;
}
@@ -1447,8 +1451,8 @@ static void put_flush_tlb_info(void)
}
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned int stride_shift,
- bool freed_tables)
+ unsigned long end, unsigned int stride_shift,
+ bool freed_tables, struct mmu_gather *tlb)
{
struct flush_tlb_info *info;
int cpu = get_cpu();
@@ -1458,7 +1462,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
new_tlb_gen = inc_mm_tlb_gen(mm);
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
- new_tlb_gen);
+ new_tlb_gen, tlb);
/*
* flush_tlb_multi() is not optimized for the common case in which only
@@ -1476,6 +1480,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
local_irq_disable();
flush_tlb_func(info);
local_irq_enable();
+ /*
+ * Only current CPU uses this mm, so we can treat this as
+ * having synchronized with GUP-fast. No sync IPI needed.
+ */
+ if (tlb && freed_tables)
+ tlb->tlb_flush_sent_ipi = true;
}
put_flush_tlb_info();
@@ -1553,7 +1563,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
guard(preempt)();
info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
- TLB_GENERATION_INVALID);
+ TLB_GENERATION_INVALID, NULL);
if (info->end == TLB_FLUSH_ALL)
kernel_tlb_flush_all(info);
@@ -1733,7 +1743,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
int cpu = get_cpu();
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
- TLB_GENERATION_INVALID);
+ TLB_GENERATION_INVALID, NULL);
/*
* flush_tlb_multi() is not optimized for the common case in which only
* a local TLB flush is needed. Optimize this use-case by calling
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3975f7d11553..cbbe008590ee 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -249,6 +249,7 @@ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
#define tlb_needs_table_invalidate() (true)
#endif
+void tlb_gather_remove_table_sync_one(struct mmu_gather *tlb);
void tlb_remove_table_sync_one(void);
#else
@@ -257,6 +258,7 @@ void tlb_remove_table_sync_one(void);
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif
+static inline void tlb_gather_remove_table_sync_one(struct mmu_gather *tlb) { }
static inline void tlb_remove_table_sync_one(void) { }
#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
@@ -378,6 +380,12 @@ struct mmu_gather {
*/
unsigned int fully_unshared_tables : 1;
+ /*
+ * Did the TLB flush for freed/unshared tables send IPIs to all CPUs?
+ * If true, we can skip the redundant IPI in tlb_remove_table_sync_one().
+ */
+ unsigned int tlb_flush_sent_ipi : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -833,13 +841,9 @@ static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
*
* We only perform this when we are the last sharer of a page table,
* as the IPI will reach all CPUs: any GUP-fast.
- *
- * Note that on configs where tlb_remove_table_sync_one() is a NOP,
- * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
- * required IPIs already for us.
*/
if (tlb->fully_unshared_tables) {
- tlb_remove_table_sync_one();
+ tlb_gather_remove_table_sync_one(tlb);
tlb->fully_unshared_tables = false;
}
}
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2faa23d7f8d4..da36de52b281 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -273,8 +273,14 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
}
-void tlb_remove_table_sync_one(void)
+void tlb_gather_remove_table_sync_one(struct mmu_gather *tlb)
{
+ /* Skip the IPI if the TLB flush already synchronized with other CPUs */
+ if (tlb && tlb->tlb_flush_sent_ipi) {
+ tlb->tlb_flush_sent_ipi = false;
+ return;
+ }
+
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* assumed to be actually RCU-freed.
@@ -285,6 +291,11 @@ void tlb_remove_table_sync_one(void)
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}
+void tlb_remove_table_sync_one(void)
+{
+ tlb_gather_remove_table_sync_one(NULL);
+}
+
static void tlb_remove_table_rcu(struct rcu_head *head)
{
__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
@@ -328,7 +339,7 @@ static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
__tlb_remove_table(ptdesc);
}
-static inline void __tlb_remove_table_one(void *table)
+static inline void __tlb_remove_table_one(void *table, struct mmu_gather *tlb)
{
struct ptdesc *ptdesc;
@@ -336,16 +347,16 @@ static inline void __tlb_remove_table_one(void *table)
call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
}
#else
-static inline void __tlb_remove_table_one(void *table)
+static inline void __tlb_remove_table_one(void *table, struct mmu_gather *tlb)
{
- tlb_remove_table_sync_one();
+ tlb_gather_remove_table_sync_one(tlb);
__tlb_remove_table(table);
}
#endif /* CONFIG_PT_RECLAIM */
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
{
- __tlb_remove_table_one(table);
+ __tlb_remove_table_one(table, tlb);
}
static void tlb_table_flush(struct mmu_gather *tlb)
@@ -367,7 +378,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
- tlb_remove_table_one(table);
+ tlb_remove_table_one(table, tlb);
return;
}
(*batch)->nr = 0;
@@ -427,6 +438,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
tlb->vma_pfn = 0;
tlb->fully_unshared_tables = 0;
+ tlb->tlb_flush_sent_ipi = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
}
--
2.49.0
On 1/6/26 04:03, Lance Yang wrote:
> From: Lance Yang <lance.yang@linux.dev>
>
> When unsharing hugetlb PMD page tables, we currently send two IPIs: one
> for TLB invalidation, and another to synchronize with concurrent GUP-fast
> walkers via tlb_remove_table_sync_one().
>
> However, if the TLB flush already sent IPIs to all CPUs (when freed_tables
> or unshared_tables is true), the second IPI is redundant. GUP-fast runs
> with IRQs disabled, so when the TLB flush IPI completes, any concurrent
> GUP-fast must have finished.
>
> To avoid the redundant IPI, we add a flag to mmu_gather to track whether
> the TLB flush sent IPIs. We pass the mmu_gather pointer through the TLB
> flush path via flush_tlb_info, so native_flush_tlb_multi() can set the
> flag when it sends IPIs for freed_tables. We also set the flag for
> local-only flushes, since disabling IRQs provides the same guarantee.
The lack of imperative voice is killing me. :)
> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
> index 866ea78ba156..c5950a92058c 100644
> --- a/arch/x86/include/asm/tlb.h
> +++ b/arch/x86/include/asm/tlb.h
> @@ -20,7 +20,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
> end = tlb->end;
> }
>
> - flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
> + flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
> + tlb->freed_tables || tlb->unshared_tables, tlb);
> }
I think this hunk sums up v3 pretty well. Where there was a single boolean, now there are two. To add to that, the structure that contains the booleans is itself being passed in. The boolean is still named 'freed_tables', and is going from:
tlb->freed_tables
which is pretty obviously correct to:
tlb->freed_tables || tlb->unshared_tables
which is _far_ from obviously correct.
I'm at a loss for why the patch wouldn't just do this:
- flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+ flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb);
I suspect these were sent out in a bit of haste, which isn't the first time I've gotten that feeling with this series.
Could we slow down, please?
> static inline void invlpg(unsigned long addr)
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 00daedfefc1b..83c260c88b80 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -220,6 +220,7 @@ struct flush_tlb_info {
> * will be zero.
> */
> struct mm_struct *mm;
> + struct mmu_gather *tlb;
> unsigned long start;
> unsigned long end;
> u64 new_tlb_gen;
This also gives me pause.
There is a *lot* of redundant information between 'struct mmu_gather' and 'struct tlb_flush_info'. There needs to at least be a description of what the relationship is and how these relate to each other. I would have naively thought that the right move here would be to pull the mmu_gather data out at one discrete time rather than store a pointer to it.
What I see here is, I suspect, the most expedient way to do it. I'd _certainly_ have done this myself if I was just hacking something together to play with as quickly as possible.
So, in the end, I don't hate the approach here (yet). But it is almost impossible to evaluate it because the series is taking some rather egregious shortcuts and is lacking any real semblance of a refactoring effort.
On 2026/1/7 00:24, Dave Hansen wrote:
> On 1/6/26 04:03, Lance Yang wrote:
>> From: Lance Yang <lance.yang@linux.dev>
>>
>> When unsharing hugetlb PMD page tables, we currently send two IPIs: one
>> for TLB invalidation, and another to synchronize with concurrent GUP-fast
>> walkers via tlb_remove_table_sync_one().
>>
>> However, if the TLB flush already sent IPIs to all CPUs (when freed_tables
>> or unshared_tables is true), the second IPI is redundant. GUP-fast runs
>> with IRQs disabled, so when the TLB flush IPI completes, any concurrent
>> GUP-fast must have finished.
>>
>> To avoid the redundant IPI, we add a flag to mmu_gather to track whether
>> the TLB flush sent IPIs. We pass the mmu_gather pointer through the TLB
>> flush path via flush_tlb_info, so native_flush_tlb_multi() can set the
>> flag when it sends IPIs for freed_tables. We also set the flag for
>> local-only flushes, since disabling IRQs provides the same guarantee.
>
> The lack of imperative voice is killing me. :)
Oops.
>
>> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
>> index 866ea78ba156..c5950a92058c 100644
>> --- a/arch/x86/include/asm/tlb.h
>> +++ b/arch/x86/include/asm/tlb.h
>> @@ -20,7 +20,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
>> end = tlb->end;
>> }
>>
>> - flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
>> + flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
>> + tlb->freed_tables || tlb->unshared_tables, tlb);
>> }
>
> I think this hunk sums up v3 pretty well. Where there was a single boolean, now there are two. To add to that, the structure that contains the booleans is itself being passed in. The boolean is still named 'freed_tables', and is going from:
>
> tlb->freed_tables
>
> which is pretty obviously correct to:
>
> tlb->freed_tables || tlb->unshared_tables
>
> which is _far_ from obviously correct.
>
> I'm at a loss for why the patch wouldn't just do this:
>
> - flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
> + flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb);
>
> I suspect these were sent out in a bit of haste, which isn't the first time I've gotten that feeling with this series.
>
> Could we slow down, please?
Sorry, I went too fast ...
>
>> static inline void invlpg(unsigned long addr)
>> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
>> index 00daedfefc1b..83c260c88b80 100644
>> --- a/arch/x86/include/asm/tlbflush.h
>> +++ b/arch/x86/include/asm/tlbflush.h
>> @@ -220,6 +220,7 @@ struct flush_tlb_info {
>> * will be zero.
>> */
>> struct mm_struct *mm;
>> + struct mmu_gather *tlb;
>> unsigned long start;
>> unsigned long end;
>> u64 new_tlb_gen;
>
> This also gives me pause.
>
> There is a *lot* of redundant information between 'struct mmu_gather' and 'struct tlb_flush_info'. There needs to at least be a description of what the relationship is and how these relate to each other. I would have naively thought that the right move here would be to pull the mmu_gather data out at one discrete time rather than store a pointer to it.
>
> What I see here is, I suspect, the most expedient way to do it. I'd _certainly_ have done this myself if I was just hacking something together to play with as quickly as possible.
>
> So, in the end, I don't hate the approach here (yet). But it is almost impossible to evaluate it because the series is taking some rather egregious shortcuts and is lacking any real semblance of a refactoring effort.
The flag lifetime issue David pointed out is real, and you're right
about the messy parameters :)
And, yeah, I need to think more those. Maybe v3 can be fixed, or maybe
v2 is actually sufficient - it's conservative but safe (no false positives).
Will take more time, thanks!
>
> static void tlb_table_flush(struct mmu_gather *tlb)
> @@ -367,7 +378,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
> *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
> if (*batch == NULL) {
> tlb_table_invalidate(tlb);
> - tlb_remove_table_one(table);
> + tlb_remove_table_one(table, tlb);
> return;
> }
> (*batch)->nr = 0;
> @@ -427,6 +438,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
> tlb->vma_pfn = 0;
>
> tlb->fully_unshared_tables = 0;
> + tlb->tlb_flush_sent_ipi = 0;
> __tlb_reset_range(tlb);
> inc_tlb_flush_pending(tlb->mm);
> }
But when would we have to reset tlb->tlb_flush_sent_ipi = 0 later?
That's where it gets tricky. Just imagine the MMU gather gets reused later.
Also,
+ if (info->freed_tables && info->tlb)
+ info->tlb->tlb_flush_sent_ipi = true;
in native_flush_tlb_multi() misses the fact that we have different
flushing types for removed/unshared tables vs. other flush.
So this approach more here certainly gets more complicated and error prone.
tlb_table_flush_implies_ipi_broadcast() was clearer in that regard: if
you flushed the TLB after removing /unsharing tables, the IPI for
handling page tables can be skipped. It's on the code flow to assure that.
What could work is tracking "tlb_table_flush_sent_ipi" really when we
are flushing the TLB for removed/unshared tables, and maybe resetting it
... I don't know when from the top of my head.
v2 was simpler IMHO.
--
Cheers
David
On 2026/1/6 23:19, David Hildenbrand (Red Hat) wrote:
>> static void tlb_table_flush(struct mmu_gather *tlb)
>> @@ -367,7 +378,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void
>> *table)
>> *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
>> if (*batch == NULL) {
>> tlb_table_invalidate(tlb);
>> - tlb_remove_table_one(table);
>> + tlb_remove_table_one(table, tlb);
>> return;
>> }
>> (*batch)->nr = 0;
>> @@ -427,6 +438,7 @@ static void __tlb_gather_mmu(struct mmu_gather
>> *tlb, struct mm_struct *mm,
>> tlb->vma_pfn = 0;
>> tlb->fully_unshared_tables = 0;
>> + tlb->tlb_flush_sent_ipi = 0;
>> __tlb_reset_range(tlb);
>> inc_tlb_flush_pending(tlb->mm);
>> }
>
> But when would we have to reset tlb->tlb_flush_sent_ipi = 0 later?
> That's where it gets tricky. Just imagine the MMU gather gets reused later.
>
> Also,
>
> + if (info->freed_tables && info->tlb)
> + info->tlb->tlb_flush_sent_ipi = true;
>
> in native_flush_tlb_multi() misses the fact that we have different
> flushing types for removed/unshared tables vs. other flush.
>
> So this approach more here certainly gets more complicated and error prone.
Agreed. Tracking the flag through mmu_gather lifecycle does get
more complicated and error-prone ...
>
> tlb_table_flush_implies_ipi_broadcast() was clearer in that regard: if
> you flushed the TLB after removing /unsharing tables, the IPI for
> handling page tables can be skipped. It's on the code flow to assure that.
v2 was definitely simpler.
>
> What could work is tracking "tlb_table_flush_sent_ipi" really when we
> are flushing the TLB for removed/unshared tables, and maybe resetting
> it ... I don't know when from the top of my head.
Not sure what's the best way forward here :(
>
> v2 was simpler IMHO.
The main concern Dave raised was that with PV hypercalls or when
INVLPGB is available, we can't tell from a static check whether IPIs
were actually sent.
Maybe that's acceptable, or we could find a simpler way to track that ...
Open to suggestions!
>> What could work is tracking "tlb_table_flush_sent_ipi" really when we >> are flushing the TLB for removed/unshared tables, and maybe resetting >> it ... I don't know when from the top of my head. > > Not sure what's the best way forward here :( > >> >> v2 was simpler IMHO. > > The main concern Dave raised was that with PV hypercalls or when > INVLPGB is available, we can't tell from a static check whether IPIs > were actually sent. Why can't we set the boolean at runtime when initializing the pv_ops structure, when we are sure that it is allowed? -- Cheers David
On 2026/1/9 22:13, David Hildenbrand (Red Hat) wrote: > >>> What could work is tracking "tlb_table_flush_sent_ipi" really when we >>> are flushing the TLB for removed/unshared tables, and maybe resetting >>> it ... I don't know when from the top of my head. >> >> Not sure what's the best way forward here :( >> >>> >>> v2 was simpler IMHO. >> >> The main concern Dave raised was that with PV hypercalls or when >> INVLPGB is available, we can't tell from a static check whether IPIs >> were actually sent. > > Why can't we set the boolean at runtime when initializing the pv_ops > structure, when we are sure that it is allowed? Yes, thanks, that sounds like a reasonable trade-off :) As you mentioned: "this lifetime stuff in core-mm ends up getting more complicated than v2 without a clear benefit". I totally agree that v3 is too complicated :( But Dave's concern about v2 was that we can't accurately tell whether IPIs were actually sent in PV environments or with INVLPGB, which misses optimization opportunities. The INVLPGB+no_global_asid case also sends IPIs during TLB flush. Anyway, yeah, I'd rather start with a simple approach, even if it's not perfect. We can always improve it later ;) Any ideas on how to move forward?
On 1/9/26 16:30, Lance Yang wrote:
>
>
> On 2026/1/9 22:13, David Hildenbrand (Red Hat) wrote:
>>
>>>> What could work is tracking "tlb_table_flush_sent_ipi" really when we
>>>> are flushing the TLB for removed/unshared tables, and maybe resetting
>>>> it ... I don't know when from the top of my head.
>>>
>>> Not sure what's the best way forward here :(
>>>
>>>>
>>>> v2 was simpler IMHO.
>>>
>>> The main concern Dave raised was that with PV hypercalls or when
>>> INVLPGB is available, we can't tell from a static check whether IPIs
>>> were actually sent.
>>
>> Why can't we set the boolean at runtime when initializing the pv_ops
>> structure, when we are sure that it is allowed?
>
> Yes, thanks, that sounds like a reasonable trade-off :)
>
> As you mentioned:
>
> "this lifetime stuff in core-mm ends up getting more complicated than
> v2 without a clear benefit".
>
> I totally agree that v3 is too complicated :(
>
> But Dave's concern about v2 was that we can't accurately tell whether
> IPIs were actually sent in PV environments or with INVLPGB, which
> misses optimization opportunities. The INVLPGB+no_global_asid case
> also sends IPIs during TLB flush.
>
> Anyway, yeah, I'd rather start with a simple approach, even if it's
> not perfect. We can always improve it later ;)
>
> Any ideas on how to move forward?
I'd hope Dave can comment :)
In general, I saw the whole thing as a two step process:
1) Avoid IPIs completely when the TLB flush sent them. We can achieve
that through v2 or v3, one-way or the other, I don't particularly
care as long as it is clean and simple.
2) For other configs/arch, send IPIs only to CPUs that are actually in
GUP-fast etc. That would resolve some RT headake with broadcast IPIs.
Regarding 2), it obviously only applies to setups where 1) does not
apply: like x86 with INVLPGB or arm64.
I once had the idea of letting CPUs that enter/exit GUP-fast (and
similar) to indicate in a global cpumask (or per-CPU variables) that
they are in that context. Then, we can just collect these CPUs and limit
the IPIs to them (usually, not a lot ...).
The trick here is to not slowdown GUP-fast too much. And one person
(Yair in RT context) who played with that was not able to reduce the
overhead sufficiently enough.
I guess the options are
a) Per-MM CPU mask we have to update atomically when entering/leaving
GUP-fast
b) Global mask we have to update atomically when entering/leaving GUP-fast
c) Per-CPU variable we have to update when entering-leaving GUP-fast.
Interrupts are disabled, so we don't have to worry about reschedule etc.
Maybe someone reading along has other thoughts.
--
Cheers
David
Hi David,
On 2026/1/7 00:10, Lance Yang wrote:
[..]
>> What could work is tracking "tlb_table_flush_sent_ipi" really when we
>> are flushing the TLB for removed/unshared tables, and maybe resetting
>> it ... I don't know when from the top of my head.
>
Seems like we could fix the issue that the flag lifetime was broken
if the MMU gather gets reused by splitting the flush and reset. This
ensures the flag stays valid between flush and sync.
Now tlb_flush_unshared_tables() does:
1) __tlb_flush_mmu_tlbonly() - flush only, keeps flags alive
2) tlb_gather_remove_table_sync_one() - can check the flag
3) __tlb_reset_range() - reset everything after sync
Something like this:
---8<---
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3975f7d11553..a95b054dfcca 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -415,6 +415,7 @@ static inline void __tlb_reset_range(struct
mmu_gather *tlb)
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
tlb->unshared_tables = 0;
+ tlb->tlb_flush_sent_ipi = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -492,7 +493,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct
vm_area_struct *vma)
tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
}
-static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+static inline void __tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
/*
* Anything calling __tlb_adjust_range() also sets at least one of
@@ -503,6 +504,11 @@ static inline void tlb_flush_mmu_tlbonly(struct
mmu_gather *tlb)
return;
tlb_flush(tlb);
+}
+
+static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+{
+ __tlb_flush_mmu_tlbonly(tlb);
__tlb_reset_range(tlb);
}
@@ -824,7 +830,7 @@ static inline void tlb_flush_unshared_tables(struct
mmu_gather *tlb)
* flush the TLB for the unsharer now.
*/
if (tlb->unshared_tables)
- tlb_flush_mmu_tlbonly(tlb);
+ __tlb_flush_mmu_tlbonly(tlb);
/*
* Similarly, we must make sure that concurrent GUP-fast will not
@@ -834,14 +840,16 @@ static inline void
tlb_flush_unshared_tables(struct mmu_gather *tlb)
* We only perform this when we are the last sharer of a page table,
* as the IPI will reach all CPUs: any GUP-fast.
*
- * Note that on configs where tlb_remove_table_sync_one() is a NOP,
- * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
- * required IPIs already for us.
+ * Use tlb_gather_remove_table_sync_one() instead of
+ * tlb_remove_table_sync_one() to skip the redundant IPI if the
+ * TLB flush above already sent one.
*/
if (tlb->fully_unshared_tables) {
- tlb_remove_table_sync_one();
+ tlb_gather_remove_table_sync_one(tlb);
tlb->fully_unshared_tables = false;
}
+
+ __tlb_reset_range(tlb);
}
#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
---
For khugepaged, it should be fine - it uses a local mmu_gather that
doesn't get reused. The lifetime is simply:
tlb_gather_mmu() → flush → sync → tlb_finish_mmu()
Let me know if this addresses your concern :)
[...]
On 1/7/26 07:37, Lance Yang wrote:
> Hi David,
>
> On 2026/1/7 00:10, Lance Yang wrote:
> [..]
>>> What could work is tracking "tlb_table_flush_sent_ipi" really when we
>>> are flushing the TLB for removed/unshared tables, and maybe resetting
>>> it ... I don't know when from the top of my head.
>>
>
> Seems like we could fix the issue that the flag lifetime was broken
> if the MMU gather gets reused by splitting the flush and reset. This
> ensures the flag stays valid between flush and sync.
>
> Now tlb_flush_unshared_tables() does:
> 1) __tlb_flush_mmu_tlbonly() - flush only, keeps flags alive
> 2) tlb_gather_remove_table_sync_one() - can check the flag
> 3) __tlb_reset_range() - reset everything after sync
>
> Something like this:
>
> ---8<---
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 3975f7d11553..a95b054dfcca 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -415,6 +415,7 @@ static inline void __tlb_reset_range(struct
> mmu_gather *tlb)
> tlb->cleared_puds = 0;
> tlb->cleared_p4ds = 0;
> tlb->unshared_tables = 0;
> + tlb->tlb_flush_sent_ipi = 0;
As raised, the "tlb_flush_sent_ipi" is confusing when we sent to
different CPUs based on whether we are removing page tables or not.
I think you would really want to track that explicitly
"tlb_table_flush_sent_ipi" ?
> /*
> * Do not reset mmu_gather::vma_* fields here, we do not
> * call into tlb_start_vma() again to set them if there is an
> @@ -492,7 +493,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct
> vm_area_struct *vma)
> tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
> }
>
> -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
> +static inline void __tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
> {
> /*
> * Anything calling __tlb_adjust_range() also sets at least one of
> @@ -503,6 +504,11 @@ static inline void tlb_flush_mmu_tlbonly(struct
> mmu_gather *tlb)
> return;
>
> tlb_flush(tlb);
> +}
> +
> +static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
> +{
> + __tlb_flush_mmu_tlbonly(tlb);
> __tlb_reset_range(tlb);
> }
>
> @@ -824,7 +830,7 @@ static inline void tlb_flush_unshared_tables(struct
> mmu_gather *tlb)
> * flush the TLB for the unsharer now.
> */
> if (tlb->unshared_tables)
> - tlb_flush_mmu_tlbonly(tlb);
> + __tlb_flush_mmu_tlbonly(tlb);
>
> /*
> * Similarly, we must make sure that concurrent GUP-fast will not
> @@ -834,14 +840,16 @@ static inline void
> tlb_flush_unshared_tables(struct mmu_gather *tlb)
> * We only perform this when we are the last sharer of a page table,
> * as the IPI will reach all CPUs: any GUP-fast.
> *
> - * Note that on configs where tlb_remove_table_sync_one() is a NOP,
> - * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
> - * required IPIs already for us.
> + * Use tlb_gather_remove_table_sync_one() instead of
> + * tlb_remove_table_sync_one() to skip the redundant IPI if the
> + * TLB flush above already sent one.
> */
> if (tlb->fully_unshared_tables) {
> - tlb_remove_table_sync_one();
> + tlb_gather_remove_table_sync_one(tlb);
> tlb->fully_unshared_tables = false;
> }
> +
> + __tlb_reset_range(tlb);
> }
> #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
> ---
>
> For khugepaged, it should be fine - it uses a local mmu_gather that
> doesn't get reused. The lifetime is simply:
>
> tlb_gather_mmu() → flush → sync → tlb_finish_mmu()
>
> Let me know if this addresses your concern :)
I'll probably have to see the full picture. But this lifetime stuff in
core-mm ends up getting more complicated than v2 without a clear benefit
to me (except maybe handling some x86 oddities better ;) )
--
Cheers
David
© 2016 - 2026 Red Hat, Inc.