Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/tlbbatch.h | 1 +
arch/x86/include/asm/tlbflush.h | 12 ++------
arch/x86/mm/tlb.c | 54 +++++++++++++++++++++++++++++++--
3 files changed, 55 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
index 1ad56eb3e8a8..f9a17edf63ad 100644
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch {
* the PFNs being flushed..
*/
struct cpumask cpumask;
+ bool used_invlpgb;
};
#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5eae5c1aafa5..e5516afdef7d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -358,21 +358,15 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
return atomic64_inc_return(&mm->context.tlb_gen);
}
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm,
- unsigned long uaddr)
-{
- inc_mm_tlb_gen(mm);
- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
{
flush_tlb_mm(mm);
}
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr);
static inline bool pte_flags_need_flush(unsigned long oldflags,
unsigned long newflags,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index bfc69ae4ea40..81f847c94321 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1659,9 +1659,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
- invlpgb_flush_all_nonglobals();
- } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
@@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
local_irq_enable();
}
+ /*
+ * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+ * The cpumask above contains only CPUs that were running tasks
+ * not using broadcast TLB flushing.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
+ tlbsync();
+ migrate_enable();
+ batch->used_invlpgb = false;
+ }
+
cpumask_clear(&batch->cpumask);
put_flush_tlb_info();
put_cpu();
}
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr)
+{
+ if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
+ u16 asid = mm_global_asid(mm);
+ /*
+ * Queue up an asynchronous invalidation. The corresponding
+ * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
+ * on the same CPU.
+ */
+ if (!batch->used_invlpgb) {
+ batch->used_invlpgb = true;
+ migrate_disable();
+ }
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+ /*
+ * Some CPUs might still be using a local ASID for this
+ * process, and require IPIs, while others are using the
+ * global ASID.
+ *
+ * In this corner case we need to do both the broadcast
+ * TLB invalidation, and send IPIs. The IPIs will help
+ * stragglers transition to the broadcast ASID.
+ */
+ if (READ_ONCE(mm->context.asid_transition))
+ goto also_send_ipi;
+ } else {
+also_send_ipi:
+ inc_mm_tlb_gen(mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ }
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or
--
2.47.1
On 16/01/2025 4:30, Rik van Riel wrote: > Instead of doing a system-wide TLB flush from arch_tlbbatch_flush, > queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending. > [snip] > --- a/arch/x86/mm/tlb.c > +++ b/arch/x86/mm/tlb.c > @@ -1659,9 +1659,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) > * a local TLB flush is needed. Optimize this use-case by calling > * flush_tlb_func_local() directly in this case. > */ > - if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { > - invlpgb_flush_all_nonglobals(); > - } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { > + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { > flush_tlb_multi(&batch->cpumask, info); > } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { > lockdep_assert_irqs_enabled(); > @@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) > local_irq_enable(); > } > > + /* > + * If we issued (asynchronous) INVLPGB flushes, wait for them here. > + * The cpumask above contains only CPUs that were running tasks > + * not using broadcast TLB flushing. > + */ > + if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) { > + tlbsync(); > + migrate_enable(); Maybe someone mentioned it before, but I would emphasize that I do not think that preventing migration for potentially long time is that great. One alternative solution would be to set a bit on cpu_tlbstate, that when set, you'd issue a tlbsync on context switch. (I can think about other solutions, but I think the one I just mentioned is the cleanest one). > + batch->used_invlpgb = false; > + } > + > cpumask_clear(&batch->cpumask); > > put_flush_tlb_info(); > put_cpu(); > } > > +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, > + struct mm_struct *mm, > + unsigned long uaddr) > +{ > + if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) { > + u16 asid = mm_global_asid(mm); > + /* > + * Queue up an asynchronous invalidation. The corresponding > + * TLBSYNC is done in arch_tlbbatch_flush(), and must be done > + * on the same CPU. > + */ > + if (!batch->used_invlpgb) { > + batch->used_invlpgb = true; > + migrate_disable(); See my comment above... > + } > + invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); > + /* Do any CPUs supporting INVLPGB need PTI? */ > + if (static_cpu_has(X86_FEATURE_PTI)) > + invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false); > + > + /* > + * Some CPUs might still be using a local ASID for this > + * process, and require IPIs, while others are using the > + * global ASID. > + * > + * In this corner case we need to do both the broadcast > + * TLB invalidation, and send IPIs. The IPIs will help > + * stragglers transition to the broadcast ASID. > + */ > + if (READ_ONCE(mm->context.asid_transition)) > + goto also_send_ipi; > + } else { > +also_send_ipi: I really think you should avoid such goto's. A simple bool variable of "need_ipi" would suffice. > + inc_mm_tlb_gen(mm); > + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); > + } > + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); > +} > + > /* > * Blindly accessing user memory from NMI context can be dangerous > * if we're in the middle of switching the current user task or
On Mon, 2025-01-20 at 11:56 +0200, Nadav Amit wrote: > > > @@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct > > arch_tlbflush_unmap_batch *batch) > > local_irq_enable(); > > } > > > > + /* > > + * If we issued (asynchronous) INVLPGB flushes, wait for > > them here. > > + * The cpumask above contains only CPUs that were running > > tasks > > + * not using broadcast TLB flushing. > > + */ > > + if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch- > > >used_invlpgb) { > > + tlbsync(); > > + migrate_enable(); > > Maybe someone mentioned it before, but I would emphasize that I do > not > think that preventing migration for potentially long time is that > great. > > One alternative solution would be to set a bit on cpu_tlbstate, that > when set, you'd issue a tlbsync on context switch. > > (I can think about other solutions, but I think the one I just > mentioned > is the cleanest one). It is clean, but I'm not convinced it is good enough. We need to guarantee that the INVLPGBs have finished before we free the pages. Running a TLBSYNC at the next context switch could mean that TLBSYNC won't run until after the pages have been freed. In practice it is probably good enough, since it would be simpler for TLBSYNC to return once all pending (older) INVLPGBs have finished, but it's not architecturally guaranteed. We could send an IPI to remote CPUs in order for them to call TLBSYNC, but is that really better? -- All Rights Reversed.
On 20/01/2025 16:02, Rik van Riel wrote: > On Mon, 2025-01-20 at 11:56 +0200, Nadav Amit wrote: >> >>> @@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct >>> arch_tlbflush_unmap_batch *batch) >>> local_irq_enable(); >>> } >>> >>> + /* >>> + * If we issued (asynchronous) INVLPGB flushes, wait for >>> them here. >>> + * The cpumask above contains only CPUs that were running >>> tasks >>> + * not using broadcast TLB flushing. >>> + */ >>> + if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch- >>>> used_invlpgb) { >>> + tlbsync(); >>> + migrate_enable(); >> >> Maybe someone mentioned it before, but I would emphasize that I do >> not >> think that preventing migration for potentially long time is that >> great. >> >> One alternative solution would be to set a bit on cpu_tlbstate, that >> when set, you'd issue a tlbsync on context switch. >> >> (I can think about other solutions, but I think the one I just >> mentioned >> is the cleanest one). > > It is clean, but I'm not convinced it is good enough. > > We need to guarantee that the INVLPGBs have finished > before we free the pages. > > Running a TLBSYNC at the next context switch could > mean that TLBSYNC won't run until after the pages > have been freed. > > In practice it is probably good enough, since it > would be simpler for TLBSYNC to return once all > pending (older) INVLPGBs have finished, but it's > not architecturally guaranteed. > > We could send an IPI to remote CPUs in order for > them to call TLBSYNC, but is that really better? > I am not sure we are on the same page. What I suggested is: 1. arch_tlbbatch_flush() would still do tlbsync() 2. No migrate_enable() in arch_tlbbatch_flush() 3. No migrate_disable() in arch_tlbbatch_add_pending() 4. arch_tlbbatch_add_pending() sets cpu_tlbstate.pending_tlb_broadcast 5. switch_mm_irqs_off() checks cpu_tlbstate.pending_tlb_broadcast and if it is set performs tlbsync and clears it.
On Mon, 2025-01-20 at 16:14 +0200, Nadav Amit wrote: > > I am not sure we are on the same page. What I suggested is: > > 1. arch_tlbbatch_flush() would still do tlbsync() > 2. No migrate_enable() in arch_tlbbatch_flush() > 3. No migrate_disable() in arch_tlbbatch_add_pending() > 4. arch_tlbbatch_add_pending() sets > cpu_tlbstate.pending_tlb_broadcast > 5. switch_mm_irqs_off() checks cpu_tlbstate.pending_tlb_broadcast and > if > it is set performs tlbsync and clears it. > How does that synchronize the page freeing (from page reclaim) with the TLBSYNCs? What guarantees that the page reclaim path won't free the pages until after TLBSYNC has completed on the CPUs that kicked off asynchronous flushes with INVPLGB? -- All Rights Reversed.
On 20/01/2025 18:11, Rik van Riel wrote: > On Mon, 2025-01-20 at 16:14 +0200, Nadav Amit wrote: >> >> I am not sure we are on the same page. What I suggested is: >> >> 1. arch_tlbbatch_flush() would still do tlbsync() >> 2. No migrate_enable() in arch_tlbbatch_flush() >> 3. No migrate_disable() in arch_tlbbatch_add_pending() >> 4. arch_tlbbatch_add_pending() sets >> cpu_tlbstate.pending_tlb_broadcast >> 5. switch_mm_irqs_off() checks cpu_tlbstate.pending_tlb_broadcast and >> if >> it is set performs tlbsync and clears it. >> > How does that synchronize the page freeing (from page > reclaim) with the TLBSYNCs? > > What guarantees that the page reclaim path won't free > the pages until after TLBSYNC has completed on the CPUs > that kicked off asynchronous flushes with INVPLGB? [ you make me lose my confidence, although I see nothing wrong ] Freeing the pages must be done after the TLBSYNC. I did not imply it needs to be changed. The page freeing (and reclaim) path is only initiated after arch_tlbbatch_flush() is completed. If no migration is initiated, since we did not remove any tlbsync, it should be fine. If migration was initiated, and some invlpgb's were already initiated, then we need for correctness to initiate tlbsync before the task might be scheduled on another core. That's exactly why adding tlbsync to switch_mm_irqs_off() is needed in such a case.
On Mon, 2025-01-20 at 19:09 +0200, Nadav Amit wrote: > > On 20/01/2025 18:11, Rik van Riel wrote: > > > > What guarantees that the page reclaim path won't free > > the pages until after TLBSYNC has completed on the CPUs > > that kicked off asynchronous flushes with INVPLGB? > > [ you make me lose my confidence, although I see nothing wrong ] > > Freeing the pages must be done after the TLBSYNC. I did not imply it > needs to be changed. > > The page freeing (and reclaim) path is only initiated after > arch_tlbbatch_flush() is completed. If no migration is initiated, > since > we did not remove any tlbsync, it should be fine. > > If migration was initiated, and some invlpgb's were already > initiated, > then we need for correctness to initiate tlbsync before the task > might > be scheduled on another core. That's exactly why adding tlbsync to > switch_mm_irqs_off() is needed in such a case. This is the page reclaim code. The process that has those other pages mapped might be running on other CPUs simultaneously with the page reclaim code. Even if we were invalidating one of our own pages this way, there could be other threads in the same process, running while we are in the page reclaim code. -- All Rights Reversed.
[ Thanks for your patience. ] > On 20 Jan 2025, at 19:11, Rik van Riel <riel@surriel.com> wrote: > > On Mon, 2025-01-20 at 19:09 +0200, Nadav Amit wrote: > > This is the page reclaim code. > > The process that has those other pages mapped might be > running on other CPUs simultaneously with the page > reclaim code. > > Even if we were invalidating one of our own pages this > way, there could be other threads in the same process, > running while we are in the page reclaim code. Of course, but there is nothing new here. Let me see where we lose each other by first stating the goal, what you propose, and what I suggest. We are issuing invlpgb and we need to ensure tlbsync on the same core that initiated invlpgb before arch_tlbbatch_flush() finishes. That’s all that matters for our discussion (correct me if I miss something). You solved it by disabling migration and running tlbsync at the end. I suggest *not* to disable migration, to keep running tlbsync at the arch_tlbbatch_flush() as you do, and if context-switch happens after arch_tlbbatch_add_pending() and before arch_tlbbatch_flush(), to run tlbsync during the context switch.
On Mon, 2025-01-20 at 19:50 +0200, Nadav Amit wrote: > [ Thanks for your patience. ] > > > On 20 Jan 2025, at 19:11, Rik van Riel <riel@surriel.com> wrote: > > > > On Mon, 2025-01-20 at 19:09 +0200, Nadav Amit wrote: > > > > This is the page reclaim code. > > > > The process that has those other pages mapped might be > > running on other CPUs simultaneously with the page > > reclaim code. > > > > Even if we were invalidating one of our own pages this > > way, there could be other threads in the same process, > > running while we are in the page reclaim code. > > > > Of course, but there is nothing new here. Let me see where we lose > each other by first stating the goal, what you propose, and what I > suggest. > > We are issuing invlpgb and we need to ensure tlbsync on the same core > that initiated invlpgb before arch_tlbbatch_flush() finishes. That’s > all that matters for our discussion (correct me if I miss something). > > You solved it by disabling migration and running tlbsync at the end. > > I suggest *not* to disable migration, to keep running tlbsync at the > arch_tlbbatch_flush() as you do, and if context-switch happens after > arch_tlbbatch_add_pending() and before arch_tlbbatch_flush(), to run > tlbsync during the context switch. > How would you keep track of CPUs where the tlbsync has NOT happened before arch_tlbbatch_flush()? That part seems to be missing still. -- All Rights Reversed.
> On 20 Jan 2025, at 19:56, Rik van Riel <riel@surriel.com> wrote: > > How would you keep track of CPUs where the tlbsync > has NOT happened before arch_tlbbatch_flush()? > > That part seems to be missing still. You only keep track if there is a pending tlbsync on *your* CPU. No need to track if other CPUs did not issue tlbsync during arch_tlbbatch_add_pending(). If the process that does the reclamation was migrated, a TLBSYNC is issued during the context switch, before that thread that does the reclamation has any chance of being scheduled. I hope this code changes on top of your would make it clearer: > +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, > + struct mm_struct *mm, > + unsigned long uaddr) > +{ > + if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) { > + u16 asid = mm_global_asid(mm); > + /* > + * Queue up an asynchronous invalidation. The corresponding > + * TLBSYNC is done in arch_tlbbatch_flush(), and must be done > + * on the same CPU. > + */ #if 0 // remove > + if (!batch->used_invlpgb) { > + batch->used_invlpgb = true; > + migrate_disable(); > + } #endif batch->used_invlpg = true; preempt_disable(); > + invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); > + /* Do any CPUs supporting INVLPGB need PTI? */ > + if (static_cpu_has(X86_FEATURE_PTI)) > + invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false); this_cpu_write(cpu_tlbstate.pending_tlbsync, true); preempt_enable(); > + > + /* > + * Some CPUs might still be using a local ASID for this > + * process, and require IPIs, while others are using the > + * global ASID. > + * > + * In this corner case we need to do both the broadcast > + * TLB invalidation, and send IPIs. The IPIs will help > + * stragglers transition to the broadcast ASID. > + */ > + if (READ_ONCE(mm->context.asid_transition)) > + goto also_send_ipi; > + } else { > +also_send_ipi: > + inc_mm_tlb_gen(mm); > + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); > + } > + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); > +} > + Then in switch_mm_irqs_off(), b if (this_cpu_read(cpu_tlbstate.pending_tlbsync)) tlbsync(); Note that when switch_mm_irqs_off() is called due to context switch from context_switch(), finish_task_switch() has still not took place, so the task cannot be scheduled on other cores.
On Mon, 2025-01-20 at 20:56 +0200, Nadav Amit wrote: > > Then in switch_mm_irqs_off(), b > > if (this_cpu_read(cpu_tlbstate.pending_tlbsync)) > tlbsync(); > > Note that when switch_mm_irqs_off() is called due to context switch > from > context_switch(), finish_task_switch() has still not took place, so > the > task cannot be scheduled on other cores. > > That would certainly work. I'll add that as a separate patch, which the x86 maintainers can either accept or leave out. I'm not sure what's worse, making reclaim non-migratable, or adding an extra branch (we can probably avoid a cache line miss) to switch_mm_irqs_off. -- All Rights Reversed.
© 2016 - 2025 Red Hat, Inc.