[v14] AMD broadcast TLB invalidation

[PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Rik van Riel 11 months, 2 weeks ago

Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.

This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.

Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Tested-by: Brendan Jackman <jackmanb@google.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
---
 arch/x86/include/asm/tlb.h      | 12 ++---
 arch/x86/include/asm/tlbflush.h | 34 ++++++++++----
 arch/x86/mm/tlb.c               | 79 +++++++++++++++++++++++++++++++--
 3 files changed, 107 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 91c9a4da3ace..e645884a1877 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -89,16 +89,16 @@ static inline void __tlbsync(void)
 #define INVLPGB_FINAL_ONLY		BIT(4)
 #define INVLPGB_INCLUDE_NESTED		BIT(5)
 
-static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
-						unsigned long addr,
-						u16 nr,
-						bool pmd_stride)
+static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						  unsigned long addr,
+						  u16 nr,
+						  bool pmd_stride)
 {
 	__invlpgb(0, pcid, addr, nr, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
 }
 
 /* Flush all mappings for a given PCID, not including globals. */
-static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
 {
 	__invlpgb(0, pcid, 0, 1, 0, INVLPGB_PCID);
 }
@@ -111,7 +111,7 @@ static inline void invlpgb_flush_all(void)
 }
 
 /* Flush addr, including globals, for all PCIDs. */
-static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+static inline void __invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
 {
 	__invlpgb(0, 0, addr, nr, 0, INVLPGB_INCLUDE_GLOBAL);
 }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 811dd70eb6b8..22462bd4b1ee 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -105,6 +105,9 @@ struct tlb_state {
 	 * need to be invalidated.
 	 */
 	bool invalidate_other;
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+	bool need_tlbsync;
+#endif
 
 #ifdef CONFIG_ADDRESS_MASKING
 	/*
@@ -284,6 +287,16 @@ static inline bool in_asid_transition(struct mm_struct *mm)
 
 	return mm && READ_ONCE(mm->context.asid_transition);
 }
+
+static inline bool cpu_need_tlbsync(void)
+{
+	return this_cpu_read(cpu_tlbstate.need_tlbsync);
+}
+
+static inline void cpu_write_tlbsync(bool state)
+{
+	this_cpu_write(cpu_tlbstate.need_tlbsync, state);
+}
 #else
 static inline u16 mm_global_asid(struct mm_struct *mm)
 {
@@ -302,6 +315,15 @@ static inline bool in_asid_transition(struct mm_struct *mm)
 {
 	return false;
 }
+
+static inline bool cpu_need_tlbsync(void)
+{
+	return false;
+}
+
+static inline void cpu_write_tlbsync(bool state)
+{
+}
 #endif
 
 #ifdef CONFIG_PARAVIRT
@@ -351,21 +373,15 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
-					     struct mm_struct *mm,
-					     unsigned long uaddr)
-{
-	inc_mm_tlb_gen(mm);
-	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
 static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
 {
 	flush_tlb_mm(mm);
 }
 
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr);
 
 static inline bool pte_flags_need_flush(unsigned long oldflags,
 					unsigned long newflags,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index cd109bdf0dd9..4d56d22b9893 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -487,6 +487,37 @@ static void finish_asid_transition(struct flush_tlb_info *info)
 	clear_asid_transition(mm);
 }
 
+static inline void tlbsync(void)
+{
+	if (!cpu_need_tlbsync())
+		return;
+	__tlbsync();
+	cpu_write_tlbsync(false);
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						unsigned long addr,
+						u16 nr, bool pmd_stride)
+{
+	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+	if (!cpu_need_tlbsync())
+		cpu_write_tlbsync(true);
+}
+
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+	__invlpgb_flush_single_pcid_nosync(pcid);
+	if (!cpu_need_tlbsync())
+		cpu_write_tlbsync(true);
+}
+
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+	__invlpgb_flush_addr_nosync(addr, nr);
+	if (!cpu_need_tlbsync())
+		cpu_write_tlbsync(true);
+}
+
 static void broadcast_tlb_flush(struct flush_tlb_info *info)
 {
 	bool pmd = info->stride_shift == PMD_SHIFT;
@@ -785,6 +816,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 		WARN_ON_ONCE(!irqs_disabled());
 
+	tlbsync();
+
 	/*
 	 * Verify that CR3 is what we think it is.  This will catch
 	 * hypothetical buggy code that directly switches to swapper_pg_dir
@@ -961,6 +994,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
  */
 void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
+	tlbsync();
+
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
 		return;
 
@@ -1624,9 +1659,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	 * a local TLB flush is needed. Optimize this use-case by calling
 	 * flush_tlb_func_local() directly in this case.
 	 */
-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
-		invlpgb_flush_all_nonglobals();
-	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
 		flush_tlb_multi(&batch->cpumask, info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
@@ -1635,12 +1668,52 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		local_irq_enable();
 	}
 
+	/*
+	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+	 * The cpumask above contains only CPUs that were running tasks
+	 * not using broadcast TLB flushing.
+	 */
+	tlbsync();
+
 	cpumask_clear(&batch->cpumask);
 
 	put_flush_tlb_info();
 	put_cpu();
 }
 
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	u16 asid = mm_global_asid(mm);
+
+	if (asid) {
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+		/*
+		 * Some CPUs might still be using a local ASID for this
+		 * process, and require IPIs, while others are using the
+		 * global ASID.
+		 *
+		 * In this corner case we need to do both the broadcast
+		 * TLB invalidation, and send IPIs. The IPIs will help
+		 * stragglers transition to the broadcast ASID.
+		 */
+		if (in_asid_transition(mm))
+			asid = 0;
+	}
+
+	if (!asid) {
+		inc_mm_tlb_gen(mm);
+		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	}
+
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
 /*
  * Blindly accessing user memory from NMI context can be dangerous
  * if we're in the middle of switching the current user task or
-- 
2.47.1

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Borislav Petkov 11 months, 1 week ago

On Tue, Feb 25, 2025 at 10:00:46PM -0500, Rik van Riel wrote:
> +static inline bool cpu_need_tlbsync(void)
> +{
> +	return this_cpu_read(cpu_tlbstate.need_tlbsync);
> +}
> +
> +static inline void cpu_write_tlbsync(bool state)

That thing feels better like "cpu_set_tlbsync" in the code...

> +{
> +	this_cpu_write(cpu_tlbstate.need_tlbsync, state);
> +}
>  #else
>  static inline u16 mm_global_asid(struct mm_struct *mm)
>  {

...

> +static inline void tlbsync(void)
> +{
> +	if (!cpu_need_tlbsync())
> +		return;
> +	__tlbsync();
> +	cpu_write_tlbsync(false);
> +}

Easier to parse visually:

static inline void tlbsync(void)
{
        if (cpu_need_tlbsync()) {
                __tlbsync();
                cpu_write_tlbsync(false);
        }
}

Final:

From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:46 -0500
Subject: [PATCH] x86/mm: Do targeted broadcast flushing from tlbbatch code

Instead of doing a system-wide TLB flush from arch_tlbbatch_flush(), queue up
asynchronous, targeted flushes from arch_tlbbatch_add_pending().

This also allows to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB flushing
from the reclaim and compaction paths.

  [ bp:
   - Massage
   - :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-12-riel@surriel.com
---
 arch/x86/include/asm/tlb.h      | 12 ++---
 arch/x86/include/asm/tlbflush.h | 27 +++++++----
 arch/x86/mm/tlb.c               | 79 +++++++++++++++++++++++++++++++--
 3 files changed, 100 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 04f2c6f4cee3..b5c2005725cf 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -102,16 +102,16 @@ static inline void __tlbsync(void) { }
 #define INVLPGB_FINAL_ONLY		BIT(4)
 #define INVLPGB_INCLUDE_NESTED		BIT(5)
 
-static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
-						unsigned long addr,
-						u16 nr,
-						bool pmd_stride)
+static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						  unsigned long addr,
+						  u16 nr,
+						  bool pmd_stride)
 {
 	__invlpgb(0, pcid, addr, nr, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
 }
 
 /* Flush all mappings for a given PCID, not including globals. */
-static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
 {
 	__invlpgb(0, pcid, 0, 1, 0, INVLPGB_PCID);
 }
@@ -131,7 +131,7 @@ static inline void invlpgb_flush_all(void)
 }
 
 /* Flush addr, including globals, for all PCIDs. */
-static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+static inline void __invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
 {
 	__invlpgb(0, 0, addr, nr, 0, INVLPGB_INCLUDE_GLOBAL);
 }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c21030269ff..cbdb86d58301 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -105,6 +105,9 @@ struct tlb_state {
 	 * need to be invalidated.
 	 */
 	bool invalidate_other;
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+	bool need_tlbsync;
+#endif
 
 #ifdef CONFIG_ADDRESS_MASKING
 	/*
@@ -292,11 +295,23 @@ static inline bool mm_in_asid_transition(struct mm_struct *mm)
 
 	return mm && READ_ONCE(mm->context.asid_transition);
 }
+
+static inline bool cpu_need_tlbsync(void)
+{
+	return this_cpu_read(cpu_tlbstate.need_tlbsync);
+}
+
+static inline void cpu_set_tlbsync(bool state)
+{
+	this_cpu_write(cpu_tlbstate.need_tlbsync, state);
+}
 #else
 static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
 static inline void mm_init_global_asid(struct mm_struct *mm) { }
 static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
 static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
+static inline bool cpu_need_tlbsync(void) { return false; }
+static inline void cpu_set_tlbsync(bool state) { }
 #endif /* CONFIG_BROADCAST_TLB_FLUSH */
 
 #ifdef CONFIG_PARAVIRT
@@ -346,21 +361,15 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
-					     struct mm_struct *mm,
-					     unsigned long uaddr)
-{
-	inc_mm_tlb_gen(mm);
-	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
 static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
 {
 	flush_tlb_mm(mm);
 }
 
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr);
 
 static inline bool pte_flags_need_flush(unsigned long oldflags,
 					unsigned long newflags,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0efd99053c09..83ba6876adbf 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -492,6 +492,37 @@ static void finish_asid_transition(struct flush_tlb_info *info)
 	mm_clear_asid_transition(mm);
 }
 
+static inline void tlbsync(void)
+{
+	if (cpu_need_tlbsync()) {
+		__tlbsync();
+		cpu_set_tlbsync(false);
+	}
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						unsigned long addr,
+						u16 nr, bool pmd_stride)
+{
+	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+	if (!cpu_need_tlbsync())
+		cpu_set_tlbsync(true);
+}
+
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+	__invlpgb_flush_single_pcid_nosync(pcid);
+	if (!cpu_need_tlbsync())
+		cpu_set_tlbsync(true);
+}
+
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+	__invlpgb_flush_addr_nosync(addr, nr);
+	if (!cpu_need_tlbsync())
+		cpu_set_tlbsync(true);
+}
+
 static void broadcast_tlb_flush(struct flush_tlb_info *info)
 {
 	bool pmd = info->stride_shift == PMD_SHIFT;
@@ -790,6 +821,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 		WARN_ON_ONCE(!irqs_disabled());
 
+	tlbsync();
+
 	/*
 	 * Verify that CR3 is what we think it is.  This will catch
 	 * hypothetical buggy code that directly switches to swapper_pg_dir
@@ -966,6 +999,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
  */
 void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
+	tlbsync();
+
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
 		return;
 
@@ -1633,9 +1668,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	 * a local TLB flush is needed. Optimize this use-case by calling
 	 * flush_tlb_func_local() directly in this case.
 	 */
-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
-		invlpgb_flush_all_nonglobals();
-	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
 		flush_tlb_multi(&batch->cpumask, info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
@@ -1644,12 +1677,52 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		local_irq_enable();
 	}
 
+	/*
+	 * If (asynchronous) INVLPGB flushes were issued, wait for them here.
+	 * The cpumask above contains only CPUs that were running tasks
+	 * not using broadcast TLB flushing.
+	 */
+	tlbsync();
+
 	cpumask_clear(&batch->cpumask);
 
 	put_flush_tlb_info();
 	put_cpu();
 }
 
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	u16 asid = mm_global_asid(mm);
+
+	if (asid) {
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (cpu_feature_enabled(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+		/*
+		 * Some CPUs might still be using a local ASID for this
+		 * process, and require IPIs, while others are using the
+		 * global ASID.
+		 *
+		 * In this corner case, both broadcast TLB invalidation
+		 * and IPIs need to be sent. The IPIs will help
+		 * stragglers transition to the broadcast ASID.
+		 */
+		if (mm_in_asid_transition(mm))
+			asid = 0;
+	}
+
+	if (!asid) {
+		inc_mm_tlb_gen(mm);
+		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	}
+
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
 /*
  * Blindly accessing user memory from NMI context can be dangerous
  * if we're in the middle of switching the current user task or
-- 
2.43.0

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Dave Hansen 11 months, 1 week ago

On 3/3/25 03:46, Borislav Petkov wrote:
> On Tue, Feb 25, 2025 at 10:00:46PM -0500, Rik van Riel wrote:
>> +static inline bool cpu_need_tlbsync(void)
>> +{
>> +	return this_cpu_read(cpu_tlbstate.need_tlbsync);
>> +}
>> +
>> +static inline void cpu_write_tlbsync(bool state)
> 
> That thing feels better like "cpu_set_tlbsync" in the code...
> 
>> +{
>> +	this_cpu_write(cpu_tlbstate.need_tlbsync, state);
>> +}
>>  #else
>>  static inline u16 mm_global_asid(struct mm_struct *mm)
>>  {
> 
> ...
> 
>> +static inline void tlbsync(void)
>> +{
>> +	if (!cpu_need_tlbsync())
>> +		return;
>> +	__tlbsync();
>> +	cpu_write_tlbsync(false);
>> +}
> 
> Easier to parse visually:
> 
> static inline void tlbsync(void)
> {
>         if (cpu_need_tlbsync()) {
>                 __tlbsync();
>                 cpu_write_tlbsync(false);
>         }
> }
> 
> Final:
> 
> From: Rik van Riel <riel@surriel.com>
> Date: Tue, 25 Feb 2025 22:00:46 -0500
> Subject: [PATCH] x86/mm: Do targeted broadcast flushing from tlbbatch code
> 
> Instead of doing a system-wide TLB flush from arch_tlbbatch_flush(), queue up
> asynchronous, targeted flushes from arch_tlbbatch_add_pending().
> 
> This also allows to avoid adding the CPUs of processes using broadcast
> flushing to the batch->cpumask, and will hopefully further reduce TLB flushing
> from the reclaim and compaction paths.
> 
>   [ bp:
>    - Massage
>    - :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi ]
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
> Link: https://lore.kernel.org/r/20250226030129.530345-12-riel@surriel.com
> ---
>  arch/x86/include/asm/tlb.h      | 12 ++---
>  arch/x86/include/asm/tlbflush.h | 27 +++++++----
>  arch/x86/mm/tlb.c               | 79 +++++++++++++++++++++++++++++++--
>  3 files changed, 100 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
> index 04f2c6f4cee3..b5c2005725cf 100644
> --- a/arch/x86/include/asm/tlb.h
> +++ b/arch/x86/include/asm/tlb.h
> @@ -102,16 +102,16 @@ static inline void __tlbsync(void) { }
>  #define INVLPGB_FINAL_ONLY		BIT(4)
>  #define INVLPGB_INCLUDE_NESTED		BIT(5)
>  
> -static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
> -						unsigned long addr,
> -						u16 nr,
> -						bool pmd_stride)
> +static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
> +						  unsigned long addr,
> +						  u16 nr,
> +						  bool pmd_stride)
>  {
>  	__invlpgb(0, pcid, addr, nr, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
>  }
>  
>  /* Flush all mappings for a given PCID, not including globals. */
> -static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
> +static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
>  {
>  	__invlpgb(0, pcid, 0, 1, 0, INVLPGB_PCID);
>  }
> @@ -131,7 +131,7 @@ static inline void invlpgb_flush_all(void)
>  }
>  
>  /* Flush addr, including globals, for all PCIDs. */
> -static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
> +static inline void __invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
>  {
>  	__invlpgb(0, 0, addr, nr, 0, INVLPGB_INCLUDE_GLOBAL);
>  }
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 8c21030269ff..cbdb86d58301 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -105,6 +105,9 @@ struct tlb_state {
>  	 * need to be invalidated.
>  	 */
>  	bool invalidate_other;
> +#ifdef CONFIG_BROADCAST_TLB_FLUSH
> +	bool need_tlbsync;
> +#endif
>  
>  #ifdef CONFIG_ADDRESS_MASKING
>  	/*
> @@ -292,11 +295,23 @@ static inline bool mm_in_asid_transition(struct mm_struct *mm)
>  
>  	return mm && READ_ONCE(mm->context.asid_transition);
>  }
> +
> +static inline bool cpu_need_tlbsync(void)
> +{
> +	return this_cpu_read(cpu_tlbstate.need_tlbsync);
> +}
> +
> +static inline void cpu_set_tlbsync(bool state)
> +{
> +	this_cpu_write(cpu_tlbstate.need_tlbsync, state);
> +}
>  #else
>  static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
>  static inline void mm_init_global_asid(struct mm_struct *mm) { }
>  static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
>  static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
> +static inline bool cpu_need_tlbsync(void) { return false; }
> +static inline void cpu_set_tlbsync(bool state) { }
>  #endif /* CONFIG_BROADCAST_TLB_FLUSH */
>  
>  #ifdef CONFIG_PARAVIRT
> @@ -346,21 +361,15 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
>  	return atomic64_inc_return(&mm->context.tlb_gen);
>  }
>  
> -static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> -					     struct mm_struct *mm,
> -					     unsigned long uaddr)
> -{
> -	inc_mm_tlb_gen(mm);
> -	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> -	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> -}
> -
>  static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
>  {
>  	flush_tlb_mm(mm);
>  }
>  
>  extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
> +extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> +					     struct mm_struct *mm,
> +					     unsigned long uaddr);
>  
>  static inline bool pte_flags_need_flush(unsigned long oldflags,
>  					unsigned long newflags,
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 0efd99053c09..83ba6876adbf 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -492,6 +492,37 @@ static void finish_asid_transition(struct flush_tlb_info *info)
>  	mm_clear_asid_transition(mm);
>  }
>  
> +static inline void tlbsync(void)
> +{
> +	if (cpu_need_tlbsync()) {
> +		__tlbsync();
> +		cpu_set_tlbsync(false);
> +	}
> +}
> +
> +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
> +						unsigned long addr,
> +						u16 nr, bool pmd_stride)
> +{
> +	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
> +	if (!cpu_need_tlbsync())
> +		cpu_set_tlbsync(true);
> +}
> +
> +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
> +{
> +	__invlpgb_flush_single_pcid_nosync(pcid);
> +	if (!cpu_need_tlbsync())
> +		cpu_set_tlbsync(true);
> +}
> +
> +static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
> +{
> +	__invlpgb_flush_addr_nosync(addr, nr);
> +	if (!cpu_need_tlbsync())
> +		cpu_set_tlbsync(true);
> +}

One thought on these:

Instead of having three functions:

	1. A raw __invlpgb_*_nosync()
	2. A wrapper invlpgb_*_nosync() that flips cpu_set_tlbsync()
	3. A wrapper invlpgb_*()

Could we get away with just two?  For instance, what if we had *ALL*
__invlpgb()'s do cpu_set_tlbsync()? Then we'd universally call tlbsync().

static inline void invlpgb_flush_all_nonglobals(void)
{
        guard(preempt)();
        __invlpgb(0, 0, 0, 1, NO_STRIDE, INVLPGB_MODE_ALL_NONGLOBALS);
        tlbsync();
}

Then we wouldn't need any of those three new wrappers. The only downside
is that we'd end up with paths that logically do:

	__invlpgb()
	cpu_set_tlbsync(true);
	if (cpu_need_tlbsync()) { // always true
		__tlbsync();
		cpu_set_tlbsync(true);
	}

In other words, a possibly superfluous set/check/clear of the
"need_tlbsync" state. But I'd expect that to be a pittance compared to
the actual cost of INVLPGB/TLBSYNC.

>  static void broadcast_tlb_flush(struct flush_tlb_info *info)
>  {
>  	bool pmd = info->stride_shift == PMD_SHIFT;
> @@ -790,6 +821,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
>  	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
>  		WARN_ON_ONCE(!irqs_disabled());
>  
> +	tlbsync();

This one is in dire need of comments.

>  	/*
>  	 * Verify that CR3 is what we think it is.  This will catch
>  	 * hypothetical buggy code that directly switches to swapper_pg_dir
> @@ -966,6 +999,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
>   */
>  void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
>  {
> +	tlbsync();

Ditto, *especially* if this hits the init_mm state. There really
shouldn't be any deferred flushes for the init_mm.

>  	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
>  		return;
>  
> @@ -1633,9 +1668,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>  	 * a local TLB flush is needed. Optimize this use-case by calling
>  	 * flush_tlb_func_local() directly in this case.
>  	 */
> -	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
> -		invlpgb_flush_all_nonglobals();
> -	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
> +	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
>  		flush_tlb_multi(&batch->cpumask, info);
>  	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
>  		lockdep_assert_irqs_enabled();
> @@ -1644,12 +1677,52 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>  		local_irq_enable();
>  	}
>  
> +	/*
> +	 * If (asynchronous) INVLPGB flushes were issued, wait for them here.
> +	 * The cpumask above contains only CPUs that were running tasks
> +	 * not using broadcast TLB flushing.
> +	 */
> +	tlbsync();

I have a suggested comment in the attached fixups. This makes it sound
like tasks are either INVLPGB *or* in ->cpumask. Transitioning mms can
be both, I think.

>  	cpumask_clear(&batch->cpumask);
>  
>  	put_flush_tlb_info();
>  	put_cpu();
>  }
>  
> +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> +					     struct mm_struct *mm,
> +					     unsigned long uaddr)
> +{
> +	u16 asid = mm_global_asid(mm);
> +
> +	if (asid) {
> +		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
> +		/* Do any CPUs supporting INVLPGB need PTI? */
> +		if (cpu_feature_enabled(X86_FEATURE_PTI))
> +			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
> +
> +		/*
> +		 * Some CPUs might still be using a local ASID for this
> +		 * process, and require IPIs, while others are using the
> +		 * global ASID.
> +		 *
> +		 * In this corner case, both broadcast TLB invalidation
> +		 * and IPIs need to be sent. The IPIs will help
> +		 * stragglers transition to the broadcast ASID.
> +		 */
> +		if (mm_in_asid_transition(mm))
> +			asid = 0;
> +	}
> +
> +	if (!asid) {
> +		inc_mm_tlb_gen(mm);
> +		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> +	}
> +
> +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> +}

I stuck some better comments in the patch. I think this should both
mention that a later tlbsync() is required and *also* match teh two
cases better.

Also, the "if (asid)" isn't super nice naming. Let's please call it
"global_asid" because it's either a global ASID or 0.

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Brendan Jackman 11 months, 1 week ago

On Mon, Mar 03, 2025 at 01:47:42PM -0800, Dave Hansen wrote:
> >  static void broadcast_tlb_flush(struct flush_tlb_info *info)
> >  {
> >  	bool pmd = info->stride_shift == PMD_SHIFT;
> > @@ -790,6 +821,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> >  	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
> >  		WARN_ON_ONCE(!irqs_disabled());
> >  
> > +	tlbsync();
> 
> This one is in dire need of comments.

I have been advocating to move this to a place where it's clearer. I
also think it needs comments:

https://lore.kernel.org/all/CA+i-1C31TrceZiizC_tng_cc-zcvKsfXLAZD_XDftXnp9B2Tdw@mail.gmail.com/

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Borislav Petkov 11 months, 1 week ago

On Tue, Mar 04, 2025 at 12:52:47PM +0000, Brendan Jackman wrote:
> https://lore.kernel.org/all/CA+i-1C31TrceZiizC_tng_cc-zcvKsfXLAZD_XDftXnp9B2Tdw@mail.gmail.com/

Lemme try to understand what you're suggesting on that subthread:

> static inline void arch_start_context_switch(struct task_struct *prev)
> {
>     arch_paravirt_start_context_switch(prev);
>     tlb_start_context_switch(prev);
> }

This kinda makes sense to me...

> Now I think about it... if we always tlbsync() before a context switch, is the
> cant_migrate() above actually required? I think with that, even if we migrated
> in the middle of e.g.  broadcast_kernel_range_flush(), we'd be fine? (At
> least, from the specific perspective of the invplgb code, presumably having
> preemption on there would break things horribly in other ways).

I think we still need it because you need to TLBSYNC on the same CPU you've
issued the INVLPGB and actually, you want all TLBs to have been synched
system-wide.

Or am I misunderstanding it?

Anything else I missed?

Btw, I just sent v15 - if you wanna continue commenting there...

https://lore.kernel.org/r/20250304135816.12356-1-bp@kernel.org

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Brendan Jackman 11 months, 1 week ago

On Tue, Mar 04, 2025 at 03:11:34PM +0100, Borislav Petkov wrote:
> On Tue, Mar 04, 2025 at 12:52:47PM +0000, Brendan Jackman wrote:
> > https://lore.kernel.org/all/CA+i-1C31TrceZiizC_tng_cc-zcvKsfXLAZD_XDftXnp9B2Tdw@mail.gmail.com/
> 
> Lemme try to understand what you're suggesting on that subthread:
> 
> > static inline void arch_start_context_switch(struct task_struct *prev)
> > {
> >     arch_paravirt_start_context_switch(prev);
> >     tlb_start_context_switch(prev);
> > }
> 
> This kinda makes sense to me...

Yeah so basically my concern here is that we are doing something
that's about context switching, but we're doing it in mm-switching
code, entangling an assumption that "context_switch() must either call
this function or that function". Whereas if we just call it explicitly
from context_switch() it will be much clearer.

> > Now I think about it... if we always tlbsync() before a context switch, is the
> > cant_migrate() above actually required? I think with that, even if we migrated
> > in the middle of e.g.  broadcast_kernel_range_flush(), we'd be fine? (At
> > least, from the specific perspective of the invplgb code, presumably having
> > preemption on there would break things horribly in other ways).
> 
> I think we still need it because you need to TLBSYNC on the same CPU you've
> issued the INVLPGB and actually, you want all TLBs to have been synched
> system-wide.
> 
> Or am I misunderstanding it?

Er, I might be exposing my own ignorance here. I was thinking that you
always go through context_switch() before you get migrated, but I
might not understand hwo migration happens.

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Dave Hansen 11 months, 1 week ago

On 3/4/25 07:33, Brendan Jackman wrote:
> On Tue, Mar 04, 2025 at 03:11:34PM +0100, Borislav Petkov wrote:
>> On Tue, Mar 04, 2025 at 12:52:47PM +0000, Brendan Jackman wrote:
>>> https://lore.kernel.org/all/CA+i-1C31TrceZiizC_tng_cc-zcvKsfXLAZD_XDftXnp9B2Tdw@mail.gmail.com/
>>
>> Lemme try to understand what you're suggesting on that subthread:
>>
>>> static inline void arch_start_context_switch(struct task_struct *prev)
>>> {
>>>     arch_paravirt_start_context_switch(prev);
>>>     tlb_start_context_switch(prev);
>>> }
>>
>> This kinda makes sense to me...
> 
> Yeah so basically my concern here is that we are doing something
> that's about context switching, but we're doing it in mm-switching
> code, entangling an assumption that "context_switch() must either call
> this function or that function". Whereas if we just call it explicitly
> from context_switch() it will be much clearer.

I was coming to a similar conclusion. All of the nastiness here would
come from an operation like:

	INVLPGB
	=> get scheduled on another CPU
	TLBSYNC

But there's no nastiness with:

	INVLPGB
	=> switch to init_mm
	TLBSYNC

at *all*. Because the TLBSYNC still works just fine. In fact, it *has*
to work just fine because you can get an TLB flush IPI in that window
already.

>>> Now I think about it... if we always tlbsync() before a context switch, is the
>>> cant_migrate() above actually required? I think with that, even if we migrated
>>> in the middle of e.g.  broadcast_kernel_range_flush(), we'd be fine? (At
>>> least, from the specific perspective of the invplgb code, presumably having
>>> preemption on there would break things horribly in other ways).
>>
>> I think we still need it because you need to TLBSYNC on the same CPU you've
>> issued the INVLPGB and actually, you want all TLBs to have been synched
>> system-wide.
>>
>> Or am I misunderstanding it?
> 
> Er, I might be exposing my own ignorance here. I was thinking that you
> always go through context_switch() before you get migrated, but I
> might not understand hwo migration happens.

Let's take a step back. Most of our IPI-based TLB flushes end up in this
code:

        preempt_disable();
        smp_call_function_many_cond(...);
        preempt_enable();

We don't have any fanciness around to keep the initiating thread on the
same CPU or check for pending TLB flushes at context switch time or lazy
tlb entry. We don't send asynchronous IPIs from the tlbbatch code and
then check for completion at batch finish time.

There's a lot of complexity to stuffing these TLB flushes into a
microarchitectural buffer and making *sure* they are flushed out.
INVLPGB is not free. It's not clear at all to me that doing loads of
them in reclaim code is superior to doing loads of:

	inc_mm_tlb_gen(mm);
	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));

and then just zapping the whole TLB on the next context switch.

I think we should defer this for now. Let's look at it again when there
are some performance numbers to justify it.

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Borislav Petkov 11 months, 1 week ago

On Mon, Mar 03, 2025 at 01:47:42PM -0800, Dave Hansen wrote:
> > +static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
> > +{
> > +	__invlpgb_flush_addr_nosync(addr, nr);
> > +	if (!cpu_need_tlbsync())
> > +		cpu_set_tlbsync(true);
> > +}
> 
> One thought on these:
> 
> Instead of having three functions:
> 
> 	1. A raw __invlpgb_*_nosync()
> 	2. A wrapper invlpgb_*_nosync() that flips cpu_set_tlbsync()
> 	3. A wrapper invlpgb_*()
> 
> Could we get away with just two?  For instance, what if we had *ALL*
> __invlpgb()'s do cpu_set_tlbsync()? Then we'd universally call tlbsync().
> 
> static inline void invlpgb_flush_all_nonglobals(void)
> {
>         guard(preempt)();
>         __invlpgb(0, 0, 0, 1, NO_STRIDE, INVLPGB_MODE_ALL_NONGLOBALS);
>         tlbsync();
> }
> 
> Then we wouldn't need any of those three new wrappers. The only downside
> is that we'd end up with paths that logically do:
> 
> 	__invlpgb()
> 	cpu_set_tlbsync(true);
> 	if (cpu_need_tlbsync()) { // always true
> 		__tlbsync();
> 		cpu_set_tlbsync(true);
> 	}
> 
> In other words, a possibly superfluous set/check/clear of the
> "need_tlbsync" state. But I'd expect that to be a pittance compared to
> the actual cost of INVLPGB/TLBSYNC.

Lemme see whether I can grasp properly what you mean:

What you really want is for the _nosync() variants to set need_tlbsync, right?

Because looking at all the call sites which do set tlbsync, the flow is this:

        __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride, freed_tables);
        if (!cpu_need_tlbsync())
                cpu_set_tlbsync(true);

So we should move that

	cpu_set_tlbsync(true);

into the __invlpgb_*_nosync() variant.

And in order to make it even simpler, we should drop the testing too:

IOW, this:

/* Flush all mappings for a given PCID, not including globals. */
static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
{
        __invlpgb(0, pcid, 0, 1, 0, INVLPGB_PCID);
        cpu_set_tlbsync(true);
}

Right?

> >  static void broadcast_tlb_flush(struct flush_tlb_info *info)
> >  {
> >  	bool pmd = info->stride_shift == PMD_SHIFT;
> > @@ -790,6 +821,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> >  	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
> >  		WARN_ON_ONCE(!irqs_disabled());
> >  
> > +	tlbsync();
> 
> This one is in dire need of comments.

Maybe this:

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 08672350536f..b97249ffff1f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -822,6 +822,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
        if (IS_ENABLED(CONFIG_PROVE_LOCKING))
                WARN_ON_ONCE(!irqs_disabled());
 
+       /*
+        * Finish any remote TLB flushes pending from this CPU:
+        */
        tlbsync();
 
        /*

> Ditto, *especially* if this hits the init_mm state. There really
> shouldn't be any deferred flushes for the init_mm.

Basically what you said but as a comment. :-P

Merged in the rest.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH v14 11/13] x86/mm: do targeted broadcast flushing from tlbbatch code

Posted by Dave Hansen 11 months, 1 week ago

On 3/4/25 03:52, Borislav Petkov wrote:
> On Mon, Mar 03, 2025 at 01:47:42PM -0800, Dave Hansen wrote:
...
> IOW, this:
> 
> /* Flush all mappings for a given PCID, not including globals. */
> static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
> {
>         __invlpgb(0, pcid, 0, 1, 0, INVLPGB_PCID);
>         cpu_set_tlbsync(true);
> }
> 
> Right?

Yep, that works.

Optimizing out the writes like the old code did is certainly a good
thought. But I suspect the cacheline is hot the majority of the time.

>>>  static void broadcast_tlb_flush(struct flush_tlb_info *info)
>>>  {
>>>  	bool pmd = info->stride_shift == PMD_SHIFT;
>>> @@ -790,6 +821,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
>>>  	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
>>>  		WARN_ON_ONCE(!irqs_disabled());
>>>  
>>> +	tlbsync();
>>
>> This one is in dire need of comments.
> 
> Maybe this:
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 08672350536f..b97249ffff1f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -822,6 +822,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING))
>                 WARN_ON_ONCE(!irqs_disabled());
>  
> +       /*
> +        * Finish any remote TLB flushes pending from this CPU:
> +        */
>         tlbsync();

That's a prototypical "what" comment and not "why", though. It makes a
lot of sense that any flushes that the old task did should complete
before a new gets activated. But I honestly can't think of a _specific_
problem that it causes.

I don't doubt that this does _some_ good, but I just don't know what
good it does. ;)