[v5] AMD broadcast TLB invalidation

[PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Rik van Riel 2 weeks, 6 days ago

Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.

This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.

Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/tlbbatch.h |  1 +
 arch/x86/include/asm/tlbflush.h | 12 ++------
 arch/x86/mm/tlb.c               | 54 +++++++++++++++++++++++++++++++--
 3 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
index 1ad56eb3e8a8..f9a17edf63ad 100644
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch {
 	 * the PFNs being flushed..
 	 */
 	struct cpumask cpumask;
+	bool used_invlpgb;
 };
 
 #endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5eae5c1aafa5..e5516afdef7d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -358,21 +358,15 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
-					     struct mm_struct *mm,
-					     unsigned long uaddr)
-{
-	inc_mm_tlb_gen(mm);
-	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
 static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
 {
 	flush_tlb_mm(mm);
 }
 
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr);
 
 static inline bool pte_flags_need_flush(unsigned long oldflags,
 					unsigned long newflags,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index bfc69ae4ea40..81f847c94321 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1659,9 +1659,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	 * a local TLB flush is needed. Optimize this use-case by calling
 	 * flush_tlb_func_local() directly in this case.
 	 */
-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
-		invlpgb_flush_all_nonglobals();
-	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
 		flush_tlb_multi(&batch->cpumask, info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
@@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		local_irq_enable();
 	}
 
+	/*
+	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+	 * The cpumask above contains only CPUs that were running tasks
+	 * not using broadcast TLB flushing.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
+		tlbsync();
+		migrate_enable();
+		batch->used_invlpgb = false;
+	}
+
 	cpumask_clear(&batch->cpumask);
 
 	put_flush_tlb_info();
 	put_cpu();
 }
 
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
+		u16 asid = mm_global_asid(mm);
+		/*
+		 * Queue up an asynchronous invalidation. The corresponding
+		 * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
+		 * on the same CPU.
+		 */
+		if (!batch->used_invlpgb) {
+			batch->used_invlpgb = true;
+			migrate_disable();
+		}
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+		/*
+		 * Some CPUs might still be using a local ASID for this
+		 * process, and require IPIs, while others are using the
+		 * global ASID.
+		 *
+		 * In this corner case we need to do both the broadcast
+		 * TLB invalidation, and send IPIs. The IPIs will help
+		 * stragglers transition to the broadcast ASID.
+		 */
+		if (READ_ONCE(mm->context.asid_transition))
+			goto also_send_ipi;
+	} else {
+also_send_ipi:
+		inc_mm_tlb_gen(mm);
+		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	}
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
 /*
  * Blindly accessing user memory from NMI context can be dangerous
  * if we're in the middle of switching the current user task or
-- 
2.47.1

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Nadav Amit 2 weeks, 2 days ago


On 16/01/2025 4:30, Rik van Riel wrote:
> Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
> queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
> 

[snip]

> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1659,9 +1659,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>   	 * a local TLB flush is needed. Optimize this use-case by calling
>   	 * flush_tlb_func_local() directly in this case.
>   	 */
> -	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
> -		invlpgb_flush_all_nonglobals();
> -	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
> +	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
>   		flush_tlb_multi(&batch->cpumask, info);
>   	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
>   		lockdep_assert_irqs_enabled();
> @@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>   		local_irq_enable();
>   	}
>   
> +	/*
> +	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
> +	 * The cpumask above contains only CPUs that were running tasks
> +	 * not using broadcast TLB flushing.
> +	 */
> +	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
> +		tlbsync();
> +		migrate_enable();

Maybe someone mentioned it before, but I would emphasize that I do not 
think that preventing migration for potentially long time is that great.

One alternative solution would be to set a bit on cpu_tlbstate, that 
when set, you'd issue a tlbsync on context switch.

(I can think about other solutions, but I think the one I just mentioned 
is the cleanest one).

> +		batch->used_invlpgb = false;
> +	}
> +
>   	cpumask_clear(&batch->cpumask);
>   
>   	put_flush_tlb_info();
>   	put_cpu();
>   }
>   
> +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> +					     struct mm_struct *mm,
> +					     unsigned long uaddr)
> +{
> +	if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
> +		u16 asid = mm_global_asid(mm);
> +		/*
> +		 * Queue up an asynchronous invalidation. The corresponding
> +		 * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
> +		 * on the same CPU.
> +		 */
> +		if (!batch->used_invlpgb) {
> +			batch->used_invlpgb = true;
> +			migrate_disable();

See my comment above...

> +		}
> +		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
> +		/* Do any CPUs supporting INVLPGB need PTI? */
> +		if (static_cpu_has(X86_FEATURE_PTI))
> +			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
> +
> +		/*
> +		 * Some CPUs might still be using a local ASID for this
> +		 * process, and require IPIs, while others are using the
> +		 * global ASID.
> +		 *
> +		 * In this corner case we need to do both the broadcast
> +		 * TLB invalidation, and send IPIs. The IPIs will help
> +		 * stragglers transition to the broadcast ASID.
> +		 */
> +		if (READ_ONCE(mm->context.asid_transition))
> +			goto also_send_ipi;
> +	} else {
> +also_send_ipi:

I really think you should avoid such goto's. A simple bool variable of 
"need_ipi" would suffice.

> +		inc_mm_tlb_gen(mm);
> +		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> +	}
> +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> +}
> +
>   /*
>    * Blindly accessing user memory from NMI context can be dangerous
>    * if we're in the middle of switching the current user task or

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Rik van Riel 2 weeks, 2 days ago

On Mon, 2025-01-20 at 11:56 +0200, Nadav Amit wrote:
> 
> > @@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct
> > arch_tlbflush_unmap_batch *batch)
> >   		local_irq_enable();
> >   	}
> >   
> > +	/*
> > +	 * If we issued (asynchronous) INVLPGB flushes, wait for
> > them here.
> > +	 * The cpumask above contains only CPUs that were running
> > tasks
> > +	 * not using broadcast TLB flushing.
> > +	 */
> > +	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch-
> > >used_invlpgb) {
> > +		tlbsync();
> > +		migrate_enable();
> 
> Maybe someone mentioned it before, but I would emphasize that I do
> not 
> think that preventing migration for potentially long time is that
> great.
> 
> One alternative solution would be to set a bit on cpu_tlbstate, that 
> when set, you'd issue a tlbsync on context switch.
> 
> (I can think about other solutions, but I think the one I just
> mentioned 
> is the cleanest one).

It is clean, but I'm not convinced it is good enough.

We need to guarantee that the INVLPGBs have finished
before we free the pages.

Running a TLBSYNC at the next context switch could
mean that TLBSYNC won't run until after the pages
have been freed.

In practice it is probably good enough, since it
would be simpler for TLBSYNC to return once all
pending (older) INVLPGBs have finished, but it's
not architecturally guaranteed.

We could send an IPI to remote CPUs in order for
them to call TLBSYNC, but is that really better?

-- 
All Rights Reversed.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Nadav Amit 2 weeks, 1 day ago


On 20/01/2025 16:02, Rik van Riel wrote:
> On Mon, 2025-01-20 at 11:56 +0200, Nadav Amit wrote:
>>
>>> @@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct
>>> arch_tlbflush_unmap_batch *batch)
>>>    		local_irq_enable();
>>>    	}
>>>    
>>> +	/*
>>> +	 * If we issued (asynchronous) INVLPGB flushes, wait for
>>> them here.
>>> +	 * The cpumask above contains only CPUs that were running
>>> tasks
>>> +	 * not using broadcast TLB flushing.
>>> +	 */
>>> +	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch-
>>>> used_invlpgb) {
>>> +		tlbsync();
>>> +		migrate_enable();
>>
>> Maybe someone mentioned it before, but I would emphasize that I do
>> not
>> think that preventing migration for potentially long time is that
>> great.
>>
>> One alternative solution would be to set a bit on cpu_tlbstate, that
>> when set, you'd issue a tlbsync on context switch.
>>
>> (I can think about other solutions, but I think the one I just
>> mentioned
>> is the cleanest one).
> 
> It is clean, but I'm not convinced it is good enough.
> 
> We need to guarantee that the INVLPGBs have finished
> before we free the pages.
> 
> Running a TLBSYNC at the next context switch could
> mean that TLBSYNC won't run until after the pages
> have been freed.
> 
> In practice it is probably good enough, since it
> would be simpler for TLBSYNC to return once all
> pending (older) INVLPGBs have finished, but it's
> not architecturally guaranteed.
> 
> We could send an IPI to remote CPUs in order for
> them to call TLBSYNC, but is that really better?
> 

I am not sure we are on the same page. What I suggested is:

1. arch_tlbbatch_flush() would still do tlbsync()
2. No migrate_enable() in arch_tlbbatch_flush()
3. No migrate_disable() in arch_tlbbatch_add_pending()
4. arch_tlbbatch_add_pending() sets cpu_tlbstate.pending_tlb_broadcast
5. switch_mm_irqs_off() checks cpu_tlbstate.pending_tlb_broadcast and if 
it is set performs tlbsync and clears it.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Rik van Riel 2 weeks, 1 day ago

On Mon, 2025-01-20 at 16:14 +0200, Nadav Amit wrote:
> 
> I am not sure we are on the same page. What I suggested is:
> 
> 1. arch_tlbbatch_flush() would still do tlbsync()
> 2. No migrate_enable() in arch_tlbbatch_flush()
> 3. No migrate_disable() in arch_tlbbatch_add_pending()
> 4. arch_tlbbatch_add_pending() sets
> cpu_tlbstate.pending_tlb_broadcast
> 5. switch_mm_irqs_off() checks cpu_tlbstate.pending_tlb_broadcast and
> if 
> it is set performs tlbsync and clears it.
> 
How does that synchronize the page freeing (from page
reclaim) with the TLBSYNCs?

What guarantees that the page reclaim path won't free
the pages until after TLBSYNC has completed on the CPUs
that kicked off asynchronous flushes with INVPLGB?

-- 
All Rights Reversed.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Nadav Amit 2 weeks, 1 day ago

On 20/01/2025 18:11, Rik van Riel wrote:
> On Mon, 2025-01-20 at 16:14 +0200, Nadav Amit wrote:
>>
>> I am not sure we are on the same page. What I suggested is:
>>
>> 1. arch_tlbbatch_flush() would still do tlbsync()
>> 2. No migrate_enable() in arch_tlbbatch_flush()
>> 3. No migrate_disable() in arch_tlbbatch_add_pending()
>> 4. arch_tlbbatch_add_pending() sets
>> cpu_tlbstate.pending_tlb_broadcast
>> 5. switch_mm_irqs_off() checks cpu_tlbstate.pending_tlb_broadcast and
>> if
>> it is set performs tlbsync and clears it.
>>
> How does that synchronize the page freeing (from page
> reclaim) with the TLBSYNCs?
> 
> What guarantees that the page reclaim path won't free
> the pages until after TLBSYNC has completed on the CPUs
> that kicked off asynchronous flushes with INVPLGB?

[ you make me lose my confidence, although I see nothing wrong ]

Freeing the pages must be done after the TLBSYNC. I did not imply it 
needs to be changed.

The page freeing (and reclaim) path is only initiated after 
arch_tlbbatch_flush() is completed. If no migration is initiated, since 
we did not remove any tlbsync, it should be fine.

If migration was initiated, and some invlpgb's were already initiated, 
then we need for correctness to initiate tlbsync before the task might 
be scheduled on another core. That's exactly why adding tlbsync to 
switch_mm_irqs_off() is needed in such a case.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Rik van Riel 2 weeks, 1 day ago

On Mon, 2025-01-20 at 19:09 +0200, Nadav Amit wrote:
> 
> On 20/01/2025 18:11, Rik van Riel wrote:
> > 
> > What guarantees that the page reclaim path won't free
> > the pages until after TLBSYNC has completed on the CPUs
> > that kicked off asynchronous flushes with INVPLGB?
> 
> [ you make me lose my confidence, although I see nothing wrong ]
> 
> Freeing the pages must be done after the TLBSYNC. I did not imply it 
> needs to be changed.
> 
> The page freeing (and reclaim) path is only initiated after 
> arch_tlbbatch_flush() is completed. If no migration is initiated,
> since 
> we did not remove any tlbsync, it should be fine.
> 
> If migration was initiated, and some invlpgb's were already
> initiated, 
> then we need for correctness to initiate tlbsync before the task
> might 
> be scheduled on another core. That's exactly why adding tlbsync to 
> switch_mm_irqs_off() is needed in such a case.

This is the page reclaim code.

The process that has those other pages mapped might be
running on other CPUs simultaneously with the page
reclaim code.

Even if we were invalidating one of our own pages this
way, there could be other threads in the same process,
running while we are in the page reclaim code.

-- 
All Rights Reversed.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Nadav Amit 2 weeks, 1 day ago

[ Thanks for your patience. ]

> On 20 Jan 2025, at 19:11, Rik van Riel <riel@surriel.com> wrote:
> 
> On Mon, 2025-01-20 at 19:09 +0200, Nadav Amit wrote:
> 
> This is the page reclaim code.
> 
> The process that has those other pages mapped might be
> running on other CPUs simultaneously with the page
> reclaim code.
> 
> Even if we were invalidating one of our own pages this
> way, there could be other threads in the same process,
> running while we are in the page reclaim code.

Of course, but there is nothing new here. Let me see where we lose
each other by first stating the goal, what you propose, and what I
suggest.

We are issuing invlpgb and we need to ensure tlbsync on the same core
that initiated invlpgb before arch_tlbbatch_flush() finishes. That’s
all that matters for our discussion (correct me if I miss something).

You solved it by disabling migration and running tlbsync at the end.

I suggest *not* to disable migration, to keep running tlbsync at the
arch_tlbbatch_flush() as you do, and if context-switch happens after
arch_tlbbatch_add_pending() and before arch_tlbbatch_flush(), to run
tlbsync during the context switch.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Rik van Riel 2 weeks, 1 day ago

On Mon, 2025-01-20 at 19:50 +0200, Nadav Amit wrote:
> [ Thanks for your patience. ]
> 
> > On 20 Jan 2025, at 19:11, Rik van Riel <riel@surriel.com> wrote:
> > 
> > On Mon, 2025-01-20 at 19:09 +0200, Nadav Amit wrote:
> > 
> > This is the page reclaim code.
> > 
> > The process that has those other pages mapped might be
> > running on other CPUs simultaneously with the page
> > reclaim code.
> > 
> > Even if we were invalidating one of our own pages this
> > way, there could be other threads in the same process,
> > running while we are in the page reclaim code.
> 
> 
> 
> Of course, but there is nothing new here. Let me see where we lose
> each other by first stating the goal, what you propose, and what I
> suggest.
> 
> We are issuing invlpgb and we need to ensure tlbsync on the same core
> that initiated invlpgb before arch_tlbbatch_flush() finishes. That’s
> all that matters for our discussion (correct me if I miss something).
> 
> You solved it by disabling migration and running tlbsync at the end.
> 
> I suggest *not* to disable migration, to keep running tlbsync at the
> arch_tlbbatch_flush() as you do, and if context-switch happens after
> arch_tlbbatch_add_pending() and before arch_tlbbatch_flush(), to run
> tlbsync during the context switch.
> 
How would you keep track of CPUs where the tlbsync
has NOT happened before arch_tlbbatch_flush()?

That part seems to be missing still.

-- 
All Rights Reversed.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Nadav Amit 2 weeks, 1 day ago


> On 20 Jan 2025, at 19:56, Rik van Riel <riel@surriel.com> wrote:
> 
> How would you keep track of CPUs where the tlbsync
> has NOT happened before arch_tlbbatch_flush()?
> 
> That part seems to be missing still.

You only keep track if there is a pending tlbsync on *your* CPU. No need to
track if other CPUs did not issue tlbsync during arch_tlbbatch_add_pending().
If the process that does the reclamation was migrated, a TLBSYNC is issued
during the context switch, before that thread that does the reclamation has
any chance of being scheduled.

I hope this code changes on top of your would make it clearer:

> +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> +					     struct mm_struct *mm,
> +					     unsigned long uaddr)
> +{
> +	if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
> +		u16 asid = mm_global_asid(mm);
> +		/*
> +		 * Queue up an asynchronous invalidation. The corresponding
> +		 * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
> +		 * on the same CPU.
> +		 */

#if 0 		// remove
> +		if (!batch->used_invlpgb) {
> +			batch->used_invlpgb = true;
> +			migrate_disable();
> +		}
#endif

		 batch->used_invlpg = true;
		 preempt_disable();

> +		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
> +		/* Do any CPUs supporting INVLPGB need PTI? */
> +		if (static_cpu_has(X86_FEATURE_PTI))
> +			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);

		 this_cpu_write(cpu_tlbstate.pending_tlbsync, true);
		 preempt_enable();
> +
> +		/*
> +		 * Some CPUs might still be using a local ASID for this
> +		 * process, and require IPIs, while others are using the
> +		 * global ASID.
> +		 *
> +		 * In this corner case we need to do both the broadcast
> +		 * TLB invalidation, and send IPIs. The IPIs will help
> +		 * stragglers transition to the broadcast ASID.
> +		 */
> +		if (READ_ONCE(mm->context.asid_transition))
> +			goto also_send_ipi;
> +	} else {
> +also_send_ipi:
> +		inc_mm_tlb_gen(mm);
> +		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> +	}
> +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> +}
> +

Then in switch_mm_irqs_off(), b

	if (this_cpu_read(cpu_tlbstate.pending_tlbsync))
		tlbsync();

Note that when switch_mm_irqs_off() is called due to context switch from 
context_switch(), finish_task_switch() has still not took place, so the
task cannot be scheduled on other cores.

Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Posted by Rik van Riel 2 weeks, 1 day ago

On Mon, 2025-01-20 at 20:56 +0200, Nadav Amit wrote:
> 
> Then in switch_mm_irqs_off(), b
> 
> 	if (this_cpu_read(cpu_tlbstate.pending_tlbsync))
> 		tlbsync();
> 
> Note that when switch_mm_irqs_off() is called due to context switch
> from 
> context_switch(), finish_task_switch() has still not took place, so
> the
> task cannot be scheduled on other cores.
> 
> 
That would certainly work.

I'll add that as a separate patch, which the x86
maintainers can either accept or leave out.

I'm not sure what's worse, making reclaim non-migratable,
or adding an extra branch (we can probably avoid a cache
line miss) to switch_mm_irqs_off.

-- 
All Rights Reversed.