targeted TLB sync IPIs for lockless page table walkers

[PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Lance Yang 5 days, 13 hours ago

From: Lance Yang <lance.yang@linux.dev>

Currently, tlb_remove_table_sync_one() broadcasts IPIs to all CPUs to wait
for any concurrent lockless page table walkers (e.g., GUP-fast). This is
inefficient on systems with many CPUs, especially for RT workloads[1].

This patch introduces a per-CPU tracking mechanism to record which CPUs are
actively performing lockless page table walks for a specific mm_struct.
When freeing/unsharing page tables, we can now send IPIs only to the CPUs
that are actually walking that mm, instead of broadcasting to all CPUs.

In preparation for targeted IPIs; a follow-up will switch callers to
tlb_remove_table_sync_mm().

Note that the tracking adds ~3% latency to GUP-fast, as measured on a
64-core system.

[1] https://lore.kernel.org/linux-mm/1b27a3fa-359a-43d0-bdeb-c31341749367@kernel.org/

Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Lance Yang <lance.yang@linux.dev>
---
 include/asm-generic/tlb.h |  2 ++
 include/linux/mm.h        | 34 ++++++++++++++++++++++++++
 kernel/events/core.c      |  2 ++
 mm/gup.c                  |  2 ++
 mm/mmu_gather.c           | 50 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 90 insertions(+)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 4aeac0c3d3f0..b6b06e6b879f 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -250,6 +250,7 @@ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
 #endif
 
 void tlb_remove_table_sync_one(void);
+void tlb_remove_table_sync_mm(struct mm_struct *mm);
 
 #else
 
@@ -258,6 +259,7 @@ void tlb_remove_table_sync_one(void);
 #endif
 
 static inline void tlb_remove_table_sync_one(void) { }
+static inline void tlb_remove_table_sync_mm(struct mm_struct *mm) { }
 
 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8a8fd47399c..d92df995fcd1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2995,6 +2995,40 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
 		      pgoff_t *offset);
 int folio_add_pins(struct folio *folio, unsigned int pins);
 
+/*
+ * Track CPUs doing lockless page table walks to avoid broadcast IPIs
+ * during TLB flushes.
+ */
+DECLARE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
+
+static inline void pt_walk_lockless_start(struct mm_struct *mm)
+{
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * Tell other CPUs we're doing lockless page table walk.
+	 *
+	 * Full barrier needed to prevent page table reads from being
+	 * reordered before this write.
+	 *
+	 * Pairs with smp_rmb() in tlb_remove_table_sync_mm().
+	 */
+	this_cpu_write(active_lockless_pt_walk_mm, mm);
+	smp_mb();
+}
+
+static inline void pt_walk_lockless_end(void)
+{
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * Clear the pointer so other CPUs no longer see this CPU as walking
+	 * the mm. Use smp_store_release to ensure page table reads complete
+	 * before the clear is visible to other CPUs.
+	 */
+	smp_store_release(this_cpu_ptr(&active_lockless_pt_walk_mm), NULL);
+}
+
 int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
 int pin_user_pages_fast(unsigned long start, int nr_pages,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..6539112c28ff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8190,7 +8190,9 @@ static u64 perf_get_page_size(unsigned long addr)
 		mm = &init_mm;
 	}
 
+	pt_walk_lockless_start(mm);
 	size = perf_get_pgtable_size(mm, addr);
+	pt_walk_lockless_end();
 
 	local_irq_restore(flags);
 
diff --git a/mm/gup.c b/mm/gup.c
index 8e7dc2c6ee73..6748e28b27f2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3154,7 +3154,9 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
 	 * that come from callers of tlb_remove_table_sync_one().
 	 */
 	local_irq_save(flags);
+	pt_walk_lockless_start(current->mm);
 	gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
+	pt_walk_lockless_end();
 	local_irq_restore(flags);
 
 	/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2faa23d7f8d4..35c89e4b6230 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -285,6 +285,56 @@ void tlb_remove_table_sync_one(void)
 	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 }
 
+DEFINE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
+EXPORT_PER_CPU_SYMBOL_GPL(active_lockless_pt_walk_mm);
+
+/**
+ * tlb_remove_table_sync_mm - send IPIs to CPUs doing lockless page table
+ * walk for @mm
+ *
+ * @mm: target mm; only CPUs walking this mm get an IPI.
+ *
+ * Like tlb_remove_table_sync_one() but only targets CPUs in
+ * active_lockless_pt_walk_mm.
+ */
+void tlb_remove_table_sync_mm(struct mm_struct *mm)
+{
+	cpumask_var_t target_cpus;
+	bool found_any = false;
+	int cpu;
+
+	if (WARN_ONCE(!mm, "NULL mm in %s\n", __func__)) {
+		tlb_remove_table_sync_one();
+		return;
+	}
+
+	/* If we can't, fall back to broadcast. */
+	if (!alloc_cpumask_var(&target_cpus, GFP_ATOMIC)) {
+		tlb_remove_table_sync_one();
+		return;
+	}
+
+	cpumask_clear(target_cpus);
+
+	/* Pairs with smp_mb() in pt_walk_lockless_start(). */
+	smp_rmb();
+
+	/* Find CPUs doing lockless page table walks for this mm */
+	for_each_online_cpu(cpu) {
+		if (per_cpu(active_lockless_pt_walk_mm, cpu) == mm) {
+			cpumask_set_cpu(cpu, target_cpus);
+			found_any = true;
+		}
+	}
+
+	/* Only send IPIs to CPUs actually doing lockless walks */
+	if (found_any)
+		smp_call_function_many(target_cpus, tlb_remove_table_smp_sync,
+				       NULL, 1);
+
+	free_cpumask_var(target_cpus);
+}
+
 static void tlb_remove_table_rcu(struct rcu_head *head)
 {
 	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
-- 
2.49.0

Re: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Peter Zijlstra 5 days, 11 hours ago

On Mon, Feb 02, 2026 at 03:45:55PM +0800, Lance Yang wrote:
> From: Lance Yang <lance.yang@linux.dev>
> 
> Currently, tlb_remove_table_sync_one() broadcasts IPIs to all CPUs to wait
> for any concurrent lockless page table walkers (e.g., GUP-fast). This is
> inefficient on systems with many CPUs, especially for RT workloads[1].
> 
> This patch introduces a per-CPU tracking mechanism to record which CPUs are
> actively performing lockless page table walks for a specific mm_struct.
> When freeing/unsharing page tables, we can now send IPIs only to the CPUs
> that are actually walking that mm, instead of broadcasting to all CPUs.
> 
> In preparation for targeted IPIs; a follow-up will switch callers to
> tlb_remove_table_sync_mm().
> 
> Note that the tracking adds ~3% latency to GUP-fast, as measured on a
> 64-core system.

What architecture, and that is acceptable?

> +/*
> + * Track CPUs doing lockless page table walks to avoid broadcast IPIs
> + * during TLB flushes.
> + */
> +DECLARE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
> +
> +static inline void pt_walk_lockless_start(struct mm_struct *mm)
> +{
> +	lockdep_assert_irqs_disabled();
> +
> +	/*
> +	 * Tell other CPUs we're doing lockless page table walk.
> +	 *
> +	 * Full barrier needed to prevent page table reads from being
> +	 * reordered before this write.
> +	 *
> +	 * Pairs with smp_rmb() in tlb_remove_table_sync_mm().
> +	 */
> +	this_cpu_write(active_lockless_pt_walk_mm, mm);
> +	smp_mb();

One thing to try is something like:

	xchg(this_cpu_ptr(&active_lockless_pt_walk_mm), mm);

That *might* be a little better on x86_64, on anything else you really
don't want to use this_cpu_() ops when you *know* IRQs are already
disabled.

> +}
> +
> +static inline void pt_walk_lockless_end(void)
> +{
> +	lockdep_assert_irqs_disabled();
> +
> +	/*
> +	 * Clear the pointer so other CPUs no longer see this CPU as walking
> +	 * the mm. Use smp_store_release to ensure page table reads complete
> +	 * before the clear is visible to other CPUs.
> +	 */
> +	smp_store_release(this_cpu_ptr(&active_lockless_pt_walk_mm), NULL);
> +}
> +
>  int get_user_pages_fast(unsigned long start, int nr_pages,
>  			unsigned int gup_flags, struct page **pages);
>  int pin_user_pages_fast(unsigned long start, int nr_pages,

> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 2faa23d7f8d4..35c89e4b6230 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -285,6 +285,56 @@ void tlb_remove_table_sync_one(void)
>  	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
>  }
>  
> +DEFINE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
> +EXPORT_PER_CPU_SYMBOL_GPL(active_lockless_pt_walk_mm);

Why the heck is this exported? Both users are firmly core code.

> +/**
> + * tlb_remove_table_sync_mm - send IPIs to CPUs doing lockless page table
> + * walk for @mm
> + *
> + * @mm: target mm; only CPUs walking this mm get an IPI.
> + *
> + * Like tlb_remove_table_sync_one() but only targets CPUs in
> + * active_lockless_pt_walk_mm.
> + */
> +void tlb_remove_table_sync_mm(struct mm_struct *mm)
> +{
> +	cpumask_var_t target_cpus;
> +	bool found_any = false;
> +	int cpu;
> +
> +	if (WARN_ONCE(!mm, "NULL mm in %s\n", __func__)) {
> +		tlb_remove_table_sync_one();
> +		return;
> +	}
> +
> +	/* If we can't, fall back to broadcast. */
> +	if (!alloc_cpumask_var(&target_cpus, GFP_ATOMIC)) {
> +		tlb_remove_table_sync_one();
> +		return;
> +	}
> +
> +	cpumask_clear(target_cpus);
> +
> +	/* Pairs with smp_mb() in pt_walk_lockless_start(). */

Pairs how? The start thing does something like:

	[W] active_lockless_pt_walk_mm = mm
	MB
	[L] page-tables

So this is:

	[L] page-tables
	RMB
	[L] active_lockless_pt_walk_mm

?

> +	smp_rmb();
> +
> +	/* Find CPUs doing lockless page table walks for this mm */
> +	for_each_online_cpu(cpu) {
> +		if (per_cpu(active_lockless_pt_walk_mm, cpu) == mm) {
> +			cpumask_set_cpu(cpu, target_cpus);

You really don't need this to be atomic.

> +			found_any = true;
> +		}
> +	}
> +
> +	/* Only send IPIs to CPUs actually doing lockless walks */
> +	if (found_any)
> +		smp_call_function_many(target_cpus, tlb_remove_table_smp_sync,
> +				       NULL, 1);

Coding style wants { } here. Also, isn't this what we have
smp_call_function_many_cond() for?

> +	free_cpumask_var(target_cpus);
> +}
> +
>  static void tlb_remove_table_rcu(struct rcu_head *head)
>  {
>  	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
> -- 
> 2.49.0
>

Re: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Lance Yang 5 days, 9 hours ago

Hi Peter,

Thanks for taking time to review!

On 2026/2/2 17:42, Peter Zijlstra wrote:
> On Mon, Feb 02, 2026 at 03:45:55PM +0800, Lance Yang wrote:
>> From: Lance Yang <lance.yang@linux.dev>
>>
>> Currently, tlb_remove_table_sync_one() broadcasts IPIs to all CPUs to wait
>> for any concurrent lockless page table walkers (e.g., GUP-fast). This is
>> inefficient on systems with many CPUs, especially for RT workloads[1].
>>
>> This patch introduces a per-CPU tracking mechanism to record which CPUs are
>> actively performing lockless page table walks for a specific mm_struct.
>> When freeing/unsharing page tables, we can now send IPIs only to the CPUs
>> that are actually walking that mm, instead of broadcasting to all CPUs.
>>
>> In preparation for targeted IPIs; a follow-up will switch callers to
>> tlb_remove_table_sync_mm().
>>
>> Note that the tracking adds ~3% latency to GUP-fast, as measured on a
>> 64-core system.
> 
> What architecture, and that is acceptable?

x86-64.

I ran ./gup_bench which spawns 60 threads, each doing 500k GUP-fast
operations (pinning 8 pages per call) via the gup_test ioctl.

Results for pin pages:
- Before: avg 1.489s (10 runs)
- After:  avg 1.533s (10 runs)

Given we avoid broadcast IPIs on large systems, I think this is a
reasonable trade-off :)

> 
>> +/*
>> + * Track CPUs doing lockless page table walks to avoid broadcast IPIs
>> + * during TLB flushes.
>> + */
>> +DECLARE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
>> +
>> +static inline void pt_walk_lockless_start(struct mm_struct *mm)
>> +{
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	/*
>> +	 * Tell other CPUs we're doing lockless page table walk.
>> +	 *
>> +	 * Full barrier needed to prevent page table reads from being
>> +	 * reordered before this write.
>> +	 *
>> +	 * Pairs with smp_rmb() in tlb_remove_table_sync_mm().
>> +	 */
>> +	this_cpu_write(active_lockless_pt_walk_mm, mm);
>> +	smp_mb();
> 
> One thing to try is something like:
> 
> 	xchg(this_cpu_ptr(&active_lockless_pt_walk_mm), mm);
> 
> That *might* be a little better on x86_64, on anything else you really
> don't want to use this_cpu_() ops when you *know* IRQs are already
> disabled.

Ah, good to know that. Thanks!

IIUC, xchg() provides the full barrier we need ;)

> 
>> +}
>> +
>> +static inline void pt_walk_lockless_end(void)
>> +{
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	/*
>> +	 * Clear the pointer so other CPUs no longer see this CPU as walking
>> +	 * the mm. Use smp_store_release to ensure page table reads complete
>> +	 * before the clear is visible to other CPUs.
>> +	 */
>> +	smp_store_release(this_cpu_ptr(&active_lockless_pt_walk_mm), NULL);
>> +}
>> +
>>   int get_user_pages_fast(unsigned long start, int nr_pages,
>>   			unsigned int gup_flags, struct page **pages);
>>   int pin_user_pages_fast(unsigned long start, int nr_pages,
> 
>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
>> index 2faa23d7f8d4..35c89e4b6230 100644
>> --- a/mm/mmu_gather.c
>> +++ b/mm/mmu_gather.c
>> @@ -285,6 +285,56 @@ void tlb_remove_table_sync_one(void)
>>   	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
>>   }
>>   
>> +DEFINE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
>> +EXPORT_PER_CPU_SYMBOL_GPL(active_lockless_pt_walk_mm);
> 
> Why the heck is this exported? Both users are firmly core code.

OK. Will drop this export.

> 
>> +/**
>> + * tlb_remove_table_sync_mm - send IPIs to CPUs doing lockless page table
>> + * walk for @mm
>> + *
>> + * @mm: target mm; only CPUs walking this mm get an IPI.
>> + *
>> + * Like tlb_remove_table_sync_one() but only targets CPUs in
>> + * active_lockless_pt_walk_mm.
>> + */
>> +void tlb_remove_table_sync_mm(struct mm_struct *mm)
>> +{
>> +	cpumask_var_t target_cpus;
>> +	bool found_any = false;
>> +	int cpu;
>> +
>> +	if (WARN_ONCE(!mm, "NULL mm in %s\n", __func__)) {
>> +		tlb_remove_table_sync_one();
>> +		return;
>> +	}
>> +
>> +	/* If we can't, fall back to broadcast. */
>> +	if (!alloc_cpumask_var(&target_cpus, GFP_ATOMIC)) {
>> +		tlb_remove_table_sync_one();
>> +		return;
>> +	}
>> +
>> +	cpumask_clear(target_cpus);
>> +
>> +	/* Pairs with smp_mb() in pt_walk_lockless_start(). */
> 
> Pairs how? The start thing does something like:
> 
> 	[W] active_lockless_pt_walk_mm = mm
> 	MB
> 	[L] page-tables
> 
> So this is:
> 
> 	[L] page-tables
> 	RMB
> 	[L] active_lockless_pt_walk_mm
> 
> ?

On the walker side (pt_walk_lockless_start):

  [W]  active_lockless_pt_walk_mm = mm
  MB
  [L] page-tables (walker reads page tables)

So the walker publishes "I'm walking this mm" before reading page tables.

On the sync side we don't read page-tables. We do:

  RMB
  [L] active_lockless_pt_walk_mm (we read the per-CPU pointer below)

We need to observe the walker's store of active_lockless_pt_walk_mm before
we decide which CPUs to IPI.

So on the sync side we do smp_rmb(), then read active_lockless_pt_walk_mm.

That pairs with the full barrier in pt_walk_lockless_start().

> 
>> +	smp_rmb();
>> +
>> +	/* Find CPUs doing lockless page table walks for this mm */
>> +	for_each_online_cpu(cpu) {
>> +		if (per_cpu(active_lockless_pt_walk_mm, cpu) == mm) {
>> +			cpumask_set_cpu(cpu, target_cpus);
> 
> You really don't need this to be atomic.
> 
>> +			found_any = true;
>> +		}
>> +	}
>> +
>> +	/* Only send IPIs to CPUs actually doing lockless walks */
>> +	if (found_any)
>> +		smp_call_function_many(target_cpus, tlb_remove_table_smp_sync,
>> +				       NULL, 1);
> 
> Coding style wants { } here. Also, isn't this what we have
> smp_call_function_many_cond() for?

Right! That would be better, something like:

static bool tlb_remove_table_sync_mm_cond(int cpu, void *mm)
{
	return per_cpu(active_lockless_pt_walk_mm, cpu) == (struct mm_struct *)mm;
}

on_each_cpu_cond_mask(tlb_remove_table_sync_mm_cond,
			tlb_remove_table_smp_sync,
			(void *)mm, true, cpu_online_mask);

> 
>> +	free_cpumask_var(target_cpus);
>> +}
>> +
>>   static void tlb_remove_table_rcu(struct rcu_head *head)
>>   {
>>   	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
>> -- 
>> 2.49.0
>>

Thanks,
Lance

Re: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Dave Hansen 5 days, 4 hours ago

On 2/2/26 04:14, Lance Yang wrote:
>>> Note that the tracking adds ~3% latency to GUP-fast, as measured on a
>>> 64-core system.
>>
>> What architecture, and that is acceptable?
> 
> x86-64.
> 
> I ran ./gup_bench which spawns 60 threads, each doing 500k GUP-fast
> operations (pinning 8 pages per call) via the gup_test ioctl.
> 
> Results for pin pages:
> - Before: avg 1.489s (10 runs)
> - After:  avg 1.533s (10 runs)
> 
> Given we avoid broadcast IPIs on large systems, I think this is a
> reasonable trade-off 🙂

I thought the big databases were really sensitive to GUP-fast latency.
They like big systems, too. Won't they howl when this finally hits their
testing?

Also, two of the "write" side here are:

 * collapse_huge_page() (khugepaged)
 * tlb_remove_table() (in an "-ENOMEM" path)

Those are quite slow paths, right? Shouldn't the design here favor
keeping gup-fast as fast as possible as opposed to impacting those?

Re: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Peter Zijlstra 5 days, 8 hours ago

On Mon, Feb 02, 2026 at 08:14:32PM +0800, Lance Yang wrote:

> > > +	/* Pairs with smp_mb() in pt_walk_lockless_start(). */
> > 
> > Pairs how? The start thing does something like:
> > 
> > 	[W] active_lockless_pt_walk_mm = mm
> > 	MB
> > 	[L] page-tables
> > 
> > So this is:
> > 
> > 	[L] page-tables
> > 	RMB
> > 	[L] active_lockless_pt_walk_mm
> > 
> > ?
> 
> On the walker side (pt_walk_lockless_start):
> 
>  [W]  active_lockless_pt_walk_mm = mm
>  MB
>  [L] page-tables (walker reads page tables)
> 
> So the walker publishes "I'm walking this mm" before reading page tables.
> 
> On the sync side we don't read page-tables. We do:
> 
>  RMB
>  [L] active_lockless_pt_walk_mm (we read the per-CPU pointer below)
> 
> We need to observe the walker's store of active_lockless_pt_walk_mm before
> we decide which CPUs to IPI.
> 
> So on the sync side we do smp_rmb(), then read active_lockless_pt_walk_mm.
> 
> That pairs with the full barrier in pt_walk_lockless_start().

No it doesn't; this is not how memory barriers work.

Re: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Lance Yang 5 days, 7 hours ago


On 2026/2/2 20:51, Peter Zijlstra wrote:
> On Mon, Feb 02, 2026 at 08:14:32PM +0800, Lance Yang wrote:
> 
>>>> +	/* Pairs with smp_mb() in pt_walk_lockless_start(). */
>>>
>>> Pairs how? The start thing does something like:
>>>
>>> 	[W] active_lockless_pt_walk_mm = mm
>>> 	MB
>>> 	[L] page-tables
>>>
>>> So this is:
>>>
>>> 	[L] page-tables
>>> 	RMB
>>> 	[L] active_lockless_pt_walk_mm
>>>
>>> ?
>>
>> On the walker side (pt_walk_lockless_start):
>>
>>   [W]  active_lockless_pt_walk_mm = mm
>>   MB
>>   [L] page-tables (walker reads page tables)
>>
>> So the walker publishes "I'm walking this mm" before reading page tables.
>>
>> On the sync side we don't read page-tables. We do:
>>
>>   RMB
>>   [L] active_lockless_pt_walk_mm (we read the per-CPU pointer below)
>>
>> We need to observe the walker's store of active_lockless_pt_walk_mm before
>> we decide which CPUs to IPI.
>>
>> So on the sync side we do smp_rmb(), then read active_lockless_pt_walk_mm.
>>
>> That pairs with the full barrier in pt_walk_lockless_start().
> 
> No it doesn't; this is not how memory barriers work.

Hmm... we need MB rather than RMB on the sync side. Is that correct?

Walker:
[W]active_lockless_pt_walk_mm = mm -> MB -> [L]page-tables

Sync:
[W]page-tables -> MB -> [L]active_lockless_pt_walk_mm


Thanks,
Lance

Re: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Peter Zijlstra 5 days, 7 hours ago

On Mon, Feb 02, 2026 at 09:23:07PM +0800, Lance Yang wrote:

> Hmm... we need MB rather than RMB on the sync side. Is that correct?
> 
> Walker:
> [W]active_lockless_pt_walk_mm = mm -> MB -> [L]page-tables
> 
> Sync:
> [W]page-tables -> MB -> [L]active_lockless_pt_walk_mm
> 

This can work -- but only if the walker and sync touch the same
page-table address.

Now, typically I would imagine they both share the p4d/pud address at
the very least, right?

Re: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers

Posted by Lance Yang 5 days, 6 hours ago


On 2026/2/2 21:42, Peter Zijlstra wrote:
> On Mon, Feb 02, 2026 at 09:23:07PM +0800, Lance Yang wrote:
> 
>> Hmm... we need MB rather than RMB on the sync side. Is that correct?
>>
>> Walker:
>> [W]active_lockless_pt_walk_mm = mm -> MB -> [L]page-tables
>>
>> Sync:
>> [W]page-tables -> MB -> [L]active_lockless_pt_walk_mm
>>
> 
> This can work -- but only if the walker and sync touch the same
> page-table address.
> 
> Now, typically I would imagine they both share the p4d/pud address at
> the very least, right?

Thanks. I think I see the confusion ...

To be clear, the goal is not to make the walker see page-table writes 
through the
MB pairing, but to wait for any concurrent lockless page table walkers 
to finish.

The flow is:

1) Page tables are modified
2) TLB flush is done
3) Read active_lockless_pt_walk_mm (with MB to order page-table writes 
before
    this read) to find which CPUs are locklessly walking this mm
4) IPI those CPUs
5) The IPI forces them to sync, so after the IPI returns, any in-flight 
lockless
    page table walk has finished (or will restart and see the new page 
tables)

The synchronization relies on the IPI to ensure walkers stop before 
continuing.

I would assume the TLB flush (step 2) should imply some barrier.

Does that clarify?