[RFC PATCH 7/9] x86/mm: Introduce Remote Action Request

Rik van Riel posted 9 patches 7 months, 2 weeks ago
There is a newer version of this series
[RFC PATCH 7/9] x86/mm: Introduce Remote Action Request
Posted by Rik van Riel 7 months, 2 weeks ago
From: Yu-cheng Yu <yu-cheng.yu@intel.com>

Remote Action Request (RAR) is a TLB flushing broadcast facility.
To start a TLB flush, the initiator CPU creates a RAR payload and
sends a command to the APIC.  The receiving CPUs automatically flush
TLBs as specified in the payload without the kernel's involement.

[ riel:
  - add pcid parameter to smp_call_rar_many so other mms can be flushed
  - ensure get_payload only allocates valid indices
  - make sure rar_cpu_init does not write reserved bits
  - fix overflow in range vs full flush decision ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/rar.h   |  69 +++++++++++
 arch/x86/kernel/cpu/common.c |   4 +
 arch/x86/mm/Makefile         |   1 +
 arch/x86/mm/rar.c            | 226 +++++++++++++++++++++++++++++++++++
 4 files changed, 300 insertions(+)
 create mode 100644 arch/x86/include/asm/rar.h
 create mode 100644 arch/x86/mm/rar.c

diff --git a/arch/x86/include/asm/rar.h b/arch/x86/include/asm/rar.h
new file mode 100644
index 000000000000..b5ba856fcaa8
--- /dev/null
+++ b/arch/x86/include/asm/rar.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RAR_H
+#define _ASM_X86_RAR_H
+
+/*
+ * RAR payload types
+ */
+#define RAR_TYPE_INVPG		0
+#define RAR_TYPE_INVPG_NO_CR3	1
+#define RAR_TYPE_INVPCID	2
+#define RAR_TYPE_INVEPT		3
+#define RAR_TYPE_INVVPID	4
+#define RAR_TYPE_WRMSR		5
+
+/*
+ * Subtypes for RAR_TYPE_INVLPG
+ */
+#define RAR_INVPG_ADDR			0 /* address specific */
+#define RAR_INVPG_ALL			2 /* all, include global */
+#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
+
+/*
+ * Subtypes for RAR_TYPE_INVPCID
+ */
+#define RAR_INVPCID_ADDR		0 /* address specific */
+#define RAR_INVPCID_PCID		1 /* all of PCID */
+#define RAR_INVPCID_ALL			2 /* all, include global */
+#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
+
+/*
+ * Page size for RAR_TYPE_INVLPG
+ */
+#define RAR_INVLPG_PAGE_SIZE_4K		0
+#define RAR_INVLPG_PAGE_SIZE_2M		1
+#define RAR_INVLPG_PAGE_SIZE_1G		2
+
+/*
+ * Max number of pages per payload
+ */
+#define RAR_INVLPG_MAX_PAGES 63
+
+typedef struct {
+	uint64_t for_sw : 8;
+	uint64_t type : 8;
+	uint64_t must_be_zero_1 : 16;
+	uint64_t subtype : 3;
+	uint64_t page_size: 2;
+	uint64_t num_pages : 6;
+	uint64_t must_be_zero_2 : 21;
+
+	uint64_t must_be_zero_3;
+
+	/*
+	 * Starting address
+	 */
+	uint64_t initiator_cr3;
+	uint64_t linear_address;
+
+	/*
+	 * Padding
+	 */
+	uint64_t padding[4];
+} rar_payload_t;
+
+void rar_cpu_init(void);
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end);
+
+#endif /* _ASM_X86_RAR_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5666620e7153..75b43db0b129 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -71,6 +71,7 @@
 #include <asm/tdx.h>
 #include <asm/posted_intr.h>
 #include <asm/runtime-const.h>
+#include <asm/rar.h>
 
 #include "cpu.h"
 
@@ -2395,6 +2396,9 @@ void cpu_init(void)
 	if (is_uv_system())
 		uv_cpu_init();
 
+	if (cpu_feature_enabled(X86_FEATURE_RAR))
+		rar_cpu_init();
+
 	load_fixmap_gdt(cpu);
 }
 
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index cebe5812d78d..d49d16412569 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
 obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)	+= pti.o
+obj-$(CONFIG_BROADCAST_TLB_FLUSH)		+= rar.o
 
 obj-$(CONFIG_X86_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
new file mode 100644
index 000000000000..77a334f1e212
--- /dev/null
+++ b/arch/x86/mm/rar.c
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RAR Tlb shootdown
+ */
+
+#include <linux/kgdb.h>
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/sync_bitops.h>
+#include <asm/rar.h>
+#include <asm/tlbflush.h>
+
+static DEFINE_PER_CPU(int, rar_lock);
+static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+
+#define RAR_ACTION_OK		0x00
+#define RAR_ACTION_START	0x01
+#define RAR_ACTION_ACKED	0x02
+#define RAR_ACTION_FAIL		0x80
+
+#define RAR_MAX_PAYLOADS 32UL
+
+static unsigned long rar_in_use = ~(RAR_MAX_PAYLOADS - 1);
+static rar_payload_t rar_payload[RAR_MAX_PAYLOADS] __page_aligned_bss;
+static DEFINE_PER_CPU_ALIGNED(u64[(RAR_MAX_PAYLOADS + 8) / 8], rar_action);
+
+static __always_inline void lock(int *lock)
+{
+	smp_cond_load_acquire(lock, !(VAL & 1));
+	*lock |= 1;
+
+	/*
+	 * prevent CPU from reordering the above assignment
+	 * to ->flags with any subsequent assignments to other
+	 * fields of the specified call_single_data structure:
+	 */
+	smp_wmb();
+}
+
+static __always_inline void unlock(int *lock)
+{
+	WARN_ON(!(*lock & 1));
+
+	/*
+	 * ensure we're all done before releasing data:
+	 */
+	smp_store_release(lock, 0);
+}
+
+static unsigned long get_payload(void)
+{
+	while (1) {
+		unsigned long bit;
+
+		/*
+		 * Find a free bit and confirm it with
+		 * test_and_set_bit() below.
+		 */
+		bit = ffz(READ_ONCE(rar_in_use));
+
+		if (bit >= RAR_MAX_PAYLOADS)
+			continue;
+
+		if (!test_and_set_bit((long)bit, &rar_in_use))
+			return bit;
+	}
+}
+
+static void free_payload(unsigned long idx)
+{
+	clear_bit(idx, &rar_in_use);
+}
+
+static void set_payload(unsigned long idx, u16 pcid, unsigned long start,
+			uint32_t pages)
+{
+	rar_payload_t *p = &rar_payload[idx];
+
+	p->must_be_zero_1	= 0;
+	p->must_be_zero_2	= 0;
+	p->must_be_zero_3	= 0;
+	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
+	p->type			= RAR_TYPE_INVPCID;
+	p->num_pages		= pages;
+	p->initiator_cr3	= pcid;
+	p->linear_address	= start;
+
+	if (pcid) {
+		/* RAR invalidation of the mapping of a specific process. */
+		if (pages >= RAR_INVLPG_MAX_PAGES)
+			p->subtype = RAR_INVPCID_PCID;
+		else
+			p->subtype = RAR_INVPCID_ADDR;
+	} else {
+		/*
+		 * Unfortunately RAR_INVPCID_ADDR excludes global translations.
+		 * Always do a full flush for kernel invalidations.
+		 */
+		p->subtype = RAR_INVPCID_ALL;
+	}
+
+	smp_wmb();
+}
+
+static void set_action_entry(unsigned long idx, int target_cpu)
+{
+	u8 *bitmap = (u8 *)per_cpu(rar_action, target_cpu);
+
+	WRITE_ONCE(bitmap[idx], RAR_ACTION_START);
+}
+
+static void wait_for_done(unsigned long idx, int target_cpu)
+{
+	u8 status;
+	u8 *bitmap = (u8 *)per_cpu(rar_action, target_cpu);
+
+	status = READ_ONCE(bitmap[idx]);
+
+	while ((status != RAR_ACTION_OK) && (status != RAR_ACTION_FAIL)) {
+		cpu_relax();
+		status = READ_ONCE(bitmap[idx]);
+	}
+
+	WARN_ON_ONCE(bitmap[idx] == RAR_ACTION_FAIL);
+}
+
+void rar_cpu_init(void)
+{
+	u64 r;
+	u8 *bitmap;
+	int this_cpu = smp_processor_id();
+
+	per_cpu(rar_lock, this_cpu) = 0;
+	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
+
+	rdmsrl(MSR_IA32_RAR_INFO, r);
+	pr_info_once("RAR: support %lld payloads\n", r >> 32);
+
+	bitmap = (u8 *)per_cpu(rar_action, this_cpu);
+	memset(bitmap, 0, RAR_MAX_PAYLOADS);
+	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
+	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));
+
+	r = RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF;
+	// reserved bits!!! r |= (RAR_VECTOR & 0xff);
+	wrmsrl(MSR_IA32_RAR_CTRL, r);
+}
+
+/*
+ * This is a modified version of smp_call_function_many() of kernel/smp.c,
+ * without a function pointer, because the RAR handler is the ucode.
+ */
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end)
+{
+	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
+	int cpu, next_cpu, this_cpu = smp_processor_id();
+	cpumask_t *dest_mask;
+	unsigned long idx;
+
+	if (pages > RAR_INVLPG_MAX_PAGES || end == TLB_FLUSH_ALL)
+		pages = RAR_INVLPG_MAX_PAGES;
+
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * We allow cpu's that are not yet online though, as no one else can
+	 * send smp call function interrupt to this cpu and as such deadlocks
+	 * can't happen.
+	 */
+	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
+		     && !oops_in_progress && !early_boot_irqs_disabled);
+
+	/* Try to fastpath.  So, what's a CPU they want?  Ignoring this one. */
+	cpu = cpumask_first_and(mask, cpu_online_mask);
+	if (cpu == this_cpu)
+		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
+	/* No online cpus?  We're done. */
+	if (cpu >= nr_cpu_ids)
+		return;
+
+	/* Do we have another CPU which isn't us? */
+	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+	if (next_cpu == this_cpu)
+		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
+
+	/* Fastpath: do that cpu by itself. */
+	if (next_cpu >= nr_cpu_ids) {
+		lock(this_cpu_ptr(&rar_lock));
+		idx = get_payload();
+		set_payload(idx, pcid, start, pages);
+		set_action_entry(idx, cpu);
+		arch_send_rar_single_ipi(cpu);
+		wait_for_done(idx, cpu);
+		free_payload(idx);
+		unlock(this_cpu_ptr(&rar_lock));
+		return;
+	}
+
+	dest_mask = this_cpu_ptr(&rar_cpu_mask);
+	cpumask_and(dest_mask, mask, cpu_online_mask);
+	cpumask_clear_cpu(this_cpu, dest_mask);
+
+	/* Some callers race with other cpus changing the passed mask */
+	if (unlikely(!cpumask_weight(dest_mask)))
+		return;
+
+	lock(this_cpu_ptr(&rar_lock));
+	idx = get_payload();
+	set_payload(idx, pcid, start, pages);
+
+	for_each_cpu(cpu, dest_mask)
+		set_action_entry(idx, cpu);
+
+	/* Send a message to all CPUs in the map */
+	arch_send_rar_ipi_mask(dest_mask);
+
+	for_each_cpu(cpu, dest_mask)
+		wait_for_done(idx, cpu);
+
+	free_payload(idx);
+	unlock(this_cpu_ptr(&rar_lock));
+}
+EXPORT_SYMBOL(smp_call_rar_many);
-- 
2.49.0
Re: [RFC PATCH 7/9] x86/mm: Introduce Remote Action Request
Posted by Ingo Molnar 7 months, 1 week ago
* Rik van Riel <riel@surriel.com> wrote:

> +++ b/arch/x86/include/asm/rar.h
> @@ -0,0 +1,69 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_RAR_H
> +#define _ASM_X86_RAR_H
> +
> +/*
> + * RAR payload types
> + */
> +#define RAR_TYPE_INVPG		0
> +#define RAR_TYPE_INVPG_NO_CR3	1
> +#define RAR_TYPE_INVPCID	2
> +#define RAR_TYPE_INVEPT		3
> +#define RAR_TYPE_INVVPID	4
> +#define RAR_TYPE_WRMSR		5
> +
> +/*
> + * Subtypes for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVPG_ADDR			0 /* address specific */
> +#define RAR_INVPG_ALL			2 /* all, include global */
> +#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
> +
> +/*
> + * Subtypes for RAR_TYPE_INVPCID
> + */
> +#define RAR_INVPCID_ADDR		0 /* address specific */
> +#define RAR_INVPCID_PCID		1 /* all of PCID */
> +#define RAR_INVPCID_ALL			2 /* all, include global */
> +#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
> +
> +/*
> + * Page size for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVLPG_PAGE_SIZE_4K		0
> +#define RAR_INVLPG_PAGE_SIZE_2M		1
> +#define RAR_INVLPG_PAGE_SIZE_1G		2
> +
> +/*
> + * Max number of pages per payload
> + */
> +#define RAR_INVLPG_MAX_PAGES 63
> +
> +typedef struct {
> +	uint64_t for_sw : 8;
> +	uint64_t type : 8;
> +	uint64_t must_be_zero_1 : 16;
> +	uint64_t subtype : 3;
> +	uint64_t page_size: 2;
> +	uint64_t num_pages : 6;
> +	uint64_t must_be_zero_2 : 21;
> +
> +	uint64_t must_be_zero_3;
> +
> +	/*
> +	 * Starting address
> +	 */
> +	uint64_t initiator_cr3;
> +	uint64_t linear_address;
> +
> +	/*
> +	 * Padding
> +	 */
> +	uint64_t padding[4];
> +} rar_payload_t;

- Please don't use _t typedefs for complex types. 'struct rar_payload' 
  should be good enough.

- Please use u64/u32 for HW ABI definitions.

- Please align bitfield definitions vertically, for better readability:

	u64 for_sw		:  8;
	u64 type		:  8;
	u64 must_be_zero_1	: 16;
	u64 subtype		:  3;
	u64 page_size		:  2;
	u64 num_pages		:  6;
	u64 must_be_zero_2	: 21;

> +++ b/arch/x86/mm/rar.c
> @@ -0,0 +1,226 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * RAR Tlb shootdown

s/Tlb
 /TLB

> +#include <linux/kgdb.h>

Is this really needed? There's nothing KGDB specific in here AFAICS.

> +static DEFINE_PER_CPU_ALIGNED(u64[(RAR_MAX_PAYLOADS + 8) / 8], rar_action);

> +static void set_action_entry(unsigned long idx, int target_cpu)
> +{
> +	u8 *bitmap = (u8 *)per_cpu(rar_action, target_cpu);

> +	u8 *bitmap = (u8 *)per_cpu(rar_action, target_cpu);

> +	bitmap = (u8 *)per_cpu(rar_action, this_cpu);


So AFAICS all these ugly, forced type casts tp (u8 *) are needed only 
because rar_action has the wrong type: if it were an u8[], then these 
lines could be:

	static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);

	...

	u8 *bitmap = per_cpu(rar_action, target_cpu);

right?

Thanks,

	Ingo
Re: [RFC PATCH 7/9] x86/mm: Introduce Remote Action Request
Posted by Nadav Amit 7 months, 2 weeks ago

> On 6 May 2025, at 3:37, Rik van Riel <riel@surriel.com> wrote:
> 
> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end)
> +{
> +	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
> +	int cpu, next_cpu, this_cpu = smp_processor_id();
> +	cpumask_t *dest_mask;
> +	unsigned long idx;
> +
> +	if (pages > RAR_INVLPG_MAX_PAGES || end == TLB_FLUSH_ALL)
> +		pages = RAR_INVLPG_MAX_PAGES;
> +
> +	/*
> +	 * Can deadlock when called with interrupts disabled.
> +	 * We allow cpu's that are not yet online though, as no one else can
> +	 * send smp call function interrupt to this cpu and as such deadlocks
> +	 * can't happen.
> +	 */
> +	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
> +		     && !oops_in_progress && !early_boot_irqs_disabled);

To ease it for the reader, consider using the updated version from smp.c
(or - even better - refactor into common inline function):

	if (cpu_online(this_cpu) && !oops_in_progress &&
	    !early_boot_irqs_disabled)
		lockdep_assert_irqs_enabled();


> +
> +	/* Try to fastpath.  So, what's a CPU they want?  Ignoring this one. */
> +	cpu = cpumask_first_and(mask, cpu_online_mask);
> +	if (cpu == this_cpu)
> +		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
> +
> +	/* No online cpus?  We're done. */
> +	if (cpu >= nr_cpu_ids)
> +		return;
> +
> +	/* Do we have another CPU which isn't us? */
> +	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
> +	if (next_cpu == this_cpu)
> +		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
> +
> +	/* Fastpath: do that cpu by itself. */

If you follow my comment (suggestion) about the concurrent flushes, then 
this part should be moved to be in the same was as done in the updated
smp_call_function_many_cond().

IOW, the main difference between this path and the “slow path” is 
arch_send_rar_ipi_mask() vs arch_send_rar_single_ipi() (and maybe
“and” with cpu_online_mask).


> +	if (next_cpu >= nr_cpu_ids) {
> +		lock(this_cpu_ptr(&rar_lock));
> +		idx = get_payload();
> +		set_payload(idx, pcid, start, pages);
> +		set_action_entry(idx, cpu);
> +		arch_send_rar_single_ipi(cpu);
> +		wait_for_done(idx, cpu);
> +		free_payload(idx);
> +		unlock(this_cpu_ptr(&rar_lock));
> +		return;
> +	}
> +
> +	dest_mask = this_cpu_ptr(&rar_cpu_mask);
> +	cpumask_and(dest_mask, mask, cpu_online_mask);
> +	cpumask_clear_cpu(this_cpu, dest_mask);
> +
> +	/* Some callers race with other cpus changing the passed mask */
> +	if (unlikely(!cpumask_weight(dest_mask)))
> +		return;
> +
> +	lock(this_cpu_ptr(&rar_lock));
> +	idx = get_payload();
> +	set_payload(idx, pcid, start, pages);
> +
> +	for_each_cpu(cpu, dest_mask)
> +		set_action_entry(idx, cpu);
> +
> +	/* Send a message to all CPUs in the map */
> +	arch_send_rar_ipi_mask(dest_mask);


Since 2019 we have move into “multi” TLB flush instead of “many”.

This means that try to take advantage of the time between sending the IPI
and the indication it was completed to do the local TLB flush. For both
consistency and performance, I recommend you’d follow this approach and
do the local TLB flush (if needed) here, instead of doing it in the
caller. 

> +
> +	for_each_cpu(cpu, dest_mask)
> +		wait_for_done(idx, cpu);
> +
> +	free_payload(idx);
> +	unlock(this_cpu_ptr(&rar_lock));

We don’t do lock/unlock on kernel/smp.c . So I would expect at least a
comment as for why it is required.

> +}
> +EXPORT_SYMBOL(smp_call_rar_many);
Re: [RFC PATCH 7/9] x86/mm: Introduce Remote Action Request
Posted by Rik van Riel 7 months, 1 week ago
On Tue, 2025-05-06 at 09:59 +0300, Nadav Amit wrote:
> 
> 
> > On 6 May 2025, at 3:37, Rik van Riel <riel@surriel.com> wrote:
> > 
> > +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> > +		       unsigned long start, unsigned long end)
> > +{
> > +	unsigned long pages = (end - start + PAGE_SIZE) /
> > PAGE_SIZE;
> > +	int cpu, next_cpu, this_cpu = smp_processor_id();
> > +	cpumask_t *dest_mask;
> > +	unsigned long idx;
> > +
> > +	if (pages > RAR_INVLPG_MAX_PAGES || end == TLB_FLUSH_ALL)
> > +		pages = RAR_INVLPG_MAX_PAGES;
> > +
> > +	/*
> > +	 * Can deadlock when called with interrupts disabled.
> > +	 * We allow cpu's that are not yet online though, as no
> > one else can
> > +	 * send smp call function interrupt to this cpu and as
> > such deadlocks
> > +	 * can't happen.
> > +	 */
> > +	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
> > +		     && !oops_in_progress &&
> > !early_boot_irqs_disabled);
> 
> To ease it for the reader, consider using the updated version from
> smp.c
> (or - even better - refactor into common inline function):
> 
> 	if (cpu_online(this_cpu) && !oops_in_progress &&
> 	    !early_boot_irqs_disabled)
> 		lockdep_assert_irqs_enabled();

Nice cleanup. I will change this. Thank you.

> 
> 
> > +
> > +	/* Try to fastpath.  So, what's a CPU they want?  Ignoring
> > this one. */
> > +	cpu = cpumask_first_and(mask, cpu_online_mask);
> > +	if (cpu == this_cpu)
> > +		cpu = cpumask_next_and(cpu, mask,
> > cpu_online_mask);
> > +
> > +	/* No online cpus?  We're done. */
> > +	if (cpu >= nr_cpu_ids)
> > +		return;
> > +
> > +	/* Do we have another CPU which isn't us? */
> > +	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
> > +	if (next_cpu == this_cpu)
> > +		next_cpu = cpumask_next_and(next_cpu, mask,
> > cpu_online_mask);
> > +
> > +	/* Fastpath: do that cpu by itself. */
> 
> If you follow my comment (suggestion) about the concurrent flushes,
> then 
> this part should be moved to be in the same was as done in the
> updated
> smp_call_function_many_cond().
> 
> IOW, the main difference between this path and the “slow path” is 
> arch_send_rar_ipi_mask() vs arch_send_rar_single_ipi() (and maybe
> “and” with cpu_online_mask).

It gets better. Page 8 of the RAR whitepaper tells
us that we can simply use RAR to have a CPU send
itself TLB flush instructions, and the microcode
will do the flush at the same time the other CPUs
handle theirs.

"At this point, the ILP may invalidate its own TLB by 
 signaling RAR to itself in order to invoke the RAR handler
 locally as well"

I tried this, but things blew up very early in
boot, presumably due to the CPU trying to send
itself a RAR before it was fully configured to
handle them.

The code may need a better decision point than
cpu_feature_enabled(X86_FEATURE_RAR) to decide
whether or not to use RAR.

Probably something that indicates RAR is actually
ready to use on all CPUs.

> 
> Since 2019 we have move into “multi” TLB flush instead of “many”.
> 
> This means that try to take advantage of the time between sending the
> IPI
> and the indication it was completed to do the local TLB flush. 

I think we have 3 cases here:

1) Only the local TLB needs to be flushed.
   In this case we can INVPCID locally, and skip any
   potential contention on the RAR payload table.

2) Only one remote CPU needs to be flushed (no local).
   This can use the arch_rar_send_single_ipi() thing.

3) Multiple CPUs need to be flushed. This could include
   the local CPU, or be only multiple remote CPUs.
   For this case we could just use arch_send_rar_ipi_mask(),
   including sending a RAR request to the local CPU, which
   should handle it concurrently with the other CPUs.

Does that seem like a reasonable way to handle things?

> > +
> > +	for_each_cpu(cpu, dest_mask)
> > +		wait_for_done(idx, cpu);
> > +
> > +	free_payload(idx);
> > +	unlock(this_cpu_ptr(&rar_lock));
> 
> We don’t do lock/unlock on kernel/smp.c . So I would expect at least
> a
> comment as for why it is required.
> 
That is a very good question!

It is locking a per-cpu lock, which no other code
path takes.

It looks like it could protect against preemption,
on a kernel with full preemption enabled, but that
should not be needed since the code in arch/x86/mm/tlb.c
disables preemption around every call to the RAR code.

I suspect that lock is no longer needed, but maybe
somebody at Intel has a reason why we still do?

-- 
All Rights Reversed.
Re: [RFC PATCH 7/9] x86/mm: Introduce Remote Action Request
Posted by Nadav Amit 7 months, 1 week ago

> On 6 May 2025, at 18:16, Rik van Riel <riel@surriel.com> wrote:
> 
> It gets better. Page 8 of the RAR whitepaper tells
> us that we can simply use RAR to have a CPU send
> itself TLB flush instructions, and the microcode
> will do the flush at the same time the other CPUs
> handle theirs.
> 
> "At this point, the ILP may invalidate its own TLB by 
> signaling RAR to itself in order to invoke the RAR handler
> locally as well"
> 
> I tried this, but things blew up very early in
> boot, presumably due to the CPU trying to send
> itself a RAR before it was fully configured to
> handle them.
> 
> The code may need a better decision point than
> cpu_feature_enabled(X86_FEATURE_RAR) to decide
> whether or not to use RAR.
> 
> Probably something that indicates RAR is actually
> ready to use on all CPUs.
> 

Once you get something working (perhaps with a branch for
now) you can take the static-key/static-call path, presumably.
I would first try to get something working properly.

BTW: I suspect that the RAR approach might not handle TLB
storms worse than the IPI approach, in which once the handler
sees such a storm, it does full TLB flush and skips flushes of
“older” generations. You may want to benchmark this scenario
(IIRC one of the will-it-scale does something similar).

> I think we have 3 cases here:
> 
> 1) Only the local TLB needs to be flushed.
>   In this case we can INVPCID locally, and skip any
>   potential contention on the RAR payload table.

More like INVLPG (and INVPCID to the user PTI). AFAIK, Andy said
INVLPG performs better than INVPCID for a single entry. But yes,
this is a simple and hot scenario that should have a separate
code-path.

> 
> 2) Only one remote CPU needs to be flushed (no local).
>   This can use the arch_rar_send_single_ipi() thing.
> 
> 3) Multiple CPUs need to be flushed. This could include
>   the local CPU, or be only multiple remote CPUs.
>   For this case we could just use arch_send_rar_ipi_mask(),
>   including sending a RAR request to the local CPU, which
>   should handle it concurrently with the other CPUs.
> 
> Does that seem like a reasonable way to handle things?

It it. It is just that code-wise, I think the 2nd and 3rd cases
are similar, and it can be better to distinguish the differences
between them without creating two completely separate code-paths.
This makes maintenance and reasoning more simple, I think.

Consider having a look at smp_call_function_many_cond(). I think
it handles the 2nd and 3rd cases nicely in the manner I just
described. Admittedly, I am a bit biased…
Re: [RFC PATCH 7/9] x86/mm: Introduce Remote Action Request
Posted by Rik van Riel 7 months, 1 week ago
On Tue, 2025-05-06 at 18:50 +0300, Nadav Amit wrote:
> 
> 
> > On 6 May 2025, at 18:16, Rik van Riel <riel@surriel.com> wrote:
> > 
> > It gets better. Page 8 of the RAR whitepaper tells
> > us that we can simply use RAR to have a CPU send
> > itself TLB flush instructions, and the microcode
> > will do the flush at the same time the other CPUs
> > handle theirs.
> > 
> > "At this point, the ILP may invalidate its own TLB by 
> > signaling RAR to itself in order to invoke the RAR handler
> > locally as well"
> > 
> > I tried this, but things blew up very early in
> > boot, presumably due to the CPU trying to send
> > itself a RAR before it was fully configured to
> > handle them.
> > 
> > The code may need a better decision point than
> > cpu_feature_enabled(X86_FEATURE_RAR) to decide
> > whether or not to use RAR.
> > 
> > Probably something that indicates RAR is actually
> > ready to use on all CPUs.
> > 
> 
> Once you get something working (perhaps with a branch for
> now) you can take the static-key/static-call path, presumably.
> I would first try to get something working properly.
> 
The static-key code is implemented with alternatives,
which call flush_tlb_mm_range.

I've not spent the time digging into whether that
creates any chicken-egg scenarios yet :)

> > I think we have 3 cases here:
> > 
> > 1) Only the local TLB needs to be flushed.
> >   In this case we can INVPCID locally, and skip any
> >   potential contention on the RAR payload table.
> 
> More like INVLPG (and INVPCID to the user PTI). AFAIK, Andy said
> INVLPG performs better than INVPCID for a single entry. But yes,
> this is a simple and hot scenario that should have a separate
> code-path.

I think this can probably be handled in flush_tlb_mm_range(),
so the RAR code is only called for cases (2) and (3) to
begin with.

> 
> > 
> > 2) Only one remote CPU needs to be flushed (no local).
> >   This can use the arch_rar_send_single_ipi() thing.
> > 
> > 3) Multiple CPUs need to be flushed. This could include
> >   the local CPU, or be only multiple remote CPUs.
> >   For this case we could just use arch_send_rar_ipi_mask(),
> >   including sending a RAR request to the local CPU, which
> >   should handle it concurrently with the other CPUs.
> > 
> > Does that seem like a reasonable way to handle things?
> 
> It it. It is just that code-wise, I think the 2nd and 3rd cases
> are similar, and it can be better to distinguish the differences
> between them without creating two completely separate code-paths.
> This makes maintenance and reasoning more simple, I think.
> 
> Consider having a look at smp_call_function_many_cond(). I think
> it handles the 2nd and 3rd cases nicely in the manner I just
> described. Admittedly, I am a bit biased…

I need to use smp_call_function_many_cond() anyway,
to prevent sending RARs to CPUs that are in lazy
TLB mode (and possibly in a power saving idle state).

IPI TLB flushing and RAR can probably both use the
same should_flush_tlb() helper function.

-- 
All Rights Reversed.
Re: [RFC PATCH 7/9] x86/mm: Introduce Remote Action Request
Posted by Dave Hansen 7 months, 1 week ago
On 5/6/25 08:16, Rik van Riel wrote:
> I suspect that lock is no longer needed, but maybe
> somebody at Intel has a reason why we still do?

I just took a quick look at the locking. It doesn't make any sense to me
either.

I suspect it's just plain not needed.