[v2] Intel RAR TLB invalidation

[RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 3 weeks ago

From: Yu-cheng Yu <yu-cheng.yu@intel.com>

Remote Action Request (RAR) is a TLB flushing broadcast facility.
To start a TLB flush, the initiator CPU creates a RAR payload and
sends a command to the APIC.  The receiving CPUs automatically flush
TLBs as specified in the payload without the kernel's involement.

[ riel: add pcid parameter to smp_call_rar_many so other mms can be flushed ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/rar.h   |  69 +++++++++++++
 arch/x86/kernel/cpu/common.c |   4 +
 arch/x86/mm/Makefile         |   1 +
 arch/x86/mm/rar.c            | 195 +++++++++++++++++++++++++++++++++++
 4 files changed, 269 insertions(+)
 create mode 100644 arch/x86/include/asm/rar.h
 create mode 100644 arch/x86/mm/rar.c

diff --git a/arch/x86/include/asm/rar.h b/arch/x86/include/asm/rar.h
new file mode 100644
index 000000000000..78c039e40e81
--- /dev/null
+++ b/arch/x86/include/asm/rar.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RAR_H
+#define _ASM_X86_RAR_H
+
+/*
+ * RAR payload types
+ */
+#define RAR_TYPE_INVPG		0
+#define RAR_TYPE_INVPG_NO_CR3	1
+#define RAR_TYPE_INVPCID	2
+#define RAR_TYPE_INVEPT		3
+#define RAR_TYPE_INVVPID	4
+#define RAR_TYPE_WRMSR		5
+
+/*
+ * Subtypes for RAR_TYPE_INVLPG
+ */
+#define RAR_INVPG_ADDR			0 /* address specific */
+#define RAR_INVPG_ALL			2 /* all, include global */
+#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
+
+/*
+ * Subtypes for RAR_TYPE_INVPCID
+ */
+#define RAR_INVPCID_ADDR		0 /* address specific */
+#define RAR_INVPCID_PCID		1 /* all of PCID */
+#define RAR_INVPCID_ALL			2 /* all, include global */
+#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
+
+/*
+ * Page size for RAR_TYPE_INVLPG
+ */
+#define RAR_INVLPG_PAGE_SIZE_4K		0
+#define RAR_INVLPG_PAGE_SIZE_2M		1
+#define RAR_INVLPG_PAGE_SIZE_1G		2
+
+/*
+ * Max number of pages per payload
+ */
+#define RAR_INVLPG_MAX_PAGES 63
+
+struct rar_payload {
+	u64 for_sw		: 8;
+	u64 type		: 8;
+	u64 must_be_zero_1	: 16;
+	u64 subtype		: 3;
+	u64 page_size		: 2;
+	u64 num_pages		: 6;
+	u64 must_be_zero_2	: 21;
+
+	u64 must_be_zero_3;
+
+	/*
+	 * Starting address
+	 */
+	u64 initiator_cr3;
+	u64 linear_address;
+
+	/*
+	 * Padding
+	 */
+	u64 padding[4];
+};
+
+void rar_cpu_init(void);
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end);
+
+#endif /* _ASM_X86_RAR_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dd662c42f510..b1e1b9afb2ac 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -71,6 +71,7 @@
 #include <asm/tdx.h>
 #include <asm/posted_intr.h>
 #include <asm/runtime-const.h>
+#include <asm/rar.h>
 
 #include "cpu.h"
 
@@ -2438,6 +2439,9 @@ void cpu_init(void)
 	if (is_uv_system())
 		uv_cpu_init();
 
+	if (cpu_feature_enabled(X86_FEATURE_RAR))
+		rar_cpu_init();
+
 	load_fixmap_gdt(cpu);
 }
 
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5b9908f13dcf..f36fc99e8b10 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
 obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)	+= pti.o
+obj-$(CONFIG_BROADCAST_TLB_FLUSH)		+= rar.o
 
 obj-$(CONFIG_X86_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
new file mode 100644
index 000000000000..16dc9b889cbd
--- /dev/null
+++ b/arch/x86/mm/rar.c
@@ -0,0 +1,195 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RAR TLB shootdown
+ */
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/sync_bitops.h>
+#include <asm/rar.h>
+#include <asm/tlbflush.h>
+
+static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+
+#define RAR_ACTION_OK		0x00
+#define RAR_ACTION_START	0x01
+#define RAR_ACTION_ACKED	0x02
+#define RAR_ACTION_FAIL		0x80
+
+#define RAR_MAX_PAYLOADS 32UL
+
+static unsigned long rar_in_use = ~(RAR_MAX_PAYLOADS - 1);
+static struct rar_payload rar_payload[RAR_MAX_PAYLOADS] __page_aligned_bss;
+static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
+
+static unsigned long get_payload(void)
+{
+	while (1) {
+		unsigned long bit;
+
+		/*
+		 * Find a free bit and confirm it with
+		 * test_and_set_bit() below.
+		 */
+		bit = ffz(READ_ONCE(rar_in_use));
+
+		if (bit >= RAR_MAX_PAYLOADS)
+			continue;
+
+		if (!test_and_set_bit((long)bit, &rar_in_use))
+			return bit;
+	}
+}
+
+static void free_payload(unsigned long idx)
+{
+	clear_bit(idx, &rar_in_use);
+}
+
+static void set_payload(unsigned long idx, u16 pcid, unsigned long start,
+			uint32_t pages)
+{
+	struct rar_payload *p = &rar_payload[idx];
+
+	p->must_be_zero_1	= 0;
+	p->must_be_zero_2	= 0;
+	p->must_be_zero_3	= 0;
+	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
+	p->type			= RAR_TYPE_INVPCID;
+	p->num_pages		= pages;
+	p->initiator_cr3	= pcid;
+	p->linear_address	= start;
+
+	if (pcid) {
+		/* RAR invalidation of the mapping of a specific process. */
+		if (pages >= RAR_INVLPG_MAX_PAGES)
+			p->subtype = RAR_INVPCID_PCID;
+		else
+			p->subtype = RAR_INVPCID_ADDR;
+	} else {
+		/*
+		 * Unfortunately RAR_INVPCID_ADDR excludes global translations.
+		 * Always do a full flush for kernel invalidations.
+		 */
+		p->subtype = RAR_INVPCID_ALL;
+	}
+
+	smp_wmb();
+}
+
+static void set_action_entry(unsigned long idx, int target_cpu)
+{
+	u8 *bitmap = per_cpu(rar_action, target_cpu);
+
+	WRITE_ONCE(bitmap[idx], RAR_ACTION_START);
+}
+
+static void wait_for_done(unsigned long idx, int target_cpu)
+{
+	u8 status;
+	u8 *rar_actions = per_cpu(rar_action, target_cpu);
+
+	status = READ_ONCE(rar_actions[idx]);
+
+	while ((status != RAR_ACTION_OK) && (status != RAR_ACTION_FAIL)) {
+		cpu_relax();
+		status = READ_ONCE(rar_actions[idx]);
+	}
+
+	WARN_ON_ONCE(rar_actions[idx] == RAR_ACTION_FAIL);
+}
+
+void rar_cpu_init(void)
+{
+	u64 r;
+	u8 *bitmap;
+	int this_cpu = smp_processor_id();
+
+	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
+
+	rdmsrl(MSR_IA32_RAR_INFO, r);
+	pr_info_once("RAR: support %lld payloads\n", r >> 32);
+
+	bitmap = (u8 *)per_cpu(rar_action, this_cpu);
+	memset(bitmap, 0, RAR_MAX_PAYLOADS);
+	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
+	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));
+
+	r = RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF;
+	// reserved bits!!! r |= (RAR_VECTOR & 0xff);
+	wrmsrl(MSR_IA32_RAR_CTRL, r);
+}
+
+/*
+ * This is a modified version of smp_call_function_many() of kernel/smp.c,
+ * without a function pointer, because the RAR handler is the ucode.
+ */
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end)
+{
+	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
+	int cpu, next_cpu, this_cpu = smp_processor_id();
+	cpumask_t *dest_mask;
+	unsigned long idx;
+
+	if (pages > RAR_INVLPG_MAX_PAGES || end == TLB_FLUSH_ALL)
+		pages = RAR_INVLPG_MAX_PAGES;
+
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * We allow cpu's that are not yet online though, as no one else can
+	 * send smp call function interrupt to this cpu and as such deadlocks
+	 * can't happen.
+	 */
+	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
+		     && !oops_in_progress && !early_boot_irqs_disabled);
+
+	/* Try to fastpath.  So, what's a CPU they want?  Ignoring this one. */
+	cpu = cpumask_first_and(mask, cpu_online_mask);
+	if (cpu == this_cpu)
+		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
+	/* No online cpus?  We're done. */
+	if (cpu >= nr_cpu_ids)
+		return;
+
+	/* Do we have another CPU which isn't us? */
+	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+	if (next_cpu == this_cpu)
+		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
+
+	/* Fastpath: do that cpu by itself. */
+	if (next_cpu >= nr_cpu_ids) {
+		idx = get_payload();
+		set_payload(idx, pcid, start, pages);
+		set_action_entry(idx, cpu);
+		arch_send_rar_single_ipi(cpu);
+		wait_for_done(idx, cpu);
+		free_payload(idx);
+		return;
+	}
+
+	dest_mask = this_cpu_ptr(&rar_cpu_mask);
+	cpumask_and(dest_mask, mask, cpu_online_mask);
+	cpumask_clear_cpu(this_cpu, dest_mask);
+
+	/* Some callers race with other cpus changing the passed mask */
+	if (unlikely(!cpumask_weight(dest_mask)))
+		return;
+
+	idx = get_payload();
+	set_payload(idx, pcid, start, pages);
+
+	for_each_cpu(cpu, dest_mask)
+		set_action_entry(idx, cpu);
+
+	/* Send a message to all CPUs in the map */
+	arch_send_rar_ipi_mask(dest_mask);
+
+	for_each_cpu(cpu, dest_mask)
+		wait_for_done(idx, cpu);
+
+	free_payload(idx);
+}
+EXPORT_SYMBOL(smp_call_rar_many);
-- 
2.49.0

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Dave Hansen 8 months, 3 weeks ago

On 5/19/25 18:02, Rik van Riel wrote:
> From: Yu-cheng Yu <yu-cheng.yu@intel.com>
> 
> Remote Action Request (RAR) is a TLB flushing broadcast facility.
> To start a TLB flush, the initiator CPU creates a RAR payload and
> sends a command to the APIC.  The receiving CPUs automatically flush
> TLBs as specified in the payload without the kernel's involement.
> 
> [ riel: add pcid parameter to smp_call_rar_many so other mms can be flushed ]
> 
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>  arch/x86/include/asm/rar.h   |  69 +++++++++++++
>  arch/x86/kernel/cpu/common.c |   4 +
>  arch/x86/mm/Makefile         |   1 +
>  arch/x86/mm/rar.c            | 195 +++++++++++++++++++++++++++++++++++
>  4 files changed, 269 insertions(+)
>  create mode 100644 arch/x86/include/asm/rar.h
>  create mode 100644 arch/x86/mm/rar.c
> 
> diff --git a/arch/x86/include/asm/rar.h b/arch/x86/include/asm/rar.h
> new file mode 100644
> index 000000000000..78c039e40e81
> --- /dev/null
> +++ b/arch/x86/include/asm/rar.h
> @@ -0,0 +1,69 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_RAR_H
> +#define _ASM_X86_RAR_H
> +
> +/*
> + * RAR payload types
> + */
> +#define RAR_TYPE_INVPG		0
> +#define RAR_TYPE_INVPG_NO_CR3	1
> +#define RAR_TYPE_INVPCID	2
> +#define RAR_TYPE_INVEPT		3
> +#define RAR_TYPE_INVVPID	4
> +#define RAR_TYPE_WRMSR		5
> +
> +/*
> + * Subtypes for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVPG_ADDR			0 /* address specific */
> +#define RAR_INVPG_ALL			2 /* all, include global */
> +#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
> +
> +/*
> + * Subtypes for RAR_TYPE_INVPCID
> + */
> +#define RAR_INVPCID_ADDR		0 /* address specific */
> +#define RAR_INVPCID_PCID		1 /* all of PCID */
> +#define RAR_INVPCID_ALL			2 /* all, include global */
> +#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
> +
> +/*
> + * Page size for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVLPG_PAGE_SIZE_4K		0
> +#define RAR_INVLPG_PAGE_SIZE_2M		1
> +#define RAR_INVLPG_PAGE_SIZE_1G		2
> +
> +/*
> + * Max number of pages per payload
> + */
> +#define RAR_INVLPG_MAX_PAGES 63
> +
> +struct rar_payload {
> +	u64 for_sw		: 8;
> +	u64 type		: 8;
> +	u64 must_be_zero_1	: 16;
> +	u64 subtype		: 3;
> +	u64 page_size		: 2;
> +	u64 num_pages		: 6;
> +	u64 must_be_zero_2	: 21;
> +
> +	u64 must_be_zero_3;
> +
> +	/*
> +	 * Starting address
> +	 */
> +	u64 initiator_cr3;
> +	u64 linear_address;
> +
> +	/*
> +	 * Padding
> +	 */
> +	u64 padding[4];
> +};
> +
> +void rar_cpu_init(void);
> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end);
> +
> +#endif /* _ASM_X86_RAR_H */
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index dd662c42f510..b1e1b9afb2ac 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -71,6 +71,7 @@
>  #include <asm/tdx.h>
>  #include <asm/posted_intr.h>
>  #include <asm/runtime-const.h>
> +#include <asm/rar.h>
>  
>  #include "cpu.h"
>  
> @@ -2438,6 +2439,9 @@ void cpu_init(void)
>  	if (is_uv_system())
>  		uv_cpu_init();
>  
> +	if (cpu_feature_enabled(X86_FEATURE_RAR))
> +		rar_cpu_init();
> +
>  	load_fixmap_gdt(cpu);
>  }
>  
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 5b9908f13dcf..f36fc99e8b10 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -52,6 +52,7 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
>  obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
>  obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
>  obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)	+= pti.o
> +obj-$(CONFIG_BROADCAST_TLB_FLUSH)		+= rar.o
>  
>  obj-$(CONFIG_X86_MEM_ENCRYPT)	+= mem_encrypt.o
>  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
> diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
> new file mode 100644
> index 000000000000..16dc9b889cbd
> --- /dev/null
> +++ b/arch/x86/mm/rar.c
> @@ -0,0 +1,195 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * RAR TLB shootdown
> + */
> +#include <linux/sched.h>
> +#include <linux/bug.h>
> +#include <asm/current.h>
> +#include <asm/io.h>
> +#include <asm/sync_bitops.h>
> +#include <asm/rar.h>
> +#include <asm/tlbflush.h>
> +
> +static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
> +
> +#define RAR_ACTION_OK		0x00
> +#define RAR_ACTION_START	0x01
> +#define RAR_ACTION_ACKED	0x02
> +#define RAR_ACTION_FAIL		0x80

These don't match up with the names that ended up in the public
documentation. Could we realign them, please?

> +#define RAR_MAX_PAYLOADS 32UL
> +
> +static unsigned long rar_in_use = ~(RAR_MAX_PAYLOADS - 1);
> +static struct rar_payload rar_payload[RAR_MAX_PAYLOADS] __page_aligned_bss;
> +static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);

At some point, there needs to be a description of the data structures.
For instance, there's nothing architecturally requiring all CPUs to
share a payload table. But this implementation chooses to have them
share. We need a discussion somewhere of those design decisions.

One thing that also needs discussion: 'rar_in_use' isn't really about
RAR itself. It's a bitmap of whether the payload is allocated.

> +static unsigned long get_payload(void)
> +{

This is more like "allocate a payload slot" than a "get payload"
operation, IMNHO.

> +	while (1) {
> +		unsigned long bit;
> +
> +		/*
> +		 * Find a free bit and confirm it with
> +		 * test_and_set_bit() below.
> +		 */
> +		bit = ffz(READ_ONCE(rar_in_use));
> +
> +		if (bit >= RAR_MAX_PAYLOADS)
> +			continue;
> +
> +		if (!test_and_set_bit((long)bit, &rar_in_use))
> +			return bit;
> +	}
> +}

This also serves like a kind of spinlock to wait for a payload slot to
become free.

> +static void free_payload(unsigned long idx)
> +{
> +	clear_bit(idx, &rar_in_use);
> +}
> +
> +static void set_payload(unsigned long idx, u16 pcid, unsigned long start,
> +			uint32_t pages)
> +{
> +	struct rar_payload *p = &rar_payload[idx];

I'd _probably_ just pass the 'struct rar_payload *' instead of an index.
It's harder to screw up a pointer.

> +	p->must_be_zero_1	= 0;
> +	p->must_be_zero_2	= 0;
> +	p->must_be_zero_3	= 0;
> +	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
> +	p->type			= RAR_TYPE_INVPCID;
> +	p->num_pages		= pages;
> +	p->initiator_cr3	= pcid;
> +	p->linear_address	= start;
> +
> +	if (pcid) {
> +		/* RAR invalidation of the mapping of a specific process. */
> +		if (pages >= RAR_INVLPG_MAX_PAGES)
> +			p->subtype = RAR_INVPCID_PCID;
> +		else
> +			p->subtype = RAR_INVPCID_ADDR;
> +	} else {
> +		/*
> +		 * Unfortunately RAR_INVPCID_ADDR excludes global translations.
> +		 * Always do a full flush for kernel invalidations.
> +		 */
> +		p->subtype = RAR_INVPCID_ALL;
> +	}
> +
> +	smp_wmb();
> +}

The barrier could use a comment too.

> +static void set_action_entry(unsigned long idx, int target_cpu)

Just trying to read this, I think we probably should remove the 'idx'
nomenclature and call them "payload_nr"'s or something more descriptive.

> +{
> +	u8 *bitmap = per_cpu(rar_action, target_cpu);
> +
> +	WRITE_ONCE(bitmap[idx], RAR_ACTION_START);
> +}

Maybe a comment like this for set_action_entry() would be helpful:

/*
 * Given a remote CPU, "arm" its action vector to ensure it
 * handles payload number 'idx' when it receives the RAR signal.
 * The remote CPU will overwrite RAR_ACTION_START when it handles
 * the request.
 */

> +static void wait_for_done(unsigned long idx, int target_cpu)
> +{
> +	u8 status;
> +	u8 *rar_actions = per_cpu(rar_action, target_cpu);
> +
> +	status = READ_ONCE(rar_actions[idx]);
> +
> +	while ((status != RAR_ACTION_OK) && (status != RAR_ACTION_FAIL)) {

Should this be:

	while (status == RAR_ACTION_START) {
	...

? That would more clearly link it to set_action_entry() and would also
be shorter.

> +		cpu_relax();
> +		status = READ_ONCE(rar_actions[idx]);
> +	}
> +
> +	WARN_ON_ONCE(rar_actions[idx] == RAR_ACTION_FAIL);
> +}
> +
> +void rar_cpu_init(void)
> +{
> +	u64 r;
> +	u8 *bitmap;
> +	int this_cpu = smp_processor_id();
> +
> +	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
> +
> +	rdmsrl(MSR_IA32_RAR_INFO, r);
> +	pr_info_once("RAR: support %lld payloads\n", r >> 32);

Doesn't this need to get coordinated or checked against RAR_MAX_PAYLOADS?

It might also be nice to use one of the mask functions for this. It's
nice when you see a spec say "37:32" and then you see code actually see
a GENMASK(37, 32) somewhere to match it.

> +	bitmap = (u8 *)per_cpu(rar_action, this_cpu);
> +	memset(bitmap, 0, RAR_MAX_PAYLOADS);
> +	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
> +	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));

	please vertically align the virt_to_phys() ^

> +
> +	r = RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF;

Setting RAR_CTRL_IGNORE_IF is probably worth a _little_ discussion in
the changelog.

> +	// reserved bits!!! r |= (RAR_VECTOR & 0xff);

Is this just some cruft from testing?

> +	wrmsrl(MSR_IA32_RAR_CTRL, r);
> +}
> +
> +/*
> + * This is a modified version of smp_call_function_many() of kernel/smp.c,
> + * without a function pointer, because the RAR handler is the ucode.
> + */

It doesn't look _that_ much like smp_call_function_many(). I don't see
much that can be consolidated.

> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end)
> +{
> +	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
> +	int cpu, next_cpu, this_cpu = smp_processor_id();
> +	cpumask_t *dest_mask;
> +	unsigned long idx;
> +
> +	if (pages > RAR_INVLPG_MAX_PAGES || end == TLB_FLUSH_ALL)
> +		pages = RAR_INVLPG_MAX_PAGES;
> +
> +	/*
> +	 * Can deadlock when called with interrupts disabled.
> +	 * We allow cpu's that are not yet online though, as no one else can

Nit: at some point all of the "we's" need to be excised and moved over
to imperative voice.

> +	 * send smp call function interrupt to this cpu and as such deadlocks
> +	 * can't happen.
> +	 */
> +	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
> +		     && !oops_in_progress && !early_boot_irqs_disabled);
> +
> +	/* Try to fastpath.  So, what's a CPU they want?  Ignoring this one. */
> +	cpu = cpumask_first_and(mask, cpu_online_mask);
> +	if (cpu == this_cpu)
> +		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
> +
> +	/* No online cpus?  We're done. */
> +	if (cpu >= nr_cpu_ids)
> +		return;

This little idiom _is_ in smp_call_function_many_cond(). I wonder if it
can be refactored out.

> +	/* Do we have another CPU which isn't us? */
> +	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
> +	if (next_cpu == this_cpu)
> +		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
> +
> +	/* Fastpath: do that cpu by itself. */
> +	if (next_cpu >= nr_cpu_ids) {
> +		idx = get_payload();
> +		set_payload(idx, pcid, start, pages);
> +		set_action_entry(idx, cpu);
> +		arch_send_rar_single_ipi(cpu);
> +		wait_for_done(idx, cpu);
> +		free_payload(idx);
> +		return;
> +	}

FWIW, I'm not sure this is that much of a fast path. I wouldn't be
shocked if _some_ hardware has a much faster way of IPI'ing a single CPU
versus a bunch. But I think arch_send_rar_single_ipi() and
arch_send_rar_ipi_mask() end up frobbing the hardware in pretty similar
ways.

I'd probably just axe this in the name of simplification unless there
are numbers behind it.

> +	dest_mask = this_cpu_ptr(&rar_cpu_mask);
> +	cpumask_and(dest_mask, mask, cpu_online_mask);
> +	cpumask_clear_cpu(this_cpu, dest_mask);
> +
> +	/* Some callers race with other cpus changing the passed mask */
> +	if (unlikely(!cpumask_weight(dest_mask)))
> +		return;
> +
> +	idx = get_payload();
> +	set_payload(idx, pcid, start, pages);
> +
> +	for_each_cpu(cpu, dest_mask)
> +		set_action_entry(idx, cpu);
> +
> +	/* Send a message to all CPUs in the map */
> +	arch_send_rar_ipi_mask(dest_mask);
> +
> +	for_each_cpu(cpu, dest_mask)
> +		wait_for_done(idx, cpu);

Naming nit: Let's give wait_for_done() a more RAR-specific name. It'll
make it clear that this is a RAR opertion and not soemthing generic.

> +	free_payload(idx);
> +}
> +EXPORT_SYMBOL(smp_call_rar_many);

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 1 week ago

On Wed, 2025-05-21 at 09:38 -0700, Dave Hansen wrote:
> 
> > +static void wait_for_done(unsigned long idx, int target_cpu)
> > +{
> > +	u8 status;
> > +	u8 *rar_actions = per_cpu(rar_action, target_cpu);
> > +
> > +	status = READ_ONCE(rar_actions[idx]);
> > +
> > +	while ((status != RAR_ACTION_OK) && (status !=
> > RAR_ACTION_FAIL)) {
> 
> Should this be:
> 
> 	while (status == RAR_ACTION_START) {
> 	...
> 
> ? That would more clearly link it to set_action_entry() and would
> also
> be shorter.
> 
That is a very good question. The old RAR code
suggests there might be some intermediate state
when the target CPU works on processing the
RAR entry, but the current documentation only
shows RAR_SUCCESS, RAR_PENDING, and RAR_FAILURE
as possible values.

Lets try with status == RAR_ACTION_PENDING.

> > 
> > +void rar_cpu_init(void)
> > +{
> > +	u64 r;
> > +	u8 *bitmap;
> > +	int this_cpu = smp_processor_id();
> > +
> > +	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
> > +
> > +	rdmsrl(MSR_IA32_RAR_INFO, r);
> > +	pr_info_once("RAR: support %lld payloads\n", r >> 32);
> 
> Doesn't this need to get coordinated or checked against
> RAR_MAX_PAYLOADS?

I just added that in, and also applied all the cleanups
from your email.

> 
> > +	// reserved bits!!! r |= (RAR_VECTOR & 0xff);
> 
> Is this just some cruft from testing?
> 
I'm kind of guessing the old code might have used this
value to specify which IRQ vector to use for RAR, but
modern microcode hardcodes the RAR_VECTOR value.

> > +	wrmsrl(MSR_IA32_RAR_CTRL, r);
> > +}
> > +
> > +/*
> > + * This is a modified version of smp_call_function_many() of
> > kernel/smp.c,
> > + * without a function pointer, because the RAR handler is the
> > ucode.
> > + */
> 
> It doesn't look _that_ much like smp_call_function_many(). I don't
> see
> much that can be consolidated.

Agreed. It looks even less like it after some more
simplifications.

> 
> > +	/* No online cpus?  We're done. */
> > +	if (cpu >= nr_cpu_ids)
> > +		return;
> 
> This little idiom _is_ in smp_call_function_many_cond(). I wonder if
> it
> can be refactored out.

Removing the arch_send_rar_single_ipi fast path
gets rid of this code completely.

Once we cpumask_and with the cpu_online_mask,
the cpumask_weight should end up as 0 if no
online CPUs are in the mask.

Thank you for all the cleanup suggestions.
I've tried to address them all for v3.


-- 
All Rights Reversed.

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Thomas Gleixner 8 months, 3 weeks ago

On Wed, May 21 2025 at 09:38, Dave Hansen wrote:
> On 5/19/25 18:02, Rik van Riel wrote:
>> +/*
>> + * This is a modified version of smp_call_function_many() of kernel/smp.c,
>> + * without a function pointer, because the RAR handler is the ucode.
>> + */
>
> It doesn't look _that_ much like smp_call_function_many(). I don't see
> much that can be consolidated.

It does not look like it because it has a gazillion of function
arguments, which can all be packed into a data structure, i.e. the
function argument of smp_call_function_many().

There is zero justification to reinvent the wheel and create another
source of hard to debug problems.

IMNSHO it's absolutely not rocket science to reuse
smp_call_function_many() for this, but I might be missing something as
always and I'm happy to be enlightenend.

Just for the record: The changelog contains an utter void of information
why this modified version of well established common code is required
and desired.

Thanks,

        tglx

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Nadav Amit 8 months, 3 weeks ago

Not a full review, but..

> On 20 May 2025, at 4:02, Rik van Riel <riel@surriel.com> wrote:
> 
> +/*
> + * This is a modified version of smp_call_function_many() of kernel/smp.c,

The updated function names is smp_call_function_many_cond() and it is
not aligned with smp_call_rar_many. I think the new version is (suprisingly)
better, so it’d be beneficial to bring smp_call_rar_many() to be like the
updated one in smp.c.

> + * without a function pointer, because the RAR handler is the ucode.
> + */
> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end)
> +{
> +	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
> +	int cpu, next_cpu, this_cpu = smp_processor_id();
> +	cpumask_t *dest_mask;
> +	unsigned long idx;
> +
> +	if (pages > RAR_INVLPG_MAX_PAGES || end == TLB_FLUSH_ALL)
> +		pages = RAR_INVLPG_MAX_PAGES;
> +
> +	/*
> +	 * Can deadlock when called with interrupts disabled.
> +	 * We allow cpu's that are not yet online though, as no one else can
> +	 * send smp call function interrupt to this cpu and as such deadlocks
> +	 * can't happen.
> +	 */
> +	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
> +		     && !oops_in_progress && !early_boot_irqs_disabled);

I thought you agreed to change it to make it use lockdep instead (so it will
be compiled out without LOCKDEP), like done in smp_call_function_many_cond()

> +
> +	/* Try to fastpath.  So, what's a CPU they want?  Ignoring this one. */
> +	cpu = cpumask_first_and(mask, cpu_online_mask);
> +	if (cpu == this_cpu)
> +		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
> +

Putting aside the rest of the code, I see you don’t call should_flush_tlb().
I think it is worth mentioning in commit log or comment the rationale behind
it (and maybe benchmarks to justify it).

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 3 weeks ago

On Tue, 2025-05-20 at 14:29 +0300, Nadav Amit wrote:
> Not a full review, but..
> 
> > On 20 May 2025, at 4:02, Rik van Riel <riel@surriel.com> wrote:
> > 
> > +/*
> > + * This is a modified version of smp_call_function_many() of
> > kernel/smp.c,
> 
> The updated function names is smp_call_function_many_cond() and it is
> not aligned with smp_call_rar_many. I think the new version is
> (suprisingly)
> better, so it’d be beneficial to bring smp_call_rar_many() to be like
> the
> updated one in smp.c.
> 
Agreed, it will be good to conditionally not send 
the RAR vector to some CPUs, especially ones that
are in deeper idle states.

That means structuring the code more like
smp_call_function_many_cond()

> > +	/*
> > +	 * Can deadlock when called with interrupts disabled.
> > +	 * We allow cpu's that are not yet online though, as no
> > one else can
> > +	 * send smp call function interrupt to this cpu and as
> > such deadlocks
> > +	 * can't happen.
> > +	 */
> > +	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
> > +		     && !oops_in_progress &&
> > !early_boot_irqs_disabled);
> 
> I thought you agreed to change it to make it use lockdep instead (so
> it will
> be compiled out without LOCKDEP), like done in
> smp_call_function_many_cond()
> 
I thought I had made that change in my tree.

I guess I lost it in a rebase :(

> > +
> > +	/* Try to fastpath.  So, what's a CPU they want?  Ignoring
> > this one. */
> > +	cpu = cpumask_first_and(mask, cpu_online_mask);
> > +	if (cpu == this_cpu)
> > +		cpu = cpumask_next_and(cpu, mask,
> > cpu_online_mask);
> > +
> 
> Putting aside the rest of the code, I see you don’t call
> should_flush_tlb().
> I think it is worth mentioning in commit log or comment the rationale
> behind
> it (and maybe benchmarks to justify it).
> 
> 
The long term plan here is to simply have the originating
CPU included in the cpumask, and have it send a RAR
request to itself.

That way all the CPUs can invalidate their entries in
parallel, without any extra code.

-- 
All Rights Reversed.

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Nadav Amit 8 months, 3 weeks ago

> On 20 May 2025, at 16:00, Rik van Riel <riel@surriel.com> wrote:
> 
>> Putting aside the rest of the code, I see you don’t call
>> should_flush_tlb().
>> I think it is worth mentioning in commit log or comment the rationale
>> behind
>> it (and maybe benchmarks to justify it).
>> 
>> 
> The long term plan here is to simply have the originating
> CPU included in the cpumask, and have it send a RAR
> request to itself.

That’s unrelated. I was referring to considering supporting
some sort of lazy TLB to eliminate sending RAR to cores that
do not care about it. Is there a cost of RAR to more cores than
needed? My guess is that there is one, and maybe in such cases
you would want actual IPI and special handling.

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 3 weeks ago

On Tue, 2025-05-20 at 23:26 +0300, Nadav Amit wrote:
> 
> > On 20 May 2025, at 16:00, Rik van Riel <riel@surriel.com> wrote:
> > 
> > > Putting aside the rest of the code, I see you don’t call
> > > should_flush_tlb().
> > > I think it is worth mentioning in commit log or comment the
> > > rationale
> > > behind
> > > it (and maybe benchmarks to justify it).
> > > 
> > > 
> > The long term plan here is to simply have the originating
> > CPU included in the cpumask, and have it send a RAR
> > request to itself.
> 
> That’s unrelated. I was referring to considering supporting
> some sort of lazy TLB to eliminate sending RAR to cores that
> do not care about it. Is there a cost of RAR to more cores than
> needed? My guess is that there is one, and maybe in such cases
> you would want actual IPI and special handling.

For RAR, I suspect the big cost is waking up
CPUs in idle states, and waiting for them to
wake up.

One possibility may be to change leave_mm()
to have an argument to set some flag that
the RAR code can read to see whether or
not to send a RAR interrupt to that CPU,
even if it is in the cpumask.

I don't think we can use the exact same
should_flush_tlb() logic, because the
tlb_gen is not updated by a RAR flush,
and the should_flush_tlb() logic is
somewhat intertwined with the tlb_gen
logic.

-- 
All Rights Reversed.

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Ingo Molnar 8 months, 3 weeks ago

* Rik van Riel <riel@surriel.com> wrote:

> From: Yu-cheng Yu <yu-cheng.yu@intel.com>
> 
> Remote Action Request (RAR) is a TLB flushing broadcast facility.
> To start a TLB flush, the initiator CPU creates a RAR payload and
> sends a command to the APIC.  The receiving CPUs automatically flush
> TLBs as specified in the payload without the kernel's involement.
> 
> [ riel: add pcid parameter to smp_call_rar_many so other mms can be flushed ]

Please actually review & tidy up patches that you pass through, don't 
just hack them up minimally and slap your tag and SOB on top of it.

One example, of many:

> +	 * We allow cpu's that are not yet online though, as no one else can

Here the comment has 'CPU' in lowercase, and with a grammar mistake.

> +	 * send smp call function interrupt to this cpu and as such deadlocks

Here 'CPU' is in lowercase.

> +	/* Try to fastpath.  So, what's a CPU they want?  Ignoring this one. */

Oh, here 'CPU' is uppercase again! What happened?

> +	/* No online cpus?  We're done. */

Lowercase again. Damn, I thought we settled on a way to spell this 
thing already.

> +	/* Do we have another CPU which isn't us? */

And uppercase. What a roller-coaster.

> +	/* Fastpath: do that cpu by itself. */
> +	/* Some callers race with other cpus changing the passed mask */

And lowercase.

> +	/* Send a message to all CPUs in the map */

And uppercase.

It's almost as if nobody has ever read these comments after writing 
them.

There's like a zillion small random-noise details through the entire 
series that insert unnecessary extra white noise in critical system 
code that should be a lot more carefully written, which emits a foul 
aura of carelessness. Reviewers should not be forced to point these out 
to you, in fact reviewers should not be exposed to such noise at all.

Please review the entire thing *much* more carefully before submitting 
-v3.

Thanks,

	Ingo

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 3 weeks ago

On Tue, 2025-05-20 at 11:28 +0200, Ingo Molnar wrote:
> 
> * Rik van Riel <riel@surriel.com> wrote:
> 
> > From: Yu-cheng Yu <yu-cheng.yu@intel.com>
> > 
> > Remote Action Request (RAR) is a TLB flushing broadcast facility.
> > To start a TLB flush, the initiator CPU creates a RAR payload and
> > sends a command to the APIC.  The receiving CPUs automatically
> > flush
> > TLBs as specified in the payload without the kernel's involement.
> > 
> > [ riel: add pcid parameter to smp_call_rar_many so other mms can be
> > flushed ]
> 
> Please actually review & tidy up patches that you pass through, don't
> just hack them up minimally and slap your tag and SOB on top of it.

I'm happy to do that now that the code is finally working.

v3 will have much more cleanups, and hopefully a few
optimizations.

-- 
All Rights Reversed.

Re: [RFC v2 7/9] x86/mm: Introduce Remote Action Request

Posted by Ingo Molnar 8 months, 2 weeks ago

* Rik van Riel <riel@surriel.com> wrote:

> v3 will have much more cleanups, and hopefully a few optimizations.

Thanks!

	Ingo