[v3] Intel RAR TLB invalidation

[RFC PATCH v3 5/7] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 1 week ago

From: Yu-cheng Yu <yu-cheng.yu@intel.com>

Remote Action Request (RAR) is a TLB flushing broadcast facility.
To start a TLB flush, the initiator CPU creates a RAR payload and
sends a command to the APIC.  The receiving CPUs automatically flush
TLBs as specified in the payload without the kernel's involement.

[ riel: add pcid parameter to smp_call_rar_many so other mms can be flushed ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/rar.h   |  69 +++++++++++
 arch/x86/kernel/cpu/common.c |   4 +
 arch/x86/mm/Makefile         |   1 +
 arch/x86/mm/rar.c            | 217 +++++++++++++++++++++++++++++++++++
 4 files changed, 291 insertions(+)
 create mode 100644 arch/x86/include/asm/rar.h
 create mode 100644 arch/x86/mm/rar.c

diff --git a/arch/x86/include/asm/rar.h b/arch/x86/include/asm/rar.h
new file mode 100644
index 000000000000..78c039e40e81
--- /dev/null
+++ b/arch/x86/include/asm/rar.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RAR_H
+#define _ASM_X86_RAR_H
+
+/*
+ * RAR payload types
+ */
+#define RAR_TYPE_INVPG		0
+#define RAR_TYPE_INVPG_NO_CR3	1
+#define RAR_TYPE_INVPCID	2
+#define RAR_TYPE_INVEPT		3
+#define RAR_TYPE_INVVPID	4
+#define RAR_TYPE_WRMSR		5
+
+/*
+ * Subtypes for RAR_TYPE_INVLPG
+ */
+#define RAR_INVPG_ADDR			0 /* address specific */
+#define RAR_INVPG_ALL			2 /* all, include global */
+#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
+
+/*
+ * Subtypes for RAR_TYPE_INVPCID
+ */
+#define RAR_INVPCID_ADDR		0 /* address specific */
+#define RAR_INVPCID_PCID		1 /* all of PCID */
+#define RAR_INVPCID_ALL			2 /* all, include global */
+#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
+
+/*
+ * Page size for RAR_TYPE_INVLPG
+ */
+#define RAR_INVLPG_PAGE_SIZE_4K		0
+#define RAR_INVLPG_PAGE_SIZE_2M		1
+#define RAR_INVLPG_PAGE_SIZE_1G		2
+
+/*
+ * Max number of pages per payload
+ */
+#define RAR_INVLPG_MAX_PAGES 63
+
+struct rar_payload {
+	u64 for_sw		: 8;
+	u64 type		: 8;
+	u64 must_be_zero_1	: 16;
+	u64 subtype		: 3;
+	u64 page_size		: 2;
+	u64 num_pages		: 6;
+	u64 must_be_zero_2	: 21;
+
+	u64 must_be_zero_3;
+
+	/*
+	 * Starting address
+	 */
+	u64 initiator_cr3;
+	u64 linear_address;
+
+	/*
+	 * Padding
+	 */
+	u64 padding[4];
+};
+
+void rar_cpu_init(void);
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end);
+
+#endif /* _ASM_X86_RAR_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8feb8fd2957a..d68a0a9b2aa2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -71,6 +71,7 @@
 #include <asm/tdx.h>
 #include <asm/posted_intr.h>
 #include <asm/runtime-const.h>
+#include <asm/rar.h>
 
 #include "cpu.h"
 
@@ -2425,6 +2426,9 @@ void cpu_init(void)
 	if (is_uv_system())
 		uv_cpu_init();
 
+	if (cpu_feature_enabled(X86_FEATURE_RAR))
+		rar_cpu_init();
+
 	load_fixmap_gdt(cpu);
 }
 
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5b9908f13dcf..f36fc99e8b10 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
 obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)	+= pti.o
+obj-$(CONFIG_BROADCAST_TLB_FLUSH)		+= rar.o
 
 obj-$(CONFIG_X86_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
new file mode 100644
index 000000000000..f63e68b412de
--- /dev/null
+++ b/arch/x86/mm/rar.c
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RAR TLB shootdown
+ */
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/sync_bitops.h>
+#include <asm/rar.h>
+#include <asm/tlbflush.h>
+
+static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+
+#define RAR_ACTION_SUCCESS	0x00
+#define RAR_ACTION_PENDING	0x01
+#define RAR_ACTION_FAILURE	0x80
+
+#define RAR_MAX_PAYLOADS 64UL
+
+/* How many RAR payloads are supported by this CPU */
+static int rar_max_payloads = RAR_MAX_PAYLOADS;
+
+/* Bitmap describing which RAR payload slots are in use. */
+static unsigned long rar_in_use = ~(RAR_MAX_PAYLOADS - 1);
+
+/*
+ * RAR payloads telling CPUs what to do. This table is shared between
+ * all CPUs; it is possible to have multiple payload tables shared between
+ * different subsets of CPUs, but that adds a lot of complexity.
+ */
+static struct rar_payload rar_payload[RAR_MAX_PAYLOADS] __page_aligned_bss;
+
+/*
+ * The action vector tells each CPU which payload table entries
+ * have work for that CPU.
+ */
+static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
+
+static unsigned long get_payload_slot(void)
+{
+	while (1) {
+		unsigned long bit;
+
+		/*
+		 * Find a free bit and confirm it with test_and_set_bit()
+		 * below. If no slot is free, spin until one becomes free.
+		 */
+		bit = ffz(READ_ONCE(rar_in_use));
+
+		if (bit >= rar_max_payloads)
+			continue;
+
+		if (!test_and_set_bit((long)bit, &rar_in_use))
+			return bit;
+	}
+}
+
+static void free_payload_slot(unsigned long payload_nr)
+{
+	clear_bit(payload_nr, &rar_in_use);
+}
+
+static void set_payload(struct rar_payload *p, u16 pcid, unsigned long start,
+			uint32_t pages)
+{
+	p->must_be_zero_1	= 0;
+	p->must_be_zero_2	= 0;
+	p->must_be_zero_3	= 0;
+	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
+	p->type			= RAR_TYPE_INVPCID;
+	p->num_pages		= pages;
+	p->initiator_cr3	= pcid;
+	p->linear_address	= start;
+
+	if (pcid) {
+		/* RAR invalidation of the mapping of a specific process. */
+		if (pages >= RAR_INVLPG_MAX_PAGES)
+			p->subtype = RAR_INVPCID_PCID;
+		else
+			p->subtype = RAR_INVPCID_ADDR;
+	} else {
+		/*
+		 * Unfortunately RAR_INVPCID_ADDR excludes global translations.
+		 * Always do a full flush for kernel invalidations.
+		 */
+		p->subtype = RAR_INVPCID_ALL;
+	}
+
+	/* Ensure all writes are visible before the action entry is set. */
+	smp_wmb();
+}
+
+static void set_action_entry(unsigned long payload_nr, int target_cpu)
+{
+	u8 *bitmap = per_cpu(rar_action, target_cpu);
+
+	/*
+	 * Given a remote CPU, "arm" its action vector to ensure it handles
+	 * the request at payload_nr when it receives a RAR signal.
+	 * The remote CPU will overwrite RAR_ACTION_PENDING when it handles
+	 * the request.
+	 */
+	WRITE_ONCE(bitmap[payload_nr], RAR_ACTION_PENDING);
+}
+
+static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
+{
+	u8 status;
+	u8 *rar_actions = per_cpu(rar_action, target_cpu);
+
+	status = READ_ONCE(rar_actions[payload_nr]);
+
+	while (status == RAR_ACTION_PENDING) {
+		cpu_relax();
+		status = READ_ONCE(rar_actions[payload_nr]);
+	}
+
+	WARN_ON_ONCE(rar_actions[payload_nr] != RAR_ACTION_SUCCESS);
+}
+
+void rar_cpu_init(void)
+{
+	u64 r;
+	u8 *bitmap;
+	int max_payloads;
+	int this_cpu = smp_processor_id();
+
+	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
+
+	/* The MSR contains N defining the max [0-N] rar payload slots. */
+	rdmsrl(MSR_IA32_RAR_INFO, r);
+	max_payloads = (r >> 32) + 1;
+
+	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower our limit. */
+	if (max_payloads < rar_max_payloads)
+		rar_max_payloads = max_payloads;
+	pr_info_once("RAR: support %d payloads\n", max_payloads);
+
+	bitmap = (u8 *)per_cpu(rar_action, this_cpu);
+	memset(bitmap, 0, RAR_MAX_PAYLOADS);
+	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
+	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));
+
+	/*
+	 * Allow RAR events to be processed while interrupts are disabled on
+	 * a target CPU. This prevents "pileups" where many CPUs are waiting
+	 * on one CPU that has IRQs blocked for too long, and should reduce
+	 * contention on the rar_payload table.
+	 */
+	r = RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF;
+	wrmsrl(MSR_IA32_RAR_CTRL, r);
+}
+
+/*
+ * Inspired by smp_call_function_many(), but RAR requires a global payload
+ * table rather than per-CPU payloads in the CSD table, because the action
+ * handler is microcode rather than software.
+ */
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end)
+{
+	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
+	int cpu, this_cpu = smp_processor_id();
+	cpumask_t *dest_mask;
+	unsigned long payload_nr;
+
+	if (pages > RAR_INVLPG_MAX_PAGES || end == TLB_FLUSH_ALL)
+		pages = RAR_INVLPG_MAX_PAGES;
+
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * Allow CPUs that are not yet online though, as no one else can
+	 * send smp call function interrupt to this CPU and as such deadlocks
+	 * can't happen.
+	 */
+	if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) {
+		lockdep_assert_irqs_enabled();
+		lockdep_assert_preemption_disabled();
+	}
+
+	/*
+	 * A CPU needs to be initialized in order to process RARs.
+	 * Skip offline CPUs.
+	 *
+	 * TODO:
+	 * - Use RAR to flush our own TLB so it can all happen in parallel
+	 *   (need to resolve a chicken-egg issue with the boot CPU)
+	 * - Skip RAR to CPUs that are in a deeper C-state, with an empty TLB
+	 *
+	 * This code cannot use the should_flush_tlb() logic here because
+	 * RAR flushes do not update the tlb_gen, resulting in unnecessary
+	 * flushes at context switch time.
+	 */
+	dest_mask = this_cpu_ptr(&rar_cpu_mask);
+	cpumask_and(dest_mask, mask, cpu_online_mask);
+	__cpumask_clear_cpu(this_cpu, dest_mask);
+
+	/* Some callers race with other CPUs changing the passed mask */
+	if (unlikely(!cpumask_weight(dest_mask)))
+		return;
+
+	payload_nr = get_payload_slot();
+	set_payload(&rar_payload[payload_nr], pcid, start, pages);
+
+	for_each_cpu(cpu, dest_mask)
+		set_action_entry(payload_nr, cpu);
+
+	/* Send a message to all CPUs in the map */
+	native_send_rar_ipi(dest_mask);
+
+	for_each_cpu(cpu, dest_mask)
+		wait_for_action_done(payload_nr, cpu);
+
+	free_payload_slot(payload_nr);
+}
+EXPORT_SYMBOL(smp_call_rar_many);
-- 
2.49.0

Re: [RFC PATCH v3 5/7] x86/mm: Introduce Remote Action Request

Posted by Nadav Amit 8 months, 1 week ago

Just few small things that jump out…

> On 5 Jun 2025, at 19:35, Rik van Riel <riel@surriel.com> wrote:
> 
> +void rar_cpu_init(void)
> +{
> +	u64 r;
> +	u8 *bitmap;
> +	int max_payloads;
> +	int this_cpu = smp_processor_id();
> +
> +	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
> +
> +	/* The MSR contains N defining the max [0-N] rar payload slots. */
> +	rdmsrl(MSR_IA32_RAR_INFO, r);
> +	max_payloads = (r >> 32) + 1;
> +
> +	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower our limit. */
> +	if (max_payloads < rar_max_payloads)
> +		rar_max_payloads = max_payloads;

Unless I am missing something, this looks very racy.

BTW: should rar_max_payloads be ro_after_init?

> +	pr_info_once("RAR: support %d payloads\n", max_payloads);
> +
> +	bitmap = (u8 *)per_cpu(rar_action, this_cpu);

this_cpu_ptr() would be cleaner (here and when using rar_cpu_mask).

> +	memset(bitmap, 0, RAR_MAX_PAYLOADS);
> +	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
> +	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));
> +
> +	/*
> +	 * Allow RAR events to be processed while interrupts are disabled on
> +	 * a target CPU. This prevents "pileups" where many CPUs are waiting
> +	 * on one CPU that has IRQs blocked for too long, and should reduce
> +	 * contention on the rar_payload table.
> +	 */
> +	r = RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF;

Do we really need r ?

> +	wrmsrl(MSR_IA32_RAR_CTRL, r);
> +}

Re: [RFC PATCH v3 5/7] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 1 week ago

On Thu, 2025-06-05 at 21:54 +0300, Nadav Amit wrote:
> Just few small things that jump out…
> 
> > On 5 Jun 2025, at 19:35, Rik van Riel <riel@surriel.com> wrote:
> > 
> > +void rar_cpu_init(void)
> > +{
> > +	u64 r;
> > +	u8 *bitmap;
> > +	int max_payloads;
> > +	int this_cpu = smp_processor_id();
> > +
> > +	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
> > +
> > +	/* The MSR contains N defining the max [0-N] rar payload
> > slots. */
> > +	rdmsrl(MSR_IA32_RAR_INFO, r);
> > +	max_payloads = (r >> 32) + 1;
> > +
> > +	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower
> > our limit. */
> > +	if (max_payloads < rar_max_payloads)
> > +		rar_max_payloads = max_payloads;
> 
> Unless I am missing something, this looks very racy.
> 
All the CPUs in the system should support the same
number rar_max_payloads, since they share the same
rar_action table.

> BTW: should rar_max_payloads be ro_after_init?
> 
> > +	pr_info_once("RAR: support %d payloads\n", max_payloads);
> > +
> > +	bitmap = (u8 *)per_cpu(rar_action, this_cpu);
> 
> this_cpu_ptr() would be cleaner (here and when using rar_cpu_mask).

A CPU cannot start using the rar_action table until
the wrmsl below.

That should ensure there is no race here.

Thank you for the cleanup ideas. I'll apply those for v4.

> > +	/*
> > +	 * Allow RAR events to be processed while interrupts are
> > disabled on
> > +	 * a target CPU. This prevents "pileups" where many CPUs
> > are waiting
> > +	 * on one CPU that has IRQs blocked for too long, and
> > should reduce
> > +	 * contention on the rar_payload table.
> > +	 */
> > +	r = RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF;
> 
> Do we really need r ?

I suppose not. The original code added another
value into r, but that hangs hard with today's
microcode :)

> 
> > +	wrmsrl(MSR_IA32_RAR_CTRL, r);
> > +}
> 
> 

-- 
All Rights Reversed.

Re: [RFC PATCH v3 5/7] x86/mm: Introduce Remote Action Request

Posted by Nadav Amit 8 months, 1 week ago

> On 5 Jun 2025, at 22:40, Rik van Riel <riel@surriel.com> wrote:
> 
> On Thu, 2025-06-05 at 21:54 +0300, Nadav Amit wrote:
>> Just few small things that jump out…
>> 
>>> On 5 Jun 2025, at 19:35, Rik van Riel <riel@surriel.com> wrote:
>>> 
>>> 
>>> On 5 Jun 2025, at 19:35, Rik van Riel <riel@surriel.com> wrote:
>>> 
>>> +void rar_cpu_init(void)
>>> +{
>>> +	u64 r;
>>> +	u8 *bitmap;
>>> +	int max_payloads;
>>> +	int this_cpu = smp_processor_id();
>>> +
>>> +	cpumask_clear(&per_cpu(rar_cpu_mask, this_cpu));
>>> +
>>> +	/* The MSR contains N defining the max [0-N] rar payload
>>> slots. */
>>> +	rdmsrl(MSR_IA32_RAR_INFO, r);
>>> +	max_payloads = (r >> 32) + 1;
>>> +
>>> +	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower
>>> our limit. */
>>> +	if (max_payloads < rar_max_payloads)
>>> +		rar_max_payloads = max_payloads;
>> 
>> Unless I am missing something, this looks very racy.
>> 
> All the CPUs in the system should support the same
> number rar_max_payloads, since they share the same
> rar_action table.
> 

Usually you don’t want even benign data-races because it might cause
tools to shout for no reason. So you would want to assist both other
people and tools such as KCSAN by marking such accesses with
data_race().

However, I think perhaps the bigger issue is that you want to assume
all cores have the same RAR settings, and right now it might be a bit
inconsistent.

So you may want to do some initial checks on the BSP as for
whether RAR is supported and what rar_max_payloads is (e.g., in
bsp_init_intel() ). And then on each AP, in something like 
init_intel() you’d call setup_clear_cpu_cap() to disable RAR if any
CPU's max_payloads is different than the BSP.

[ BTW: further regarding patch 4, it seems cleaner to call
  rar_cpu_init() from Intel specific code like init_intel() ? ]

Just sharing my thoughts (and further clarifying them),
Nadav

Re: [RFC PATCH v3 5/7] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 8 months, 1 week ago

On Fri, 2025-06-06 at 01:45 +0300, Nadav Amit wrote:
> 
> However, I think perhaps the bigger issue is that you want to assume
> all cores have the same RAR settings, and right now it might be a bit
> inconsistent.

The documentation strongly suggests that RAR_INFO.TableMaxIndex
is architecturally determined, and always 64 on Sapphire Rapids.

I'm not sure we want code to handle a case the Intel documentation
seems to suggest cannot exist. Maybe somebody from Intel has
some firmer ideas here?

> 
> So you may want to do some initial checks on the BSP as for
> whether RAR is supported and what rar_max_payloads is (e.g., in
> bsp_init_intel() ). And then on each AP, in something like 
> init_intel() you’d call setup_clear_cpu_cap() to disable RAR if any
> CPU's max_payloads is different than the BSP.

Grabbing the value of rar_max_payloads from early_init_intel()
might make sense, since it is supposed to be a system-wide
value, and not a per CPU thing.

That seems like it would solve data race issues?

> 
> [ BTW: further regarding patch 4, it seems cleaner to call
>   rar_cpu_init() from Intel specific code like init_intel() ? ]
> 
Good idea, I'll move the call there.

> Just sharing my thoughts (and further clarifying them),
> 
I appreciate the comments and suggestions!

Now that the code seems to (finally) work reliably,
I should have a faster turnaround time incorporating
people's suggestions, too.

-- 
All Rights Reversed.

Re: [RFC PATCH v3 5/7] x86/mm: Introduce Remote Action Request

Posted by Nadav Amit 8 months, 1 week ago

> On 6 Jun 2025, at 3:03, Rik van Riel <riel@surriel.com> wrote:
> 
> On Fri, 2025-06-06 at 01:45 +0300, Nadav Amit wrote:
> 
>> 
>> So you may want to do some initial checks on the BSP as for
>> whether RAR is supported and what rar_max_payloads is (e.g., in
>> bsp_init_intel() ). And then on each AP, in something like 
>> init_intel() you’d call setup_clear_cpu_cap() to disable RAR if any
>> CPU's max_payloads is different than the BSP.
> 
> Grabbing the value of rar_max_payloads from early_init_intel()
> might make sense, since it is supposed to be a system-wide
> value, and not a per CPU thing.
> 
> That seems like it would solve data race issues?
> 

early_init_intel() is called from init_intel() so it
would be called both for the BSP and the APs; IOW, it won’t solve
the race.

Looking again at the code, I think that you should be able to assume
this data is the same on all cores and move the X86_FEATURE_RAR enabling
and rar_max_payloads setting into intel_detect_tlb() which appears
both to be the most logical place to put it and only runs on the BSP.

[RFC PATCH v3 1/7] x86/mm: Introduce Remote Action Request MSRs
[RFC PATCH v3 2/7] x86/mm: enable BROADCAST_TLB_FLUSH on Intel, too
[RFC PATCH v3 3/7] x86/mm: Introduce X86_FEATURE_RAR
[RFC PATCH v3 4/7] x86/apic: Introduce Remote Action Request Operations
[RFC PATCH v3 5/7] x86/mm: Introduce Remote Action Request
[RFC PATCH v3 6/7] x86/mm: use RAR for kernel TLB flushes
[RFC PATCH v3 7/7] x86/mm: userspace & pageout flushing using Intel RAR