[v4] Intel RAR TLB invalidation

[RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 3 months, 3 weeks ago

From: Yu-cheng Yu <yu-cheng.yu@intel.com>

Remote Action Request (RAR) is a TLB flushing broadcast facility.
To start a TLB flush, the initiator CPU creates a RAR payload and
sends a command to the APIC.  The receiving CPUs automatically flush
TLBs as specified in the payload without the kernel's involement.

[ riel: add pcid parameter to smp_call_rar_many so other mms can be flushed ]

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/rar.h  |  76 ++++++++++++
 arch/x86/kernel/cpu/intel.c |   8 +-
 arch/x86/mm/Makefile        |   1 +
 arch/x86/mm/rar.c           | 236 ++++++++++++++++++++++++++++++++++++
 4 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/rar.h
 create mode 100644 arch/x86/mm/rar.c

diff --git a/arch/x86/include/asm/rar.h b/arch/x86/include/asm/rar.h
new file mode 100644
index 000000000000..c875b9e9c509
--- /dev/null
+++ b/arch/x86/include/asm/rar.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RAR_H
+#define _ASM_X86_RAR_H
+
+/*
+ * RAR payload types
+ */
+#define RAR_TYPE_INVPG		0
+#define RAR_TYPE_INVPG_NO_CR3	1
+#define RAR_TYPE_INVPCID	2
+#define RAR_TYPE_INVEPT		3
+#define RAR_TYPE_INVVPID	4
+#define RAR_TYPE_WRMSR		5
+
+/*
+ * Subtypes for RAR_TYPE_INVLPG
+ */
+#define RAR_INVPG_ADDR			0 /* address specific */
+#define RAR_INVPG_ALL			2 /* all, include global */
+#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
+
+/*
+ * Subtypes for RAR_TYPE_INVPCID
+ */
+#define RAR_INVPCID_ADDR		0 /* address specific */
+#define RAR_INVPCID_PCID		1 /* all of PCID */
+#define RAR_INVPCID_ALL			2 /* all, include global */
+#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
+
+/*
+ * Page size for RAR_TYPE_INVLPG
+ */
+#define RAR_INVLPG_PAGE_SIZE_4K		0
+#define RAR_INVLPG_PAGE_SIZE_2M		1
+#define RAR_INVLPG_PAGE_SIZE_1G		2
+
+/*
+ * Max number of pages per payload
+ */
+#define RAR_INVLPG_MAX_PAGES 63
+
+struct rar_payload {
+	u64 for_sw		: 8;
+	u64 type		: 8;
+	u64 must_be_zero_1	: 16;
+	u64 subtype		: 3;
+	u64 page_size		: 2;
+	u64 num_pages		: 6;
+	u64 must_be_zero_2	: 21;
+
+	u64 must_be_zero_3;
+
+	/*
+	 * Starting address
+	 */
+	union {
+		u64 initiator_cr3;
+		struct {
+			u64 pcid	: 12;
+			u64 ignored	: 52;
+		};
+	};
+	u64 linear_address;
+
+	/*
+	 * Padding
+	 */
+	u64 padding[4];
+};
+
+void rar_cpu_init(void);
+void rar_boot_cpu_init(void);
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end);
+
+#endif /* _ASM_X86_RAR_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0cc4ae27127c..ddc5e7d81077 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -22,6 +22,7 @@
 #include <asm/microcode.h>
 #include <asm/msr.h>
 #include <asm/numa.h>
+#include <asm/rar.h>
 #include <asm/resctrl.h>
 #include <asm/thermal.h>
 #include <asm/uaccess.h>
@@ -624,6 +625,9 @@ static void init_intel(struct cpuinfo_x86 *c)
 	split_lock_init();
 
 	intel_init_thermal(c);
+
+	if (cpu_feature_enabled(X86_FEATURE_RAR))
+		rar_cpu_init();
 }
 
 #ifdef CONFIG_X86_32
@@ -725,8 +729,10 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
 
 		rdmsrl(MSR_IA32_CORE_CAPS, msr);
 
-		if (msr & MSR_IA32_CORE_CAPS_RAR)
+		if (msr & MSR_IA32_CORE_CAPS_RAR) {
 			setup_force_cpu_cap(X86_FEATURE_RAR);
+			rar_boot_cpu_init();
+		}
 	}
 }
 
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5b9908f13dcf..f36fc99e8b10 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
 obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)	+= pti.o
+obj-$(CONFIG_BROADCAST_TLB_FLUSH)		+= rar.o
 
 obj-$(CONFIG_X86_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
new file mode 100644
index 000000000000..76959782fb03
--- /dev/null
+++ b/arch/x86/mm/rar.c
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RAR TLB shootdown
+ */
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/sync_bitops.h>
+#include <asm/rar.h>
+#include <asm/tlbflush.h>
+
+static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+
+#define RAR_SUCCESS	0x00
+#define RAR_PENDING	0x01
+#define RAR_FAILURE	0x80
+
+#define RAR_MAX_PAYLOADS 64UL
+
+/* How many RAR payloads are supported by this CPU */
+static int rar_max_payloads __ro_after_init = RAR_MAX_PAYLOADS;
+
+/*
+ * RAR payloads telling CPUs what to do. This table is shared between
+ * all CPUs; it is possible to have multiple payload tables shared between
+ * different subsets of CPUs, but that adds a lot of complexity.
+ */
+static struct rar_payload rar_payload[RAR_MAX_PAYLOADS] __page_aligned_bss;
+
+/*
+ * Reduce contention for the RAR payloads by having a small number of
+ * CPUs share a RAR payload entry, instead of a free for all with all CPUs.
+ */
+struct rar_lock {
+	union {
+		raw_spinlock_t lock;
+		char __padding[SMP_CACHE_BYTES];
+	};
+};
+
+static struct rar_lock rar_locks[RAR_MAX_PAYLOADS] __cacheline_aligned;
+
+/*
+ * The action vector tells each CPU which payload table entries
+ * have work for that CPU.
+ */
+static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
+
+/*
+ * TODO: group CPUs together based on locality in the system instead
+ * of CPU number, to further reduce the cost of contention.
+ */
+static int cpu_rar_payload_number(void)
+{
+	int cpu = raw_smp_processor_id();
+	return cpu % rar_max_payloads;
+}
+
+static int get_payload_slot(void)
+{
+	int payload_nr = cpu_rar_payload_number();
+	raw_spin_lock(&rar_locks[payload_nr].lock);
+	return payload_nr;
+}
+
+static void free_payload_slot(unsigned long payload_nr)
+{
+	raw_spin_unlock(&rar_locks[payload_nr].lock);
+}
+
+static void set_payload(struct rar_payload *p, u16 pcid, unsigned long start,
+			long pages)
+{
+	p->must_be_zero_1	= 0;
+	p->must_be_zero_2	= 0;
+	p->must_be_zero_3	= 0;
+	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
+	p->type			= RAR_TYPE_INVPCID;
+	p->pcid			= pcid;
+	p->linear_address	= start;
+
+	if (pcid) {
+		/* RAR invalidation of the mapping of a specific process. */
+		if (pages < RAR_INVLPG_MAX_PAGES) {
+			p->num_pages = pages;
+			p->subtype = RAR_INVPCID_ADDR;
+		} else {
+			p->subtype = RAR_INVPCID_PCID;
+		}
+	} else {
+		/*
+		 * Unfortunately RAR_INVPCID_ADDR excludes global translations.
+		 * Always do a full flush for kernel invalidations.
+		 */
+		p->subtype = RAR_INVPCID_ALL;
+	}
+
+	/* Ensure all writes are visible before the action entry is set. */
+	smp_wmb();
+}
+
+static void set_action_entry(unsigned long payload_nr, int target_cpu)
+{
+	u8 *bitmap = per_cpu(rar_action, target_cpu);
+
+	/*
+	 * Given a remote CPU, "arm" its action vector to ensure it handles
+	 * the request at payload_nr when it receives a RAR signal.
+	 * The remote CPU will overwrite RAR_PENDING when it handles
+	 * the request.
+	 */
+	WRITE_ONCE(bitmap[payload_nr], RAR_PENDING);
+}
+
+static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
+{
+	u8 status;
+	u8 *rar_actions = per_cpu(rar_action, target_cpu);
+
+	status = READ_ONCE(rar_actions[payload_nr]);
+
+	while (status == RAR_PENDING) {
+		cpu_relax();
+		status = READ_ONCE(rar_actions[payload_nr]);
+	}
+
+	WARN_ON_ONCE(rar_actions[payload_nr] != RAR_SUCCESS);
+}
+
+void rar_cpu_init(void)
+{
+	u8 *bitmap;
+	u64 r;
+
+	/* Check if this CPU was already initialized. */
+	rdmsrl(MSR_IA32_RAR_PAYLOAD_BASE, r);
+	if (r == (u64)virt_to_phys(rar_payload))
+		return;
+
+	bitmap = this_cpu_ptr(rar_action);
+	memset(bitmap, 0, RAR_MAX_PAYLOADS);
+	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
+	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));
+
+	/*
+	 * Allow RAR events to be processed while interrupts are disabled on
+	 * a target CPU. This prevents "pileups" where many CPUs are waiting
+	 * on one CPU that has IRQs blocked for too long, and should reduce
+	 * contention on the rar_payload table.
+	 */
+	wrmsrl(MSR_IA32_RAR_CTRL, RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF);
+}
+
+void rar_boot_cpu_init(void)
+{
+	int max_payloads;
+	u64 r;
+
+	/* The MSR contains N defining the max [0-N] rar payload slots. */
+	rdmsrl(MSR_IA32_RAR_INFO, r);
+	max_payloads = (r >> 32) + 1;
+
+	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower our limit. */
+	if (max_payloads < rar_max_payloads)
+		rar_max_payloads = max_payloads;
+	pr_info("RAR: support %d payloads\n", max_payloads);
+
+	for (r = 0; r < rar_max_payloads; r++)
+		rar_locks[r].lock = __RAW_SPIN_LOCK_UNLOCKED(rar_lock);
+
+	/* Initialize the boot CPU early to handle early boot flushes. */
+	rar_cpu_init();
+}
+
+/*
+ * Inspired by smp_call_function_many(), but RAR requires a global payload
+ * table rather than per-CPU payloads in the CSD table, because the action
+ * handler is microcode rather than software.
+ */
+void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
+		       unsigned long start, unsigned long end)
+{
+	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
+	int cpu, this_cpu = smp_processor_id();
+	cpumask_t *dest_mask;
+	unsigned long payload_nr;
+
+	/* Catch the "end - start + PAGE_SIZE" overflow above. */
+	if (end == TLB_FLUSH_ALL)
+		pages = RAR_INVLPG_MAX_PAGES + 1;
+
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * Allow CPUs that are not yet online though, as no one else can
+	 * send smp call function interrupt to this CPU and as such deadlocks
+	 * can't happen.
+	 */
+	if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) {
+		lockdep_assert_irqs_enabled();
+		lockdep_assert_preemption_disabled();
+	}
+
+	/*
+	 * A CPU needs to be initialized in order to process RARs.
+	 * Skip offline CPUs.
+	 *
+	 * TODO:
+	 * - Skip RAR to CPUs that are in a deeper C-state, with an empty TLB
+	 *
+	 * This code cannot use the should_flush_tlb() logic here because
+	 * RAR flushes do not update the tlb_gen, resulting in unnecessary
+	 * flushes at context switch time.
+	 */
+	dest_mask = this_cpu_ptr(&rar_cpu_mask);
+	cpumask_and(dest_mask, mask, cpu_online_mask);
+
+	/* Some callers race with other CPUs changing the passed mask */
+	if (unlikely(!cpumask_weight(dest_mask)))
+		return;
+
+	payload_nr = get_payload_slot();
+	set_payload(&rar_payload[payload_nr], pcid, start, pages);
+
+	for_each_cpu(cpu, dest_mask)
+		set_action_entry(payload_nr, cpu);
+
+	/* Send a message to all CPUs in the map */
+	native_send_rar_ipi(dest_mask);
+
+	for_each_cpu(cpu, dest_mask)
+		wait_for_action_done(payload_nr, cpu);
+
+	free_payload_slot(payload_nr);
+}
+EXPORT_SYMBOL(smp_call_rar_many);
-- 
2.49.0

Re: [RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by Kirill A. Shutemov 3 months, 2 weeks ago

On Thu, Jun 19, 2025 at 04:03:57PM -0400, Rik van Riel wrote:
> From: Yu-cheng Yu <yu-cheng.yu@intel.com>
> 
> Remote Action Request (RAR) is a TLB flushing broadcast facility.
> To start a TLB flush, the initiator CPU creates a RAR payload and
> sends a command to the APIC.  The receiving CPUs automatically flush
> TLBs as specified in the payload without the kernel's involement.
> 
> [ riel: add pcid parameter to smp_call_rar_many so other mms can be flushed ]
> 
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>  arch/x86/include/asm/rar.h  |  76 ++++++++++++
>  arch/x86/kernel/cpu/intel.c |   8 +-
>  arch/x86/mm/Makefile        |   1 +
>  arch/x86/mm/rar.c           | 236 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 320 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/include/asm/rar.h
>  create mode 100644 arch/x86/mm/rar.c
> 
> diff --git a/arch/x86/include/asm/rar.h b/arch/x86/include/asm/rar.h
> new file mode 100644
> index 000000000000..c875b9e9c509
> --- /dev/null
> +++ b/arch/x86/include/asm/rar.h
> @@ -0,0 +1,76 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_RAR_H
> +#define _ASM_X86_RAR_H
> +
> +/*
> + * RAR payload types
> + */
> +#define RAR_TYPE_INVPG		0
> +#define RAR_TYPE_INVPG_NO_CR3	1
> +#define RAR_TYPE_INVPCID	2
> +#define RAR_TYPE_INVEPT		3
> +#define RAR_TYPE_INVVPID	4
> +#define RAR_TYPE_WRMSR		5
> +
> +/*
> + * Subtypes for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVPG_ADDR			0 /* address specific */
> +#define RAR_INVPG_ALL			2 /* all, include global */
> +#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
> +
> +/*
> + * Subtypes for RAR_TYPE_INVPCID
> + */
> +#define RAR_INVPCID_ADDR		0 /* address specific */
> +#define RAR_INVPCID_PCID		1 /* all of PCID */
> +#define RAR_INVPCID_ALL			2 /* all, include global */
> +#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
> +
> +/*
> + * Page size for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVLPG_PAGE_SIZE_4K		0
> +#define RAR_INVLPG_PAGE_SIZE_2M		1
> +#define RAR_INVLPG_PAGE_SIZE_1G		2
> +
> +/*
> + * Max number of pages per payload
> + */
> +#define RAR_INVLPG_MAX_PAGES 63
> +
> +struct rar_payload {
> +	u64 for_sw		: 8;

Bitfield of 8 bit? Why not just u8?

> +	u64 type		: 8;
> +	u64 must_be_zero_1	: 16;
> +	u64 subtype		: 3;
> +	u64 page_size		: 2;
> +	u64 num_pages		: 6;
> +	u64 must_be_zero_2	: 21;
> +
> +	u64 must_be_zero_3;
> +
> +	/*
> +	 * Starting address
> +	 */
> +	union {
> +		u64 initiator_cr3;

Initiator? It is CR3 to flush, not CR3 of the initiator.

> +		struct {
> +			u64 pcid	: 12;
> +			u64 ignored	: 52;
> +		};
> +	};
> +	u64 linear_address;
> +
> +	/*
> +	 * Padding
> +	 */
> +	u64 padding[4];

But it is not padding. It is available for SW, according to spec.

> +};

As far as I can see, only RAR_TYPE_INVPCID is used. Maybe it worth
defining payload struct specifically for this type and get rid of union.

> +
> +void rar_cpu_init(void);
> +void rar_boot_cpu_init(void);
> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end);
> +
> +#endif /* _ASM_X86_RAR_H */
> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
> index 0cc4ae27127c..ddc5e7d81077 100644
> --- a/arch/x86/kernel/cpu/intel.c
> +++ b/arch/x86/kernel/cpu/intel.c
> @@ -22,6 +22,7 @@
>  #include <asm/microcode.h>
>  #include <asm/msr.h>
>  #include <asm/numa.h>
> +#include <asm/rar.h>
>  #include <asm/resctrl.h>
>  #include <asm/thermal.h>
>  #include <asm/uaccess.h>
> @@ -624,6 +625,9 @@ static void init_intel(struct cpuinfo_x86 *c)
>  	split_lock_init();
>  
>  	intel_init_thermal(c);
> +
> +	if (cpu_feature_enabled(X86_FEATURE_RAR))
> +		rar_cpu_init();

So, boot CPU gets initialized twice right? Once via rar_boot_cpu_init()
in intel_detect_tlb() and the second time here.

>  }
>  
>  #ifdef CONFIG_X86_32
> @@ -725,8 +729,10 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
>  
>  		rdmsrl(MSR_IA32_CORE_CAPS, msr);
>  
> -		if (msr & MSR_IA32_CORE_CAPS_RAR)
> +		if (msr & MSR_IA32_CORE_CAPS_RAR) {
>  			setup_force_cpu_cap(X86_FEATURE_RAR);
> +			rar_boot_cpu_init();
> +		}
>  	}
>  }
>  
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 5b9908f13dcf..f36fc99e8b10 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -52,6 +52,7 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
>  obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
>  obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
>  obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)	+= pti.o
> +obj-$(CONFIG_BROADCAST_TLB_FLUSH)		+= rar.o
>  
>  obj-$(CONFIG_X86_MEM_ENCRYPT)	+= mem_encrypt.o
>  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
> diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
> new file mode 100644
> index 000000000000..76959782fb03
> --- /dev/null
> +++ b/arch/x86/mm/rar.c
> @@ -0,0 +1,236 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * RAR TLB shootdown
> + */
> +#include <linux/sched.h>
> +#include <linux/bug.h>
> +#include <asm/current.h>
> +#include <asm/io.h>
> +#include <asm/sync_bitops.h>
> +#include <asm/rar.h>
> +#include <asm/tlbflush.h>
> +
> +static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
> +
> +#define RAR_SUCCESS	0x00
> +#define RAR_PENDING	0x01
> +#define RAR_FAILURE	0x80
> +
> +#define RAR_MAX_PAYLOADS 64UL
> +
> +/* How many RAR payloads are supported by this CPU */
> +static int rar_max_payloads __ro_after_init = RAR_MAX_PAYLOADS;
> +
> +/*
> + * RAR payloads telling CPUs what to do. This table is shared between
> + * all CPUs; it is possible to have multiple payload tables shared between
> + * different subsets of CPUs, but that adds a lot of complexity.
> + */
> +static struct rar_payload rar_payload[RAR_MAX_PAYLOADS] __page_aligned_bss;

On machines without RAR it would waste 4k. Not a big deal, I guess. But it
would be neat to reclaim it if unused.

> +/*
> + * Reduce contention for the RAR payloads by having a small number of
> + * CPUs share a RAR payload entry, instead of a free for all with all CPUs.
> + */
> +struct rar_lock {
> +	union {
> +		raw_spinlock_t lock;
> +		char __padding[SMP_CACHE_BYTES];
> +	};
> +};
> +
> +static struct rar_lock rar_locks[RAR_MAX_PAYLOADS] __cacheline_aligned;

One more 4k.

> +/*
> + * The action vector tells each CPU which payload table entries
> + * have work for that CPU.
> + */
> +static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
> +
> +/*
> + * TODO: group CPUs together based on locality in the system instead
> + * of CPU number, to further reduce the cost of contention.
> + */
> +static int cpu_rar_payload_number(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	return cpu % rar_max_payloads;
> +}
> +
> +static int get_payload_slot(void)
> +{
> +	int payload_nr = cpu_rar_payload_number();
> +	raw_spin_lock(&rar_locks[payload_nr].lock);
> +	return payload_nr;
> +}
> +
> +static void free_payload_slot(unsigned long payload_nr)
> +{
> +	raw_spin_unlock(&rar_locks[payload_nr].lock);
> +}
> +
> +static void set_payload(struct rar_payload *p, u16 pcid, unsigned long start,
> +			long pages)
> +{
> +	p->must_be_zero_1	= 0;
> +	p->must_be_zero_2	= 0;
> +	p->must_be_zero_3	= 0;
> +	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
> +	p->type			= RAR_TYPE_INVPCID;
> +	p->pcid			= pcid;
> +	p->linear_address	= start;
> +
> +	if (pcid) {
> +		/* RAR invalidation of the mapping of a specific process. */
> +		if (pages < RAR_INVLPG_MAX_PAGES) {
> +			p->num_pages = pages;
> +			p->subtype = RAR_INVPCID_ADDR;
> +		} else {
> +			p->subtype = RAR_INVPCID_PCID;
> +		}
> +	} else {
> +		/*
> +		 * Unfortunately RAR_INVPCID_ADDR excludes global translations.
> +		 * Always do a full flush for kernel invalidations.
> +		 */
> +		p->subtype = RAR_INVPCID_ALL;
> +	}
> +
> +	/* Ensure all writes are visible before the action entry is set. */
> +	smp_wmb();
> +}
> +
> +static void set_action_entry(unsigned long payload_nr, int target_cpu)
> +{
> +	u8 *bitmap = per_cpu(rar_action, target_cpu);
> +
> +	/*
> +	 * Given a remote CPU, "arm" its action vector to ensure it handles
> +	 * the request at payload_nr when it receives a RAR signal.
> +	 * The remote CPU will overwrite RAR_PENDING when it handles
> +	 * the request.
> +	 */
> +	WRITE_ONCE(bitmap[payload_nr], RAR_PENDING);
> +}
> +
> +static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
> +{
> +	u8 status;
> +	u8 *rar_actions = per_cpu(rar_action, target_cpu);
> +
> +	status = READ_ONCE(rar_actions[payload_nr]);
> +
> +	while (status == RAR_PENDING) {
> +		cpu_relax();
> +		status = READ_ONCE(rar_actions[payload_nr]);
> +	}
> +
> +	WARN_ON_ONCE(rar_actions[payload_nr] != RAR_SUCCESS);
> +}
> +
> +void rar_cpu_init(void)
> +{
> +	u8 *bitmap;
> +	u64 r;
> +
> +	/* Check if this CPU was already initialized. */
> +	rdmsrl(MSR_IA32_RAR_PAYLOAD_BASE, r);
> +	if (r == (u64)virt_to_phys(rar_payload))
> +		return;
> +
> +	bitmap = this_cpu_ptr(rar_action);
> +	memset(bitmap, 0, RAR_MAX_PAYLOADS);
> +	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
> +	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));
> +
> +	/*
> +	 * Allow RAR events to be processed while interrupts are disabled on
> +	 * a target CPU. This prevents "pileups" where many CPUs are waiting
> +	 * on one CPU that has IRQs blocked for too long, and should reduce
> +	 * contention on the rar_payload table.
> +	 */
> +	wrmsrl(MSR_IA32_RAR_CTRL, RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF);

Hmm. How is RAR_CTRL_IGNORE_IF safe? Wouldn't it break GUP_fast() which
relies on disabling interrupts to block TLB flush and page table freeing?

> +}
> +
> +void rar_boot_cpu_init(void)
> +{
> +	int max_payloads;
> +	u64 r;
> +
> +	/* The MSR contains N defining the max [0-N] rar payload slots. */
> +	rdmsrl(MSR_IA32_RAR_INFO, r);
> +	max_payloads = (r >> 32) + 1;
> +
> +	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower our limit. */
> +	if (max_payloads < rar_max_payloads)
> +		rar_max_payloads = max_payloads;
> +	pr_info("RAR: support %d payloads\n", max_payloads);
> +
> +	for (r = 0; r < rar_max_payloads; r++)
> +		rar_locks[r].lock = __RAW_SPIN_LOCK_UNLOCKED(rar_lock);
> +
> +	/* Initialize the boot CPU early to handle early boot flushes. */
> +	rar_cpu_init();
> +}
> +
> +/*
> + * Inspired by smp_call_function_many(), but RAR requires a global payload
> + * table rather than per-CPU payloads in the CSD table, because the action
> + * handler is microcode rather than software.
> + */
> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end)
> +{
> +	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
> +	int cpu, this_cpu = smp_processor_id();
> +	cpumask_t *dest_mask;
> +	unsigned long payload_nr;
> +
> +	/* Catch the "end - start + PAGE_SIZE" overflow above. */
> +	if (end == TLB_FLUSH_ALL)
> +		pages = RAR_INVLPG_MAX_PAGES + 1;
> +
> +	/*
> +	 * Can deadlock when called with interrupts disabled.
> +	 * Allow CPUs that are not yet online though, as no one else can
> +	 * send smp call function interrupt to this CPU and as such deadlocks
> +	 * can't happen.
> +	 */
> +	if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) {
> +		lockdep_assert_irqs_enabled();
> +		lockdep_assert_preemption_disabled();
> +	}
> +
> +	/*
> +	 * A CPU needs to be initialized in order to process RARs.
> +	 * Skip offline CPUs.
> +	 *
> +	 * TODO:
> +	 * - Skip RAR to CPUs that are in a deeper C-state, with an empty TLB
> +	 *
> +	 * This code cannot use the should_flush_tlb() logic here because
> +	 * RAR flushes do not update the tlb_gen, resulting in unnecessary
> +	 * flushes at context switch time.
> +	 */
> +	dest_mask = this_cpu_ptr(&rar_cpu_mask);
> +	cpumask_and(dest_mask, mask, cpu_online_mask);
> +
> +	/* Some callers race with other CPUs changing the passed mask */
> +	if (unlikely(!cpumask_weight(dest_mask)))
> +		return;
> +
> +	payload_nr = get_payload_slot();
> +	set_payload(&rar_payload[payload_nr], pcid, start, pages);
> +
> +	for_each_cpu(cpu, dest_mask)
> +		set_action_entry(payload_nr, cpu);
> +
> +	/* Send a message to all CPUs in the map */
> +	native_send_rar_ipi(dest_mask);
> +
> +	for_each_cpu(cpu, dest_mask)
> +		wait_for_action_done(payload_nr, cpu);
> +
> +	free_payload_slot(payload_nr);
> +}
> +EXPORT_SYMBOL(smp_call_rar_many);
> -- 
> 2.49.0
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by Kirill A. Shutemov 3 months, 2 weeks ago

On Thu, Jun 26, 2025 at 06:41:09PM +0300, Kirill A. Shutemov wrote:
> > +	/*
> > +	 * Allow RAR events to be processed while interrupts are disabled on
> > +	 * a target CPU. This prevents "pileups" where many CPUs are waiting
> > +	 * on one CPU that has IRQs blocked for too long, and should reduce
> > +	 * contention on the rar_payload table.
> > +	 */
> > +	wrmsrl(MSR_IA32_RAR_CTRL, RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF);
> 
> Hmm. How is RAR_CTRL_IGNORE_IF safe? Wouldn't it break GUP_fast() which
> relies on disabling interrupts to block TLB flush and page table freeing?

Ah. I missed that x86 switched to MMU_GATHER_RCU_TABLE_FREE.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by Nadav Amit 3 months, 3 weeks ago


On 19/06/2025 23:03, Rik van Riel wrote:
> From: Yu-cheng Yu <yu-cheng.yu@intel.com>
> 
> Remote Action Request (RAR) is a TLB flushing broadcast facility.
> To start a TLB flush, the initiator CPU creates a RAR payload and
> sends a command to the APIC.  The receiving CPUs automatically flush
> TLBs as specified in the payload without the kernel's involement.
> 
> [ riel: add pcid parameter to smp_call_rar_many so other mms can be flushed ]
> 
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>   arch/x86/include/asm/rar.h  |  76 ++++++++++++
>   arch/x86/kernel/cpu/intel.c |   8 +-
>   arch/x86/mm/Makefile        |   1 +
>   arch/x86/mm/rar.c           | 236 ++++++++++++++++++++++++++++++++++++
>   4 files changed, 320 insertions(+), 1 deletion(-)
>   create mode 100644 arch/x86/include/asm/rar.h
>   create mode 100644 arch/x86/mm/rar.c
> 
> diff --git a/arch/x86/include/asm/rar.h b/arch/x86/include/asm/rar.h
> new file mode 100644
> index 000000000000..c875b9e9c509
> --- /dev/null
> +++ b/arch/x86/include/asm/rar.h
> @@ -0,0 +1,76 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_RAR_H
> +#define _ASM_X86_RAR_H
> +
> +/*
> + * RAR payload types
> + */
> +#define RAR_TYPE_INVPG		0
> +#define RAR_TYPE_INVPG_NO_CR3	1
> +#define RAR_TYPE_INVPCID	2
> +#define RAR_TYPE_INVEPT		3
> +#define RAR_TYPE_INVVPID	4
> +#define RAR_TYPE_WRMSR		5
> +
> +/*
> + * Subtypes for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVPG_ADDR			0 /* address specific */
> +#define RAR_INVPG_ALL			2 /* all, include global */
> +#define RAR_INVPG_ALL_NO_GLOBAL		3 /* all, exclude global */
> +
> +/*
> + * Subtypes for RAR_TYPE_INVPCID
> + */
> +#define RAR_INVPCID_ADDR		0 /* address specific */
> +#define RAR_INVPCID_PCID		1 /* all of PCID */
> +#define RAR_INVPCID_ALL			2 /* all, include global */
> +#define RAR_INVPCID_ALL_NO_GLOBAL	3 /* all, exclude global */
> +
> +/*
> + * Page size for RAR_TYPE_INVLPG
> + */
> +#define RAR_INVLPG_PAGE_SIZE_4K		0
> +#define RAR_INVLPG_PAGE_SIZE_2M		1
> +#define RAR_INVLPG_PAGE_SIZE_1G		2
> +
> +/*
> + * Max number of pages per payload
> + */
> +#define RAR_INVLPG_MAX_PAGES 63
> +
> +struct rar_payload {
> +	u64 for_sw		: 8;
> +	u64 type		: 8;
> +	u64 must_be_zero_1	: 16;
> +	u64 subtype		: 3;
> +	u64 page_size		: 2;
> +	u64 num_pages		: 6;
> +	u64 must_be_zero_2	: 21;
> +
> +	u64 must_be_zero_3;
> +
> +	/*
> +	 * Starting address
> +	 */
> +	union {
> +		u64 initiator_cr3;
> +		struct {
> +			u64 pcid	: 12;
> +			u64 ignored	: 52;
> +		};
> +	};
> +	u64 linear_address;
> +
> +	/*
> +	 * Padding
> +	 */
> +	u64 padding[4];
> +};

I think __aligned(64) should allow you to get rid of the padding.

> +
> +void rar_cpu_init(void);
> +void rar_boot_cpu_init(void);
> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end);
> +
> +#endif /* _ASM_X86_RAR_H */
> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
> index 0cc4ae27127c..ddc5e7d81077 100644
> --- a/arch/x86/kernel/cpu/intel.c
> +++ b/arch/x86/kernel/cpu/intel.c
> @@ -22,6 +22,7 @@
>   #include <asm/microcode.h>
>   #include <asm/msr.h>
>   #include <asm/numa.h>
> +#include <asm/rar.h>
>   #include <asm/resctrl.h>
>   #include <asm/thermal.h>
>   #include <asm/uaccess.h>
> @@ -624,6 +625,9 @@ static void init_intel(struct cpuinfo_x86 *c)
>   	split_lock_init();
>   
>   	intel_init_thermal(c);
> +
> +	if (cpu_feature_enabled(X86_FEATURE_RAR))
> +		rar_cpu_init();
>   }
>   
>   #ifdef CONFIG_X86_32
> @@ -725,8 +729,10 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
>   
>   		rdmsrl(MSR_IA32_CORE_CAPS, msr);
>   
> -		if (msr & MSR_IA32_CORE_CAPS_RAR)
> +		if (msr & MSR_IA32_CORE_CAPS_RAR) {
>   			setup_force_cpu_cap(X86_FEATURE_RAR);
> +			rar_boot_cpu_init();
> +		}
>   	}
>   }
>   
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 5b9908f13dcf..f36fc99e8b10 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -52,6 +52,7 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
>   obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
>   obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
>   obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)	+= pti.o
> +obj-$(CONFIG_BROADCAST_TLB_FLUSH)		+= rar.o
>   
>   obj-$(CONFIG_X86_MEM_ENCRYPT)	+= mem_encrypt.o
>   obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
> diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
> new file mode 100644
> index 000000000000..76959782fb03
> --- /dev/null
> +++ b/arch/x86/mm/rar.c
> @@ -0,0 +1,236 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * RAR TLB shootdown
> + */
> +#include <linux/sched.h>
> +#include <linux/bug.h>
> +#include <asm/current.h>
> +#include <asm/io.h>
> +#include <asm/sync_bitops.h>
> +#include <asm/rar.h>
> +#include <asm/tlbflush.h>
> +
> +static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
> +
> +#define RAR_SUCCESS	0x00
> +#define RAR_PENDING	0x01
> +#define RAR_FAILURE	0x80
> +
> +#define RAR_MAX_PAYLOADS 64UL
> +
> +/* How many RAR payloads are supported by this CPU */
> +static int rar_max_payloads __ro_after_init = RAR_MAX_PAYLOADS;
> +
> +/*
> + * RAR payloads telling CPUs what to do. This table is shared between
> + * all CPUs; it is possible to have multiple payload tables shared between
> + * different subsets of CPUs, but that adds a lot of complexity.
> + */
> +static struct rar_payload rar_payload[RAR_MAX_PAYLOADS] __page_aligned_bss;
> +
> +/*
> + * Reduce contention for the RAR payloads by having a small number of
> + * CPUs share a RAR payload entry, instead of a free for all with all CPUs.
> + */
> +struct rar_lock {
> +	union {
> +		raw_spinlock_t lock;
> +		char __padding[SMP_CACHE_BYTES];
> +	};
> +};

I think you can lose the __padding and instead have 
____cacheline_aligned (and then you won't need union).

> +
> +static struct rar_lock rar_locks[RAR_MAX_PAYLOADS] __cacheline_aligned;
> +
> +/*
> + * The action vector tells each CPU which payload table entries
> + * have work for that CPU.
> + */
> +static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
> +
> +/*
> + * TODO: group CPUs together based on locality in the system instead
> + * of CPU number, to further reduce the cost of contention.
> + */
> +static int cpu_rar_payload_number(void)
> +{
> +	int cpu = raw_smp_processor_id();

Why raw_* ?

> +	return cpu % rar_max_payloads;
> +}
> +
> +static int get_payload_slot(void)
> +{
> +	int payload_nr = cpu_rar_payload_number();
> +	raw_spin_lock(&rar_locks[payload_nr].lock);
> +	return payload_nr;
> +}

I think it would be better to open-code it to improve readability. If 
you choose not to, I think you should use sparse indications (e.g., 
__acquires() ).

> +
> +static void free_payload_slot(unsigned long payload_nr)
> +{
> +	raw_spin_unlock(&rar_locks[payload_nr].lock);
> +}
> +
> +static void set_payload(struct rar_payload *p, u16 pcid, unsigned long start,
> +			long pages)
> +{
> +	p->must_be_zero_1	= 0;
> +	p->must_be_zero_2	= 0;
> +	p->must_be_zero_3	= 0;
> +	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;

I think you can propagate the stride to this point instead of using 
fixed 4KB stride.

> +	p->type			= RAR_TYPE_INVPCID;
> +	p->pcid			= pcid;
> +	p->linear_address	= start;
> +
> +	if (pcid) {
> +		/* RAR invalidation of the mapping of a specific process. */
> +		if (pages < RAR_INVLPG_MAX_PAGES) {
> +			p->num_pages = pages;
> +			p->subtype = RAR_INVPCID_ADDR;
> +		} else {
> +			p->subtype = RAR_INVPCID_PCID;

I wonder whether it would be safer to set something to p->num_pages.
(then we can do it unconditionally)

> +		}
> +	} else {
> +		/*
> +		 * Unfortunately RAR_INVPCID_ADDR excludes global translations.
> +		 * Always do a full flush for kernel invalidations.
> +		 */
> +		p->subtype = RAR_INVPCID_ALL;
> +	}
> +
> +	/* Ensure all writes are visible before the action entry is set. */
> +	smp_wmb();

Maybe you can drop the smp_wmb() here and instead change the 
WRITE_ONCE() in set_action_entry() to smp_store_release() ? It should 
have the same effect and I think would be cleaner and convey your intent 
better.

> +}
> +
> +static void set_action_entry(unsigned long payload_nr, int target_cpu)
> +{
> +	u8 *bitmap = per_cpu(rar_action, target_cpu);

bitmap? It doesn't look like one...

> +
> +	/*
> +	 * Given a remote CPU, "arm" its action vector to ensure it handles
> +	 * the request at payload_nr when it receives a RAR signal.
> +	 * The remote CPU will overwrite RAR_PENDING when it handles
> +	 * the request.
> +	 */
> +	WRITE_ONCE(bitmap[payload_nr], RAR_PENDING);
> +}
> +
> +static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
> +{
> +	u8 status;
> +	u8 *rar_actions = per_cpu(rar_action, target_cpu);
> +
> +	status = READ_ONCE(rar_actions[payload_nr]);
> +
> +	while (status == RAR_PENDING) {
> +		cpu_relax();
> +		status = READ_ONCE(rar_actions[payload_nr]);
> +	}
> +
> +	WARN_ON_ONCE(rar_actions[payload_nr] != RAR_SUCCESS);

WARN_ON_ONCE(status != RAR_SUCCESS)

> +}
> +
> +void rar_cpu_init(void)
> +{
> +	u8 *bitmap;
> +	u64 r;
> +
> +	/* Check if this CPU was already initialized. */
> +	rdmsrl(MSR_IA32_RAR_PAYLOAD_BASE, r);
> +	if (r == (u64)virt_to_phys(rar_payload))
> +		return;

Seems a bit risky test. If anything, I would check that the MSR that is 
supposed to be set *last* (MSR_IA32_RAR_CTRL) have the expected value. 
But it would still be best to initialize the MSRs unconditionally or to 
avoid repeated initialization using a different scheme.

> +
> +	bitmap = this_cpu_ptr(rar_action);
> +	memset(bitmap, 0, RAR_MAX_PAYLOADS);
> +	wrmsrl(MSR_IA32_RAR_ACT_VEC, (u64)virt_to_phys(bitmap));
> +	wrmsrl(MSR_IA32_RAR_PAYLOAD_BASE, (u64)virt_to_phys(rar_payload));
> +
> +	/*
> +	 * Allow RAR events to be processed while interrupts are disabled on
> +	 * a target CPU. This prevents "pileups" where many CPUs are waiting
> +	 * on one CPU that has IRQs blocked for too long, and should reduce
> +	 * contention on the rar_payload table.
> +	 */
> +	wrmsrl(MSR_IA32_RAR_CTRL, RAR_CTRL_ENABLE | RAR_CTRL_IGNORE_IF);
> +}
> +
> +void rar_boot_cpu_init(void)
> +{
> +	int max_payloads;
> +	u64 r;
> +
> +	/* The MSR contains N defining the max [0-N] rar payload slots. */
> +	rdmsrl(MSR_IA32_RAR_INFO, r);
> +	max_payloads = (r >> 32) + 1;
> +
> +	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower our limit. */
> +	if (max_payloads < rar_max_payloads)
> +		rar_max_payloads = max_payloads;
> +	pr_info("RAR: support %d payloads\n", max_payloads);
> +
> +	for (r = 0; r < rar_max_payloads; r++)
> +		rar_locks[r].lock = __RAW_SPIN_LOCK_UNLOCKED(rar_lock);

Not a fan of the reuse of r for different purposes.

> +
> +	/* Initialize the boot CPU early to handle early boot flushes. */
> +	rar_cpu_init();
> +}
> +
> +/*
> + * Inspired by smp_call_function_many(), but RAR requires a global payload
> + * table rather than per-CPU payloads in the CSD table, because the action
> + * handler is microcode rather than software.
> + */
> +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> +		       unsigned long start, unsigned long end)
> +{
> +	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;

I think "end" is not inclusive. See for instance flush_tlb_page() where 
"end" is set to "a + PAGE_SIZE". So this would flush one extra page.

> +	int cpu, this_cpu = smp_processor_id();
> +	cpumask_t *dest_mask;
> +	unsigned long payload_nr;
> +
> +	/* Catch the "end - start + PAGE_SIZE" overflow above. */
> +	if (end == TLB_FLUSH_ALL)
> +		pages = RAR_INVLPG_MAX_PAGES + 1;
> +
> +	/*
> +	 * Can deadlock when called with interrupts disabled.
> +	 * Allow CPUs that are not yet online though, as no one else can
> +	 * send smp call function interrupt to this CPU and as such deadlocks
> +	 * can't happen.
> +	 */
> +	if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) {
> +		lockdep_assert_irqs_enabled();
> +		lockdep_assert_preemption_disabled();
> +	}
> +
> +	/*
> +	 * A CPU needs to be initialized in order to process RARs.
> +	 * Skip offline CPUs.
> +	 *
> +	 * TODO:
> +	 * - Skip RAR to CPUs that are in a deeper C-state, with an empty TLB
> +	 *
> +	 * This code cannot use the should_flush_tlb() logic here because
> +	 * RAR flushes do not update the tlb_gen, resulting in unnecessary
> +	 * flushes at context switch time.
> +	 */
> +	dest_mask = this_cpu_ptr(&rar_cpu_mask);
> +	cpumask_and(dest_mask, mask, cpu_online_mask);
> +
> +	/* Some callers race with other CPUs changing the passed mask */
> +	if (unlikely(!cpumask_weight(dest_mask)))

cpumask_and() returns "false if *@dstp is empty, else returns true". So 
you can use his value instead of calling cpumask_weight().

> +		return;
> +
> +	payload_nr = get_payload_slot();
> +	set_payload(&rar_payload[payload_nr], pcid, start, pages);
> +
> +	for_each_cpu(cpu, dest_mask)
> +		set_action_entry(payload_nr, cpu);
> +
> +	/* Send a message to all CPUs in the map */
> +	native_send_rar_ipi(dest_mask);
> +
> +	for_each_cpu(cpu, dest_mask)
> +		wait_for_action_done(payload_nr, cpu);
> +
> +	free_payload_slot(payload_nr);
> +}
> +EXPORT_SYMBOL(smp_call_rar_many);

Re: [RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by Rik van Riel 3 months, 3 weeks ago

On Fri, 2025-06-20 at 02:01 +0300, Nadav Amit wrote:
> 
> > +/*
> > + * Reduce contention for the RAR payloads by having a small number
> > of
> > + * CPUs share a RAR payload entry, instead of a free for all with
> > all CPUs.
> > + */
> > +struct rar_lock {
> > +	union {
> > +		raw_spinlock_t lock;
> > +		char __padding[SMP_CACHE_BYTES];
> > +	};
> > +};
> 
> I think you can lose the __padding and instead have 
> ____cacheline_aligned (and then you won't need union).
> 
I tried that initially, but the compiler was unhappy
to have __cacheline_aligned in the definition of a
struct.

> > +
> > +static struct rar_lock rar_locks[RAR_MAX_PAYLOADS]
> > __cacheline_aligned;
> > +
> > +/*
> > + * The action vector tells each CPU which payload table entries
> > + * have work for that CPU.
> > + */
> > +static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
> > +
> > +/*
> > + * TODO: group CPUs together based on locality in the system
> > instead
> > + * of CPU number, to further reduce the cost of contention.
> > + */
> > +static int cpu_rar_payload_number(void)
> > +{
> > +	int cpu = raw_smp_processor_id();
> 
> Why raw_* ?

I'll change that to regular smp_processor_id()
for the next version.

> 
> > +	return cpu % rar_max_payloads;
> > +}
> > +
> > +static int get_payload_slot(void)
> > +{
> > +	int payload_nr = cpu_rar_payload_number();
> > +	raw_spin_lock(&rar_locks[payload_nr].lock);
> > +	return payload_nr;
> > +}
> 
> I think it would be better to open-code it to improve readability. If
> you choose not to, I think you should use sparse indications (e.g., 
> __acquires() ).

Good point about the annotations. This can indeed
be open coded, since any future improvements here,
for example to have cpu_rar_payload_number() take
topology into account to reduce the cost of contention,
will be in that helper function.

> 
> > +
> > +static void free_payload_slot(unsigned long payload_nr)
> > +{
> > +	raw_spin_unlock(&rar_locks[payload_nr].lock);
> > +}
> > +
> > +static void set_payload(struct rar_payload *p, u16 pcid, unsigned
> > long start,
> > +			long pages)
> > +{
> > +	p->must_be_zero_1	= 0;
> > +	p->must_be_zero_2	= 0;
> > +	p->must_be_zero_3	= 0;
> > +	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
> 
> I think you can propagate the stride to this point instead of using 
> fixed 4KB stride.

Agreed. That's another future optimization, for
once this code all works right.

Currently I am in a situation where the receiving
CPU clears the action vector from RAR_PENDING to
RAR_SUCCESS, but the TLB does not appear to always
be correctly flushed.

> 
> > +	p->type			= RAR_TYPE_INVPCID;
> > +	p->pcid			= pcid;
> > +	p->linear_address	= start;
> > +
> > +	if (pcid) {
> > +		/* RAR invalidation of the mapping of a specific
> > process. */
> > +		if (pages < RAR_INVLPG_MAX_PAGES) {
> > +			p->num_pages = pages;
> > +			p->subtype = RAR_INVPCID_ADDR;
> > +		} else {
> > +			p->subtype = RAR_INVPCID_PCID;
> 
> I wonder whether it would be safer to set something to p->num_pages.
> (then we can do it unconditionally)

We have a limited number of bits available for
p->num_pages. I'm not sure we want to try
writing a larger number than what fits in those
bits.

> 
> > +		}
> > +	} else {
> > +		/*
> > +		 * Unfortunately RAR_INVPCID_ADDR excludes global
> > translations.
> > +		 * Always do a full flush for kernel
> > invalidations.
> > +		 */
> > +		p->subtype = RAR_INVPCID_ALL;
> > +	}
> > +
> > +	/* Ensure all writes are visible before the action entry
> > is set. */
> > +	smp_wmb();
> 
> Maybe you can drop the smp_wmb() here and instead change the 
> WRITE_ONCE() in set_action_entry() to smp_store_release() ? It should
> have the same effect and I think would be cleaner and convey your
> intent 
> better.
> 
We need protection against two different things here.

1) Receiving CPUs must see all the writes done by
   the originating CPU before we send the RAR IPI.

2) Receiving CPUs must see all the writes done by
   set_payload() before the write done by
   set_action_entry(), in case another CPU sends
   the RAR IPI before we do.

   That other RAR IPI could even be sent between
   when we write the payload, and when we write
   the action entry. The receiving CPU could take
   long enough processing other RAR payloads that
   it can see our action entry after we write it.

Does removing the smp_wmb() still leave everything
safe in that scenario?


> > +}
> > +
> > +static void set_action_entry(unsigned long payload_nr, int
> > target_cpu)
> > +{
> > +	u8 *bitmap = per_cpu(rar_action, target_cpu);
> 
> bitmap? It doesn't look like one...

I'll rename this one to rar_actions like I did in
wait_for_action_done()

> 
> > +static void wait_for_action_done(unsigned long payload_nr, int
> > target_cpu)
> > +{
> > +	u8 status;
> > +	u8 *rar_actions = per_cpu(rar_action, target_cpu);
> > +
> > +	status = READ_ONCE(rar_actions[payload_nr]);
> > +
> > +	while (status == RAR_PENDING) {
> > +		cpu_relax();
> > +		status = READ_ONCE(rar_actions[payload_nr]);
> > +	}
> > +
> > +	WARN_ON_ONCE(rar_actions[payload_nr] != RAR_SUCCESS);
> 
> WARN_ON_ONCE(status != RAR_SUCCESS)

I'll add that cleanup for v5, too.

> 
> > +}
> > +
> > +void rar_cpu_init(void)
> > +{
> > +	u8 *bitmap;
> > +	u64 r;
> > +
> > +	/* Check if this CPU was already initialized. */
> > +	rdmsrl(MSR_IA32_RAR_PAYLOAD_BASE, r);
> > +	if (r == (u64)virt_to_phys(rar_payload))
> > +		return;
> 
> Seems a bit risky test. If anything, I would check that the MSR that
> is 
> supposed to be set *last* (MSR_IA32_RAR_CTRL) have the expected
> value. 
> But it would still be best to initialize the MSRs unconditionally or
> to 
> avoid repeated initialization using a different scheme.
> 
Whatever different scheme we use must be able to deal
with CPU hotplug and suspend/resume. There are legitimate
cases where rar_cpu_init() is called, and the in-MSR
state does not match the in-memory state.

You are right that we could always unconditionally
write the MSRs.

However, I am not entirely convinced that overwriting
the per-CPU rar_action array with all zeroes (RAR_SUCCESS)
is always safe to do without some sort of guard.

I suppose it might be, since if we are in rar_cpu_init()
current->mm should be init_mm, the CPU bit in cpu_online_mask
is not set, and we don't have to worry about flushing memory 
all that much?

> > 
> > +	/* If this CPU supports less than RAR_MAX_PAYLOADS, lower
> > our limit. */
> > +	if (max_payloads < rar_max_payloads)
> > +		rar_max_payloads = max_payloads;
> > +	pr_info("RAR: support %d payloads\n", max_payloads);
> > +
> > +	for (r = 0; r < rar_max_payloads; r++)
> > +		rar_locks[r].lock =
> > __RAW_SPIN_LOCK_UNLOCKED(rar_lock);
> 
> Not a fan of the reuse of r for different purposes.

Fair enough, I'll add another variable name.

> 
> > +/*
> > + * Inspired by smp_call_function_many(), but RAR requires a global
> > payload
> > + * table rather than per-CPU payloads in the CSD table, because
> > the action
> > + * handler is microcode rather than software.
> > + */
> > +void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
> > +		       unsigned long start, unsigned long end)
> > +{
> > +	unsigned long pages = (end - start + PAGE_SIZE) /
> > PAGE_SIZE;
> 
> I think "end" is not inclusive. See for instance flush_tlb_page()
> where 
> "end" is set to "a + PAGE_SIZE". So this would flush one extra page.
> 
It gets better. Once we add in a "stride" argument, we
may end up with a range that covers only the first
4kB of one of the 2MB entries the calling code wanted
to invalidate. I fell into that trap already with the
INVLPGB code :)

I'll look into simplifying this for the next version,
probably with only 4k PAGE_SIZE at first. We can think
about stride later.

> > 
> > +	 * This code cannot use the should_flush_tlb() logic here
> > because
> > +	 * RAR flushes do not update the tlb_gen, resulting in
> > unnecessary
> > +	 * flushes at context switch time.
> > +	 */
> > +	dest_mask = this_cpu_ptr(&rar_cpu_mask);
> > +	cpumask_and(dest_mask, mask, cpu_online_mask);
> > +
> > +	/* Some callers race with other CPUs changing the passed
> > mask */
> > +	if (unlikely(!cpumask_weight(dest_mask)))
> 
> cpumask_and() returns "false if *@dstp is empty, else returns true".
> So 
> you can use his value instead of calling cpumask_weight().

Will do, thank you.



-- 
All Rights Reversed.

Re: [RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by David Laight 3 months, 2 weeks ago

On Thu, 19 Jun 2025 21:10:47 -0400
Rik van Riel <riel@surriel.com> wrote:

> On Fri, 2025-06-20 at 02:01 +0300, Nadav Amit wrote:
> >   
> > > +/*
> > > + * Reduce contention for the RAR payloads by having a small number
> > > of
> > > + * CPUs share a RAR payload entry, instead of a free for all with
> > > all CPUs.
> > > + */
> > > +struct rar_lock {
> > > +	union {
> > > +		raw_spinlock_t lock;
> > > +		char __padding[SMP_CACHE_BYTES];
> > > +	};
> > > +};  
> > 
> > I think you can lose the __padding and instead have 
> > ____cacheline_aligned (and then you won't need union).
> >   
> I tried that initially, but the compiler was unhappy
> to have __cacheline_aligned in the definition of a
> struct.

You should be able to put it onto the first structure member.
(Which would match the normal use.)

The padding doesn't have the same effect.
Even for rar_lock[] the first entry will share with whatever
comes before, and the padding on the last entry just isn't used.

	David

Re: [RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by Nadav Amit 3 months, 2 weeks ago

On 20/06/2025 4:10, Rik van Riel wrote:
> On Fri, 2025-06-20 at 02:01 +0300, Nadav Amit wrote:
>>> +
>>> +static void set_payload(struct rar_payload *p, u16 pcid, unsigned
>>> long start,
>>> +			long pages)
>>> +{
>>> +	p->must_be_zero_1	= 0;
>>> +	p->must_be_zero_2	= 0;
>>> +	p->must_be_zero_3	= 0;
>>> +	p->page_size		= RAR_INVLPG_PAGE_SIZE_4K;
>>
>> I think you can propagate the stride to this point instead of using
>> fixed 4KB stride.
> 
> Agreed. That's another future optimization, for
> once this code all works right. 

It might not be an optimization if it leads to performance regression 
relatively to the current software-based flush that does take the stride 
into account. IOW: flushing 2MB huge-page would end up doing a full TLB 
flush with RAR in contrast to selectively flushing the specific entry 
with the software-based shootdown. Correct me if I'm wrong.

> 
> Currently I am in a situation where the receiving
> CPU clears the action vector from RAR_PENDING to
> RAR_SUCCESS, but the TLB does not appear to always
> be correctly flushed.

Interesting. I really do not know anything about it, but you may want to 
look on performance counters to see whether the flush is at least 
reported to take place.

> 
>>
>>> +	p->type			= RAR_TYPE_INVPCID;
>>> +	p->pcid			= pcid;
>>> +	p->linear_address	= start;
>>> +
>>> +	if (pcid) {
>>> +		/* RAR invalidation of the mapping of a specific
>>> process. */
>>> +		if (pages < RAR_INVLPG_MAX_PAGES) {
>>> +			p->num_pages = pages;
>>> +			p->subtype = RAR_INVPCID_ADDR;
>>> +		} else {
>>> +			p->subtype = RAR_INVPCID_PCID;
>>
>> I wonder whether it would be safer to set something to p->num_pages.
>> (then we can do it unconditionally)
> 
> We have a limited number of bits available for
> p->num_pages. I'm not sure we want to try
> writing a larger number than what fits in those
> bits.

I was just looking for a way to simplify the code and at the same time 
avoid "undefined" state. History proved the all kind of unintended 
consequences happen when some state is uninitialized (even if the spec 
does not require it).

> 
>>
>>> +		}
>>> +	} else {
>>> +		/*
>>> +		 * Unfortunately RAR_INVPCID_ADDR excludes global
>>> translations.
>>> +		 * Always do a full flush for kernel
>>> invalidations.
>>> +		 */
>>> +		p->subtype = RAR_INVPCID_ALL;
>>> +	}
>>> +
>>> +	/* Ensure all writes are visible before the action entry
>>> is set. */
>>> +	smp_wmb();
>>
>> Maybe you can drop the smp_wmb() here and instead change the
>> WRITE_ONCE() in set_action_entry() to smp_store_release() ? It should
>> have the same effect and I think would be cleaner and convey your
>> intent
>> better.
>>
> We need protection against two different things here.
> 
> 1) Receiving CPUs must see all the writes done by
>     the originating CPU before we send the RAR IPI.
> 
> 2) Receiving CPUs must see all the writes done by
>     set_payload() before the write done by
>     set_action_entry(), in case another CPU sends
>     the RAR IPI before we do.
> 
>     That other RAR IPI could even be sent between
>     when we write the payload, and when we write
>     the action entry. The receiving CPU could take
>     long enough processing other RAR payloads that
>     it can see our action entry after we write it.
> 
> Does removing the smp_wmb() still leave everything
> safe in that scenario?

Admittedly, I do not understand your concern. IIUC until 
set_action_entry() is called, and until RAR_PENDING is set, IPIs are 
irrelevant and the RAR entry would not be processed. Once it is set, you 
want all the prior writes to be visible. Semantically, that's exactly 
what smp_store_release() means.

Technically, smp_wmb() + WRITE_ONCE() are equivalent to 
smp_store_release() on x86. smp_wmb() is actually a compiler barrier as 
x86 follows the TSO memory model, and smp_store_release() is actually a 
compiler-barrier + WRITE_ONCE(). So I think it's all safe and at the 
same time clearer.

Re: [RFC PATCH v4 5/8] x86/mm: Introduce Remote Action Request

Posted by Sean Christopherson 3 months, 2 weeks ago

On Thu, Jun 19, 2025, Rik van Riel wrote:
> On Fri, 2025-06-20 at 02:01 +0300, Nadav Amit wrote:
> > 
> > > +/*
> > > + * Reduce contention for the RAR payloads by having a small number
> > > of
> > > + * CPUs share a RAR payload entry, instead of a free for all with
> > > all CPUs.
> > > + */
> > > +struct rar_lock {
> > > +	union {
> > > +		raw_spinlock_t lock;
> > > +		char __padding[SMP_CACHE_BYTES];
> > > +	};
> > > +};
> > 
> > I think you can lose the __padding and instead have 
> > ____cacheline_aligned (and then you won't need union).
> > 
> I tried that initially, but the compiler was unhappy
> to have __cacheline_aligned in the definition of a
> struct.

____cacheline_aligned_in_smp, a.k.a. ____cacheline_aligned, should work in a
struct or "on" a struct.  There are multiple instances of both throughout the
kernel.