x86/mm: Intel RAR TLB invalidation support

[RFC v5 8/8] x86/mm: make RAR invalidation scalable by skipping duplicate APIC pokes

Posted by Rik van Riel 2 months, 2 weeks ago

From: Rik van Riel <riel@meta.com>

The naive RAR implementation suffers from heavy contention in
apic_mem_wait_irc_idle(), when multiple CPUs send our RAR
interrupts simultaneously.

When a CPU receives a RAR, it will scan its action vector, and
process all the rar_payload entries where the corresponding action
vector is set to RAR_PENDING. After processing each payload, it
will set the corresponding action vector to RAR_SUCCESS.

That means sending one single RAR to a CPU is enough for that CPU
to process all the pending RAR payloads, and other CPUs do not
usually need to send additional RARs to that CPU.

Optimistically avoid sending RAR interrupts to CPUs that are
already processing a RAR, looping back only if our request
went unprocessed, but the remote CPU is no longer processing
any RARs.

This changes will-it-scale tlb_flush2_threads numbers like this:

loops/sec  IPI flush    naive RAR     optimized RAR
threads
   1         175k          174k           170k
   5         337k          345k           321k
  10         530k          469k           497k
  20         752k          363k           616k
  30         922k          259k           754k
  40        1005k          205k           779k
  50        1073k          164k           883k
  60        1040k          141k           813k

The numbers above are on a 30 core / 60 thread, single socket
Sapphire Rapids system. Average of 4 runs.

This exact same code reached up to 1200k loops/second on a
-tip kernel from a few weeks ago, and did so reliably across
several reboots. I have no good explanation for the difference.

Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/mm/rar.c | 60 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
index 76959782fb03..fd89eaaf4fc1 100644
--- a/arch/x86/mm/rar.c
+++ b/arch/x86/mm/rar.c
@@ -11,6 +11,7 @@
 #include <asm/tlbflush.h>
 
 static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+static DEFINE_PER_CPU(struct cpumask, apic_cpu_mask);
 
 #define RAR_SUCCESS	0x00
 #define RAR_PENDING	0x01
@@ -47,6 +48,32 @@ static struct rar_lock rar_locks[RAR_MAX_PAYLOADS] __cacheline_aligned;
  */
 static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
 
+/*
+ * Tracks whether a RAR is in flight to this CPU. This is used
+ * to avoid sending another RAR (waiting on the APIC) when the
+ * target CPU is already handling RARs.
+ */
+static DEFINE_PER_CPU(int, rar_pending) = -1;
+
+static bool get_rar_pending(int target_cpu, int this_cpu)
+{
+	int *this_rar_pending = &per_cpu(rar_pending, target_cpu);
+
+	/* Another CPU is flushing this CPU already. */
+	if (*this_rar_pending != -1)
+		return false;
+
+	/* Is this_cpu the one that needs to send a RAR to target_cpu? */
+	return cmpxchg(this_rar_pending, -1, this_cpu) == -1;
+}
+
+static void release_rar_pending(int target_cpu, int this_cpu)
+{
+	/* If this_cpu sent the RAR to target_cpu, clear rar_pending */
+	if (READ_ONCE(per_cpu(rar_pending, target_cpu)) == this_cpu)
+		WRITE_ONCE(per_cpu(rar_pending, target_cpu), -1);
+}
+
 /*
  * TODO: group CPUs together based on locality in the system instead
  * of CPU number, to further reduce the cost of contention.
@@ -113,7 +140,7 @@ static void set_action_entry(unsigned long payload_nr, int target_cpu)
 	WRITE_ONCE(bitmap[payload_nr], RAR_PENDING);
 }
 
-static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
+static u8 wait_for_action_done(unsigned long payload_nr, int target_cpu)
 {
 	u8 status;
 	u8 *rar_actions = per_cpu(rar_action, target_cpu);
@@ -123,9 +150,14 @@ static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
 	while (status == RAR_PENDING) {
 		cpu_relax();
 		status = READ_ONCE(rar_actions[payload_nr]);
+		/* Target CPU is not processing RARs right now. */
+		if (READ_ONCE(per_cpu(rar_pending, target_cpu)) == -1)
+			return status;
 	}
 
 	WARN_ON_ONCE(rar_actions[payload_nr] != RAR_SUCCESS);
+
+	return status;
 }
 
 void rar_cpu_init(void)
@@ -183,7 +215,7 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
 {
 	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
 	int cpu, this_cpu = smp_processor_id();
-	cpumask_t *dest_mask;
+	cpumask_t *dest_mask, *apic_mask;
 	unsigned long payload_nr;
 
 	/* Catch the "end - start + PAGE_SIZE" overflow above. */
@@ -213,7 +245,9 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
 	 * flushes at context switch time.
 	 */
 	dest_mask = this_cpu_ptr(&rar_cpu_mask);
+	apic_mask = this_cpu_ptr(&apic_cpu_mask);
 	cpumask_and(dest_mask, mask, cpu_online_mask);
+	cpumask_clear(apic_mask);
 
 	/* Some callers race with other CPUs changing the passed mask */
 	if (unlikely(!cpumask_weight(dest_mask)))
@@ -225,11 +259,25 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
 	for_each_cpu(cpu, dest_mask)
 		set_action_entry(payload_nr, cpu);
 
-	/* Send a message to all CPUs in the map */
-	native_send_rar_ipi(dest_mask);
+	do {
+		for_each_cpu(cpu, dest_mask) {
+			/* Track the CPUs that have no RAR pending (yet). */
+			if (get_rar_pending(cpu, this_cpu))
+				__cpumask_set_cpu(cpu, apic_mask);
+		}
 
-	for_each_cpu(cpu, dest_mask)
-		wait_for_action_done(payload_nr, cpu);
+		/* Send a message to the CPUs not processing RARs yet */
+		native_send_rar_ipi(apic_mask);
+
+		for_each_cpu(cpu, dest_mask) {
+			u8 status = wait_for_action_done(payload_nr, cpu);
+			if (status == RAR_SUCCESS) {
+				release_rar_pending(cpu, this_cpu);
+				__cpumask_clear_cpu(cpu, dest_mask);
+				__cpumask_clear_cpu(cpu, apic_mask);
+			}
+		}
+	} while (unlikely(cpumask_weight(dest_mask)));
 
 	free_payload_slot(payload_nr);
 }
-- 
2.51.1

[RFC v5 1/8] x86/mm: Introduce Remote Action Request MSRs
[RFC v5 2/8] x86/mm: enable BROADCAST_TLB_FLUSH on Intel, too
[RFC v5 3/8] x86/mm: Introduce X86_FEATURE_RAR
[RFC v5 4/8] x86/apic: Introduce Remote Action Request Operations
[RFC v5 5/8] x86/mm: Introduce Remote Action Request
[RFC v5 6/8] x86/mm: use RAR for kernel TLB flushes
[RFC v5 7/8] x86/mm: userspace & pageout flushing using Intel RAR
[RFC v5 8/8] x86/mm: make RAR invalidation scalable by skipping duplicate APIC pokes