From nobody Sun Feb  8 09:24:42 2026
Received: from shelob.surriel.com (shelob.surriel.com [96.67.55.147])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 83BE62F3600
	for <linux-kernel@vger.kernel.org>; Fri, 21 Nov 2025 18:55:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=96.67.55.147
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1763751344; cv=none;
 b=f5qUWgtAyBQAvGwYr9KsOrTIwcKcNvzUX2diiEH/GYibjVOjR2eBMZPd+KcTlcxUg5A3NG4+55HO1pYLerLdl3fMkDZA+1RCqyTyZ9mw7VLBwJ2BqI98nenr5svng4ItHFKUpCowwao2wo1Q+sq0bwJEM1d9IRnuCa0ryMDzgvk=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1763751344; c=relaxed/simple;
	bh=a41uQWToJ/q47rPMKsAd8cDSXXtkp9Qj8qZyZehtOdY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=FD11dt3bGdCQixoQoOzMrcUnA8zAEI2gioTvSNunLwsYomydAq7KT2zIKjHs8p2bWETUgtelnJyh7qOiuClxnLeKvA6eyyYzHrLJwq6Yn8tyQUIhc4Fm1HxIQZ6dLVVAlrqA4IxEByx9MwcVD7T0uTAT8mFc30jN7lk9oyCs6zE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com;
 spf=pass smtp.mailfrom=surriel.com;
 dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b=MO+iCsoU; arc=none smtp.client-ip=96.67.55.147
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=surriel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=surriel.com header.i=@surriel.com
 header.b="MO+iCsoU"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=surriel.com
	; s=mail; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=pH3tylSDYWWE7T+VMZYdhlnwJgMTm4AeNoqz5h0S1Tk=; b=MO+iCsoUnmRNZodJZd+ICylvzf
	CRwukQLAvh/vsnqme1KtUT+EuEKO/QY9dhULXrvdUKwgMM7FJfdWF0C6gi1ExHbSNMwKgLr2YiaHn
	0wFwjVQn18gFXEm1K3xHYbOnPYSINA47qHX9iSnp9SVa8eIa1aQO1NK6Dok7dQje3kJY/vf+QbBTm
	wJymoIKOODkT23bOWlnEdmgIZw3jPR9olYcSMpkC6i4ZiSI4BEAFQoDjPv7gSDZLoQAs26wSJUfg2
	fi+w1C78eAYq2Wkql9EPY5/9UZapsiCo0jXQ/7p8OOZIqEjE9ObVQOJR7Z7791A59x9WoX/FFvBzD
	2sWRRAYQ==;
Received: from fangorn.home.surriel.com ([10.0.13.7])
	by shelob.surriel.com with esmtpsa  (TLS1.2) tls
 TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
	(Exim 4.97.1)
	(envelope-from <riel@surriel.com>)
	id 1vMWI9-000000003i4-1yCh;
	Fri, 21 Nov 2025 13:55:34 -0500
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org,
	x86@kernel.org,
	dave.hansen@linux.intel.com,
	peterz@infradead.org,
	kernel-team@meta.com,
	bp@alien8.de,
	Rik van Riel <riel@meta.com>,
	Rik van Riel <riel@surriel.com>
Subject: [RFC v5 8/8] x86/mm: make RAR invalidation scalable by skipping
 duplicate APIC pokes
Date: Fri, 21 Nov 2025 13:54:29 -0500
Message-ID: <20251121185530.21876-9-riel@surriel.com>
X-Mailer: git-send-email 2.51.1
In-Reply-To: <20251121185530.21876-1-riel@surriel.com>
References: <20251121185530.21876-1-riel@surriel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Rik van Riel <riel@meta.com>

The naive RAR implementation suffers from heavy contention in
apic_mem_wait_irc_idle(), when multiple CPUs send our RAR
interrupts simultaneously.

When a CPU receives a RAR, it will scan its action vector, and
process all the rar_payload entries where the corresponding action
vector is set to RAR_PENDING. After processing each payload, it
will set the corresponding action vector to RAR_SUCCESS.

That means sending one single RAR to a CPU is enough for that CPU
to process all the pending RAR payloads, and other CPUs do not
usually need to send additional RARs to that CPU.

Optimistically avoid sending RAR interrupts to CPUs that are
already processing a RAR, looping back only if our request
went unprocessed, but the remote CPU is no longer processing
any RARs.

This changes will-it-scale tlb_flush2_threads numbers like this:

loops/sec  IPI flush    naive RAR     optimized RAR
threads
   1         175k          174k           170k
   5         337k          345k           321k
  10         530k          469k           497k
  20         752k          363k           616k
  30         922k          259k           754k
  40        1005k          205k           779k
  50        1073k          164k           883k
  60        1040k          141k           813k

The numbers above are on a 30 core / 60 thread, single socket
Sapphire Rapids system. Average of 4 runs.

This exact same code reached up to 1200k loops/second on a
-tip kernel from a few weeks ago, and did so reliably across
several reboots. I have no good explanation for the difference.

Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/mm/rar.c | 60 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
index 76959782fb03..fd89eaaf4fc1 100644
--- a/arch/x86/mm/rar.c
+++ b/arch/x86/mm/rar.c
@@ -11,6 +11,7 @@
 #include <asm/tlbflush.h>
=20
 static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+static DEFINE_PER_CPU(struct cpumask, apic_cpu_mask);
=20
 #define RAR_SUCCESS	0x00
 #define RAR_PENDING	0x01
@@ -47,6 +48,32 @@ static struct rar_lock rar_locks[RAR_MAX_PAYLOADS] __cac=
heline_aligned;
  */
 static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
=20
+/*
+ * Tracks whether a RAR is in flight to this CPU. This is used
+ * to avoid sending another RAR (waiting on the APIC) when the
+ * target CPU is already handling RARs.
+ */
+static DEFINE_PER_CPU(int, rar_pending) =3D -1;
+
+static bool get_rar_pending(int target_cpu, int this_cpu)
+{
+	int *this_rar_pending =3D &per_cpu(rar_pending, target_cpu);
+
+	/* Another CPU is flushing this CPU already. */
+	if (*this_rar_pending !=3D -1)
+		return false;
+
+	/* Is this_cpu the one that needs to send a RAR to target_cpu? */
+	return cmpxchg(this_rar_pending, -1, this_cpu) =3D=3D -1;
+}
+
+static void release_rar_pending(int target_cpu, int this_cpu)
+{
+	/* If this_cpu sent the RAR to target_cpu, clear rar_pending */
+	if (READ_ONCE(per_cpu(rar_pending, target_cpu)) =3D=3D this_cpu)
+		WRITE_ONCE(per_cpu(rar_pending, target_cpu), -1);
+}
+
 /*
  * TODO: group CPUs together based on locality in the system instead
  * of CPU number, to further reduce the cost of contention.
@@ -113,7 +140,7 @@ static void set_action_entry(unsigned long payload_nr, =
int target_cpu)
 	WRITE_ONCE(bitmap[payload_nr], RAR_PENDING);
 }
=20
-static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
+static u8 wait_for_action_done(unsigned long payload_nr, int target_cpu)
 {
 	u8 status;
 	u8 *rar_actions =3D per_cpu(rar_action, target_cpu);
@@ -123,9 +150,14 @@ static void wait_for_action_done(unsigned long payload=
_nr, int target_cpu)
 	while (status =3D=3D RAR_PENDING) {
 		cpu_relax();
 		status =3D READ_ONCE(rar_actions[payload_nr]);
+		/* Target CPU is not processing RARs right now. */
+		if (READ_ONCE(per_cpu(rar_pending, target_cpu)) =3D=3D -1)
+			return status;
 	}
=20
 	WARN_ON_ONCE(rar_actions[payload_nr] !=3D RAR_SUCCESS);
+
+	return status;
 }
=20
 void rar_cpu_init(void)
@@ -183,7 +215,7 @@ void smp_call_rar_many(const struct cpumask *mask, u16 =
pcid,
 {
 	unsigned long pages =3D (end - start + PAGE_SIZE) / PAGE_SIZE;
 	int cpu, this_cpu =3D smp_processor_id();
-	cpumask_t *dest_mask;
+	cpumask_t *dest_mask, *apic_mask;
 	unsigned long payload_nr;
=20
 	/* Catch the "end - start + PAGE_SIZE" overflow above. */
@@ -213,7 +245,9 @@ void smp_call_rar_many(const struct cpumask *mask, u16 =
pcid,
 	 * flushes at context switch time.
 	 */
 	dest_mask =3D this_cpu_ptr(&rar_cpu_mask);
+	apic_mask =3D this_cpu_ptr(&apic_cpu_mask);
 	cpumask_and(dest_mask, mask, cpu_online_mask);
+	cpumask_clear(apic_mask);
=20
 	/* Some callers race with other CPUs changing the passed mask */
 	if (unlikely(!cpumask_weight(dest_mask)))
@@ -225,11 +259,25 @@ void smp_call_rar_many(const struct cpumask *mask, u1=
6 pcid,
 	for_each_cpu(cpu, dest_mask)
 		set_action_entry(payload_nr, cpu);
=20
-	/* Send a message to all CPUs in the map */
-	native_send_rar_ipi(dest_mask);
+	do {
+		for_each_cpu(cpu, dest_mask) {
+			/* Track the CPUs that have no RAR pending (yet). */
+			if (get_rar_pending(cpu, this_cpu))
+				__cpumask_set_cpu(cpu, apic_mask);
+		}
=20
-	for_each_cpu(cpu, dest_mask)
-		wait_for_action_done(payload_nr, cpu);
+		/* Send a message to the CPUs not processing RARs yet */
+		native_send_rar_ipi(apic_mask);
+
+		for_each_cpu(cpu, dest_mask) {
+			u8 status =3D wait_for_action_done(payload_nr, cpu);
+			if (status =3D=3D RAR_SUCCESS) {
+				release_rar_pending(cpu, this_cpu);
+				__cpumask_clear_cpu(cpu, dest_mask);
+				__cpumask_clear_cpu(cpu, apic_mask);
+			}
+		}
+	} while (unlikely(cpumask_weight(dest_mask)));
=20
 	free_payload_slot(payload_nr);
 }
--=20
2.51.1