From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-112.ptr.blmpb.com (va-1-112.ptr.blmpb.com
 [209.127.230.112])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5A793382298
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:11:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.112
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817863; cv=none;
 b=da3J7MptnasutrRtOmYd2964o0enaJRC97Pq8NYvDSJnY76L5c0DzMAbb4PJQjnvkZO5u5TJDL4p4RohbA9qu9vdtw+2QUrutQEi8J1U6VlmScx+Ps2egP9HpjBGzruDGLr6W6CNZDxayUX79vR+2LgYyu4PuxcE21Wv6LjarG4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817863; c=relaxed/simple;
	bh=uC6JuvPHHiv16iM6gQUufwzeMB2xsKifWPv7SKWL1fY=;
	h=Message-Id:In-Reply-To:References:To:Subject:Content-Type:Cc:From:
	 Date:Mime-Version;
 b=obd5HMqFXI67TbpLLfzVQeGFoNb1aVa8eftYZ1Rji4TBBg1PxgIsx6xf7ECtYDDlUrsSoxqPeeHbrNmv9a7+ejY1WXQFMNw5m5Qcsdmx5kruPVy1VUz6TkGh4zsyY8fW6d3BZ5KVTsBlbesfiXc+99uV7BLKu2h4a+fx4Fs2jbM=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=Ffb+375t; arc=none smtp.client-ip=209.127.230.112
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="Ffb+375t"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817854; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=PTis8wF35iuucNzcojqxgDe88w0r7l5E8SRoiLTj548=;
 b=Ffb+375t2NVTmHMiUvrmBzBVH2+9+pAt8Ee5gashxPpP4tv7LeZ1yvHXy3DHRFENLskM5F
 Lzdw3nnsO7kRiBayB6AY/js/dUkotR25wYMjg4HIrsJpd6KhXcS1LZ7EXCFeURW/S25Ngk
 QVuRgGBqINXY9pyS01Ltt1zNBD8MMmE/2juwaHFDR3IZH3uWCgQ6uOiacqzfIjLRvRoMK1
 Dd+ov25Ip2FO0mmqSpON89/snDJxmuq35svbSAbwyoGWs16q6cVNo7uOP+MtRCTeZcKJUU
 +SsUPycqKvB7mm4Rib/zON8jLsUMDz7eSbiKvnA9ABl6iqC29qjuIVlvRsHMIw==
Message-Id: <20260630111008.2034376-2-zhouchuyi@bytedance.com>
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Subject: [PATCH v9 01/14] smp: Disable preemption explicitly in
 __csd_lock_wait()
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Content-Transfer-Encoding: quoted-printable
X-Lms-Return-Path: 
 <lba+26a43a43c+341e7c+vger.kernel.org+zhouchuyi@bytedance.com>
Date: Tue, 30 Jun 2026 19:09:55 +0800
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
Content-Type: text/plain; charset="utf-8"

The CSD lock wait debugging code in __csd_lock_wait() must run with
preemption disabled. The smp function call mechanisms which invoke it
currently keep preemption disabled across the wait, so the debugging code
inherits that guarantee from its callers.

Keeping preemption disabled across the whole smp function call operation
can induce large scheduling latencies. Shortening the caller-side
preemption-disabled region would invoke __csd_lock_wait() with preemption
enabled.

Prepare for that by disabling preemption explicitly around the CSD lock
wait debugging code in __csd_lock_wait().

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Muchun Song <muchun.song@linux.dev>
---
 kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/smp.c b/kernel/smp.c
index a0bb56bd8dda..b58975480e11 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -323,6 +323,8 @@ static void __csd_lock_wait(call_single_data_t *csd)
 	int bug_id =3D 0;
 	u64 ts0, ts1;
=20
+	guard(preempt)();
+
 	ts1 =3D ts0 =3D ktime_get_mono_fast_ns();
 	for (;;) {
 		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-111.ptr.blmpb.com (va-1-111.ptr.blmpb.com
 [209.127.230.111])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D23233F9A1A
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:11:13 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.111
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817875; cv=none;
 b=BVGwUGc5H8RF6PBKiWcELN8mfFbNgdouAqSVlVWBwqv62n6HAwKjNLr9uL26um8zR7u3FOYDAMobz4WteN9EZ1T4T0qNnrmSxFoKAb+AY/5s0vKKI5tX0/bnzkWNnKM8K//hFfFza+QLwMxI+XbTL35LbzKhAk28vVcPTg2NsvA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817875; c=relaxed/simple;
	bh=XKP8JYf8Z2wYHNaLfHxvJWRj1U1bt5XLc5pXrpSoBK0=;
	h=Date:References:Cc:In-Reply-To:From:Content-Type:Subject:
	 Message-Id:Mime-Version:To;
 b=EKD6ufKf1GNQuxYPfuuPesQ3KGb3LvyZXt01WT7ApWSUyJHs94aYqrPPnr+aA2Zb5QXF9QtrThZSrYvr3yil7qf9J2DOeqI73tHmS4lhJL+unNPftjzLGB8Im5WDJpzYUniMWH5w/+gXYLLPQ4Ch7NGiDs9yvjKf4vYF79gfrKc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=OCZRUgiq; arc=none smtp.client-ip=209.127.230.111
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="OCZRUgiq"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817869; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=e7Yxv6JQTj1GPtG9bpXo1c9chLOxpdLWlb5AGeLtS2s=;
 b=OCZRUgiq/VesI1Ha/wquJjXjnhC8GmuiktfUjcvNIklVjnIcPln8r/zmfKxRGtveYmQpJG
 j8C3oRcatSvIfxJYuwRCe4GaWIdNbyiWzJ9JQTVmU2DInAe3hmUMMCkRZTIQHdPfOGjv9s
 Vzwv/KjPEXqr528kBLB5lUxhyfmA3ukw4dX+QEqBqLB/iY5XxvzH5cXzNyY32Iq17KEgYn
 XLF0FJXae+UEq1p02k+OWeMajDBlzD/CPXZZyawACseUPlBQMfTyAwubItnRYePtjEdI/M
 zuGjpTEKqsgD5nu6F6Nde+DCx6fMC+xmQmPrF4tugZOY0mlbSkkIvC7M1otXrQ==
Date: Tue, 30 Jun 2026 19:09:56 +0800
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Content-Transfer-Encoding: quoted-printable
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Subject: [PATCH v9 02/14] smp: Enable preemption early in
 smp_call_function_single()
Message-Id: <20260630111008.2034376-3-zhouchuyi@bytedance.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
X-Lms-Return-Path: 
 <lba+26a43a44b+b1a55d+vger.kernel.org+zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Content-Type: text/plain; charset="utf-8"

smp_call_function_single() disables preemption while it validates the
target CPU, prepares the call single data, queues the callback and sends
the IPI.

For the !wait case, preemption protects the per-CPU csd_data from
concurrent modification by another task on the same CPU. For the wait
case, the CSD is stack allocated and no other task can reuse it. CPU
pinning is still required until the callback has been queued and the IPI
has been sent, to ensure that the target CPU cannot be offlined after the
online check but before dispatch.

After generic_exec_single() has queued the callback, the synchronous
csd_lock_wait() invocation at the end of the execution does not require
the caller to remain pinned to the current CPU.

Enable preemption before csd_lock_wait() to shorten the
preemption-disabled section.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/smp.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index b58975480e11..292eefadddbc 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -700,11 +700,16 @@ int smp_call_function_single(int cpu, smp_call_func_t=
 func, void *info,
=20
 	err =3D generic_exec_single(cpu, csd);
=20
+	/*
+	 * @csd is stack-allocated when @wait is true. No concurrent access
+	 * except from the IPI completion path, so we can re-enable preemption
+	 * early to reduce latency.
+	 */
+	put_cpu();
+
 	if (wait)
 		csd_lock_wait(csd);
=20
-	put_cpu();
-
 	return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-115.ptr.blmpb.com (va-1-115.ptr.blmpb.com
 [209.127.230.115])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 20AC83F54AB
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:11:28 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.115
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817890; cv=none;
 b=clI+eurYH8g6zuyTGLHCfjGj7Xhfm56sK80NPAxwqT9jsY02jSNzVvpRE8dNs3ZaRzVmDFJ2cobgrbpm2x5V0lKnDjbEWM8uuG3H2wA/f5k8AnInKSruVTLDDPVKuTExrrzU8vABPk1EERYPCwB1+aKpePpD3NXWdEIxJTvH7no=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817890; c=relaxed/simple;
	bh=R7faOJ929c2PCJcFq+Dk6kkaNQw2kQCLns9yUo6xhDY=;
	h=Mime-Version:Subject:References:In-Reply-To:To:Date:Content-Type:
	 From:Message-Id:Cc;
 b=t3RukI1GIKn/pzPX1kIxKk9h6x1O1+pYXUuG9SQV3aNtDrjoHXiTZR5nbSjAMyxu+b895NftEG9QhoNWYJZzEpYcBDbV7aonZDIiMBq2CkRtOcDwdO2vdvjOMm9vzfmTtKpoL2JO26jIvss9aLl596DUyxWSGz6EWJAcK6URkWU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=VxJpktrk; arc=none smtp.client-ip=209.127.230.115
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="VxJpktrk"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817884; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=tBoFytz6cA8ILrpJXluLdSOQPOneediXsnQek8X+OpU=;
 b=VxJpktrkThxu+lnS5kWU5t/jDQvilLRShq/F8GvqISWasBBEqMcZOnEh+bVZ9YaZvK8z3h
 b5N2MDEh6vCdNmYQNeI1sVQcUvS408ia2HdMVXMFC6Ckg4NioirHmP3LOIJm3PRp1BBYa1
 14IrKwIgqozITmayKlp9/AYBwrpupoOGeoDuFG6kagqxgFVYsa5sAzNWhyOQnMeMjsMesH
 4JUZLU978bZp70NjvKW9Y0dEIPmSbE+9nwjjl/h8Dq6Zu+yItKLjimeMzLVIUkW9aa17yo
 9Oh1bEK7jGogekPy8+7PbwuV4k5lJxhtjG11MdPpRRQluLMjLXPS9cF/NS3gWQ==
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
Subject: [PATCH v9 03/14] smp: Refactor remote CPU selection in
 smp_call_function_any()
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Date: Tue, 30 Jun 2026 19:09:57 +0800
X-Lms-Return-Path: 
 <lba+26a43a45a+e52cff+vger.kernel.org+zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Message-Id: <20260630111008.2034376-4-zhouchuyi@bytedance.com>
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
Content-Transfer-Encoding: quoted-printable
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

smp_call_function_any() disables preemption across the entire operation:
selecting a target CPU, enqueueing the IPI, and synchronously waiting for
the remote CPU. smp_call_function_single() already re-enables preemption
before the synchronous csd_lock_wait(), so callers of
smp_call_function_any() should benefit from the same shorter
preemption-disabled section.

Simply removing get_cpu() and put_cpu() from smp_call_function_any()
would leave the preemption disablement entirely to
smp_call_function_single(). That opens a preemption window between
selecting the remote CPU, for example via sched_numa_find_nth_cpu(), and
dispatching the IPI in smp_call_function_single(). If the selected CPU is
fully offlined in that window, smp_call_function_single() fails its
cpu_online() check and returns -ENXIO to the caller, violating the
guarantee that smp_call_function_any() executes on any online CPU in the
mask.

Move the remote CPU selection into a common
__smp_call_function_single() helper. Keep the target CPU selection and
IPI dispatch within the same preemption-disabled region, while still
allowing the wait path to use the shorter preemption-disabled section
provided by smp_call_function_single().

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/smp.h |  3 +--
 kernel/smp.c        | 56 ++++++++++++++++++++++++---------------------
 kernel/up.c         |  3 +--
 3 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 6925d15ccaa7..11e36c7bc4d6 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -47,8 +47,7 @@ extern void __smp_call_single_queue(int cpu, struct llist=
_node *node);
 /* total number of cpus in this system (may exceed NR_CPUS) */
 extern unsigned int total_cpus;
=20
-int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
-			     int wait);
+int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, =
bool wait);
=20
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask);
diff --git a/kernel/smp.c b/kernel/smp.c
index 292eefadddbc..92e1dffe4589 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -641,17 +641,9 @@ void flush_smp_call_function_queue(void)
 	local_irq_restore(flags);
 }
=20
-/**
- * smp_call_function_single - Run a function on a specific CPU
- * @cpu: Specific target CPU for this function.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Returns: %0 on success, else a negative status code.
- */
-int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
-			     int wait)
+static int __smp_call_function_single(int cpu, smp_call_func_t func,
+				      void *info, const struct cpumask *mask,
+				      bool wait)
 {
 	call_single_data_t *csd;
 	call_single_data_t csd_stack =3D {
@@ -668,6 +660,14 @@ int smp_call_function_single(int cpu, smp_call_func_t =
func, void *info,
 	 */
 	this_cpu =3D get_cpu();
=20
+	if (mask) {
+		/* Try for same CPU (cheapest) */
+		if (!cpumask_test_cpu(this_cpu, mask))
+			cpu =3D sched_numa_find_nth_cpu(mask, 0, cpu_to_node(this_cpu));
+		else
+			cpu =3D this_cpu;
+	}
+
 	/*
 	 * Can deadlock when called with interrupts disabled.
 	 * We allow cpu's that are not yet online though, as no one else can
@@ -712,6 +712,20 @@ int smp_call_function_single(int cpu, smp_call_func_t =
func, void *info,
=20
 	return err;
 }
+
+/**
+ * smp_call_function_single - Run a function on a specific CPU
+ * @cpu:	Specific target CPU for this function.
+ * @func:	The function to run. This must be fast and non-blocking.
+ * @info:	An arbitrary pointer to pass to the function.
+ * @wait:	If true, wait until function has completed on other CPUs.
+ *
+ * Returns: %0 on success, else a negative status code.
+ */
+int smp_call_function_single(int cpu, smp_call_func_t func, void *info, bo=
ol wait)
+{
+	return __smp_call_function_single(cpu, func, info, NULL, wait);
+}
 EXPORT_SYMBOL(smp_call_function_single);
=20
 /**
@@ -762,10 +776,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_single_async);
=20
 /**
  * smp_call_function_any - Run a function on any of the given cpus
- * @mask: The mask of cpus it can run on.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait until function has completed.
+ * @mask:	The mask of cpus it can run on.
+ * @func:	The function to run. This must be fast and non-blocking.
+ * @info:	An arbitrary pointer to pass to the function.
+ * @wait:	If true, wait until function has completed.
  *
  * Selection preference:
  *	1) current cpu if in @mask
@@ -776,17 +790,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_single_async);
 int smp_call_function_any(const struct cpumask *mask,
 			  smp_call_func_t func, void *info, int wait)
 {
-	unsigned int cpu;
-	int ret;
-
-	/* Try for same CPU (cheapest) */
-	cpu =3D get_cpu();
-	if (!cpumask_test_cpu(cpu, mask))
-		cpu =3D sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu));
-
-	ret =3D smp_call_function_single(cpu, func, info, wait);
-	put_cpu();
-	return ret;
+	return __smp_call_function_single(-1, func, info, mask, wait);
 }
 EXPORT_SYMBOL_GPL(smp_call_function_any);
=20
diff --git a/kernel/up.c b/kernel/up.c
index df50828cc2f0..6d4ac9502e8b 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -9,8 +9,7 @@
 #include <linux/smp.h>
 #include <linux/hypervisor.h>
=20
-int smp_call_function_single(int cpu, void (*func) (void *info), void *inf=
o,
-				int wait)
+int smp_call_function_single(int cpu, void (*func)(void *info), void *info=
, bool wait)
 {
 	unsigned long flags;
=20
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-113.ptr.blmpb.com (va-1-113.ptr.blmpb.com
 [209.127.230.113])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B64AA40802B
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:11:43 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.113
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817905; cv=none;
 b=trOa5PUF9OpUA9LoWEnJMXaTHjsYkx6RNMZs1bru1QdAaDjWcKySn+KlfwqmE7jmUQV9cTMkd/GquWH4eymWMZImjiGYYcUlgE6LzW7yrM7Q+xvpP8u9DhJMYZAvC/Ebhim0P2DPRlE2NU6T3xDmFn91pY+4kfobuTIa5PYR5Ns=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817905; c=relaxed/simple;
	bh=hj7QVCTbs5cSbhaTiPPYGS/fHTIzJgkyrREZ5EPer1Y=;
	h=Mime-Version:To:Cc:Subject:Message-Id:From:Date:References:
	 Content-Type:In-Reply-To;
 b=PhBDsDU+KH4cgul4K7Ol/3lyZjgMiH2BH6Wk/cGpwLJGD433A2PbWknp4s/Y2K3i5AdnsSiM83K9ilBhx84+LOGwA5TpvqjIEOMuWqJknVivzZrFyWxY9Hze9wCf0SEkV4cS68coEzE1LiSbGWRtceXHAmAxbu5yjDhHzbcw6K0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=P+K6w1dm; arc=none smtp.client-ip=209.127.230.113
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="P+K6w1dm"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817898; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=Kyde7jQTygJb14k2YNuCLab/knC4Bz/j8Yl2PPZIyQs=;
 b=P+K6w1dmDn4mbfv31wpuGsl/pxFHus/PapZJ6/d+Ayi2pc5JZxg+shaXH0fDLmoJ9PYPZV
 TcwUMBNMJaoZt3Y4AvzZh/wAdXSqavmCh0BhPttGh5g6FL2ZuxnCUVe8HWrzLr61tGER4b
 AqpsBxEFUqUeqeGLVaAnWQhdisNlq5A3vsXN7yuZuPkwL9Hpi48ajyrA9RkBae1OWL8Fy/
 r1C4I/yiXJfdPyfWZ0ZPUbyOkqpQJXQ/yheuyk85Cpd1BfvxVnVsGtlu/dFRXa637AJmVq
 vwLBMW6764NI/wLZzohHo6uhLMY9ebeNlFdRe4MzsXdTsx2J0/e+flsOcxWQVw==
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
Content-Transfer-Encoding: quoted-printable
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
Subject: [PATCH v9 04/14] smp: Use task-local IPI cpumask in
 smp_call_function_many_cond()
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
Message-Id: <20260630111008.2034376-5-zhouchuyi@bytedance.com>
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Date: Tue, 30 Jun 2026 19:09:58 +0800
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
X-Lms-Return-Path: 
 <lba+26a43a468+a22f42+vger.kernel.org+zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

smp_call_function_many_cond() uses the per-CPU cfd->cpumask as the list
of remote CPUs to wait for. That is safe while the caller remains pinned
to the current CPU for the whole operation, because another task cannot
run on the same CPU and reuse the per-CPU mask.

The synchronous wait is the long-latency part of the operation. To make
that wait preemptible, the mask iterated by csd_lock_wait() must remain
stable even if the task is preempted or migrates. If the wait used the
per-CPU cfd->cpumask after dropping CPU pinning, another task scheduled
on the original CPU could enter smp_call_function_many_cond() and
overwrite the mask while the first task is still iterating it.

Give each task private IPI cpumask storage and use it as the wait mask in
smp_call_function_many_cond(). Other cpumask storage choices do not fit
this use case:

 - Per-CPU storage is the state that becomes unsafe once the wait is
   made preemptible. After the caller drops CPU pinning, another task
   scheduled on the original CPU can enter smp_call_function_many_cond()
   and reuse the same per-CPU mask.

 - Stack storage is not suitable for large NR_CPUS or
   CONFIG_CPUMASK_OFFSTACK=3Dy configurations. The wait mask needs to
   scale with cpumask_size(), and putting that storage on the stack is
   not acceptable on large systems.

 - Allocating the mask inside smp_call_function_many_cond() would put an
   allocation and a failure path in the generic IPI path. A sleeping
   allocation is not suitable because callers have historically only
   provided a preempt-disabled context, not a sleepable one. GFP_ATOMIC
   would avoid sleeping, but a failure fallback would make the latency
   improvement opportunistic instead of guaranteed.

The users are not limited to a small, pre-identifiable class of tasks. On
x86, ordinary tasks can reach this path through TLB flushes during exit,
unmap and reclaim, so allocating the mask only for a known subset of
tasks is not straightforward.

The memory cost is explicit: one word is added to task_struct. When
cpumask_size() fits in that word, the mask is stored inline and no
separate allocation is needed. Larger systems allocate cpumask_size() per
task; on x86-64 NR_CPUS=3D8192 this is about 1 KiB per task. For context,
x86 already carries several KiB of per-task architecture and FPU state,
depending on the enabled features and configuration. That does not make
the extra cpumask free, but it puts the large-NR_CPUS case in
perspective.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/sched.h |  6 ++++
 include/linux/smp.h   | 12 ++++++++
 kernel/fork.c         |  9 +++++-
 kernel/smp.c          | 70 ++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 35e6183ef615..695a2d21a374 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1364,6 +1364,12 @@ struct task_struct {
 	struct list_head		perf_event_list;
 	struct perf_ctx_data __rcu	*perf_ctx_data;
 #endif
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPTION)
+	union {
+		cpumask_t			*ipi_mask_ptr;
+		unsigned long			ipi_mask_val;
+	};
+#endif
 #ifdef CONFIG_DEBUG_PREEMPT
 	unsigned long			preempt_disable_ip;
 #endif
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 11e36c7bc4d6..2dfa7390717a 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -238,6 +238,18 @@ static inline int get_boot_cpu_id(void)
=20
 #endif /* !SMP */
=20
+#if defined(CONFIG_PREEMPTION) && defined(CONFIG_SMP)
+int smp_task_ipi_mask_alloc(struct task_struct *task);
+void smp_task_ipi_mask_free(struct task_struct *task);
+#else
+static inline int smp_task_ipi_mask_alloc(struct task_struct *task)
+{
+	return 0;
+}
+
+static inline void smp_task_ipi_mask_free(struct task_struct *task) { }
+#endif
+
 /*
  * raw_smp_processor_id() - get the current (unstable) CPU id
  *
diff --git a/kernel/fork.c b/kernel/fork.c
index 6fcca1db0af3..37f8343a3b74 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -535,6 +535,7 @@ void free_task(struct task_struct *tsk)
 #endif
 	release_user_cpus_ptr(tsk);
 	scs_release(tsk);
+	smp_task_ipi_mask_free(tsk);
=20
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -933,10 +934,14 @@ static struct task_struct *dup_task_struct(struct tas=
k_struct *orig, int node)
 #endif
 	account_kernel_stack(tsk, 1);
=20
-	err =3D scs_prepare(tsk, node);
+	err =3D smp_task_ipi_mask_alloc(tsk);
 	if (err)
 		goto free_stack;
=20
+	err =3D scs_prepare(tsk, node);
+	if (err)
+		goto free_ipi_mask;
+
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
@@ -1007,6 +1012,8 @@ static struct task_struct *dup_task_struct(struct tas=
k_struct *orig, int node)
 #endif
 	return tsk;
=20
+free_ipi_mask:
+	smp_task_ipi_mask_free(tsk);
 free_stack:
 	exit_task_stack_account(tsk);
 	free_thread_stack(tsk);
diff --git a/kernel/smp.c b/kernel/smp.c
index 92e1dffe4589..e9d647385df1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/gfp.h>
+#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
@@ -794,6 +795,49 @@ int smp_call_function_any(const struct cpumask *mask,
 }
 EXPORT_SYMBOL_GPL(smp_call_function_any);
=20
+static DEFINE_STATIC_KEY_FALSE(ipi_mask_inlined);
+
+#ifdef CONFIG_PREEMPTION
+
+int smp_task_ipi_mask_alloc(struct task_struct *task)
+{
+	if (static_branch_unlikely(&ipi_mask_inlined))
+		return 0;
+
+	task->ipi_mask_ptr =3D kmalloc(cpumask_size(), GFP_KERNEL);
+	if (!task->ipi_mask_ptr)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void smp_task_ipi_mask_free(struct task_struct *task)
+{
+	if (static_branch_unlikely(&ipi_mask_inlined))
+		return;
+
+	kfree(task->ipi_mask_ptr);
+}
+
+static cpumask_t *smp_task_ipi_mask(struct task_struct *cur)
+{
+	/*
+	 * If cpumask_size() is smaller than or equal to the pointer
+	 * size, it stashes the cpumask in the pointer itself to
+	 * avoid extra memory allocations.
+	 */
+	if (static_branch_unlikely(&ipi_mask_inlined))
+		return (cpumask_t *)&cur->ipi_mask_val;
+
+	return cur->ipi_mask_ptr;
+}
+#else
+static cpumask_t *smp_task_ipi_mask(struct task_struct *cur)
+{
+	return NULL;
+}
+#endif
+
 /*
  * Flags to be used as scf_flags argument of smp_call_function_many_cond().
  *
@@ -809,13 +853,21 @@ static void smp_call_function_many_cond(const struct =
cpumask *mask,
 					smp_cond_func_t cond_func)
 {
 	int cpu, last_cpu, this_cpu =3D smp_processor_id();
-	struct call_function_data *cfd;
+	struct cpumask *cpumask, *task_mask;
 	bool wait =3D scf_flags & SCF_WAIT;
-	int nr_cpus =3D 0;
+	struct call_function_data *cfd;
 	bool run_remote =3D false;
+	int nr_cpus =3D 0;
=20
 	lockdep_assert_preemption_disabled();
=20
+	cfd =3D this_cpu_ptr(&cfd_data);
+	task_mask =3D smp_task_ipi_mask(current);
+	if (task_mask)
+		cpumask =3D task_mask;
+	else
+		cpumask =3D cfd->cpumask;
+
 	/*
 	 * Can deadlock when called with interrupts disabled.
 	 * We allow cpu's that are not yet online though, as no one else can
@@ -836,16 +888,15 @@ static void smp_call_function_many_cond(const struct =
cpumask *mask,
=20
 	/* Check if we need remote execution, i.e., any CPU excluding this one. */
 	if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) {
-		cfd =3D this_cpu_ptr(&cfd_data);
-		cpumask_and(cfd->cpumask, mask, cpu_online_mask);
-		__cpumask_clear_cpu(this_cpu, cfd->cpumask);
+		cpumask_and(cpumask, mask, cpu_online_mask);
+		__cpumask_clear_cpu(this_cpu, cpumask);
=20
 		cpumask_clear(cfd->cpumask_ipi);
-		for_each_cpu(cpu, cfd->cpumask) {
+		for_each_cpu(cpu, cpumask) {
 			call_single_data_t *csd =3D per_cpu_ptr(cfd->csd, cpu);
=20
 			if (cond_func && !cond_func(cpu, info)) {
-				__cpumask_clear_cpu(cpu, cfd->cpumask);
+				__cpumask_clear_cpu(cpu, cpumask);
 				continue;
 			}
=20
@@ -896,7 +947,7 @@ static void smp_call_function_many_cond(const struct cp=
umask *mask,
 	}
=20
 	if (run_remote && wait) {
-		for_each_cpu(cpu, cfd->cpumask) {
+		for_each_cpu(cpu, cpumask) {
 			call_single_data_t *csd;
=20
 			csd =3D per_cpu_ptr(cfd->csd, cpu);
@@ -1010,6 +1061,9 @@ EXPORT_SYMBOL(nr_cpu_ids);
 void __init setup_nr_cpu_ids(void)
 {
 	set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + =
1);
+
+	if (IS_ENABLED(CONFIG_PREEMPTION) && cpumask_size() <=3D sizeof(unsigned =
long))
+		static_branch_enable(&ipi_mask_inlined);
 }
=20
 /* Called by boot processor to activate the rest. */
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-111.ptr.blmpb.com (va-1-111.ptr.blmpb.com
 [209.127.230.111])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 36CF73CBE84
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:11:59 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.111
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817921; cv=none;
 b=juxVamgtXWsbKzWvzJEOELEq5j8mHPsF2GaniWThcduq2B4Eg1jXBqoODgxeeEAEF4YAhzlh6KpXbxzDHxomNYjx+9l1LbgkGNnHHAij3Gs7a/ByWQye9ZdeZHncfsCvJqWTCFdXHogzesLMZlAR7E/V5hfTc3suGuVnjL5YW/A=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817921; c=relaxed/simple;
	bh=UuzD5lBfPdxFNGHWsK+9HRGFJH+k1lSCC9doa8Jpsg0=;
	h=To:From:Message-Id:Subject:Date:Mime-Version:Cc:In-Reply-To:
	 References:Content-Type;
 b=USTrHuc7EQjlo5g+oYoQAtd7G8Hus6/tzYBtoVupsL8leDE9BVAbXiKROl+a01YQlIQVk6fUkk62TeqVzbZc2YxCWxOvYPKfSdhdXmt+3+Up2r0L3Ql1XJIP2J3a8jAw0C0xrR+sPEhDC6tflE05G7xtIIbK+KamuMs3pucpcNc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=gI9XRi0x; arc=none smtp.client-ip=209.127.230.111
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="gI9XRi0x"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817915; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=AQhQxaN5vVzOqFkGjqPPHyqyA9cTYPuVzBaMAv1bnd8=;
 b=gI9XRi0xW6DrpdKrLYPfQSjgX75pkUvHSTsTyfwNjHquBM7D2iphv6YvgBfwhjo9PyeBOC
 yy+MpbjbpZtNxePHc5OzIldLuEiMo0D/i2jFsSbEu6/LPTDWCz1iymWRPmMkiK3PuLhtil
 voLVVZr5fhk6DANDXfcei7S98E8hfVtteVrW678B+l5uC/hdI+az0VhGLSHdrGVquVktxF
 38X1hAJEzVR+Ub4FTpz+owJN6JhuLecZ5ltoZs0p0bPb0FBYyNzq6Z0Q3hQ7pcYh4eESbK
 KLtygTHZR6c/xv6RB28Inu88kYu1VCY+ZfnLibwufC6/bHzwjVwZFMtsSp13/A==
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Content-Transfer-Encoding: quoted-printable
X-Lms-Return-Path: 
 <lba+26a43a479+c66b62+vger.kernel.org+zhouchuyi@bytedance.com>
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Message-Id: <20260630111008.2034376-6-zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
Subject: [PATCH v9 05/14] smp: Alloc percpu csd data in smpcfd_prepare_cpu()
 only once
Date: Tue, 30 Jun 2026 19:09:59 +0800
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

smp_call_function_many_cond() uses per-CPU CSD objects when queueing
callbacks to remote CPUs, and the wait path later dereferences those CSDs
from csd_lock_wait().

Making the wait path preemptible allows the initiating task to be
preempted or migrated before it waits for completion. A target CPU can be
offlined in that window. If smpcfd_dead_cpu() frees the target CPU's
per-CPU CSD storage, csd_lock_wait() can later dereference freed memory.

One way to protect the CSD storage is to free it via RCU or after a
synchronization step in the CPU offline path, but that would add
unnecessary complexity and can delay CPU shutdown.

Allocate the per-CPU CSD storage the first time a CPU comes up and keep
it allocated when the CPU is offlined. This allows csd_lock_wait() to
access the CSD even when the target CPU is offlined after preemption is
re-enabled and before the wait is invoked.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Muchun Song <muchun.song@linux.dev>
---
 kernel/smp.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index e9d647385df1..e76de3010b30 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -64,7 +64,14 @@ int smpcfd_prepare_cpu(unsigned int cpu)
 		free_cpumask_var(cfd->cpumask);
 		return -ENOMEM;
 	}
-	cfd->csd =3D alloc_percpu(call_single_data_t);
+
+	/*
+	 * Allocate the per-CPU CSD the first time a CPU comes up. It is
+	 * not freed when the CPU is offlined, so csd_lock_wait() can access
+	 * it even when the CPU was offlined after preemption was re-enabled.
+	 */
+	if (!cfd->csd)
+		cfd->csd =3D alloc_percpu(call_single_data_t);
 	if (!cfd->csd) {
 		free_cpumask_var(cfd->cpumask);
 		free_cpumask_var(cfd->cpumask_ipi);
@@ -80,7 +87,6 @@ int smpcfd_dead_cpu(unsigned int cpu)
=20
 	free_cpumask_var(cfd->cpumask);
 	free_cpumask_var(cfd->cpumask_ipi);
-	free_percpu(cfd->csd);
 	return 0;
 }
=20
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-114.ptr.blmpb.com (va-1-114.ptr.blmpb.com
 [209.127.230.114])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 15D4B3CCFD2
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:12:14 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.114
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817936; cv=none;
 b=mLHDCJ+tRqS5TQlGSbQGhTk+cQNIw0jseMtwpHSVia9Ggp53klZyZtg+qkvy4Yzg++ayRItP+rRtOMb57bOSMxAfvIvdJC6RS2co9f0T4HveLY8MLYlzDFCKMZc+9KsU24LH/V7YoaxjVWFwRTwDWffM9s6qOklZ5vofCNfWJVs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817936; c=relaxed/simple;
	bh=hFk0ITRIN9umy/InlnTJ8avZhxg5lKyh8I/K1CJnr1E=;
	h=Cc:Date:Mime-Version:Content-Type:Subject:In-Reply-To:Message-Id:
	 References:To:From;
 b=TKAcBUjVGpTasNAzu3xKXMN5e+pAoGgAZds1WkLSXqXAglkqY/4atmN34TwlLoiSz7Fon8c5WzHTqDEZgDzEvEskdEM9KMCBUrO4VcI6GTown9Xbv+0J6ix8GnBYGD1FXRDEAQHYYHzADruVJdO9nKgaT8y89hu/kSYcXf6ht8o=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=PIDNx1Wa; arc=none smtp.client-ip=209.127.230.114
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="PIDNx1Wa"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817930; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=nHCE0UyTQqS4wbzHYWkeAQMZngh8WErqk5IlwcwqZi8=;
 b=PIDNx1WaGrvA4epyPG6Jth5UIFauQttnUHcHs+Clpb+qY4cSXpBhBJshNur80Ql5PlFB+V
 jg3rfry8ENqjxwFz+VS0sCmG8V5SjgFNLeO0b2KA9bAPc65LcDdGU4uq2AKK/w8AAzwXW9
 da32koAXMFr3raC4LnPmkBibPCBRp5onuL1SSyVmdrlpD77TdXT8jtdN/WpmUirlsK8o1V
 EYwF3Y+GX7X973i1BmHzK6PY7LssmwvPsQZDyRuzV10ZiQ67f/mgSVOCxibFCclAHRR6Km
 3FJn/U10vruggATob6I6tZ1sF8/8ygJopYNBNzT+JrVJvwEm07GuGs2l3Ml0lg==
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
Date: Tue, 30 Jun 2026 19:10:00 +0800
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
Subject: [PATCH v9 06/14] smp: Enable preemption early in
 smp_call_function_many_cond()
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
X-Lms-Return-Path: 
 <lba+26a43a488+00a7df+vger.kernel.org+zhouchuyi@bytedance.com>
Message-Id: <20260630111008.2034376-7-zhouchuyi@bytedance.com>
Content-Transfer-Encoding: quoted-printable
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
Content-Type: text/plain; charset="utf-8"

smp_call_function_many_cond() still has to keep the caller pinned to the
current CPU while the remote IPI request is built and dispatched. This
protects the queueing state and CPU-hotplug boundary that are required
before the synchronous wait starts:

 - It protects the current CPU's per-CPU scratch cpumask,
   cfd->cpumask_ipi. Another task running on the same CPU could otherwise
   enter smp_call_function_many_cond() and reuse that scratch cpumask
   before the current caller has finished building and sending the IPI
   request.

 - It provides the CPU-hotplug exclusion required by the CSD queueing
   side. New CSDs must not be queued after smpcfd_dying_cpu() has flushed
   the outgoing CPU's callback queue. Keeping preemption disabled until
   all required CSDs have been queued and the corresponding IPIs have
   been sent prevents CPU offline from crossing that boundary in the
   middle of the queueing operation.

The CSD acquisition side also relies on that caller-side CPU pinning.
csd_lock() waits for CSD_FLAG_LOCK to clear and then marks the CSD busy
with a regular store, so another task on the same CPU must not be
allowed to acquire and reinitialize the same per-CPU CSD concurrently.

After the callbacks have been queued and the IPIs have been sent, the
caller only performs the final csd_lock_wait() completion wait. If it is
preempted there, another task running on the original CPU may enter
smp_call_function_many_cond(), but any attempt to reuse the same per-CPU
CSD will block in csd_lock() until the previous callback clears
CSD_FLAG_LOCK. The final csd_lock_wait() does not acquire or reinitialize
the CSD, so it does not need the same caller-side preemption-disabled
protection.

The wait mask is task-local, so it cannot be overwritten by another task
on the original CPU. The per-CPU CSD storage also remains allocated
across CPU offline, so csd_lock_wait() can safely dereference it even if
the target CPU is offlined after the caller is unpinned.

With those requirements satisfied, enable preemption before the
synchronous csd_lock_wait() loop. This makes the potentially long wait
preemptible and migratable while keeping the CPU-pinned section around
the remote CPU selection and IPI dispatch.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/smp.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index e76de3010b30..92f984754139 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -858,15 +858,14 @@ static void smp_call_function_many_cond(const struct =
cpumask *mask,
 					unsigned int scf_flags,
 					smp_cond_func_t cond_func)
 {
-	int cpu, last_cpu, this_cpu =3D smp_processor_id();
 	struct cpumask *cpumask, *task_mask;
 	bool wait =3D scf_flags & SCF_WAIT;
 	struct call_function_data *cfd;
+	int cpu, last_cpu, this_cpu;
 	bool run_remote =3D false;
 	int nr_cpus =3D 0;
=20
-	lockdep_assert_preemption_disabled();
-
+	this_cpu =3D get_cpu();
 	cfd =3D this_cpu_ptr(&cfd_data);
 	task_mask =3D smp_task_ipi_mask(current);
 	if (task_mask)
@@ -952,6 +951,16 @@ static void smp_call_function_many_cond(const struct c=
pumask *mask,
 		local_irq_restore(flags);
 	}
=20
+	/*
+	 * The IPI work has been queued and dispatched. On PREEMPT kernels,
+	 * tasks created through dup_task_struct() have task-local wait masks.
+	 * The boot init_task can fall back to cfd->cpumask when the mask is
+	 * not inlined, but other tasks still use task-local masks and cannot
+	 * overwrite it. On !PREEMPT kernels, preempt_enable() cannot schedule
+	 * another task, so the per-CPU mask remains protected.
+	 */
+	put_cpu();
+
 	if (run_remote && wait) {
 		for_each_cpu(cpu, cpumask) {
 			call_single_data_t *csd;
@@ -964,15 +973,14 @@ static void smp_call_function_many_cond(const struct =
cpumask *mask,
=20
 /**
  * smp_call_function_many() - Run a function on a set of CPUs.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- *        on other CPUs.
+ * @mask:	The set of cpus to run on (only runs on online subset).
+ * @func:	The function to run. This must be fast and non-blocking.
+ * @info:	An arbitrary pointer to pass to the function.
+ * @wait:	If true, wait (atomically) until function has completed
+ *		on other CPUs.
  *
  * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler. Preemption
- * must be disabled when calling this function.
+ * hardware interrupt handler or from a bottom half handler.
  *
  * @func is not called on the local CPU even if @mask contains it.  Consid=
er
  * using on_each_cpu_cond_mask() instead if this is not desirable.
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-112.ptr.blmpb.com (va-1-112.ptr.blmpb.com
 [209.127.230.112])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1E7783EEAF9
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:12:29 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.112
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817951; cv=none;
 b=U9HH1s5LGh+KTsmNV2DSWasJ71ugv73tuUZNT2tYkFJAgLJ6FUtLTYxddYl4PQkMvJacyDfLVIvUDQEN3+zgVvJa6zPfG2d89nUvgaAdyz7rC2VhSjT+1vaKsMEHHsnrp1XTefiD7yvvJcUjPmIh3hWewRhq+oesdctyK7g+qfI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817951; c=relaxed/simple;
	bh=YjRffYSkbruXhNEnghJAB35bvgNIWuHhxuo5JWkLJSg=;
	h=Content-Type:Subject:Date:Message-Id:Mime-Version:References:Cc:
	 From:To:In-Reply-To;
 b=JvNA3QPQE5grOqhkkXphO4UpI67d1WUmG4QjlpZutJZXCZv5g3sAFTmX1SgwCnFC8zBBSCR27JVx+yptl8I3y0+G6gLCY0b/HPRPC6Zip0DIKzq8WovhVxIIO3L0PGaZkR4k5Pw/XzBHpbo9EpS2n7E3Zlz3g5NjqkCUg6Y8DUk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=V5eT4xpu; arc=none smtp.client-ip=209.127.230.112
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="V5eT4xpu"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817945; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=okuvsG9ZLsbYysyixOekPe28yfFvoZLcrmyOBpvo2Ek=;
 b=V5eT4xpuKF3FHHwSR8d6FzrD9Ib6I44cnf/MBYCuUmld0tEeDGGG/daIE29WY9VSeDf6tL
 +Jve3z877MRI6Gy8l4SZeT0FCLY1LI8bqqLY3fpmTepFq6X9h2d+djIri4PROELAL5EAeQ
 AXQ9As1s9GAnJmc/ol68g5Wojfzsp1RJ7OMgc6km3sCnuaF33qCECLkr3JSlJWapFc0hAx
 7Eb11jsxtvtXZ8yLyO6u1GWNOPVJijf7bT0BDPTDDhlk4gaZb6xsOvY4NCxO+p+vr63lh5
 DyI+kYATCmCDYnTERtDmxmxVsE1uIn4kWbydbwA1uRD/DraVgDZT15mYiT7EhA==
Content-Transfer-Encoding: quoted-printable
X-Mailer: git-send-email 2.20.1
X-Lms-Return-Path: 
 <lba+26a43a497+4463dd+vger.kernel.org+zhouchuyi@bytedance.com>
Subject: [PATCH v9 07/14] smp: Remove preempt_disable() from
 smp_call_function()
Date: Tue, 30 Jun 2026 19:10:01 +0800
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
Message-Id: <20260630111008.2034376-8-zhouchuyi@bytedance.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

smp_call_function_many_cond() handles the preemption and CPU pinning
requirements internally. smp_call_function() only forwards the request to
that helper for cpu_online_mask and does not access CPU-local state on
its own.

Remove the outer preempt_disable() and preempt_enable() pair from
smp_call_function().

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/smp.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 92f984754139..933a14dbb8f8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -994,10 +994,10 @@ EXPORT_SYMBOL(smp_call_function_many);
=20
 /**
  * smp_call_function() - Run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- *        on other CPUs.
+ * @func:	The function to run. This must be fast and non-blocking.
+ * @info:	An arbitrary pointer to pass to the function.
+ * @wait:	If true, wait (atomically) until function has completed
+ *		on other CPUs.
  *
  * If @wait is true, then returns once @func has returned; otherwise
  * it returns just before the target cpu calls @func.
@@ -1007,9 +1007,8 @@ EXPORT_SYMBOL(smp_call_function_many);
  */
 void smp_call_function(smp_call_func_t func, void *info, int wait)
 {
-	preempt_disable();
-	smp_call_function_many(cpu_online_mask, func, info, wait);
-	preempt_enable();
+	smp_call_function_many_cond(cpu_online_mask, func, info,
+				    wait ? SCF_WAIT : 0, NULL);
 }
 EXPORT_SYMBOL(smp_call_function);
=20
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-115.ptr.blmpb.com (va-1-115.ptr.blmpb.com
 [209.127.230.115])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id DF0F63F39FD
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:12:44 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.115
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817966; cv=none;
 b=UWTE9XJnmz/4wVUPOJUt5R4y99uM3FaZ03CpssyNjhjJXbAQU9jBIjAZD6nsk7D4TnMXGoWzJD3rRpVZuJQp+owTORlsinJ3tSjvRAC2pgEb+h1y8jWWaD80FsWW6U8rJvzs9clc1pfIqWpWlAPtgdfewM0Rjl/Ma3l7mPgdmrc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817966; c=relaxed/simple;
	bh=ukx5HvskabpAZjztpfhL5PkE0u5ZGJBfsBxCe+RGhuA=;
	h=Message-Id:To:From:Subject:References:In-Reply-To:Content-Type:Cc:
	 Date:Mime-Version;
 b=Df8JIuodDDt6TFk/IgpkSGJ93NC++uc1GTuyN7dYd0m+xU7Mrha75Pts0BOcdMeEpZ8q1mqkf6gJGhg/zfMi12mdc+qlre+eCkza5sRHwn5ds82b48tTw4DF+KpPhptp55v6g8+4YRir4ECMXS3zOnTi8SizgQ3NZu/8frEJcbw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=FybfFnzC; arc=none smtp.client-ip=209.127.230.115
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="FybfFnzC"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817960; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=wOml0WtfCwOBElgf5Nw783xfpU6oJdBCsD9zvZyHN6A=;
 b=FybfFnzCrM9BS4d/3+tny0/LdNo7pPijVX2pITNrS+OsvAHK33MU+GdL5PctxxFT0pS1RW
 uuax+v/9zOzzqlVMp5jJY7Y7tJ7LkzaeNs4FWkDuGqrpt4iZRXKy6VqWNpcyuIBM/gKa6P
 0ApLRAm+AABt8WacAdpOy8SJN3RATmeC7omRIPyIuS7C2Polpa01L7Q+zhU1b6mw+Sab3s
 vnV8RoSik6Vvzte6gIkssY+wJtGG+cyV6WjpFM52MIyWCaqNE+cpz+lscUgu3QVBkpbnOp
 YE/1TzyJqaFtcdoA2OtKviLzrBG3Ld5FxOvrTVRh3jJxqTRd+4fOtx74dGi7TA==
Message-Id: <20260630111008.2034376-9-zhouchuyi@bytedance.com>
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Content-Transfer-Encoding: quoted-printable
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Subject: [PATCH v9 08/14] smp: Remove preempt_disable() from
 on_each_cpu_cond_mask()
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
X-Lms-Return-Path: 
 <lba+26a43a4a6+688b28+vger.kernel.org+zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
Date: Tue, 30 Jun 2026 19:10:02 +0800
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
Content-Type: text/plain; charset="utf-8"

smp_call_function_many_cond() handles the preemption and CPU pinning
requirements internally. on_each_cpu_cond_mask() only builds the call
flags and forwards the request to that helper.

Remove the outer preempt_disable() and preempt_enable() pair from
on_each_cpu_cond_mask().

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/smp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 933a14dbb8f8..db0a123911d1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1131,9 +1131,7 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func,=
 smp_call_func_t func,
 	if (wait)
 		scf_flags |=3D SCF_WAIT;
=20
-	preempt_disable();
 	smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
-	preempt_enable();
 }
 EXPORT_SYMBOL(on_each_cpu_cond_mask);
=20
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-115.ptr.blmpb.com (va-1-115.ptr.blmpb.com
 [209.127.230.115])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6F4743F5BC3
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:12:59 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.115
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817980; cv=none;
 b=TQHCnP3nlBvg6ZrRPHSuNCV0lwvqqOu8ti/fQtiX/xbNPEdkBRU0Msqz9TxzIxk/+Nt3RbYl3uip52C2nvQ2EKc+e+oDit9nw91S61bvZlmQq9TDilhB/41+Agju2WrFeDbX+uDSz8zjSd8fmpqJXIP6RzfsiBvfCAiQzLp8fbs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817980; c=relaxed/simple;
	bh=yt7c0j6x77S2SizmfX/fsuNe7+AbUheUjDqYKCv5TRA=;
	h=References:Cc:Mime-Version:In-Reply-To:From:Subject:Date:
	 Content-Type:To:Message-Id;
 b=nZbkNrNQxZ8ansVWqb9+zGRpZ4xfrXJ40nbUGklg6CCH8y+7AwnDgjMXoUSeIxyT/CQnuUbknuBkDPyZsitmygaH42Ky89J/pIcflgQv3RUyGtn1wszO51WiOw1nTBf9ZtAurSnQ+amh5i7VPLBcgFiFqp8J/H9FmUjY2zvh4CU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=XT+Ffsen; arc=none smtp.client-ip=209.127.230.115
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="XT+Ffsen"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817974; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=zn4ApWf7zS/cIePwAwfIzCjyw1S/eS0J2Gja9AjJtjc=;
 b=XT+FfsenDJRGFYsUm/z57ttwk2CU3lUFHcjFFJGVZbvDP5qT1NuMVN+SO/NQAbW9pA+LPr
 V7G8Put9SfNnAPD8Coo00+chMpfe4Iw2VHsSUD+KMDwP+ILsdk7Sf2tqDu0a5hy/y/uZEV
 DyMFpykEgSWdA+SaQIGRVlXEytgUCFb7MJYXLQ0isS/ydtHVqUp5SuwOW8kWCOjsZYFlzz
 VFPSnJmuVnYg5MD2XCoXMlb/Id9TYw7bR+BheCBHr0T5bDVDnpFUnVmfrKnSo0vgOJlhwD
 KafQus3cjWnY71KWikZDBg4iMRvYRK05lCDczb4wjpU8eYcLybD+HcmiL3q+qA==
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
Content-Transfer-Encoding: quoted-printable
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
X-Lms-Return-Path: 
 <lba+26a43a4b4+34cb73+vger.kernel.org+zhouchuyi@bytedance.com>
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Subject: [PATCH v9 09/14] scftorture: Remove preempt_disable() in
 scftorture_invoke_one()
Date: Tue, 30 Jun 2026 19:10:03 +0800
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Message-Id: <20260630111008.2034376-10-zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

The smp_call*() functions handle their required preemption and CPU
pinning internally. The explicit preempt_disable() in
scftorture_invoke_one() is therefore no longer required for correctness.
Keeping the outer preempt_disable() would also prevent scftorture from
exercising the narrowed internal preemption-disabled regions during IPI
dispatch.

Removing the outer preemption protection can expose a CPU hotplug race in
the test validation when use_cpus_read_lock is false. For multicast
operations, SCF_PRIM_MANY or SCF_PRIM_ALL, if only one CPU is online,
smp_call_function_many() correctly skips sending IPIs and leaves scfc_out
false. Without preemption disabled, a CPU hotplug thread can preempt the
test thread, bring a second CPU online and increment num_online_cpus().
When the test thread resumes, the validation check can observe
num_online_cpus() > 1 and falsely trigger the memory-ordering warning,
leaking the scfcp structure.

Remove the preempt_disable() and preempt_enable() pairs around the
smp_call*() invocations in scftorture_invoke_one(). Restrict the
num_online_cpus() > 1 validation to the use_cpus_read_lock=3Dtrue case,
where the CPU count is stable during the evaluation.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/scftorture.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 327c315f411c..2082f9b44370 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -348,6 +348,8 @@ static void scftorture_invoke_one(struct scf_statistics=
 *scfp, struct torture_ra
 	int ret =3D 0;
 	struct scf_check *scfcp =3D NULL;
 	struct scf_selector *scfsp =3D scf_sel_rand(trsp);
+	bool is_single =3D (scfsp->scfs_prim =3D=3D SCF_PRIM_SINGLE ||
+			  scfsp->scfs_prim =3D=3D SCF_PRIM_SINGLE_RPC);
=20
 	if (scfsp->scfs_prim =3D=3D SCF_PRIM_SINGLE || scfsp->scfs_wait) {
 		scfcp =3D kmalloc_obj(*scfcp, GFP_ATOMIC);
@@ -364,8 +366,6 @@ static void scftorture_invoke_one(struct scf_statistics=
 *scfp, struct torture_ra
 	}
 	if (use_cpus_read_lock)
 		cpus_read_lock();
-	else
-		preempt_disable();
 	switch (scfsp->scfs_prim) {
 	case SCF_PRIM_RESCHED:
 		if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) {
@@ -411,13 +411,10 @@ static void scftorture_invoke_one(struct scf_statisti=
cs *scfp, struct torture_ra
 		if (!ret) {
 			if (use_cpus_read_lock)
 				cpus_read_unlock();
-			else
-				preempt_enable();
+
 			wait_for_completion(&scfcp->scfc_completion);
 			if (use_cpus_read_lock)
 				cpus_read_lock();
-			else
-				preempt_disable();
 		} else {
 			scfp->n_single_rpc_ofl++;
 			scf_add_to_free_list(scfcp);
@@ -452,7 +449,7 @@ static void scftorture_invoke_one(struct scf_statistics=
 *scfp, struct torture_ra
 			scfcp->scfc_out =3D true;
 	}
 	if (scfcp && scfsp->scfs_wait) {
-		if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim =3D=3D SCF_P=
RIM_SINGLE) &&
+		if (WARN_ON_ONCE(((use_cpus_read_lock && num_online_cpus() > 1) || is_si=
ngle) &&
 				 !scfcp->scfc_out)) {
 			pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfs=
p->scfs_prim);
 			atomic_inc(&n_mb_out_errs); // Leak rather than trash!
@@ -463,8 +460,6 @@ static void scftorture_invoke_one(struct scf_statistics=
 *scfp, struct torture_ra
 	}
 	if (use_cpus_read_lock)
 		cpus_read_unlock();
-	else
-		preempt_enable();
 	if (allocfail)
 		schedule_timeout_idle((1 + longwait) * HZ);  // Let no-wait handlers com=
plete.
 	else if (!(torture_random(trsp) & 0xfff))
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-115.ptr.blmpb.com (va-1-115.ptr.blmpb.com
 [209.127.230.115])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 334F93F5BC3
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:13:13 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.115
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782817995; cv=none;
 b=Oo1jbEX2LZLQnRbau9721xwuvcNTXMlhQTGWwLDREGmCgxw4naaCsikh4q9kcV9IgEll13m2ltKsLi9KnGPQ3KAcM6ylz0nQ+tptz/bSsU75/gckNlqZTEk14uln++AlEE8GXEihj4jpds3t5qxcycCfLfpvqpUpuSF+Ws7RozU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782817995; c=relaxed/simple;
	bh=yApagF94LubP0VCgXvTAsnfvOBqN595gZ1GlcNZANWs=;
	h=Message-Id:To:Cc:In-Reply-To:Subject:Mime-Version:References:
	 Content-Type:From:Date;
 b=aDxlkBGFmTsZQ0fS+yh+Q/3Ek5L/MFhtQ1084cMV6ruDhJfN8i32Gdht3BQv7Vtv1MgDiRZcNU1yqgNlfs9Hwe+/Ji44kphG0Ci1j8HyJkeIRMND0lXETM3lnH2QtbWoWTREvb8BDAsrONO62Im/EuvDw79S7hrhwi0XsThbMaA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=KmqsH0Ca; arc=none smtp.client-ip=209.127.230.115
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="KmqsH0Ca"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782817989; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=s+Gplx3/f9PuZSrKLJEqjmNSeg4v0fOKBtkRf1ITjVo=;
 b=KmqsH0CaoTGInyax3MBx9yClyBiTpI1aARu7Y8CGikcuNN5ixMZoM26jjwamJ3+OHfqhkO
 mCkMNJCmmj5e8iBesIv/GLm/5swrhqY91xGUU8OYNsBTOIZziufaN/O8okPMrTPvm6fxfA
 sAtUu9PcXvnInt88kKTDYx5aOSkXBR+7MXaqe1ggN/9YFIoBbxa1cQ9bimsoSHa1FlLl51
 2FLX6sK6bTJh/+lMjOS/HfDZwNdb5bH3lftfI0Bq0kbAZJlfUxlIz5VQjKvjVZ6YaKT+td
 PZju5rn5vB0xXqg+m1QibPq0kSlsrQ7tR2ew0lFy+X5Zecy3Mah+uCBTv2fdgA==
Message-Id: <20260630111008.2034376-11-zhouchuyi@bytedance.com>
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Subject: [PATCH v9 10/14] x86/mm: Factor out flush_tlb_info initialization
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Content-Transfer-Encoding: quoted-printable
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Date: Tue, 30 Jun 2026 19:10:04 +0800
X-Lms-Return-Path: 
 <lba+26a43a4c3+ec31ce+vger.kernel.org+zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

get_flush_tlb_info() has two responsibilities: it reserves the per-CPU
flush_tlb_info storage and it initializes the fields that describe the
flush operation. The per-CPU storage also carries the DEBUG_VM
reentrancy check and the matching put_flush_tlb_info() lifetime rules.

Moving flush_tlb_info back to caller-provided storage requires the same
field initialization without tying the caller to the per-CPU object.
Leaving the field setup embedded in get_flush_tlb_info() would either
keep those callers tied to the per-CPU object or duplicate the
initialization logic.

Split the field setup into init_flush_tlb_info(). Keep the per-CPU
storage selection, DEBUG_VM reentrancy check and put_flush_tlb_info()
lifetime rules in get_flush_tlb_info().

No functional change intended.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/mm/tlb.c | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index af43d177087e..f76f576d3899 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1379,22 +1379,12 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_t=
lb_info, flush_tlb_info);
 static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
 #endif
=20
-static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
-			unsigned long start, unsigned long end,
-			unsigned int stride_shift, bool freed_tables,
-			u64 new_tlb_gen)
+static void init_flush_tlb_info(struct flush_tlb_info *info,
+				struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				unsigned int stride_shift, bool freed_tables,
+				u64 new_tlb_gen)
 {
-	struct flush_tlb_info *info =3D this_cpu_ptr(&flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Ensure that the following code is non-reentrant and flush_tlb_info
-	 * is not overwritten. This means no TLB flushing is initiated by
-	 * interrupt handlers and machine-check exception handlers.
-	 */
-	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) !=3D 1);
-#endif
-
 	/*
 	 * If the number of flushes is so large that a full flush
 	 * would be faster, do a full flush.
@@ -1412,6 +1402,28 @@ static struct flush_tlb_info *get_flush_tlb_info(str=
uct mm_struct *mm,
 	info->new_tlb_gen	=3D new_tlb_gen;
 	info->initiating_cpu	=3D smp_processor_id();
 	info->trim_cpumask	=3D 0;
+}
+
+static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+						 unsigned long start,
+						 unsigned long end,
+						 unsigned int stride_shift,
+						 bool freed_tables,
+						 u64 new_tlb_gen)
+{
+	struct flush_tlb_info *info =3D this_cpu_ptr(&flush_tlb_info);
+
+#ifdef CONFIG_DEBUG_VM
+	/*
+	 * Ensure that the following code is non-reentrant and flush_tlb_info
+	 * is not overwritten. This means no TLB flushing is initiated by
+	 * interrupt handlers and machine-check exception handlers.
+	 */
+	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) !=3D 1);
+#endif
+
+	init_flush_tlb_info(info, mm, start, end, stride_shift, freed_tables,
+			    new_tlb_gen);
=20
 	return info;
 }
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-113.ptr.blmpb.com (va-1-113.ptr.blmpb.com
 [209.127.230.113])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E8E6B388E51
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:13:28 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.113
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782818010; cv=none;
 b=hOEaxCZM3pBw0phXgc390QwQskipmxQ2+8qPxg6NNc7Fd1eCuefRSsXcmYPIqvBqql4wI1Em7RqDb5AGBljVrHOmqJwt+htyyaNeRynVBOKO+UB0ym+U95WCSvaEY7UbJqLJZqHRSN2YJKBq6aARo3slvPHZZGm7q+JteR3nX1s=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782818010; c=relaxed/simple;
	bh=PWuLewym18ywmLhT0X8MgjiVE8ytrH7YaJHgqXpgGPM=;
	h=References:To:Cc:Mime-Version:Content-Type:From:Subject:
	 Message-Id:In-Reply-To:Date;
 b=DIhuFS3xVNPKizUWY+4yMaYrUXLx0opAtKEA4zfr0tP+1mxXO3Py2sEOZhy+0b3EdJDc7Bnf6jUmtPriJZAFVIpkPTP26pnsYUh5frSh6g9P0gKD+xkddMCGTo/uLiTweaEpjdpjrWcUNkxL67dZi1A3+dz/2+i6BWlGNt4Hr3M=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=qQCK0CSd; arc=none smtp.client-ip=209.127.230.113
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="qQCK0CSd"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782818003; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=q9KZzW4+JeMyav6u2qNF3CUJqHIhLeehRsZhpYVcQbI=;
 b=qQCK0CSdGspS0qMnN6lUupStpxWVqSaE1YCMoZMEljQNvr18gYxmpl+l3y14RsWdrLaPyy
 IWdsAICws2i99KdWHQlFFgxnmMv9Djn+L5TqgCUFecmcABtFlBTLJ2Byt6ruaSOdDm0Nqf
 tbk+Zv3/aQVAb7jVu+A6V+kC0vYAlNbojvOWhDHQJBJ11QrwG5B3ud6FqHuIpe+Zod9a3h
 cqaH2fosXzd51In39z4ZbO5yo7wu4ZJI0RO3mMu2am035emjdmL7HHcvt4USzoCujuYovX
 ++OlE4fW0SMKVId504h0tuHcvB/bTz5YAfmy488UwKbkUxHPJ3HNc3nM+hqGnw==
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
X-Lms-Return-Path: 
 <lba+26a43a4d1+f42942+vger.kernel.org+zhouchuyi@bytedance.com>
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
X-Mailer: git-send-email 2.20.1
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Subject: [PATCH v9 11/14] x86/mm: Cap flush_tlb_info alignment at 64 bytes
Message-Id: <20260630111008.2034376-12-zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Content-Transfer-Encoding: quoted-printable
Date: Tue, 30 Jun 2026 19:10:05 +0800
Content-Type: text/plain; charset="utf-8"

A stack allocated flush_tlb_info should keep cacheline alignment to avoid
the regression that motivated the per-CPU storage, but using
SMP_CACHE_BYTES directly can make the stack frame grow excessively on
configurations with large cache lines. This was addressed by
commit 780e0106d468 ("x86/mm/tlb: Revert "x86/mm: Align TLB
invalidation info""), where the stack consumption reached 320 bytes.

Add FLUSH_TLB_INFO_ALIGN and cap the type alignment at 64 bytes. The
existing per-CPU flush_tlb_info instance remains
DEFINE_PER_CPU_SHARED_ALIGNED(), so its per-CPU shared-cacheline
alignment is unchanged.

This prepares for moving flush_tlb_info back to stack storage without
reintroducing the old large-cacheline stack usage problem.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/include/asm/tlbflush.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflus=
h.h
index 0545fe75c3fa..70098d448e99 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -4,6 +4,7 @@
=20
 #include <linux/mm_types.h>
 #include <linux/mmu_notifier.h>
+#include <linux/minmax.h>
 #include <linux/sched.h>
=20
 #include <asm/barrier.h>
@@ -211,6 +212,12 @@ extern u16 invlpgb_count_max;
=20
 extern void initialize_tlbstate_and_flush(void);
=20
+/*
+ * Keep stack-allocated flush_tlb_info cacheline aligned, but cap the
+ * alignment to avoid excessive stack usage on large-cacheline systems.
+ */
+#define FLUSH_TLB_INFO_ALIGN MIN(SMP_CACHE_BYTES, 64)
+
 /*
  * TLB flushing:
  *
@@ -249,7 +256,7 @@ struct flush_tlb_info {
 	u8			stride_shift;
 	u8			freed_tables;
 	u8			trim_cpumask;
-};
+} __aligned(FLUSH_TLB_INFO_ALIGN);
=20
 void flush_tlb_local(void);
 void flush_tlb_one_user(unsigned long addr);
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-111.ptr.blmpb.com (va-1-111.ptr.blmpb.com
 [209.127.230.111])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 40A053CB8F1
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:13:43 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.111
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782818025; cv=none;
 b=ooZFqW1o3xI7IN6q97cU2FVpfQVY13es3c9WyZLFRZFYKh4FxLLXODERSL/bYCqOQmAJs5je4mHLh85eUBuHPXHV+thyg30G07VVy+RGuzSNT6k6q1aqagpS8Q/fi8Cz+jQkysNzoS1aiYUYSdl7jLu/yw7WD1rkLyqBv4jNAKo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782818025; c=relaxed/simple;
	bh=MwqqQTfIa57tnVBXYw4vF+8H216Pip+agR0sLS3Orh8=;
	h=Mime-Version:References:Date:From:Subject:Message-Id:Cc:
	 Content-Type:To:In-Reply-To;
 b=WMSOwv0GpdSan54Wa2vj77394ZgxElBE/RSZF5oKdzQhiUHh7a8V+NA+DZWuemSHQT+XPx8M9slyKCLZI97HjFkPNaH/CobM+HHVEixCxAA0jPzlVuOSUEXmjEsb9WJdHRsijfOFZkDG1awbQCnrRFwz/OeA5qs3GOTKGqkFN7A=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=erFrD2Vr; arc=none smtp.client-ip=209.127.230.111
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="erFrD2Vr"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782818018; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=ZRdWhpGTbfevk9is9D4XlWRWuWyVfrkVAA4gPLda7Ns=;
 b=erFrD2VrDf/no1rmu6rEjDzCVkCmSjN4Xn7aPzqUsK8b9GQ/+tzOOULF8K8Yz44WOnuJcd
 6LNzizCsk2+56CxLHP7UhEx0I4SQKXtgqYPgpiD9poA7VAvDn8VmxU2lwDdCqgX0MBa7U+
 RYPSLdMxE0ZxoNQAVrzxkE3Njc4OlrwyvP2sZmeDEtskTIes7OOqoBbydzGdkL4Ylt8qNh
 XRLR+SFIfhSwYIMtkHQiqaJsAcBf1iK8sRUAV817tQHAh5A/rOjkCuP7JqdqRJo06ImJW6
 CSOG0hkhHwe83PXdqW2zS/PqwPEb5G89MJsn1mg7Rsw1T+mFRu0h6N9c7tVHYA==
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Tue, 30 Jun 2026 19:10:06 +0800
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Subject: [PATCH v9 12/14] x86/mm: Move flush_tlb_info back to the stack
Message-Id: <20260630111008.2034376-13-zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
X-Lms-Return-Path: 
 <lba+26a43a4e0+e7dd54+vger.kernel.org+zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Content-Transfer-Encoding: quoted-printable
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

flush_tlb_info benefits from cacheline alignment, but using
cacheline-aligned stack storage directly can grow stack usage too much on
configurations with large SMP_CACHE_BYTES values. Commit 515ab7c41306
("x86/mm: Align TLB invalidation info") attempted to align stack storage,
and commit 780e0106d468 ("x86/mm/tlb: Revert "x86/mm: Align TLB
invalidation info"") reverted it because using SMP_CACHE_BYTES led to 320
bytes of stack consumption. Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove
'struct flush_tlb_info' from the stack") moved flush_tlb_info to per-CPU
storage, which avoided the stack growth problem while preserving
cacheline alignment. That was a good fit while the callers kept
preemption disabled for the whole flush operation.

However, a single per-CPU flush_tlb_info also requires all flush_tlb*()
operations to keep preemption disabled while the object is in use, so
that it cannot be overwritten by another flush on the same CPU.
flush_tlb*() may send IPIs to remote CPUs and synchronously wait for all
remote CPUs to complete their local TLB flushes. That wait can take tens
of milliseconds when interrupts are disabled on a remote CPU or when a
large number of remote CPUs are involved.

To shorten the CPU-pinned and preemption-disabled section around those
remote TLB flush waits, move flush_tlb_info back to caller-private stack
storage. The caller then does not have to stay on the same CPU until the
remote flush completes.

The type alignment is capped at 64 bytes. This keeps the alignment
benefit for stack objects without reintroducing the old large-cacheline
stack usage problem.

To evaluate the performance impact, use the following script to reproduce
the microbenchmark mentioned in commit 3db6d5a5ecaf ("x86/mm/tlb: Remove
'struct flush_tlb_info' from the stack"). The test environment is an Ice
Lake system (Intel(R) Xeon(R) Platinum 8336C) with 128 CPUs and 2 NUMA
nodes. During the test, the threads were bound to specific CPUs, and both
pti and mitigations were disabled:

    #include <stdio.h>
    #include <stdlib.h>
    #include <pthread.h>
    #include <sys/mman.h>
    #include <sys/time.h>
    #include <unistd.h>

    #define NUM_OPS 1000000
    #define NUM_THREADS 3
    #define NUM_RUNS 5
    #define PAGE_SIZE 4096

    volatile int stop_threads =3D 0;

    void *busy_wait_thread(void *arg) {
        while (!stop_threads) {
            __asm__ volatile ("nop");
        }
        return NULL;
    }

    long long get_usec() {
        struct timeval tv;
        gettimeofday(&tv, NULL);
        return tv.tv_sec * 1000000LL + tv.tv_usec;
    }

    int main() {
        pthread_t threads[NUM_THREADS];
        char *addr;
        int i, r;
        addr =3D mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE
                | MAP_ANONYMOUS, -1, 0);

        if (addr =3D=3D MAP_FAILED) {
            perror("mmap");
            exit(1);
        }

        for (i =3D 0; i < NUM_THREADS; i++) {
            if (pthread_create(&threads[i], NULL, busy_wait_thread, NULL))
                exit(1);
        }

        printf("Running benchmark: %d runs, %d ops each, %d background\n"
               "threads\n", NUM_RUNS, NUM_OPS, NUM_THREADS);

        for (r =3D 0; r < NUM_RUNS; r++) {
            long long start, end;
            start =3D get_usec();
            for (i =3D 0; i < NUM_OPS; i++) {
                addr[0] =3D 1;
                if (madvise(addr, PAGE_SIZE, MADV_DONTNEED)) {
                    perror("madvise");
                    exit(1);
                }
            }
            end =3D get_usec();
            double duration =3D (double)(end - start);
            double avg_lat =3D duration / NUM_OPS;
            printf("Run %d: Total time %.2f us, Avg latency %.4f us/op\n",
                   r + 1, duration, avg_lat);
        }
        stop_threads =3D 1;
        for (i =3D 0; i < NUM_THREADS; i++)
            pthread_join(threads[i], NULL);
        munmap(addr, PAGE_SIZE);
        return 0;
    }

                   base   on-stack-aligned  on-stack-not-aligned
                   ----       ---------      -----------
avg (usec/op)     2.5278       2.5261         2.5508
stddev            0.0007       0.0027         0.0023

The benchmark results show that the average latency difference between
the baseline (base) and the properly aligned stack variable
(on-stack-aligned) is within the standard deviation (stddev). This
indicates that the variations are caused by testing noise, and reverting
to a stack variable with proper alignment causes no performance
regression compared to the per-CPU implementation. The unaligned version
(on-stack-not-aligned) shows a minor performance drop. The
CPU-pinned/preemption-disabled section can therefore be shortened without
sacrificing performance.

With caller-private storage there is no shared per-CPU object to protect,
so remove the DEBUG_VM reentrancy counter as well.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Nadav Amit <nadav.amit@gmail.com>
---
 arch/x86/mm/tlb.c | 80 +++++++++++------------------------------------
 1 file changed, 18 insertions(+), 62 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f76f576d3899..0620c001981f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1373,12 +1373,6 @@ void flush_tlb_multi(const struct cpumask *cpumask,
  */
 unsigned long tlb_single_page_flush_ceiling __read_mostly =3D 33;
=20
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info=
);
-
-#ifdef CONFIG_DEBUG_VM
-static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
-#endif
-
 static void init_flush_tlb_info(struct flush_tlb_info *info,
 				struct mm_struct *mm,
 				unsigned long start, unsigned long end,
@@ -1404,52 +1398,19 @@ static void init_flush_tlb_info(struct flush_tlb_in=
fo *info,
 	info->trim_cpumask	=3D 0;
 }
=20
-static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
-						 unsigned long start,
-						 unsigned long end,
-						 unsigned int stride_shift,
-						 bool freed_tables,
-						 u64 new_tlb_gen)
-{
-	struct flush_tlb_info *info =3D this_cpu_ptr(&flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Ensure that the following code is non-reentrant and flush_tlb_info
-	 * is not overwritten. This means no TLB flushing is initiated by
-	 * interrupt handlers and machine-check exception handlers.
-	 */
-	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) !=3D 1);
-#endif
-
-	init_flush_tlb_info(info, mm, start, end, stride_shift, freed_tables,
-			    new_tlb_gen);
-
-	return info;
-}
-
-static void put_flush_tlb_info(void)
-{
-#ifdef CONFIG_DEBUG_VM
-	/* Complete reentrancy prevention checks */
-	barrier();
-	this_cpu_dec(flush_tlb_info_idx);
-#endif
-}
-
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, unsigned int stride_shift,
 				bool freed_tables)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info;
 	int cpu =3D get_cpu();
 	u64 new_tlb_gen;
=20
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	new_tlb_gen =3D inc_mm_tlb_gen(mm);
=20
-	info =3D get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
-				  new_tlb_gen);
+	init_flush_tlb_info(&info, mm, start, end, stride_shift, freed_tables,
+			    new_tlb_gen);
=20
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1457,19 +1418,18 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsig=
ned long start,
 	 * flush_tlb_func_local() directly in this case.
 	 */
 	if (mm_global_asid(mm)) {
-		broadcast_tlb_flush(info);
+		broadcast_tlb_flush(&info);
 	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
-		info->trim_cpumask =3D should_trim_cpumask(mm);
-		flush_tlb_multi(mm_cpumask(mm), info);
+		info.trim_cpumask =3D should_trim_cpumask(mm);
+		flush_tlb_multi(mm_cpumask(mm), &info);
 		consider_global_asid(mm);
 	} else if (mm =3D=3D this_cpu_read(cpu_tlbstate.loaded_mm)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
=20
-	put_flush_tlb_info();
 	put_cpu();
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
@@ -1539,19 +1499,16 @@ static void kernel_tlb_flush_range(struct flush_tlb=
_info *info)
=20
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info;
=20
 	guard(preempt)();
+	init_flush_tlb_info(&info, NULL, start, end, PAGE_SHIFT, false,
+			    TLB_GENERATION_INVALID);
=20
-	info =3D get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
-				  TLB_GENERATION_INVALID);
-
-	if (info->end =3D=3D TLB_FLUSH_ALL)
-		kernel_tlb_flush_all(info);
+	if (info.end =3D=3D TLB_FLUSH_ALL)
+		kernel_tlb_flush_all(&info);
 	else
-		kernel_tlb_flush_range(info);
-
-	put_flush_tlb_info();
+		kernel_tlb_flush_range(&info);
 }
=20
 /*
@@ -1719,12 +1676,12 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
=20
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info;
=20
 	int cpu =3D get_cpu();
=20
-	info =3D get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
-				  TLB_GENERATION_INVALID);
+	init_flush_tlb_info(&info, NULL, 0, TLB_FLUSH_ALL, 0, false,
+			    TLB_GENERATION_INVALID);
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling
@@ -1734,17 +1691,16 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap=
_batch *batch)
 		invlpgb_flush_all_nonglobals();
 		batch->unmapped_pages =3D false;
 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
-		flush_tlb_multi(&batch->cpumask, info);
+		flush_tlb_multi(&batch->cpumask, &info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
=20
 	cpumask_clear(&batch->cpumask);
=20
-	put_flush_tlb_info();
 	put_cpu();
 }
=20
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-114.ptr.blmpb.com (va-1-114.ptr.blmpb.com
 [209.127.230.114])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A2E6B3F39F1
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:13:57 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.114
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782818039; cv=none;
 b=DHpL7Aq7BX7i4c7VLWFIimX7ZlmopmbMYdERq2orMQL0HKqP5q1eEviafNKpQXX7nrItLepnXK+RbFfq8CgTlLKH4WX/jKhk03CXosT6gmJrrK5ZyqzfOyRXYcoBwdsscuhO3rQgKb/pT2nCfCP7bdCe/lR2g9I1HJ++NsJuX4c=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782818039; c=relaxed/simple;
	bh=mj5ukmF9sWVLeAxhVxQhz9WZfDPfPFBBfZgvy+36NvY=;
	h=From:In-Reply-To:Date:Message-Id:Mime-Version:To:Cc:Subject:
	 Content-Type:References;
 b=AjDqKqrOIA0oB4aunexHo8mzeaUvsAo1gD9e/FaP7X0pqydUq8jVVPH5s7GXDToB0viY8xt5f5Cig2Dmdu4YaR6buP1LuJovhuSsqU7z3tm5qCl/RxDqeI9D3hk/i3x1WMbDo6ow6gqETtl6/nfyvav8Vxfz6ATMVKs7Yo+MW8g=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=aRHjmyNB; arc=none smtp.client-ip=209.127.230.114
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="aRHjmyNB"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782818032; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=DK+sryB3a9uToTiA4yUNiWFdWezipnFNU2xKmbiSeIw=;
 b=aRHjmyNBjMdhz+eFqDpcAiDQlF6q7VNn/k51ETLAbuWtFkVcri+j0ow6FhXTWO77qRWRyQ
 U8QbQD3ZMqMzFRdtMuTie6WGATPB5nb7AxP5VqvgJm04fJJYM2QQWoITV7mhMjnyEuAPrw
 pJ6ftK5sazoZzSUpqREIGzNtYdn50IZIU78FdHblT3y4zXyL98+YmZhf3W27gemnPu5pEX
 550BKbVQbMk86pAgzanIPSt5t5F64IqVpjdzzIy2hD5kj4lZnk3rfzAApLUXtCoH6cTjzR
 pnb1y97QyEil1G2dDQ3YoOiOpJ8JEe0YUDTuS2pTAI91XVWHQj0VDbvsOjHRZQ==
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Date: Tue, 30 Jun 2026 19:10:07 +0800
Message-Id: <20260630111008.2034376-14-zhouchuyi@bytedance.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
Subject: [PATCH v9 13/14] x86/kvm: Disable preemption in kvm_flush_tlb_multi()
Content-Transfer-Encoding: quoted-printable
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
X-Lms-Return-Path: 
 <lba+26a43a4ee+8d3563+vger.kernel.org+zhouchuyi@bytedance.com>
Content-Type: text/plain; charset="utf-8"

kvm_flush_tlb_multi() is installed as an x86 PV TLB flush backend, so
flush_tlb_multi() can reach it through pv_ops when running as a KVM
guest.

kvm_flush_tlb_multi() uses the per-CPU scratch cpumask __pv_cpu_mask.
That buffer must remain tied to the current CPU until the mask has been
copied, filtered, and consumed by native_flush_tlb_multi().

The x86/mm callers currently enter flush_tlb_multi() while pinned to a
CPU. To let those callers drop CPU pinning before issuing the remote TLB
flush, each PV backend must protect its own CPU-local scratch state.

Make the KVM backend protect its per-CPU scratch cpumask by disabling
preemption locally. This is harmless with the current callers, where the
preemption disable is nested, and makes the KVM pv_ops dependency
explicit before changing the x86/mm call sites.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/kernel/kvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 29226d112029..d540f54f4d16 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -662,8 +662,10 @@ static void kvm_flush_tlb_multi(const struct cpumask *=
cpumask,
 	u8 state;
 	int cpu;
 	struct kvm_steal_time *src;
-	struct cpumask *flushmask =3D this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+	struct cpumask *flushmask;
=20
+	guard(preempt)();
+	flushmask =3D this_cpu_cpumask_var_ptr(__pv_cpu_mask);
 	cpumask_copy(flushmask, cpumask);
 	/*
 	 * We have to call flush only on online vCPUs. And
--=20
2.20.1
From nobody Wed Jul  1 04:37:00 2026
Received: from va-1-111.ptr.blmpb.com (va-1-111.ptr.blmpb.com
 [209.127.230.111])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6A2CC3F44D9
	for <linux-kernel@vger.kernel.org>; Tue, 30 Jun 2026 11:14:12 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.127.230.111
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1782818054; cv=none;
 b=fvRENrRx97JOHptvQQMvMQFGdx/x4eh7WZpwgm/r3pv6PdrtDCCXEMOv/VFWAgfmOBMw1ty4eVvmRGOQQce7KUk/RrNbEEFHcFNCm2oo6KnWj7oa7vPVeqYhizvsrIp4RucTS4o3bZVjJ5iq0a1qjvrk80mzg0f9oX6q3H9h0iI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1782818054; c=relaxed/simple;
	bh=684092v8USn52YE9dhPvQWrgF3iuM0q4/2cRP/MDNdQ=;
	h=Date:Content-Type:From:Mime-Version:References:Cc:Message-Id:
	 In-Reply-To:Subject:To;
 b=T2n+YbPoRvJHAic2al/o/D4Wftz8A7tQzqwAQL6NuOCyiYz06YfsUTMUJDtChXn41zTGzX7rsYi+x5B3wuWvty96QahdtwPg1TmlQdNTwMPWiCWUpDMFQqKyyGW5uVelmRDC5Gr+C3/BcUso4d7Ym8XgB7A74R0LGyUEgViz4zA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=JXIcuUnr; arc=none smtp.client-ip=209.127.230.111
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="JXIcuUnr"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1782818047; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=QTzMsEqhwIFZZs+LflmjtlnmuXheBRuOlTNHtK9CMCg=;
 b=JXIcuUnrY2CzkAfr2EcbGIpx9sQnIOXBUarY6BoEnortHN9owZwMPIKxoZyV5B2oW7tot0
 XSPYbpRL8StXIXc3s0BryXQ42q053Kd/qu/7+ZQOdP5EBxmjvP8c/u6vPlPn6kNasJLfAJ
 p+77vPD4R97m7LeFoIRgTQbcjhYVhE0a6AAVJolXDNIWk/ggh2+dS1L4UPXxsQ4FUR/0BU
 yhcmOflkcMnxkmjPw4SJyqLh/MtJE8RdPLG8iJ0mrdqcQaJoFCsihoeekepwIQmKkxLcVe
 WUZ6oaCltfLBe8e8qDf/HAE1s/o54B6vbJGOqJTaXUuKfPpCaOlnbDdq8C7M3A==
Date: Tue, 30 Jun 2026 19:10:08 +0800
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
References: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
Message-Id: <20260630111008.2034376-15-zhouchuyi@bytedance.com>
In-Reply-To: <20260630111008.2034376-1-zhouchuyi@bytedance.com>
Subject: [PATCH v9 14/14] x86/mm: Re-enable preemption before
 flush_tlb_multi()
Content-Transfer-Encoding: quoted-printable
X-Lms-Return-Path: 
 <lba+26a43a4fd+5e5be1+vger.kernel.org+zhouchuyi@bytedance.com>
To: <tglx@kernel.org>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <pbonzini@redhat.com>,
	<bigeasy@linutronix.de>, <clrkwllms@kernel.org>, <rostedt@goodmis.org>,
	<nadav.amit@gmail.com>, <vkuznets@redhat.com>
Content-Type: text/plain; charset="utf-8"

flush_tlb_mm_range() and arch_tlbbatch_flush() pin the current CPU while
they decide whether the flush can be handled locally or must be sent to
remote CPUs. The CPU pinning is needed for the current CPU number and for
the local TLB flush path, which reads per-CPU TLB state.

The caller does not need to remain pinned while waiting for a remote TLB
flush to complete. After the remote-flush path has been selected,
flush_tlb_info is caller-private stack storage, so the caller no longer
has to stay on the same CPU to protect a shared per-CPU flush_tlb_info
object.

flush_tlb_multi() may also route through x86 PV backends. Those backends
must protect their own CPU-local scratch state instead of relying on the
caller to stay pinned. Hyper-V already does this by disabling interrupts
while using hyperv_pcpu_input_arg, and Xen's multicall path brackets its
per-CPU multicall buffer with xen_mc_batch() and xen_mc_issue().
kvm_flush_tlb_multi() also disables preemption while using __pv_cpu_mask.

Remote TLB flushes may synchronously wait for many CPUs, and the wait can
take tens of milliseconds when remote CPUs have interrupts disabled or
when many CPUs are involved. Keeping preemption disabled for that whole
wait unnecessarily increases scheduling latency on the initiating CPU.

Drop the CPU pinning before calling flush_tlb_multi() in the remote paths
of flush_tlb_mm_range() and arch_tlbbatch_flush(). Keep the local paths
inside the pinned section because they still access this CPU's TLB state.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/mm/tlb.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0620c001981f..3b021930cc69 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1403,6 +1403,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigne=
d long start,
 				bool freed_tables)
 {
 	struct flush_tlb_info info;
+	bool remote_flush =3D false;
 	int cpu =3D get_cpu();
 	u64 new_tlb_gen;
=20
@@ -1420,9 +1421,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigne=
d long start,
 	if (mm_global_asid(mm)) {
 		broadcast_tlb_flush(&info);
 	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
-		info.trim_cpumask =3D should_trim_cpumask(mm);
-		flush_tlb_multi(mm_cpumask(mm), &info);
-		consider_global_asid(mm);
+		remote_flush =3D true;
 	} else if (mm =3D=3D this_cpu_read(cpu_tlbstate.loaded_mm)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
@@ -1431,6 +1430,13 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsign=
ed long start,
 	}
=20
 	put_cpu();
+
+	if (remote_flush) {
+		info.trim_cpumask =3D should_trim_cpumask(mm);
+		flush_tlb_multi(mm_cpumask(mm), &info);
+		consider_global_asid(mm);
+	}
+
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
=20
@@ -1677,7 +1683,7 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
 	struct flush_tlb_info info;
-
+	bool remote_flush =3D false;
 	int cpu =3D get_cpu();
=20
 	init_flush_tlb_info(&info, NULL, 0, TLB_FLUSH_ALL, 0, false,
@@ -1691,7 +1697,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_b=
atch *batch)
 		invlpgb_flush_all_nonglobals();
 		batch->unmapped_pages =3D false;
 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
-		flush_tlb_multi(&batch->cpumask, &info);
+		remote_flush =3D true;
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
@@ -1699,9 +1705,12 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_=
batch *batch)
 		local_irq_enable();
 	}
=20
-	cpumask_clear(&batch->cpumask);
-
 	put_cpu();
+
+	if (remote_flush)
+		flush_tlb_multi(&batch->cpumask, &info);
+
+	cpumask_clear(&batch->cpumask);
 }
=20
 /*
--=20
2.20.1