From nobody Tue Feb 10 08:27:39 2026
Received: from sg-1-101.ptr.blmpb.com (sg-1-101.ptr.blmpb.com
 [118.26.132.101])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 70CC827F749
	for <linux-kernel@vger.kernel.org>; Tue,  3 Feb 2026 11:27:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=118.26.132.101
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1770118059; cv=none;
 b=czCOL42oV3aC6gePLg3XaiGtxY0sY7zydg1CeydcvtS0CjaaAruJxQf2np6JJt+rdo88iT9ZaNms0L2o5s4micWL/qBW6bmMrK5/VwY4hNA1Qc1Tj2gO8TrbUHUiaCaismwKhXGBpDPutQINGfAQ95qDhlyK1TXa1k3FwCTitU8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1770118059; c=relaxed/simple;
	bh=/0YpcSBXOXU6aBRucGUSQIzAJxwZeE0SDGhxShcFE/A=;
	h=Content-Type:Subject:In-Reply-To:Cc:References:Date:To:From:
	 Message-Id:Mime-Version;
 b=hGCF2mCgPXXIpfB4Yh8lyqt5AlH02g2hrX3M19FvAy7G7o+ve/EVq2J++96fK+8cVTkhKTsL7R9Dmkffz4S3r06zQ3QaL8thOMVqrCgEq6BjmPs3MReD79cQgI0qfASNbLHUn01PzyQy6VqNs+lk64uegT+yywu8UiIp4N1TsWo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com;
 spf=pass smtp.mailfrom=bytedance.com;
 dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b=ICMwMliU; arc=none smtp.client-ip=118.26.132.101
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=bytedance.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com
 header.b="ICMwMliU"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
 s=2212171451; d=bytedance.com; t=1770118052; h=from:subject:
 mime-version:from:date:message-id:subject:to:cc:reply-to:content-type:
 mime-version:in-reply-to:message-id;
 bh=PKi6eotB2YEfYlofLQTiEKnRk3xBhGzOqmjCiy+lFkU=;
 b=ICMwMliUOuTZ+8cJwY0bJTnkIUjDuUEMfVaY08MnkLSESZ7AQX9cACvZspBjsP23oSWu78
 ueaoDVxE79YdwIEbCGafA2rwvUYg2XpZiy/k7mLKpYIp2Og7U1nO04KZ+UKaiB2b+Qc76g
 dbK6hVMyDRZDs6QEMr/ghNiutY8lHRwc8Nsq5dXUjtPjepVvKPs54hN9enE5CRIu3qf9Pt
 rZULilIxgsrtoR4Qt088KPKPUdfOPv6bM/1Nsrzu6l0/+S1yXQ6XNY7rxPK+KghNONw5Ov
 3syRazRVwe5/4hqu7Iau2G3CSLPyl4cm4O6xthQelaJ0bOkngNSxDbasVqfycw==
Subject: [PATCH 10/11] x86/mm: Enable preemption during native_flush_tlb_multi
Content-Transfer-Encoding: quoted-printable
X-Original-From: Chuyi Zhou <zhouchuyi@bytedance.com>
In-Reply-To: <20260203112401.3889029-1-zhouchuyi@bytedance.com>
Cc: <linux-kernel@vger.kernel.org>, "Chuyi Zhou" <zhouchuyi@bytedance.com>
References: <20260203112401.3889029-1-zhouchuyi@bytedance.com>
Date: Tue,  3 Feb 2026 19:24:00 +0800
X-Lms-Return-Path: 
 <lba+26981dba2+25e1fc+vger.kernel.org+zhouchuyi@bytedance.com>
X-Mailer: git-send-email 2.20.1
To: <tglx@linutronix.de>, <mingo@redhat.com>, <luto@kernel.org>,
	<peterz@infradead.org>, <paulmck@kernel.org>, <muchun.song@linux.dev>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>
From: "Chuyi Zhou" <zhouchuyi@bytedance.com>
Message-Id: <20260203112401.3889029-11-zhouchuyi@bytedance.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
Mime-Version: 1.0
Content-Type: text/plain; charset="utf-8"

flush_tlb_mm_range()/arch_tlbbatch_flush() -> native_flush_tlb_multi() is a
common triggering path in real production environments. When pages are
reclaimed or process exit, native_flush_tlb_multi() sends IPIs to remote
CPUs and waits for all remote CPUs to complete their local TLB flushes. The
overall latency may reach tens of milliseconds due to a large number of
remote CPUs and other factors (such as interrupts being disabled). Since
flush_tlb_mm_range()/arch_tlbbatch_flush() always disable preemption, which
may cause increased scheduling latency for other threads on the current
CPU.

Previous patche convert flush_tlb_info from per-cpu variable to on-stack
variable. Additionally, it's no longer necessary to explicitly disable
preemption before calling smp_call*() since they internally handles the
preemption logic. Now is's safe to enable preemption during
native_flush_tlb_multi().

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 arch/x86/hyperv/mmu.c |  2 ++
 arch/x86/kernel/kvm.c |  4 +++-
 arch/x86/mm/tlb.c     | 23 +++++++++++++----------
 arch/x86/xen/mmu_pv.c |  1 +
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index cfcb60468b01..394f849af10a 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -65,6 +65,8 @@ static void hyperv_flush_tlb_multi(const struct cpumask *=
cpus,
 	unsigned long flags;
 	bool do_lazy =3D !info->freed_tables;
=20
+	guard(preempt)();
+
 	trace_hyperv_mmu_flush_tlb_multi(cpus, info);
=20
 	if (!hv_hypercall_pg)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index df78ddee0abb..6b56dab28e66 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -654,8 +654,10 @@ static void kvm_flush_tlb_multi(const struct cpumask *=
cpumask,
 	u8 state;
 	int cpu;
 	struct kvm_steal_time *src;
-	struct cpumask *flushmask =3D this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+	struct cpumask *flushmask;
=20
+	guard(preempt)();
+	flushmask =3D this_cpu_cpumask_var_ptr(__pv_cpu_mask);
 	cpumask_copy(flushmask, cpumask);
 	/*
 	 * We have to call flush only on online vCPUs. And
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 2d68297ed35b..4162d7ff024f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1398,21 +1398,23 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsig=
ned long start,
 				unsigned long end, unsigned int stride_shift,
 				bool freed_tables)
 {
-	int cpu =3D get_cpu();
-
 	struct flush_tlb_info info =3D {
 		.mm =3D mm,
 		.stride_shift =3D stride_shift,
 		.freed_tables =3D freed_tables,
-		.trim_cpumask =3D 0,
-		.initiating_cpu =3D cpu
+		.trim_cpumask =3D 0
 	};
+	int cpu;
=20
 	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
 		start =3D 0;
 		end =3D TLB_FLUSH_ALL;
 	}
=20
+	migrate_disable();
+
+	cpu =3D info.initiating_cpu =3D smp_processor_id();
+
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	info.new_tlb_gen =3D inc_mm_tlb_gen(mm);
=20
@@ -1425,6 +1427,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigne=
d long start,
 	 * flush_tlb_func_local() directly in this case.
 	 */
 	if (mm_global_asid(mm)) {
+		guard(preempt)();
 		broadcast_tlb_flush(&info);
 	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
 		info.trim_cpumask =3D should_trim_cpumask(mm);
@@ -1437,7 +1440,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigne=
d long start,
 		local_irq_enable();
 	}
=20
-	put_cpu();
+	migrate_enable();
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
=20
@@ -1696,8 +1699,6 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
=20
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	int cpu =3D get_cpu();
-
 	struct flush_tlb_info info =3D {
 		.start =3D 0,
 		.end =3D TLB_FLUSH_ALL,
@@ -1705,9 +1706,13 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_=
batch *batch)
 		.stride_shift =3D 0,
 		.freed_tables =3D false,
 		.new_tlb_gen =3D TLB_GENERATION_INVALID,
-		.initiating_cpu =3D cpu,
 		.trim_cpumask =3D 0,
 	};
+	int cpu;
+
+	guard(migrate)();
+
+	info.initiating_cpu =3D cpu =3D smp_processor_id();
=20
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1727,8 +1732,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_b=
atch *batch)
 	}
=20
 	cpumask_clear(&batch->cpumask);
-
-	put_cpu();
 }
=20
 /*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2a4a8deaf612..b801721050f7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1330,6 +1330,7 @@ static void xen_flush_tlb_multi(const struct cpumask =
*cpus,
 	const size_t mc_entry_size =3D sizeof(args->op) +
 		sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
=20
+	guard(preempt)();
 	trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
=20
 	if (cpumask_empty(cpus))
--=20
2.20.1