From nobody Mon Jun 15 23:18:30 2026
Received: from canpmsgout02.his.huawei.com (canpmsgout02.his.huawei.com
 [113.46.200.217])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9C7E23C061E
	for <linux-kernel@vger.kernel.org>; Thu, 23 Apr 2026 08:26:34 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=113.46.200.217
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1776932800; cv=none;
 b=cWXrAeotzfRFavGTaFsDhnBvhNP+NbC1viSqPR/j9XH6zyWQeOAqEGgxu7joyfM3c8GjA21YXsUByZ45z9cs+pxLUPEpsHCZPFHZ1zxmVGdgmYYsxCZgepZyUP+eYm5uthY1IJIqzfv6+qiwNNdcRYvIsoluktO2njdtUYP/GYI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1776932800; c=relaxed/simple;
	bh=MBIEwlh5JDcW0F4BCeDs1XH7o1SmRavMPQRrdXjU9NM=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=PLt5b0moBluBZLbyAp4fc3SD2CjfXWE5cOdv6AamlFJgUFSNZO/GNYI48MHuSZIn/K+ttiqmrizmGKgrG++ryaGtrbiGXv5JQ7vD2kD967OL4aRYEfY3FFeTdlHNuWc7NgCpf9wPJQU4uBmTjrO4yGn96+lLP5ihhqihFVkTCjg=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com;
 dkim=pass (1024-bit key) header.d=huawei.com header.i=@huawei.com
 header.b=c6yy0bzJ; arc=none smtp.client-ip=113.46.200.217
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=huawei.com header.i=@huawei.com
 header.b="c6yy0bzJ"
dkim-signature: v=1; a=rsa-sha256; d=huawei.com; s=dkim;
	c=relaxed/relaxed; q=dns/txt;
	h=From;
	bh=3QyCou5JNCc0vf8sNIbbCMM2dtrHiW+BJ5gqqVwioy4=;
	b=c6yy0bzJzh9NVT8Miuh4P51udPWq3fjWaq3SU8wkaUQQ9/JggDvbCTnZZowvA+XzMyFX64ibh
	0Gy5nlFYLzr5ptTnCnu7zaLRppVf2u1e4j/QZa92q1ZtImZoopcuFqKqdeexemEZlHH3K7EDMFw
	+XiE9MQEbLc1HU3NaBQ1UVk=
Received: from mail.maildlp.com (unknown [172.19.162.140])
	by canpmsgout02.his.huawei.com (SkyGuard) with ESMTPS id 4g1TXf578zzcb3p;
	Thu, 23 Apr 2026 16:19:46 +0800 (CST)
Received: from kwepemj100017.china.huawei.com (unknown [7.202.194.11])
	by mail.maildlp.com (Postfix) with ESMTPS id 540EC2012A;
	Thu, 23 Apr 2026 16:26:31 +0800 (CST)
Received: from huawei.com (10.67.174.193) by kwepemj100017.china.huawei.com
 (7.202.194.11) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.1544.36; Thu, 23 Apr
 2026 16:26:30 +0800
From: Luo Gengkun <luogengkun2@huawei.com>
To: <yu.c.chen@intel.com>
CC: <adamli@os.amperecomputing.com>, <aubrey.li@intel.com>,
	<bsegall@google.com>, <cyy@cyyself.name>, <dietmar.eggemann@arm.com>,
	<gavinguo@igalia.com>, <haoxing990@gmail.com>, <hdanton@sina.com>,
	<jianyong.wu@outlook.com>, <joshdon@google.com>, <juri.lelli@redhat.com>,
	<kprateek.nayak@amd.com>, <len.brown@intel.com>, <libchen@purestorage.com>,
	<linux-kernel@vger.kernel.org>, <luogengkun2@huawei.com>, <mgorman@suse.de>,
	<mingo@redhat.com>, <peterz@infradead.org>, <qyousef@layalina.io>,
	<rostedt@goodmis.org>, <sshegde@linux.ibm.com>, <tim.c.chen@intel.com>,
	<tim.c.chen@linux.intel.com>, <tingyin.duan@gmail.com>,
	<vernhao@tencent.com>, <vincent.guittot@linaro.org>,
	<vineethr@linux.ibm.com>, <vschneid@redhat.com>, <zhao1.liu@intel.com>,
	<ziqianlu@bytedance.com>
Subject: [PATCH v3] sched/cache: Reduce the overhead of task_cache_work by
 only scan the visisted cpus.
Date: Thu, 23 Apr 2026 08:54:14 +0000
Message-ID: <20260423085414.1389749-1-luogengkun2@huawei.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <041d3afc-c308-481b-824f-46170f564707@intel.com>
References: <041d3afc-c308-481b-824f-46170f564707@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: kwepems500001.china.huawei.com (7.221.188.70) To
 kwepemj100017.china.huawei.com (7.202.194.11)
Content-Type: text/plain; charset="utf-8"

The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.

To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_affinity_timeout to evict cpus that have
timed out.

Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
---
Changes history
**v3 Changes:**
1. Remove the static key and enable this feature by default.
2. Reuse llc_epoch_affinity_timeout instead of introducing
llc_epoch_visited_timeout.
3. Move the calculation of rq->cpu_epoch - pcpu_sched->epoch into
fraction_mm_sched() to avoid race between task_cache_work() and
__update_mm_sched().=20
4. Reset work->next at the end of task_cache_work() to prevent concurrent
executions by multiple threads within the same process.


**v2 Changes:**
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize ov=
erhead.
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 38 ++++++++++++++++++++++++++++++--------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {
=20
 struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_sched;
+	struct cpumask visited_cpus;
 	raw_spinlock_t lock;
 	unsigned long epoch;
 	u64 nr_running_avg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..49369f656d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1466,6 +1466,7 @@ void mm_init_sched(struct mm_struct *mm,
 	raw_spin_lock_init(&mm->sc_stat.lock);
 	mm->sc_stat.epoch =3D epoch;
 	mm->sc_stat.cpu =3D -1;
+	cpumask_clear(&mm->sc_stat.visited_cpus);
=20
 	/*
 	 * The update to mm->sc_stat should not be reordered
@@ -1507,11 +1508,18 @@ static inline void __update_mm_sched(struct rq *rq,
 	}
 }
=20
-static unsigned long fraction_mm_sched(struct rq *rq,
-				       struct sched_cache_time *pcpu_sched)
+static unsigned long fraction_mm_sched(int cpu, struct mm_struct *mm)
 {
+	struct rq *rq =3D cpu_rq(cpu);
+	struct sched_cache_time *pcpu_sched =3D per_cpu_ptr(mm->sc_stat.pcpu_sche=
d, cpu);
 	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
=20
+	/* Skip the rq that has not been hit for a long time */
+	if ((rq->cpu_epoch - pcpu_sched->epoch) > llc_epoch_affinity_timeout) {
+		cpumask_clear_cpu(cpu, &mm->sc_stat.visited_cpus);
+		return 0;
+	}
+
 	__update_mm_sched(rq, pcpu_sched);
=20
 	/*
@@ -1582,6 +1590,8 @@ void account_mm_sched(struct rq *rq, struct task_stru=
ct *p, s64 delta_exec)
 		pcpu_sched->runtime +=3D delta_exec;
 		rq->cpu_runtime +=3D delta_exec;
 		epoch =3D rq->cpu_epoch;
+		if (!cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+			cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
 	}
=20
 	/*
@@ -1627,7 +1637,11 @@ static void task_tick_cache(struct rq *rq, struct ta=
sk_struct *p)
=20
 	guard(raw_spinlock)(&mm->sc_stat.lock);
=20
-	if (work->next =3D=3D work) {
+	/*
+	 * Pairs with smp_store_release in task_cache_work() to ensure that
+	 * tash_cache_work() has finished before re-queueing the work.
+	 */
+	if (smp_load_acquire(&work->next) =3D=3D work) {
 		task_work_add(p, work, TWA_RESUME);
 		WRITE_ONCE(mm->sc_stat.epoch, epoch);
 	}
@@ -1695,6 +1709,8 @@ static inline void update_avg_scale(u64 *avg, u64 sam=
ple)
 	*avg +=3D div64_s64(diff, divisor);
 }
=20
+DEFINE_FREE(reset_work, struct callback_head *, smp_store_release(&_T->nex=
t, _T))
+
 static void task_cache_work(struct callback_head *work)
 {
 	int cpu, m_a_cpu =3D -1, nr_running =3D 0, curr_cpu;
@@ -1703,11 +1719,14 @@ static void task_cache_work(struct callback_head *w=
ork)
 	struct mm_struct *mm =3D p->mm;
 	unsigned long m_a_occ =3D 0;
 	cpumask_var_t cpus;
+	/*
+	 * Reset work->next at the end to avoid race between threads
+	 * within a process.
+	 */
+	struct callback_head *_w __free(reset_work) =3D work;
=20
 	WARN_ON_ONCE(work !=3D &p->cache_work);
=20
-	work->next =3D work;
-
 	if (p->flags & PF_EXITING)
 		return;
=20
@@ -1725,6 +1744,7 @@ static void task_cache_work(struct callback_head *wor=
k)
=20
 	scoped_guard (cpus_read_lock) {
 		get_scan_cpumasks(cpus, p);
+		cpumask_and(cpus, cpus, &mm->sc_stat.visited_cpus);
=20
 		for_each_cpu(cpu, cpus) {
 			/* XXX sched_cluster_active */
@@ -1735,9 +1755,11 @@ static void task_cache_work(struct callback_head *wo=
rk)
 			if (!sd)
 				continue;
=20
-			for_each_cpu(i, sched_domain_span(sd)) {
-				occ =3D fraction_mm_sched(cpu_rq(i),
-							per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+			for_each_cpu_and(i, sched_domain_span(sd), &mm->sc_stat.visited_cpus) {
+				occ =3D fraction_mm_sched(i, mm);
+				if (occ =3D=3D 0)
+					continue;
+
 				a_occ +=3D occ;
 				if (occ > m_occ) {
 					m_occ =3D occ;
--=20
2.34.1