From nobody Mon Jun 15 19:28:19 2026
Received: from mailgw1.hygon.cn (unknown [101.204.27.37])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 74BEE391E6D
	for <linux-kernel@vger.kernel.org>; Mon, 13 Apr 2026 07:25:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=101.204.27.37
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1776065120; cv=none;
 b=ZRrtL7sIhkr/kPKT+5vAl8J774NkrGxm0TsODNCLmjaTSIyaiaJgg91/TzT9YsWLkI+rVmDFQkzI1OuIjBf0oY4J968r2sr1P4A4yLYyZmjs14azwVwFrLCeCOw+fUwBvhwJu8VPK0zNuGP8u8sp/x2eURqPycTWYVVYSQ1m3fw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1776065120; c=relaxed/simple;
	bh=K/2kkPBJYdjHHxPmpNp3NeXG9trYzku1RUkN9FGJnws=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=Ld8WaxPZ9ZoLHMXgcs4UU7EQ5NNsZo/skG0jbqkeRqcjsaIiKwoNrSdWGSJ8X+9RKvmbrGb33knJDP91Tp4Sm/1E2qhD4VbU4ymzaG8pyzTmjNheKPC7LIp57Itlsd5UkGCWQBrdmLHCcq0yT0ZOvABavEWBz49hTOqXLoWJR7o=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=hygon.cn;
 spf=pass smtp.mailfrom=hygon.cn; arc=none smtp.client-ip=101.204.27.37
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=hygon.cn
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=hygon.cn
Received: from maildlp2.hygon.cn (unknown [127.0.0.1])
	by mailgw1.hygon.cn (Postfix) with ESMTP id 4fvJp23lQrzg1ch;
	Mon, 13 Apr 2026 15:24:58 +0800 (CST)
Received: from maildlp2.hygon.cn (unknown [172.23.18.61])
	by mailgw1.hygon.cn (Postfix) with ESMTP id 4fvJp12W2Mzg1ch;
	Mon, 13 Apr 2026 15:24:57 +0800 (CST)
Received: from cncheex04.Hygon.cn (unknown [172.23.18.114])
	by maildlp2.hygon.cn (Postfix) with ESMTPS id 3DEF630004D3;
	Mon, 13 Apr 2026 15:23:04 +0800 (CST)
Received: from jianyong.hygon.cn (172.19.20.52) by cncheex04.Hygon.cn
 (172.23.18.114) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.1544.36; Mon, 13 Apr
 2026 15:24:55 +0800
From: Jianyong Wu <wujianyong@hygon.cn>
To: <tim.c.chen@linux.intel.com>, <yu.c.chen@intel.com>,
	<luogengkun2@huawei.com>
CC: <peterz@infradead.org>, <kprateek.nayak@amd.com>, <mingo@redhat.com>,
	<vincent.guittot@linaro.org>, <juri.lelli@redhat.com>,
	<dietmar.eggemann@arm.com>, <rostedt@goodmis.org>, <bsegall@google.com>,
	<mgorman@suse.de>, <vschneid@redhat.com>, <vineethr@linux.ibm.com>,
	<hdanton@sina.com>, <sshegde@linux.ibm.com>, <jianyong.wu@outlook.com>,
	<cyy@cyyself.name>, <tingyin.duan@gmail.com>, <vernhao@tencent.com>,
	<haoxing990@gmail.com>, <len.brown@intel.com>, <aubrey.li@intel.com>,
	<zhao1.liu@intel.com>, <adamli@os.amperecomputing.com>,
	<ziqianlu@bytedance.com>, <tim.c.chen@intel.com>, <joshdon@google.com>,
	<gavinguo@igalia.com>, <qyousef@layalina.io>, <libchen@purestorage.com>,
	<linux-kernel@vger.kernel.org>, <huangsj@hygon.cn>, <wujianyong@hygon.cn>
Subject: [RFC PATCH] sched/fair: dynamically scale the period of cache work
Date: Mon, 13 Apr 2026 15:23:09 +0800
Message-ID: <20260413072309.2663668-1-wujianyong@hygon.cn>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <4fb7a6da-447d-452a-a920-7cd39b939ccb@intel.com>
References: <4fb7a6da-447d-452a-a920-7cd39b939ccb@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: cncheex05.Hygon.cn (172.23.18.115) To cncheex04.Hygon.cn
 (172.23.18.114)
Content-Type: text/plain; charset="utf-8"

When a preferred LLC is selected and remains stable, task_cache_work does
not need to run frequently. Because it scans all system CPUs for
computation, high-frequency execution hurts performance. We thus reduce
the scan rate in such cases.

On the other hand, if the preferred node becomes suboptimal, we should
increase the scan frequency to quickly find a better placement. The scan
period is therefore dynamically adjusted.

Signed-off-by: Jianyong Wu <wujianyong@hygon.cn>

---
Hi ChenYu, Tim, Gengkun,

I have another approach to address this issue, based on the observation
that the scan work can be canceled if the preferred node is stable.This
patch merely demonstrates the idea, but still needs more testing to
verify its functionality. I'm sending it out early to gather feedback and
opinions.

Thanks
Jianyong
---
 include/linux/sched.h |  4 +++
 kernel/sched/debug.c  |  6 ++++
 kernel/sched/fair.c   | 69 ++++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h  |  3 ++
 4 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e24b2b86aba4..87ce70ba6552 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2393,7 +2393,11 @@ struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_time;
 	raw_spinlock_t lock;
 	unsigned long epoch;
+	unsigned long last_reset_tick;
+	unsigned long next_scan;
+	unsigned long scan_period;
 	u64 nr_running_avg;
+	int need_scan;
 	int cpu;
 } ____cacheline_aligned_in_smp;
=20
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4469e1c152c8..56ebc379127a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -679,6 +679,12 @@ static __init int sched_init_debug(void)
 			   &llc_overaggr_pct);
 	debugfs_create_u32("imb_pct", 0644, llc,
 			   &llc_imb_pct);
+	debugfs_create_u32("scan_period_max", 0644, llc,
+			   &llc_scan_period_max);
+	debugfs_create_u32("scan_period_min", 0644, llc,
+			   &llc_scan_period_min);
+	debugfs_create_u32("scan_period_threshold", 0644, llc,
+			   &llc_scan_period_threshold);
 #endif
=20
 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops=
);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f446d755f3c5..974fe4b992ca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1287,6 +1287,9 @@ __read_mostly unsigned int llc_epoch_period	=3D EPOCH=
_PERIOD;
 __read_mostly unsigned int llc_epoch_affinity_timeout =3D EPOCH_LLC_AFFINI=
TY_TIMEOUT;
 __read_mostly unsigned int llc_imb_pct		=3D 20;
 __read_mostly unsigned int llc_overaggr_pct	=3D 50;
+__read_mostly unsigned int llc_scan_period_min =3D 1;
+__read_mostly unsigned int llc_scan_period_max =3D 64 * HZ;
+__read_mostly unsigned int llc_scan_period_threshold =3D HZ;
=20
 bool sched_cache_inuse(void)
 {
@@ -1486,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm,
 	raw_spin_lock_init(&mm->sc_stat.lock);
 	mm->sc_stat.epoch =3D epoch;
 	mm->sc_stat.cpu =3D -1;
+	mm->sc_stat.scan_period =3D llc_scan_period_min;
=20
 	/*
 	 * The update to mm->sc_stat should not be reordered
@@ -1611,15 +1615,13 @@ void account_mm_sched(struct rq *rq, struct task_st=
ruct *p, s64 delta_exec)
 		epoch =3D rq->cpu_epoch;
 	}
=20
-	/*
-	 * If this process hasn't hit task_cache_work() for a while, or it
-	 * has only 1 thread, invalidate its preferred state.
-	 */
+	/* If it has only 1 thread, invalidate its preferred state */
 	if (time_after(epoch,
-		       READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
-	    get_nr_threads(p) <=3D 1 ||
+			READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
+		get_nr_threads(p) <=3D 1 ||
 	    exceed_llc_nr(mm, cpu_of(rq), p) ||
 	    exceed_llc_capacity(mm, cpu_of(rq), p)) {
+		mm->sc_stat.scan_period =3D llc_scan_period_min;
 		if (mm->sc_stat.cpu !=3D -1)
 			mm->sc_stat.cpu =3D -1;
 	}
@@ -1652,6 +1654,10 @@ static void task_tick_cache(struct rq *rq, struct ta=
sk_struct *p)
 	if (time_after_eq(mm->sc_stat.epoch, epoch))
 		return;
=20
+	if (llc_scan_period_min < llc_scan_period_max && time_before(jiffies, mm-=
>sc_stat.next_scan) &&
+		!mm->sc_stat.need_scan)
+		return;
+
 	guard(raw_spinlock)(&mm->sc_stat.lock);
=20
 	if (work->next =3D=3D work) {
@@ -1728,7 +1734,7 @@ static void task_cache_work(struct callback_head *wor=
k)
 	struct task_struct *p =3D current, *cur;
 	unsigned long curr_m_a_occ =3D 0;
 	struct mm_struct *mm =3D p->mm;
-	unsigned long m_a_occ =3D 0;
+	unsigned long m_a_occ =3D 0, need_scan =3D 0, now;
 	cpumask_var_t cpus;
 	u64 t0, scan_cost;
=20
@@ -1753,6 +1759,12 @@ static void task_cache_work(struct callback_head *wo=
rk)
=20
 	t0 =3D sched_clock_cpu(curr_cpu);
=20
+	now =3D jiffies;
+	if (time_before(now, READ_ONCE(mm->sc_stat.next_scan)))
+		return;
+
+	WRITE_ONCE(mm->sc_stat.next_scan, (now + mm->sc_stat.scan_period));
+
 	scoped_guard (cpus_read_lock) {
 		get_scan_cpumasks(cpus, p);
=20
@@ -1811,7 +1823,8 @@ static void task_cache_work(struct callback_head *wor=
k)
 	scan_cost =3D sched_clock_cpu(curr_cpu) - t0;
 	trace_sched_llc_scan(p, scan_cost);
=20
-	if (m_a_occ > (2 * curr_m_a_occ)) {
+	need_scan =3D READ_ONCE(mm->sc_stat.need_scan);
+	if (m_a_occ > (2 * curr_m_a_occ) || need_scan) {
 		/*
 		 * Avoid switching sc_stat.cpu too fast.
 		 * The reason to choose 2X is because:
@@ -1822,9 +1835,35 @@ static void task_cache_work(struct callback_head *wo=
rk)
 		 * 3. 2X is chosen based on test results, as it delivers
 		 *    the optimal performance gain so far.
 		 */
-		mm->sc_stat.cpu =3D m_a_cpu;
+		if (m_a_occ > (2 * curr_m_a_occ))
+			mm->sc_stat.cpu =3D m_a_cpu;
+
+		if (!mm->sc_stat.last_reset_tick)
+			mm->sc_stat.last_reset_tick =3D now;
+
+		/* Change scan_period when preferred LLC changed */
+		if (((mm->sc_stat.cpu !=3D -1) && (m_a_cpu !=3D -1)
+			&& (llc_id(mm->sc_stat.cpu) !=3D llc_id(m_a_cpu)))
+			|| need_scan) {
+			if (!need_scan)
+				need_scan =3D 1;
+
+			WRITE_ONCE(mm->sc_stat.scan_period,
+				max(mm->sc_stat.scan_period >> 1, llc_scan_period_min));
+			WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
+               }
+	}
+
+	if ((now - READ_ONCE(mm->sc_stat.last_reset_tick) > llc_scan_period_thres=
hold)
+		&& !need_scan) {
+		WRITE_ONCE(mm->sc_stat.scan_period, min(mm->sc_stat.scan_period << 1,
+			llc_scan_period_max));
+		WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
 	}
=20
+	if (READ_ONCE(mm->sc_stat.need_scan))
+		WRITE_ONCE(mm->sc_stat.need_scan, 0);
+
 	update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
 	free_cpumask_var(cpus);
 }
@@ -10260,6 +10299,7 @@ static enum llc_mig can_migrate_llc_task(int src_cp=
u, int dst_cpu,
 	struct mm_struct *mm;
 	bool to_pref;
 	int cpu;
+	enum llc_mig ret;
=20
 	mm =3D p->mm;
 	if (!mm)
@@ -10287,8 +10327,17 @@ static enum llc_mig can_migrate_llc_task(int src_c=
pu, int dst_cpu,
 	else
 		return mig_unrestricted;
=20
-	return can_migrate_llc(src_cpu, dst_cpu,
+	ret =3D can_migrate_llc(src_cpu, dst_cpu,
 			       task_util(p), to_pref);
+
+	/*
+	 * If the preferred node cannot accommodate the process,
+	 * accelerate task_cache_work to find a better node.
+	 */
+	if (to_pref && ret =3D=3D mig_forbid)
+		mm->sc_stat.need_scan =3D 1;
+
+	return ret;
 }
=20
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b757812725f7..08462175f73f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4043,6 +4043,9 @@ extern unsigned int llc_epoch_period;
 extern unsigned int llc_epoch_affinity_timeout;
 extern unsigned int llc_imb_pct;
 extern unsigned int llc_overaggr_pct;
+extern unsigned int llc_scan_period_min;
+extern unsigned int llc_scan_period_max;
+extern unsigned int llc_scan_period_threshold;
=20
 static inline bool sched_cache_enabled(void)
 {
--=20
2.34.1