From nobody Sun Apr 12 21:02:22 2026
Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0332F3A9612
	for <linux-kernel@vger.kernel.org>; Wed,  1 Apr 2026 21:47:11 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=198.175.65.15
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1775080033; cv=none;
 b=HswBCjxc6gCg2ktl4TA9jB6mvt98fXk9zdmodmEmoE6sxXX3isMzFAbhXLJRMLnNkNclPSUSbKjU++5ayznikI7taHTZdHkhoJzJt0QFvn7RnecMxkDyCynrmPcJc/A/C1XzHHRgXT+UPzxo1CBq067KWfrestr3IfYyfXhTbpE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1775080033; c=relaxed/simple;
	bh=gGY3XN6kBgzH9PGIBhbAEjPcjnhT6OPGdmZ/VNkoToA=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=dHnKU1ri/PkIZbrnCAXL+h8fDeSAr9nDB4w5F3CYNtQJ9yKclnY0yx1dstRkfvD1j5XT1yZHM+H/Z42Q53JwzXZqGrK2cmQrBf/loXqd/JQysRBfHbuVOWJfKLQdoFRKx1WuyrX34Tf0pn6d/RNrxGpeomkrM0Ncj8/50/C6+JQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com;
 spf=pass smtp.mailfrom=linux.intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=WHITaK/P; arc=none smtp.client-ip=198.175.65.15
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="WHITaK/P"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1775080032; x=1806616032;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=gGY3XN6kBgzH9PGIBhbAEjPcjnhT6OPGdmZ/VNkoToA=;
  b=WHITaK/P7WquAgXdEmbyDmXLYhuKe8Y+Crl7Da19PkeUZpoECWDDw1Md
   cN1p5mdpno6nGReUZ3KZHrGtPRmmgtA88B3e/XVRO1qbeYuUoGuvtJOZq
   JyAM3/jY66Jys9Z5zKtY44w/dwrJemYOPAVIuYQ83sLggaZryIIpin3Xc
   3bWoLFYoGHHF1xNGDmHa4/INsessUJKdZ9EDNE3ewSU3Y/XDLsLtPZLWs
   OhFzCdKTvEnliOPXFF9/6ERPmCyyDyYpKKRdqk38YZ9SelslfWF97bxFT
   FCL1eQImXMHIxm9FRbEpA4QmsbaeREFZEhD9TQ1yLJqceQhncpZlu3r53
   w==;
X-CSE-ConnectionGUID: xkk8yBYBQMmeFIn5fSdV4A==
X-CSE-MsgGUID: OXPraJaeSoue5OU/e/lDfQ==
X-IronPort-AV: E=McAfee;i="6800,10657,11746"; a="79740056"
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="79740056"
Received: from fmviesa002.fm.intel.com ([10.60.135.142])
  by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 01 Apr 2026 14:47:12 -0700
X-CSE-ConnectionGUID: zF7gI6SlRIqrHMkhLj7y4w==
X-CSE-MsgGUID: 591ggyH8TDupGQ6ab3AX9g==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="249842487"
Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
  by fmviesa002.fm.intel.com with ESMTP; 01 Apr 2026 14:47:10 -0700
From: Tim Chen <tim.c.chen@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chen Yu <yu.c.chen@intel.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
	Hillf Danton <hdanton@sina.com>,
	Shrikanth Hegde <sshegde@linux.ibm.com>,
	Jianyong Wu <jianyong.wu@outlook.com>,
	Yangyu Chen <cyy@cyyself.name>,
	Tingyin Duan <tingyin.duan@gmail.com>,
	Vern Hao <vernhao@tencent.com>,
	Vern Hao <haoxing990@gmail.com>,
	Len Brown <len.brown@intel.com>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Aubrey Li <aubrey.li@intel.com>,
	Zhao Liu <zhao1.liu@intel.com>,
	Chen Yu <yu.chen.surf@gmail.com>,
	Adam Li <adamli@os.amperecomputing.com>,
	Aaron Lu <ziqianlu@bytedance.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Josh Don <joshdon@google.com>,
	Gavin Guo <gavinguo@igalia.com>,
	Qais Yousef <qyousef@layalina.io>,
	Libo Chen <libchen@purestorage.com>,
	linux-kernel@vger.kernel.org
Subject: [Patch v4 17/22] sched/cache: Avoid cache-aware scheduling for
 memory-heavy processes
Date: Wed,  1 Apr 2026 14:52:29 -0700
Message-Id: 
 <339bb2636c7306e17540268a9295a8e673b92804.1775065312.git.tim.c.chen@linux.intel.com>
X-Mailer: git-send-email 2.32.0
In-Reply-To: <cover.1775065312.git.tim.c.chen@linux.intel.com>
References: <cover.1775065312.git.tim.c.chen@linux.intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Chen Yu <yu.c.chen@intel.com>

Prateek and Tingyin reported that memory-intensive workloads (such as
stream) can saturate memory bandwidth and caches on the preferred LLC
when sched_cache aggregates too many threads.

To mitigate this, estimate a process's memory footprint by comparing
its RSS (anonymous and shared pages) to the size of the LLC. If RSS
exceeds the LLC size, skip cache-aware scheduling.

Note that RSS is only an approximation of the memory footprint.
By default, the comparison is strict, but a later patch will allow
users to provide a hint to adjust this threshold.

According to the test from Adam, some systems do not have shared L3
but with shared L2 as clusters. In this case, the L2 becomes the LLC[1].

Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@o=
s.amperecomputing.com/

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Vern Hao <vernhao@tencent.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v3->v4:
       No change.

 include/linux/cacheinfo.h | 21 ++++++++++-------
 kernel/sched/fair.c       | 48 +++++++++++++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index c8f4f0a0b874..82d0d59ca0e1 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu,
=20
 const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_=
leaf);
=20
-/*
- * Get the cacheinfo structure for the cache associated with @cpu at
- * level @level.
- * cpuhp lock must be held.
- */
-static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
+static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int leve=
l)
 {
 	struct cpu_cacheinfo *ci =3D get_cpu_cacheinfo(cpu);
 	int i;
=20
-	lockdep_assert_cpus_held();
-
 	for (i =3D 0; i < ci->num_leaves; i++) {
 		if (ci->info_list[i].level =3D=3D level) {
 			if (ci->info_list[i].attributes & CACHE_ID)
@@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_leve=
l(int cpu, int level)
 	return NULL;
 }
=20
+/*
+ * Get the cacheinfo structure for the cache associated with @cpu at
+ * level @level.
+ * cpuhp lock must be held.
+ */
+static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
+{
+	lockdep_assert_cpus_held();
+
+	return _get_cpu_cacheinfo_level(cpu, level);
+}
+
 /*
  * Get the id of the cache associated with @cpu at level @level.
  * cpuhp lock must be held.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 077ae7875e2e..a2d1b8b2a188 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1316,6 +1316,37 @@ static inline bool valid_llc_buf(struct sched_domain=
 *sd,
 	return true;
 }
=20
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+	struct cacheinfo *ci;
+	u64 rss, llc;
+
+	/*
+	 * get_cpu_cacheinfo_level() can not be used
+	 * because it requires the cpu_hotplug_lock
+	 * to be held. Use _get_cpu_cacheinfo_level()
+	 * directly because the 'cpu' can not be
+	 * offlined at the moment.
+	 */
+	ci =3D _get_cpu_cacheinfo_level(cpu, 3);
+	if (!ci) {
+		/*
+		 * On system without L3 but with shared L2,
+		 * L2 becomes the LLC.
+		 */
+		ci =3D _get_cpu_cacheinfo_level(cpu, 2);
+		if (!ci)
+			return true;
+	}
+
+	llc =3D ci->size;
+
+	rss =3D get_mm_counter(mm, MM_ANONPAGES) +
+		get_mm_counter(mm, MM_SHMEMPAGES);
+
+	return (llc < (rss * PAGE_SIZE));
+}
+
 static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
 {
 	return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
@@ -1514,7 +1545,8 @@ void account_mm_sched(struct rq *rq, struct task_stru=
ct *p, s64 delta_exec)
 	if (time_after(epoch,
 		       READ_ONCE(mm->sc_stat.epoch) + EPOCH_LLC_AFFINITY_TIMEOUT) ||
 	    get_nr_threads(p) <=3D 1 ||
-	    exceed_llc_nr(mm, cpu_of(rq))) {
+	    exceed_llc_nr(mm, cpu_of(rq)) ||
+	    exceed_llc_capacity(mm, cpu_of(rq))) {
 		if (mm->sc_stat.cpu !=3D -1)
 			mm->sc_stat.cpu =3D -1;
 	}
@@ -1619,8 +1651,8 @@ static inline void update_avg_scale(u64 *avg, u64 sam=
ple)
=20
 static void task_cache_work(struct callback_head *work)
 {
+	int cpu, m_a_cpu =3D -1, nr_running =3D 0, curr_cpu;
 	struct task_struct *p =3D current, *cur;
-	int cpu, m_a_cpu =3D -1, nr_running =3D 0;
 	unsigned long curr_m_a_occ =3D 0;
 	struct mm_struct *mm =3D p->mm;
 	unsigned long m_a_occ =3D 0;
@@ -1633,7 +1665,9 @@ static void task_cache_work(struct callback_head *wor=
k)
 	if (p->flags & PF_EXITING)
 		return;
=20
-	if (get_nr_threads(p) <=3D 1) {
+	curr_cpu =3D task_cpu(p);
+	if (get_nr_threads(p) <=3D 1 ||
+	    exceed_llc_capacity(mm, curr_cpu)) {
 		if (mm->sc_stat.cpu !=3D -1)
 			mm->sc_stat.cpu =3D -1;
=20
@@ -10144,8 +10178,12 @@ static enum llc_mig can_migrate_llc_task(int src_c=
pu, int dst_cpu,
 	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
 		return mig_unrestricted;
=20
-	/* skip cache aware load balance for single/too many threads */
-	if (get_nr_threads(p) <=3D 1 || exceed_llc_nr(mm, dst_cpu)) {
+	/*
+	 * Skip cache aware load balance for single/too many threads
+	 * or large memory RSS.
+	 */
+	if (get_nr_threads(p) <=3D 1 || exceed_llc_nr(mm, dst_cpu) ||
+	    exceed_llc_capacity(mm, dst_cpu)) {
 		if (mm->sc_stat.cpu !=3D -1)
 			mm->sc_stat.cpu =3D -1;
 		return mig_unrestricted;
--=20
2.32.0