From nobody Thu Apr  2 17:15:13 2026
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.9])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id AC725339844
	for <linux-kernel@vger.kernel.org>; Tue, 10 Feb 2026 22:13:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.9
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1770761588; cv=none;
 b=WtWWbaOSaMqfOLzBw0wpnI/FAQ++GFJAEiPf30wtkC6gGn4NBPX3Altzoq8V4S4GwE4naexzPk59f8DTNg5LSVUIi9NkcGIj7uZWvaWcjsW0G2ESyjElCDNkCGWBQfnoR/G6LiW2kxGn2p+gcN7AIm4p/yjyD55nihyMbhs5ZBA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1770761588; c=relaxed/simple;
	bh=+dXdc/foU6CrK/UWJil+Zj/UilvizxjOxvM4MOqR9P8=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=iOXGWeOGlieF+zOxGqvPqeKRLCutE2wia+XIcHpeM8ruE/4Ob+O7q73J/i/pJoTknyZieUJoEit1mriZhUvlO2soe4I/TaCa5ExPFL1GS5HDSoOdaRIvBhXhz4V+dFmnizzPq6pJKkff3tugvcV9yeDdgPE+ZJ9Kxqu0twTcvHQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com;
 spf=pass smtp.mailfrom=linux.intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=dJQ5x1RJ; arc=none smtp.client-ip=192.198.163.9
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="dJQ5x1RJ"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1770761587; x=1802297587;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=+dXdc/foU6CrK/UWJil+Zj/UilvizxjOxvM4MOqR9P8=;
  b=dJQ5x1RJv32EDA/tkEjzTAEisxFMHTLbVQqgBeT2wJEcQ2zx/kcCo487
   23H7U5GAK8VxbXQKrLinrplCg7mObHvaz3ZIDGPnzCVzDOX7+XE6Hfd5m
   4for/kizWHg1yu/3uIBp1Mv7XiXoJYYNNlaxenGjoKRmbAZ6pm5dGpmcF
   Ght9FkMxsltLHIplHfJ5ZEJD2qIRDkDoiMePTdcbc8DYbrhdCngeo9Jae
   hfMHctF1rXxdrOSTgZpTVlCGQUDSvRtVtCLct7VwWTEm1K3NJo26Q64Ul
   99Y4cjl1AWjJaruseF55haHCJTKJxBTWF0CUKLE3rrukv9c79Mc3Z5xkD
   w==;
X-CSE-ConnectionGUID: s2RFJKMwSV6FuIS/rY4UXg==
X-CSE-MsgGUID: vrEwD8ZFSSCn/t6I7MJpZw==
X-IronPort-AV: E=McAfee;i="6800,10657,11697"; a="82631269"
X-IronPort-AV: E=Sophos;i="6.21,283,1763452800";
   d="scan'208";a="82631269"
Received: from fmviesa004.fm.intel.com ([10.60.135.144])
  by fmvoesa103.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 10 Feb 2026 14:13:06 -0800
X-CSE-ConnectionGUID: 55QKO9xtTQ+Oe/DRpzT+Uw==
X-CSE-MsgGUID: xl1OQkW3RvKl4kjoe+fRBg==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.21,283,1763452800";
   d="scan'208";a="216373890"
Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
  by fmviesa004.fm.intel.com with ESMTP; 10 Feb 2026 14:13:05 -0800
From: Tim Chen <tim.c.chen@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chen Yu <yu.c.chen@intel.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
	Hillf Danton <hdanton@sina.com>,
	Shrikanth Hegde <sshegde@linux.ibm.com>,
	Jianyong Wu <jianyong.wu@outlook.com>,
	Yangyu Chen <cyy@cyyself.name>,
	Tingyin Duan <tingyin.duan@gmail.com>,
	Vern Hao <vernhao@tencent.com>,
	Vern Hao <haoxing990@gmail.com>,
	Len Brown <len.brown@intel.com>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Aubrey Li <aubrey.li@intel.com>,
	Zhao Liu <zhao1.liu@intel.com>,
	Chen Yu <yu.chen.surf@gmail.com>,
	Adam Li <adamli@os.amperecomputing.com>,
	Aaron Lu <ziqianlu@bytedance.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Josh Don <joshdon@google.com>,
	Gavin Guo <gavinguo@igalia.com>,
	Qais Yousef <qyousef@layalina.io>,
	Libo Chen <libchen@purestorage.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH v3 03/21] sched/cache: Introduce helper functions to enforce
 LLC migration policy
Date: Tue, 10 Feb 2026 14:18:43 -0800
Message-Id: 
 <7475922f6020abe5d458a136b0c88fe24e823091.1770760558.git.tim.c.chen@linux.intel.com>
X-Mailer: git-send-email 2.32.0
In-Reply-To: <cover.1770760558.git.tim.c.chen@linux.intel.com>
References: <cover.1770760558.git.tim.c.chen@linux.intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Chen Yu <yu.c.chen@intel.com>

Cache-aware scheduling aggregates threads onto their preferred LLC,
mainly through load balancing. When the preferred LLC becomes
saturated, more threads are still placed there, increasing latency.
A mechanism is needed to limit aggregation so that the preferred LLC
does not become overloaded.

Introduce helper functions can_migrate_llc() and
can_migrate_llc_task() to enforce the LLC migration policy:

  1. Aggregate a task to its preferred LLC if both source and
     destination LLCs are not too busy, or if doing so will not
     leave the preferred LLC much more imbalanced than the
     non-preferred one (>20% utilization difference, a little
     higher than imbalance_pct(17%) of the LLC domain as hysteresis).
  2. Allow moving a task from overloaded preferred LLC to a non
     preferred LLC if this will not cause the non preferred LLC
     to become too imbalanced to cause a later migration back.
  3. If both LLCs are too busy, let the generic load balance to
     spread the tasks.

Further (hysteresis)action could be taken in the future to prevent tasks
from being migrated into and out of the preferred LLC frequently (back and
forth): the threshold for migrating a task out of its preferred LLC should
be higher than that for migrating it into the LLC.

Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---

Notes:
    v2->v3:
    No change.

 kernel/sched/fair.c | 153 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dfeb107f2cfd..bf5f39a01017 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9689,6 +9689,27 @@ static inline int task_is_ineligible_on_dst_cpu(stru=
ct task_struct *p, int dest_
 }
=20
 #ifdef CONFIG_SCHED_CACHE
+/*
+ * The margin used when comparing LLC utilization with CPU capacity.
+ * It determines the LLC load level where active LLC aggregation is
+ * done.
+ * Derived from fits_capacity().
+ *
+ * (default: ~50%)
+ */
+#define fits_llc_capacity(util, max)	\
+	((util) * 2 < (max))
+
+/*
+ * The margin used when comparing utilization.
+ * is 'util1' noticeably greater than 'util2'
+ * Derived from capacity_greater().
+ * Bias is in perentage.
+ */
+/* Allows dst util to be bigger than src util by up to bias percent */
+#define util_greater(util1, util2) \
+	((util1) * 100 > (util2) * 120)
+
 /* Called from load balancing paths with rcu_read_lock held */
 static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
 					 unsigned long *cap)
@@ -9704,6 +9725,138 @@ static __maybe_unused bool get_llc_stats(int cpu, u=
nsigned long *util,
=20
 	return true;
 }
+
+/*
+ * Decision matrix according to the LLC utilization. To
+ * decide whether we can do task aggregation across LLC.
+ *
+ * By default, 50% is the threshold for treating the LLC
+ * as busy. The reason for choosing 50% is to avoid saturation
+ * of SMT-2, and it is also a safe cutoff for other SMT-n
+ * platforms.
+ *
+ * 20% is the utilization imbalance percentage to decide
+ * if the preferred LLC is busier than the non-preferred LLC.
+ * 20 is a little higher than the LLC domain's imbalance_pct
+ * 17. The hysteresis is used to avoid task bouncing between the
+ * preferred LLC and the non-preferred LLC.
+ *
+ * 1. moving towards the preferred LLC, dst is the preferred
+ *    LLC, src is not.
+ *
+ * src \ dst      30%  40%  50%  60%
+ * 30%            Y    Y    Y    N
+ * 40%            Y    Y    Y    Y
+ * 50%            Y    Y    G    G
+ * 60%            Y    Y    G    G
+ *
+ * 2. moving out of the preferred LLC, src is the preferred
+ *    LLC, dst is not:
+ *
+ * src \ dst      30%  40%  50%  60%
+ * 30%            N    N    N    N
+ * 40%            N    N    N    N
+ * 50%            N    N    G    G
+ * 60%            Y    N    G    G
+ *
+ * src :      src_util
+ * dst :      dst_util
+ * Y :        Yes, migrate
+ * N :        No, do not migrate
+ * G :        let the Generic load balance to even the load.
+ *
+ * The intention is that if both LLCs are quite busy, cache aware
+ * load balance should not be performed, and generic load balance
+ * should take effect. However, if one is busy and the other is not,
+ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
+ * be considered to determine whether LLC aggregation should be
+ * performed to bias the load towards the preferred LLC.
+ */
+
+/* migration decision, 3 states are orthogonal. */
+enum llc_mig {
+	mig_forbid =3D 0,		/* N: Don't migrate task, respect LLC preference */
+	mig_llc,		/* Y: Do LLC preference based migration */
+	mig_unrestricted	/* G: Don't restrict generic load balance migration */
+};
+
+/*
+ * Check if task can be moved from the source LLC to the
+ * destination LLC without breaking cache aware preferrence.
+ * src_cpu and dst_cpu are arbitrary CPUs within the source
+ * and destination LLCs, respectively.
+ */
+static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+				    unsigned long tsk_util,
+				    bool to_pref)
+{
+	unsigned long src_util, dst_util, src_cap, dst_cap;
+
+	if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
+	    !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
+		return mig_unrestricted;
+
+	if (!fits_llc_capacity(dst_util, dst_cap) &&
+	    !fits_llc_capacity(src_util, src_cap))
+		return mig_unrestricted;
+
+	src_util =3D src_util < tsk_util ? 0 : src_util - tsk_util;
+	dst_util =3D dst_util + tsk_util;
+	if (to_pref) {
+		/*
+		 * Don't migrate if we will get preferred LLC too
+		 * heavily loaded and if the dest is much busier
+		 * than the src, in which case migration will
+		 * increase the imbalance too much.
+		 */
+		if (!fits_llc_capacity(dst_util, dst_cap) &&
+		    util_greater(dst_util, src_util))
+			return mig_forbid;
+	} else {
+		/*
+		 * Don't migrate if we will leave preferred LLC
+		 * too idle, or if this migration leads to the
+		 * non-preferred LLC falls within sysctl_aggr_imb percent
+		 * of preferred LLC, leading to migration again
+		 * back to preferred LLC.
+		 */
+		if (fits_llc_capacity(src_util, src_cap) ||
+		    !util_greater(src_util, dst_util))
+			return mig_forbid;
+	}
+	return mig_llc;
+}
+
+/*
+ * Check if task p can migrate from source LLC to
+ * destination LLC in terms of cache aware load balance.
+ */
+static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int d=
st_cpu,
+							struct task_struct *p)
+{
+	struct mm_struct *mm;
+	bool to_pref;
+	int cpu;
+
+	mm =3D p->mm;
+	if (!mm)
+		return mig_unrestricted;
+
+	cpu =3D mm->sc_stat.cpu;
+	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+		return mig_unrestricted;
+
+	if (cpus_share_cache(dst_cpu, cpu))
+		to_pref =3D true;
+	else if (cpus_share_cache(src_cpu, cpu))
+		to_pref =3D false;
+	else
+		return mig_unrestricted;
+
+	return can_migrate_llc(src_cpu, dst_cpu,
+			       task_util(p), to_pref);
+}
+
 #else
 static inline bool get_llc_stats(int cpu, unsigned long *util,
 				 unsigned long *cap)
--=20
2.32.0