From nobody Sun Apr 12 21:01:32 2026
Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CDC6F35CB61
	for <linux-kernel@vger.kernel.org>; Wed,  1 Apr 2026 21:47:08 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=198.175.65.15
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1775080030; cv=none;
 b=tEE7vXIWNKcKjRYE7bmwsyeUPyXjaqyTCboubEarffZj3QKCHU09pHCuo6ZHjSgSVY1FMoYYGpd0jYeG7OB+e3rxkDcsXKTdEC4VmB1hkG0kT+KWAh9T4ghvGs7t0X6J2dgf5jmZ0H+VtowK5PiCFpwvzcZtPtr7mYfg5yawK/w=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1775080030; c=relaxed/simple;
	bh=KGwt+Gi0Pt7lsJd8XwsnKJGA3ETN3Vw4C/W9cDiPeCg=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=ieqWtmzzoZalZXn+EgpZQ5aewZMdiHwcWSim25R/4yTIKxa5c/uggJp0OmKRCFit7tRt2JDoQGmOkpF4l9R1Tp07Mm1Yxqm7uUNYsKZglQQFP6zp0h7xpY6zHMBBQhI2ZPk0Tw4YbNRLHq0yueUfs6Mqe7xlP7DE2O1M+RYZ9xk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com;
 spf=pass smtp.mailfrom=linux.intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=eZaROQM4; arc=none smtp.client-ip=198.175.65.15
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="eZaROQM4"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1775080029; x=1806616029;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=KGwt+Gi0Pt7lsJd8XwsnKJGA3ETN3Vw4C/W9cDiPeCg=;
  b=eZaROQM4KgFyACUs/J/Wbs1+opY9Ua5KoKBES6/5ghxgoNEtP8/rpAhv
   eSBUy0XZYCsNbKRu9V57cGLaGRjIqIqj0Pvfnl0717vxdIZsVRIrnf3iO
   VmnVOXIJuNoHl73G3rThTxWSKv5jM3OHpV/7E9g50S3H2ATuBvHfCx2QM
   MWM0Hp6tj186H2McWWHcVyeowI6NIePrMVzW7XlmO5K12hfqsPFJNN+2t
   CDhcILWFmNd3o5gCI3SU229P8z1LS7lj3h/Skl7js35wM19pdSMMXVYvs
   2eLZ6uleeD+aYswFSz/YblBinMP5rXCByWcAeXiGZUlrKiZwydctPkxRm
   g==;
X-CSE-ConnectionGUID: YPoFKm5oRR6SfpVoN70LVg==
X-CSE-MsgGUID: 57/uldy4T4+keUSzbq5JRA==
X-IronPort-AV: E=McAfee;i="6800,10657,11746"; a="79739946"
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="79739946"
Received: from fmviesa002.fm.intel.com ([10.60.135.142])
  by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 01 Apr 2026 14:47:02 -0700
X-CSE-ConnectionGUID: BMzh78hTRkuXlpcuA2Guuw==
X-CSE-MsgGUID: +F23jWSOTj2Qv1RKlzqp3w==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="249842459"
Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
  by fmviesa002.fm.intel.com with ESMTP; 01 Apr 2026 14:47:00 -0700
From: Tim Chen <tim.c.chen@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
	Hillf Danton <hdanton@sina.com>,
	Shrikanth Hegde <sshegde@linux.ibm.com>,
	Jianyong Wu <jianyong.wu@outlook.com>,
	Yangyu Chen <cyy@cyyself.name>,
	Tingyin Duan <tingyin.duan@gmail.com>,
	Vern Hao <vernhao@tencent.com>,
	Vern Hao <haoxing990@gmail.com>,
	Len Brown <len.brown@intel.com>,
	Aubrey Li <aubrey.li@intel.com>,
	Zhao Liu <zhao1.liu@intel.com>,
	Chen Yu <yu.chen.surf@gmail.com>,
	Chen Yu <yu.c.chen@intel.com>,
	Adam Li <adamli@os.amperecomputing.com>,
	Aaron Lu <ziqianlu@bytedance.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Josh Don <joshdon@google.com>,
	Gavin Guo <gavinguo@igalia.com>,
	Qais Yousef <qyousef@layalina.io>,
	Libo Chen <libchen@purestorage.com>,
	linux-kernel@vger.kernel.org
Subject: [Patch v4 12/22] sched/cache: Prioritize tasks preferring destination
 LLC during balancing
Date: Wed,  1 Apr 2026 14:52:24 -0700
Message-Id: 
 <baa458f45eab3f602af090c6d6af63dc864f5ec6.1775065312.git.tim.c.chen@linux.intel.com>
X-Mailer: git-send-email 2.32.0
In-Reply-To: <cover.1775065312.git.tim.c.chen@linux.intel.com>
References: <cover.1775065312.git.tim.c.chen@linux.intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

During LLC load balancing, first check for tasks that prefer the
destination LLC and balance them to it before others.

Mark source sched groups containing tasks preferring non local LLCs
with the group_llc_balance flag. This ensures the load balancer later
pulls or pushes these tasks toward their preferred LLCs.
The priority of group_llc_balance is lower than that of group_overloaded
and higher than that of all other group types. This is because
group_llc_balance may exacerbate load imbalance, and if the LLC balancing
attempt fails, the nr_balance_failed mechanism will trigger other group
types to rebalance the load.

The load balancer selects the busiest sched_group and migrates tasks
to less busy groups to distribute load across CPUs.

With cache-aware scheduling enabled, the busiest sched_group is
the one with most tasks preferring the destination LLC. If
the group has the llc_balance flag set, cache aware load balancing is
triggered.

Introduce the helper function update_llc_busiest() to identify the
sched_group with the most tasks preferring the destination LLC.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v3->v4:
        Add comments to explain LLC load balance priority.
        (Madadi Vineeth Reddy)

 kernel/sched/fair.c | 79 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ed84086244c..c032eeebe191 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9747,6 +9747,16 @@ enum group_type {
 	 * from balancing the load across the system.
 	 */
 	group_imbalanced,
+	/*
+	 * There are tasks running on non-preferred LLC, possible to move
+	 * them to their preferred LLC without creating too much imbalance.
+	 * The priority of group_llc_balance is lower than that of
+	 * group_overloaded and higher than that of all other group types.
+	 * This is because group_llc_balance may exacerbate load imbalance.
+	 * If the LLC balancing attempt fails, the nr_balance_failed
+	 * mechanism will trigger other group types to rebalance the load.
+	 */
+	group_llc_balance,
 	/*
 	 * The CPU is overloaded and can't provide expected CPU cycles to all
 	 * tasks.
@@ -10676,6 +10686,7 @@ struct sg_lb_stats {
 	enum group_type group_type;
 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CP=
U */
 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
+	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LL=
C */
 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its=
 capacity */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
@@ -10934,6 +10945,9 @@ group_type group_classify(unsigned int imbalance_pc=
t,
 	if (group_is_overloaded(imbalance_pct, sgs))
 		return group_overloaded;
=20
+	if (sgs->group_llc_balance)
+		return group_llc_balance;
+
 	if (sg_imbalanced(group))
 		return group_imbalanced;
=20
@@ -11128,11 +11142,63 @@ static void record_sg_llc_stats(struct lb_env *en=
v,
 	if (unlikely(READ_ONCE(sd_share->capacity) !=3D sgs->group_capacity))
 		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
 }
+
+/*
+ * Do LLC balance on sched group that contains LLC, and have tasks preferr=
ing
+ * to run on LLC in idle dst_cpu.
+ */
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+			       struct sched_group *group)
+{
+	if (!sched_cache_enabled())
+		return false;
+
+	if (env->sd->flags & SD_SHARE_LLC)
+		return false;
+
+	/*
+	 * Skip cache aware tagging if nr_balanced_failed is sufficiently high.
+	 * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed
+	 * to avoid excessive task migration at the same time.
+	 */
+	if (env->sd->nr_balance_failed >=3D env->sd->cache_nice_tries + 1)
+		return false;
+
+	if (sgs->nr_pref_dst_llc &&
+	    can_migrate_llc(cpumask_first(sched_group_span(group)),
+			    env->dst_cpu, 0, true) =3D=3D mig_llc)
+		return true;
+
+	return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+			       struct sg_lb_stats *busiest,
+			       struct sg_lb_stats *sgs)
+{
+	/*
+	 * There are more tasks that want to run on dst_cpu's LLC.
+	 */
+	return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
+}
 #else
 static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_st=
ats *sgs,
 				       struct sched_group *group)
 {
 }
+
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+			       struct sched_group *group)
+{
+	return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+			       struct sg_lb_stats *busiest,
+			       struct sg_lb_stats *sgs)
+{
+	return false;
+}
 #endif
=20
 /**
@@ -11237,6 +11303,10 @@ static inline void update_sg_lb_stats(struct lb_en=
v *env,
 		/* Check for loaded SMT group to be balanced to dst CPU */
 		if (smt_balance(env, sgs, group))
 			sgs->group_smt_balance =3D 1;
+
+		/* Check for tasks in this group can be moved to their preferred LLC */
+		if (llc_balance(env, sgs, group))
+			sgs->group_llc_balance =3D 1;
 	}
=20
 	sgs->group_type =3D group_classify(env->sd->imbalance_pct, group, sgs);
@@ -11300,6 +11370,10 @@ static bool update_sd_pick_busiest(struct lb_env *=
env,
 		/* Select the overloaded group with highest avg_load. */
 		return sgs->avg_load > busiest->avg_load;
=20
+	case group_llc_balance:
+		/* Select the group with most tasks preferring dst LLC */
+		return update_llc_busiest(env, busiest, sgs);
+
 	case group_imbalanced:
 		/*
 		 * Select the 1st imbalanced group as we don't have any way to
@@ -11562,6 +11636,7 @@ static bool update_pick_idlest(struct sched_group *=
idlest,
 			return false;
 		break;
=20
+	case group_llc_balance:
 	case group_imbalanced:
 	case group_asym_packing:
 	case group_smt_balance:
@@ -11694,6 +11769,7 @@ sched_balance_find_dst_group(struct sched_domain *s=
d, struct task_struct *p, int
 			return NULL;
 		break;
=20
+	case group_llc_balance:
 	case group_imbalanced:
 	case group_asym_packing:
 	case group_smt_balance:
@@ -12192,7 +12268,8 @@ static struct sched_group *sched_balance_find_src_g=
roup(struct lb_env *env)
 	 * group's child domain.
 	 */
 	if (sds.prefer_sibling && local->group_type =3D=3D group_has_spare &&
-	    sibling_imbalance(env, &sds, busiest, local) > 1)
+	    (busiest->group_type =3D=3D group_llc_balance ||
+	    sibling_imbalance(env, &sds, busiest, local) > 1))
 		goto force_balance;
=20
 	if (busiest->group_type !=3D group_overloaded) {
--=20
2.32.0