From nobody Sun Apr 12 21:01:32 2026
Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8FA9C3BE656
	for <linux-kernel@vger.kernel.org>; Wed,  1 Apr 2026 21:47:20 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=198.175.65.15
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1775080042; cv=none;
 b=eb8UY7lUYZm4cDpjCODcTptulrOkrNTdyAB9a8PFFGiRYECxsp+6Vbx5YmKjZtL6AnbTD2QjExNn1fELDK+sKGrFRGwzA8g4U8lIbolJXRFPCtUt+vV2hH9IlV9PWo/gwwsc8W314ye66ilaHfwHwLXi0fUQIFu5PXwWi3w3sP8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1775080042; c=relaxed/simple;
	bh=1tTwT7plD++YgRrqqa0ayjI05CyS008YkMbisOkhQwo=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=tbP1g4a46h3dKaYOFHysqN5ZLc3s2THwPjiOOBSJG6MiX09dUbMhgrZrSAFNAVxB0erxM2W9Lm6d+1ll0yn3yiiHIsWm+FrVc8Mq6USClLBncxclzX2+rhwgXxAe4oGluj7h4C1n8YkqgH5jXAu97+s4f5CIKf4pdCYioppMovQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com;
 spf=pass smtp.mailfrom=linux.intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=aHW/aQEc; arc=none smtp.client-ip=198.175.65.15
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="aHW/aQEc"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1775080041; x=1806616041;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=1tTwT7plD++YgRrqqa0ayjI05CyS008YkMbisOkhQwo=;
  b=aHW/aQEcPKUxMWlSVoK11YCCnUcLr9JJkr3ScrqlwkZ42M/tKGS+bbdU
   C8/IqQFRJpDiqgh1tzVHt3oiNy/g0+XEI2sbEh2s8nWqbnoHSzSANVkNi
   R4c6ysTSi1yw1khuLxzvZBj9d5/+/SbYzRzkUvVs49dRYI/VhSIRqRBmk
   wZCa5TLsGrfbyCgiSTReQ7TOcqGnXk9mp8wWhGvjWEKlyLsIUsHwjPGgd
   KX53z+bUF3Gs85yn6Lle7JF73IsQJTd4RzbIyncCRo3diW/Jsy5/Mggg2
   kAjYBoon3Nb1VHTHuU+yi0mX+MrBQ4AIotuEDjCf0FHcpT8EMigIruIdP
   g==;
X-CSE-ConnectionGUID: Gsbwz0M1Tbyyz3LPOsJ1uQ==
X-CSE-MsgGUID: IiMDPr5mRVG6V1mwEHlTwA==
X-IronPort-AV: E=McAfee;i="6800,10657,11746"; a="79740166"
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="79740166"
Received: from fmviesa002.fm.intel.com ([10.60.135.142])
  by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 01 Apr 2026 14:47:20 -0700
X-CSE-ConnectionGUID: 1xItSbM0SS2UIo1OalAIjQ==
X-CSE-MsgGUID: HGz3PubTRTu7X3OfKD4I2g==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="249842510"
Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
  by fmviesa002.fm.intel.com with ESMTP; 01 Apr 2026 14:47:19 -0700
From: Tim Chen <tim.c.chen@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chen Yu <yu.c.chen@intel.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
	Hillf Danton <hdanton@sina.com>,
	Shrikanth Hegde <sshegde@linux.ibm.com>,
	Jianyong Wu <jianyong.wu@outlook.com>,
	Yangyu Chen <cyy@cyyself.name>,
	Tingyin Duan <tingyin.duan@gmail.com>,
	Vern Hao <vernhao@tencent.com>,
	Vern Hao <haoxing990@gmail.com>,
	Len Brown <len.brown@intel.com>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Aubrey Li <aubrey.li@intel.com>,
	Zhao Liu <zhao1.liu@intel.com>,
	Chen Yu <yu.chen.surf@gmail.com>,
	Adam Li <adamli@os.amperecomputing.com>,
	Aaron Lu <ziqianlu@bytedance.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Josh Don <joshdon@google.com>,
	Gavin Guo <gavinguo@igalia.com>,
	Qais Yousef <qyousef@layalina.io>,
	Libo Chen <libchen@purestorage.com>,
	linux-kernel@vger.kernel.org
Subject: [Patch v4 22/22] -- DO NOT APPLY!!! -- sched/cache/debug: Add ftrace
 to track the load balance statistics
Date: Wed,  1 Apr 2026 14:52:34 -0700
Message-Id: 
 <a1489a3a249d20e8b0fbaa6b397fe8d248a51d98.1775065312.git.tim.c.chen@linux.intel.com>
X-Mailer: git-send-email 2.32.0
In-Reply-To: <cover.1775065312.git.tim.c.chen@linux.intel.com>
References: <cover.1775065312.git.tim.c.chen@linux.intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Chen Yu <yu.c.chen@intel.com>

Debug patch only.

To help investigate any potential performance regressions caused by
cache-aware scheduling in the future, introduce these ftrace events.

The user leverages these trace events (via bpftrace, etc.)
to monitor the cache-aware load balancing activity - specifically,
whether tasks are moved to their preferred LLC, moved out of their
preferred LLC, whether cache-aware load balancing is skipped
due to exceeding the memory footprint limit or too many active
tasks, and the reason why LLC preferred migration is allowed or
not.

Together with existing scheduler events, the newly introduced
events above can be used to narrow down the performance regression.
For example, the regression could be caused by excessive task
migrations among CPUs, which can be tracked either by
trace_sched_attach_task() or by checking the return value of
select_task_rq_fair(). Alternatively, it could be caused by
over-aggregation within a single LLC, which can be identified
via context switch events.

The scanning time to find the hottest LLC is simply recorded,
which can be used to evaluate whether the statistics calculation
for cache-aware scheduling is costly.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v3->v4:
        Add more trace events.

 include/trace/events/sched.h | 140 +++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |  64 +++++++++++++---
 2 files changed, 192 insertions(+), 12 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..8d1d5fa32ad2 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,146 @@
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
=20
+#ifdef CONFIG_SCHED_CACHE
+TRACE_EVENT(sched_llc_mig,
+	TP_PROTO(unsigned long dst_util, unsigned long dst_cap,
+		unsigned long src_util, unsigned long src_cap,
+		int to_pref, int mig_hint),
+
+	TP_ARGS(dst_util, dst_cap, src_util, src_cap, to_pref, mig_hint),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	dst_util)
+		__field(unsigned long,	dst_cap)
+		__field(unsigned long,	src_util)
+		__field(unsigned long,	src_cap)
+		__field(int,		to_pref)
+		__field(int,		mig_hint)
+	),
+
+	TP_fast_assign(
+		__entry->dst_util		=3D dst_util;
+		__entry->dst_cap		=3D dst_cap;
+		__entry->src_util		=3D src_util;
+		__entry->src_cap		=3D src_cap;
+		__entry->to_pref		=3D to_pref;
+		__entry->mig_hint		=3D mig_hint;
+	),
+
+	TP_printk("dst_util=3D%lu dst_cap=3D%lu src_util=3D%lu src_cap=3D%lu to_p=
ref=3D%d mig_hint=3D%d",
+		  __entry->dst_util, __entry->dst_cap, __entry->src_util,
+		  __entry->src_cap, __entry->to_pref, __entry->mig_hint)
+);
+
+TRACE_EVENT(sched_llc_scan,
+
+	TP_PROTO(struct task_struct *t, unsigned long cost),
+
+	TP_ARGS(t, cost),
+
+	TP_STRUCT__entry(
+		__array(char,	comm,	TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(unsigned long,	cost)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid		=3D t->pid;
+		__entry->cost		=3D cost;
+	),
+
+	TP_printk("comm=3D%s pid=3D%d scan_cost=3D%lu",
+			__entry->comm, __entry->pid,
+			__entry->cost)
+);
+
+TRACE_EVENT(sched_exceed_llc_cap,
+
+	TP_PROTO(struct task_struct *t, int exceeded, int  scale,
+		unsigned long llc,  unsigned long  rss),
+
+	TP_ARGS(t, exceeded, scale, llc,  rss),
+
+	TP_STRUCT__entry(
+		__array(char,	comm,	TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(int,	exceeded)
+		__field(int,	scale)
+		__field(unsigned  long,	llc)
+		__field(unsigned long,	rss)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid		=3D t->pid;
+		__entry->exceeded	=3D exceeded;
+		__entry->scale	=3D scale;
+		__entry->llc	=3D llc;
+		__entry->rss	=3D rss;
+	),
+
+	TP_printk("comm=3D%s pid=3D%d exceed_cap=3D%d scale=3D%d llc=3D%lu  rss=
=3D%lu",
+			__entry->comm, __entry->pid,
+			__entry->exceeded, __entry->scale,
+			__entry->llc, __entry->rss)
+);
+
+TRACE_EVENT(sched_exceed_llc_nr,
+
+	TP_PROTO(struct task_struct *t, int exceeded),
+
+	TP_ARGS(t, exceeded),
+
+	TP_STRUCT__entry(
+		__array(char,	comm,	TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(int,	exceeded)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid		=3D t->pid;
+		__entry->exceeded	=3D exceeded;
+	),
+
+	TP_printk("comm=3D%s pid=3D%d exceed_nr=3D%d",
+			__entry->comm, __entry->pid,
+			__entry->exceeded)
+);
+
+TRACE_EVENT(sched_attach_task,
+
+	TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+		 int attach_cpu, int attach_llc),
+
+	TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
+
+	TP_STRUCT__entry(
+		__array(char,	comm,	TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(int,	pref_cpu)
+		__field(int,	pref_llc)
+		__field(int,	attach_cpu)
+		__field(int,	attach_llc)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid		=3D t->pid;
+		__entry->pref_cpu	=3D pref_cpu;
+		__entry->pref_llc	=3D pref_llc;
+		__entry->attach_cpu	=3D attach_cpu;
+		__entry->attach_llc	=3D attach_llc;
+	),
+
+	TP_printk("comm=3D%s pid=3D%d pref_cpu=3D%d pref_llc=3D%d attach_cpu=3D%d=
 attach_llc=3D%d",
+			__entry->comm, __entry->pid,
+			__entry->pref_cpu, __entry->pref_llc,
+			__entry->attach_cpu, __entry->attach_llc)
+);
+#endif
+
 /*
  * Tracepoint for calling kthread_stop, performed to end a kthread:
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2b12918b00fd..f446d755f3c5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1337,9 +1337,11 @@ static inline int get_sched_cache_scale(int mul)
 	return (1 + (llc_aggr_tolerance - 1) * mul);
 }
=20
-static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu,
+				struct task_struct *p)
 {
 	struct cacheinfo *ci;
+	bool exceeded;
 	u64 rss, llc;
 	int scale;
=20
@@ -1385,11 +1387,17 @@ static bool exceed_llc_capacity(struct mm_struct *m=
m, int cpu)
 	if (scale =3D=3D INT_MAX)
 		return false;
=20
-	return ((llc * (u64)scale) < (rss * PAGE_SIZE));
+	exceeded =3D ((llc * (u64)scale) < (rss * PAGE_SIZE));
+
+	trace_sched_exceed_llc_cap(p, exceeded, scale, llc, rss);
+
+	return exceeded;
 }
=20
-static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+static bool exceed_llc_nr(struct mm_struct *mm, int cpu,
+			  struct task_struct *p)
 {
+	bool exceeded;
 	int scale;
=20
 	/*
@@ -1400,8 +1408,12 @@ static bool exceed_llc_nr(struct mm_struct *mm, int =
cpu)
 	if (scale =3D=3D INT_MAX)
 		return false;
=20
-	return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
+	exceeded =3D !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_thr=
eads),
 			(scale * per_cpu(sd_llc_size, cpu)));
+
+	trace_sched_exceed_llc_nr(p, exceeded);
+
+	return exceeded;
 }
=20
 static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
@@ -1606,8 +1618,8 @@ void account_mm_sched(struct rq *rq, struct task_stru=
ct *p, s64 delta_exec)
 	if (time_after(epoch,
 		       READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
 	    get_nr_threads(p) <=3D 1 ||
-	    exceed_llc_nr(mm, cpu_of(rq)) ||
-	    exceed_llc_capacity(mm, cpu_of(rq))) {
+	    exceed_llc_nr(mm, cpu_of(rq), p) ||
+	    exceed_llc_capacity(mm, cpu_of(rq), p)) {
 		if (mm->sc_stat.cpu !=3D -1)
 			mm->sc_stat.cpu =3D -1;
 	}
@@ -1718,6 +1730,7 @@ static void task_cache_work(struct callback_head *wor=
k)
 	struct mm_struct *mm =3D p->mm;
 	unsigned long m_a_occ =3D 0;
 	cpumask_var_t cpus;
+	u64 t0, scan_cost;
=20
 	WARN_ON_ONCE(work !=3D &p->cache_work);
=20
@@ -1728,7 +1741,7 @@ static void task_cache_work(struct callback_head *wor=
k)
=20
 	curr_cpu =3D task_cpu(p);
 	if (get_nr_threads(p) <=3D 1 ||
-	    exceed_llc_capacity(mm, curr_cpu)) {
+	    exceed_llc_capacity(mm, curr_cpu, p)) {
 		if (mm->sc_stat.cpu !=3D -1)
 			mm->sc_stat.cpu =3D -1;
=20
@@ -1738,6 +1751,8 @@ static void task_cache_work(struct callback_head *wor=
k)
 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
 		return;
=20
+	t0 =3D sched_clock_cpu(curr_cpu);
+
 	scoped_guard (cpus_read_lock) {
 		get_scan_cpumasks(cpus, p);
=20
@@ -1793,6 +1808,9 @@ static void task_cache_work(struct callback_head *wor=
k)
 		}
 	}
=20
+	scan_cost =3D sched_clock_cpu(curr_cpu) - t0;
+	trace_sched_llc_scan(p, scan_cost);
+
 	if (m_a_occ > (2 * curr_m_a_occ)) {
 		/*
 		 * Avoid switching sc_stat.cpu too fast.
@@ -10192,8 +10210,11 @@ static enum llc_mig can_migrate_llc(int src_cpu, i=
nt dst_cpu,
 	dst_util =3D dst_util + tsk_util;
=20
 	if (!fits_llc_capacity(dst_util, dst_cap) &&
-	    !fits_llc_capacity(src_util, src_cap))
+	    !fits_llc_capacity(src_util, src_cap)) {
+		trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+				    to_pref, mig_unrestricted);
 		return mig_unrestricted;
+	}
=20
 	if (to_pref) {
 		/*
@@ -10203,8 +10224,11 @@ static enum llc_mig can_migrate_llc(int src_cpu, i=
nt dst_cpu,
 		 * increase the imbalance too much.
 		 */
 		if (!fits_llc_capacity(dst_util, dst_cap) &&
-		    util_greater(dst_util, src_util))
+		    util_greater(dst_util, src_util)) {
+			trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+					    to_pref, mig_forbid);
 			return mig_forbid;
+		}
 	} else {
 		/*
 		 * Don't migrate if we will leave preferred LLC
@@ -10214,9 +10238,15 @@ static enum llc_mig can_migrate_llc(int src_cpu, i=
nt dst_cpu,
 		 * back to preferred LLC.
 		 */
 		if (fits_llc_capacity(src_util, src_cap) ||
-		    !util_greater(src_util, dst_util))
+		    !util_greater(src_util, dst_util)) {
+			trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+					    to_pref, mig_forbid);
 			return mig_forbid;
+		}
 	}
+
+	trace_sched_llc_mig(dst_util, dst_cap, src_util, src_cap,
+			    to_pref, mig_llc);
 	return mig_llc;
 }
=20
@@ -10243,8 +10273,8 @@ static enum llc_mig can_migrate_llc_task(int src_cp=
u, int dst_cpu,
 	 * Skip cache aware load balance for single/too many threads
 	 * or large memory RSS.
 	 */
-	if (get_nr_threads(p) <=3D 1 || exceed_llc_nr(mm, dst_cpu) ||
-	    exceed_llc_capacity(mm, dst_cpu)) {
+	if (get_nr_threads(p) <=3D 1 || exceed_llc_nr(mm, dst_cpu, p) ||
+	    exceed_llc_capacity(mm, dst_cpu, p)) {
 		if (mm->sc_stat.cpu !=3D -1)
 			mm->sc_stat.cpu =3D -1;
 		return mig_unrestricted;
@@ -10722,6 +10752,16 @@ static void attach_task(struct rq *rq, struct task=
_struct *p)
 {
 	lockdep_assert_rq_held(rq);
=20
+#ifdef CONFIG_SCHED_CACHE
+	if (p->mm) {
+		int pref_cpu =3D p->mm->sc_stat.cpu;
+
+		trace_sched_attach_task(p,
+					pref_cpu,
+					pref_cpu !=3D -1 ? llc_id(pref_cpu) : -1,
+					cpu_of(rq), llc_id(cpu_of(rq)));
+	}
+#endif
 	WARN_ON_ONCE(task_rq(p) !=3D rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	wakeup_preempt(rq, p, 0);
--=20
2.32.0