Cache Aware Scheduling

[PATCH v3 21/21] -- DO NOT APPLY!!! -- sched/cache/debug: Add ftrace to track the load balance statistics
Posted by Tim Chen 1 month, 2 weeks ago
From: Chen Yu <yu.c.chen@intel.com>

Debug patch only.

The user leverages these trace events (via bpftrace, etc.)
to monitor the cache-aware load balancing activity - specifically,
whether tasks are moved to their preferred LLC, moved out of their
preferred LLC, or whether cache-aware load balancing is skipped
due to exceeding the memory footprint limit or too many active
tasks.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v2->v3:
    Add more trace events when the process exceeds the limitation
    of LLC size or number of active threads(moved from schedstat
    to trace event for better bpf tracking)

 include/trace/events/sched.h | 79 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 40 ++++++++++++++----
 2 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..b73327653e4b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,85 @@
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
 
+#ifdef CONFIG_SCHED_CACHE
+TRACE_EVENT(sched_exceed_llc_cap,
+
+	TP_PROTO(struct task_struct *t, int exceeded),
+
+	TP_ARGS(t, exceeded),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid			)
+		__field( int,	exceeded		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid		= t->pid;
+		__entry->exceeded	= exceeded;
+	),
+
+	TP_printk("comm=%s pid=%d exceed_cap=%d",
+			__entry->comm, __entry->pid,
+			__entry->exceeded)
+);
+
+TRACE_EVENT(sched_exceed_llc_nr,
+
+	TP_PROTO(struct task_struct *t, int exceeded),
+
+	TP_ARGS(t, exceeded),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid			)
+		__field( int,	exceeded		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid		= t->pid;
+		__entry->exceeded	= exceeded;
+	),
+
+	TP_printk("comm=%s pid=%d exceed_nr=%d",
+			__entry->comm, __entry->pid,
+			__entry->exceeded)
+);
+
+TRACE_EVENT(sched_attach_task,
+
+	TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+		 int attach_cpu, int attach_llc),
+
+	TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid			)
+		__field( int,	pref_cpu		)
+		__field( int,	pref_llc		)
+		__field( int,	attach_cpu		)
+		__field( int,	attach_llc		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid		= t->pid;
+		__entry->pref_cpu	= pref_cpu;
+		__entry->pref_llc	= pref_llc;
+		__entry->attach_cpu	= attach_cpu;
+		__entry->attach_llc	= attach_llc;
+	),
+
+	TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
+			__entry->comm, __entry->pid,
+			__entry->pref_cpu, __entry->pref_llc,
+			__entry->attach_cpu, __entry->attach_llc)
+);
+#endif
+
 /*
  * Tracepoint for calling kthread_stop, performed to end a kthread:
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25cee3dd767c..977091fd0e49 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1245,9 +1245,11 @@ static inline int get_sched_cache_scale(int mul)
 	return (1 + (llc_aggr_tolerance - 1) * mul);
 }
 
-static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu,
+				struct task_struct *p)
 {
 	struct cacheinfo *ci;
+	bool exceeded;
 	u64 rss, llc;
 	int scale;
 
@@ -1293,12 +1295,18 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
 	if (scale == INT_MAX)
 		return false;
 
-	return ((llc * scale) <= (rss * PAGE_SIZE));
+	exceeded = ((llc * scale) <= (rss * PAGE_SIZE));
+
+	trace_sched_exceed_llc_cap(p, exceeded);
+
+	return exceeded;
 }
 
-static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+static bool exceed_llc_nr(struct mm_struct *mm, int cpu,
+			  struct task_struct *p)
 {
 	int smt_nr = 1, scale;
+	bool exceeded;
 
 #ifdef CONFIG_SCHED_SMT
 	if (sched_smt_active())
@@ -1313,8 +1321,12 @@ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
 	if (scale == INT_MAX)
 		return false;
 
-	return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
+	exceeded = !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
 			(scale * per_cpu(sd_llc_size, cpu)));
+
+	trace_sched_exceed_llc_nr(p, exceeded);
+
+	return exceeded;
 }
 
 static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
@@ -1522,8 +1534,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 	if (time_after(epoch,
 		       READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
 	    get_nr_threads(p) <= 1 ||
-	    exceed_llc_nr(mm, cpu_of(rq)) ||
-	    exceed_llc_capacity(mm, cpu_of(rq))) {
+	    exceed_llc_nr(mm, cpu_of(rq), p) ||
+	    exceed_llc_capacity(mm, cpu_of(rq), p)) {
 		if (mm->sc_stat.cpu != -1)
 			mm->sc_stat.cpu = -1;
 	}
@@ -1600,7 +1612,7 @@ static void task_cache_work(struct callback_head *work)
 
 	curr_cpu = task_cpu(p);
 	if (get_nr_threads(p) <= 1 ||
-	    exceed_llc_capacity(mm, curr_cpu)) {
+	    exceed_llc_capacity(mm, curr_cpu, p)) {
 		if (mm->sc_stat.cpu != -1)
 			mm->sc_stat.cpu = -1;
 
@@ -10159,8 +10171,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
 	 * Skip cache aware load balance for single/too many threads
 	 * or large memory RSS.
 	 */
-	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
-	    exceed_llc_capacity(mm, dst_cpu)) {
+	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu, p) ||
+	    exceed_llc_capacity(mm, dst_cpu, p)) {
 		if (mm->sc_stat.cpu != -1)
 			mm->sc_stat.cpu = -1;
 		return mig_unrestricted;
@@ -10602,6 +10614,16 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 {
 	lockdep_assert_rq_held(rq);
 
+#ifdef CONFIG_SCHED_CACHE
+	if (p->mm) {
+		int pref_cpu = p->mm->sc_stat.cpu;
+
+		trace_sched_attach_task(p,
+					pref_cpu,
+					pref_cpu != -1 ? llc_id(pref_cpu) : -1,
+					cpu_of(rq), llc_id(cpu_of(rq)));
+	}
+#endif
 	WARN_ON_ONCE(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	wakeup_preempt(rq, p, 0);
-- 
2.32.0