[RFC PATCH v5 9/9] sched/fair: Provide idle search schedstats

Chen Jinghuang posted 9 patches 2 weeks, 3 days ago
[RFC PATCH v5 9/9] sched/fair: Provide idle search schedstats
Posted by Chen Jinghuang 2 weeks, 3 days ago
From: Steve Sistare <steven.sistare@oracle.com>

Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
          prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
          CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
           migratable.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
---
 kernel/sched/core.c  | 31 +++++++++++++++++++++++--
 kernel/sched/fair.c  | 54 ++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h |  9 ++++++++
 kernel/sched/stats.c |  9 ++++++++
 kernel/sched/stats.h | 13 +++++++++++
 5 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759777694c78..841a4ca7e173 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4505,17 +4505,44 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write,
 
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+	int i, n = 0;
+	s64 t;
+	int skid = 0;
+
+	for (i = 0; i < 100; i++) {
+		t = local_clock();
+		t = local_clock() - t;
+		if (t > 0 && t < 1000) {	/* only use sane samples */
+			skid += (int) t;
+			n++;
+		}
+	}
+
+	if (n > 0)
+		schedstat_skid = skid / n;
+	else
+		schedstat_skid = 0;
+	pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-	if (enabled)
+	if (enabled) {
+		compute_skid();
 		static_branch_enable(&sched_schedstats);
-	else
+	} else {
 		static_branch_disable(&sched_schedstats);
+	}
 }
 
 void force_schedstat_enabled(void)
 {
 	if (!schedstat_enabled()) {
+		compute_skid();
 		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
 		static_branch_enable(&sched_schedstats);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 500215a57392..ba2b9f811135 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5091,29 +5091,35 @@ static inline void rq_idle_stamp_clear(struct rq *rq)
 static void overload_clear(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
+	unsigned long time;
 
 	if (!sched_feat(STEAL))
 		return;
 
+	time = schedstat_start_time();
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
 		sparsemask_clear_elem(overload_cpus, rq->cpu);
 	rcu_read_unlock();
+	schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
+	unsigned long time;
 
 	if (!sched_feat(STEAL))
 		return;
 
+	time = schedstat_start_time();
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
 		sparsemask_set_elem(overload_cpus, rq->cpu);
 	rcu_read_unlock();
+	schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -7830,6 +7836,16 @@ static inline bool asym_fits_cpu(unsigned long util,
 	return true;
 }
 
+#define SET_STAT(STAT)							\
+	do {								\
+		if (schedstat_enabled()) {				\
+			struct rq *rq = this_rq();			\
+									\
+			if (rq)						\
+				__schedstat_inc(rq->STAT);		\
+		}							\
+	} while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -7857,8 +7873,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	lockdep_assert_irqs_disabled();
 
 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, target))
+	    asym_fits_cpu(task_util, util_min, util_max, target)) {
+		SET_STAT(found_idle_cpu_easy);
 		return target;
+		}
 
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
@@ -7868,8 +7886,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
 
 		if (!static_branch_unlikely(&sched_cluster_active) ||
-		    cpus_share_resources(prev, target))
+		    cpus_share_resources(prev, target)) {
+			SET_STAT(found_idle_cpu_easy);
 			return prev;
+			}
 
 		prev_aff = prev;
 	}
@@ -7887,6 +7907,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    prev == smp_processor_id() &&
 	    this_rq()->nr_running <= 1 &&
 	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+		SET_STAT(found_idle_cpu_easy);
 		return prev;
 	}
 
@@ -7901,8 +7922,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 
 		if (!static_branch_unlikely(&sched_cluster_active) ||
-		    cpus_share_resources(recent_used_cpu, target))
+		    cpus_share_resources(recent_used_cpu, target)) {
+			SET_STAT(found_idle_cpu_easy);
 			return recent_used_cpu;
+			}
 
 	} else {
 		recent_used_cpu = -1;
@@ -7924,13 +7947,16 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		 */
 		if (sd) {
 			i = select_idle_capacity(p, sd, target);
+			SET_STAT(found_idle_cpu_capacity);
 			return ((unsigned)i < nr_cpumask_bits) ? i : target;
 		}
 	}
 
 	sd = rcu_dereference_all(per_cpu(sd_llc, target));
-	if (!sd)
+	if (!sd) {
+		SET_STAT(nofound_idle_cpu);
 		return target;
+	}
 
 	if (sched_smt_active()) {
 		has_idle_core = test_idle_cores(target);
@@ -7943,8 +7969,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	}
 
 	i = select_idle_cpu(p, sd, has_idle_core, target);
-	if ((unsigned)i < nr_cpumask_bits)
+	if ((unsigned)i < nr_cpumask_bits) {
+		SET_STAT(found_idle_cpu);
 		return i;
+	}
+
+	SET_STAT(nofound_idle_cpu);
 
 	/*
 	 * For cluster machines which have lower sharing cache like L2 or
@@ -8580,6 +8610,7 @@ static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 {
 	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+	unsigned long time;
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
 	int new_cpu = prev_cpu;
@@ -8587,6 +8618,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	/* SD_flags and WF_flags share the first nibble */
 	int sd_flag = wake_flags & 0xF;
 
+	time = schedstat_start_time();
+
 	/*
 	 * required for stable ->cpus_allowed
 	 */
@@ -8643,6 +8676,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	}
 	rcu_read_unlock();
 
+	schedstat_end_time(cpu_rq(cpu)->find_time, time);
+
 	return new_cpu;
 }
 
@@ -8981,6 +9016,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	struct sched_entity *se;
 	struct task_struct *p;
 	int new_tasks;
+	unsigned long time;
 
 again:
 	p = pick_task_fair(rq, rf);
@@ -9038,6 +9074,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 idle:
 	if (rf) {
+		time = schedstat_start_time();
 		/*
 		 * We must set idle_stamp _before_ calling try_steal() or
 		 * sched_balance_newidle(), such that we measure the duration
@@ -9052,6 +9089,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 		if (new_tasks)
 			rq_idle_stamp_clear(rq);
 
+		schedstat_end_time(rq->find_time, time);
+
 		/*
 		 * Because try_steal() and sched_balance_newidle() release
 		 * (and re-acquire) rq->lock, it is possible for any higher priority
@@ -13215,6 +13254,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
 		update_rq_clock(dst_rq);
 		attach_task(dst_rq, p);
 		stolen = 1;
+		schedstat_inc(dst_rq->steal);
 	}
 	local_irq_restore(rf.flags);
 
@@ -13239,6 +13279,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
 	int dst_cpu = dst_rq->cpu;
 	bool locked = true;
 	int stolen = 0;
+	bool any_overload = false;
 	struct sparsemask *overload_cpus;
 
 	if (!sched_feat(STEAL))
@@ -13281,6 +13322,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
 			stolen = 1;
 			goto out;
 		}
+		any_overload = true;
 	}
 
 out:
@@ -13292,6 +13334,8 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
 	stolen |= (dst_rq->cfs.h_nr_runnable > 0);
 	if (dst_rq->nr_running != dst_rq->cfs.h_nr_runnable)
 		stolen = -1;
+	if (!stolen && any_overload)
+		schedstat_inc(dst_rq->steal_fail);
 	return stolen;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4989a92eeb9b..530b80fbf897 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1304,6 +1304,15 @@ struct rq {
 	/* try_to_wake_up() stats */
 	unsigned int		ttwu_count;
 	unsigned int		ttwu_local;
+
+	/* Idle search stats */
+	unsigned int		found_idle_cpu_capacity;
+	unsigned int		found_idle_cpu;
+	unsigned int		found_idle_cpu_easy;
+	unsigned int		nofound_idle_cpu;
+	unsigned long		find_time;
+	unsigned int		steal;
+	unsigned int		steal_fail;
 #endif
 
 #ifdef CONFIG_CPU_IDLE
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index d1c9429a4ac5..7063c9712f68 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -129,6 +129,15 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
+		seq_printf(seq, " %u %u %u %u %lu %u %u",
+			   rq->found_idle_cpu_easy,
+			   rq->found_idle_cpu_capacity,
+			   rq->found_idle_cpu,
+			   rq->nofound_idle_cpu,
+			   rq->find_time,
+			   rq->steal,
+			   rq->steal_fail);
+
 		seq_printf(seq, "\n");
 
 		/* domain-specific stats */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index a612cf253c87..55f31a4df8fa 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -43,6 +43,17 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
 #define   schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
 #define   schedstat_val(var)		(var)
 #define   schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0)
+#define   schedstat_start_time()	schedstat_val_or_zero(local_clock())
+#define   schedstat_end_time(stat, time)			\
+	do {							\
+		unsigned long endtime;				\
+								\
+		if (schedstat_enabled() && (time)) {		\
+			endtime = local_clock() - (time) - schedstat_skid; \
+			schedstat_add((stat), endtime);		\
+		}						\
+	} while (0)
+extern unsigned long schedstat_skid;
 
 void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
 			       struct sched_statistics *stats);
@@ -81,6 +92,8 @@ static inline void rq_sched_info_depart  (struct rq *rq, unsigned long long delt
 # define   schedstat_set(var, val)	do { } while (0)
 # define   schedstat_val(var)		0
 # define   schedstat_val_or_zero(var)	0
+# define   schedstat_start_time()	0
+# define   schedstat_end_time(stat, t)	do { } while (0)
 
 # define __update_stats_wait_start(rq, p, stats)       do { } while (0)
 # define __update_stats_wait_end(rq, p, stats)         do { } while (0)
-- 
2.34.1